docling-core 2.44.1__tar.gz → 2.45.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.44.1 → docling_core-2.45.0}/PKG-INFO +1 -1
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/html.py +34 -74
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/document.py +165 -236
- docling_core-2.45.0/docling_core/types/doc/utils.py +282 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.44.1 → docling_core-2.45.0}/pyproject.toml +1 -1
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_docling_doc.py +119 -1
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_serialization.py +18 -0
- docling_core-2.44.1/docling_core/types/doc/utils.py +0 -86
- {docling_core-2.44.1 → docling_core-2.45.0}/LICENSE +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/README.md +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/py.typed +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/search/package.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/page_chunker.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/common.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/serializer/markdown.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/base.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core/utils/validators.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/SOURCES.txt +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/setup.cfg +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_base.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_collection.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_doc_base.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_doc_schema.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_doctags_load.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_nlp_qa.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_page.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_page_chunker.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_rec_schema.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_search_meta.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_utils.py +0 -0
- {docling_core-2.44.1 → docling_core-2.45.0}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.45.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -130,11 +130,14 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
130
130
|
doc_serializer: BaseDocSerializer,
|
|
131
131
|
doc: DoclingDocument,
|
|
132
132
|
is_inline_scope: bool = False,
|
|
133
|
+
visited: Optional[set[str]] = None,
|
|
133
134
|
**kwargs: Any,
|
|
134
135
|
) -> SerializationResult:
|
|
135
136
|
"""Serializes the passed text item to HTML."""
|
|
136
137
|
params = HTMLParams(**kwargs)
|
|
138
|
+
my_visited: set[str] = visited if visited is not None else set()
|
|
137
139
|
res_parts: list[SerializationResult] = []
|
|
140
|
+
post_processed = False
|
|
138
141
|
|
|
139
142
|
# Prepare the HTML based on item type
|
|
140
143
|
if isinstance(item, TitleItem):
|
|
@@ -162,7 +165,28 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
162
165
|
|
|
163
166
|
elif isinstance(item, ListItem):
|
|
164
167
|
# List items are handled by list serializer
|
|
165
|
-
|
|
168
|
+
text_parts: list[str] = []
|
|
169
|
+
if item_text := self._prepare_content(item.text):
|
|
170
|
+
item_text = doc_serializer.post_process(
|
|
171
|
+
text=item_text,
|
|
172
|
+
formatting=item.formatting,
|
|
173
|
+
hyperlink=item.hyperlink,
|
|
174
|
+
)
|
|
175
|
+
post_processed = True
|
|
176
|
+
text_parts.append(item_text)
|
|
177
|
+
nested_parts = [
|
|
178
|
+
r.text
|
|
179
|
+
for r in doc_serializer.get_parts(
|
|
180
|
+
item=item,
|
|
181
|
+
is_inline_scope=is_inline_scope,
|
|
182
|
+
visited=my_visited,
|
|
183
|
+
**kwargs,
|
|
184
|
+
)
|
|
185
|
+
]
|
|
186
|
+
text_parts.extend(nested_parts)
|
|
187
|
+
text_inner = "\n".join(text_parts)
|
|
188
|
+
if nested_parts:
|
|
189
|
+
text_inner = f"\n{text_inner}\n"
|
|
166
190
|
text = (
|
|
167
191
|
get_html_tag_with_text_direction(
|
|
168
192
|
html_tag="li",
|
|
@@ -185,11 +209,12 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
185
209
|
text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
|
|
186
210
|
|
|
187
211
|
# Apply formatting and hyperlinks
|
|
188
|
-
|
|
189
|
-
text=
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
212
|
+
if not post_processed:
|
|
213
|
+
text = doc_serializer.post_process(
|
|
214
|
+
text=text,
|
|
215
|
+
formatting=item.formatting,
|
|
216
|
+
hyperlink=item.hyperlink,
|
|
217
|
+
)
|
|
193
218
|
|
|
194
219
|
if text:
|
|
195
220
|
text_res = create_ser_result(text=text, span_source=item)
|
|
@@ -703,7 +728,6 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
703
728
|
) -> SerializationResult:
|
|
704
729
|
"""Serializes a list to HTML."""
|
|
705
730
|
my_visited: set[str] = visited if visited is not None else set()
|
|
706
|
-
params = HTMLParams(**kwargs)
|
|
707
731
|
# Get all child parts
|
|
708
732
|
parts = doc_serializer.get_parts(
|
|
709
733
|
item=item,
|
|
@@ -713,72 +737,8 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
713
737
|
**kwargs,
|
|
714
738
|
)
|
|
715
739
|
|
|
716
|
-
# Append nested list to parent list item:
|
|
717
|
-
i = 0
|
|
718
|
-
while i < len(parts):
|
|
719
|
-
prt = parts[i]
|
|
720
|
-
if prt.text.startswith(("<ul>", "<ol>")):
|
|
721
|
-
for j in range(i - 1, -1, -1):
|
|
722
|
-
if parts[j].text.startswith(("<li>", "<li ")) and parts[
|
|
723
|
-
j
|
|
724
|
-
].text.endswith("</li>"):
|
|
725
|
-
before, _, _ = parts[j].text.rpartition("</li>")
|
|
726
|
-
parts[j].text = f"{before}\n{prt.text}\n</li>"
|
|
727
|
-
break
|
|
728
|
-
if j > -1:
|
|
729
|
-
parts.pop(i)
|
|
730
|
-
else:
|
|
731
|
-
i += 1
|
|
732
|
-
|
|
733
740
|
# Add all child parts
|
|
734
|
-
text_res = "\n".join(
|
|
735
|
-
[
|
|
736
|
-
(
|
|
737
|
-
p.text
|
|
738
|
-
if (
|
|
739
|
-
(
|
|
740
|
-
p.text.startswith(("<li>", "<li "))
|
|
741
|
-
and p.text.endswith("</li>")
|
|
742
|
-
)
|
|
743
|
-
or (
|
|
744
|
-
p.text.startswith(("<ol>", "<ol "))
|
|
745
|
-
and p.text.endswith("</ol>")
|
|
746
|
-
)
|
|
747
|
-
or (
|
|
748
|
-
p.text.startswith(("<ul>", "<ul "))
|
|
749
|
-
and p.text.endswith("</ul>")
|
|
750
|
-
)
|
|
751
|
-
)
|
|
752
|
-
else (
|
|
753
|
-
get_html_tag_with_text_direction(
|
|
754
|
-
html_tag="li",
|
|
755
|
-
text=p.text,
|
|
756
|
-
attrs=(
|
|
757
|
-
{
|
|
758
|
-
"style": f"list-style-type: '{grandparent_item.marker} ';"
|
|
759
|
-
}
|
|
760
|
-
if params.show_original_list_item_marker
|
|
761
|
-
and grandparent_item.marker
|
|
762
|
-
else {}
|
|
763
|
-
),
|
|
764
|
-
)
|
|
765
|
-
if p.spans
|
|
766
|
-
and p.spans[0].item.parent
|
|
767
|
-
and isinstance(
|
|
768
|
-
(parent_item := p.spans[0].item.parent.resolve(doc)),
|
|
769
|
-
InlineGroup,
|
|
770
|
-
)
|
|
771
|
-
and parent_item.parent
|
|
772
|
-
and isinstance(
|
|
773
|
-
(grandparent_item := parent_item.parent.resolve(doc)),
|
|
774
|
-
ListItem,
|
|
775
|
-
)
|
|
776
|
-
else f"<li>{p.text}</li>"
|
|
777
|
-
)
|
|
778
|
-
)
|
|
779
|
-
for p in parts
|
|
780
|
-
]
|
|
781
|
-
)
|
|
741
|
+
text_res = "\n".join(p.text for p in parts if p.text)
|
|
782
742
|
if text_res:
|
|
783
743
|
tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
|
|
784
744
|
text_res = f"<{tag}>\n{text_res}\n</{tag}>"
|
|
@@ -1097,7 +1057,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
1097
1057
|
if self.params.html_head is not None:
|
|
1098
1058
|
return self.params.html_head
|
|
1099
1059
|
|
|
1100
|
-
head_parts = ["<head>", '<meta charset="UTF-8"
|
|
1060
|
+
head_parts = ["<head>", '<meta charset="UTF-8"/>']
|
|
1101
1061
|
|
|
1102
1062
|
# Add metadata if requested
|
|
1103
1063
|
if params.add_document_metadata:
|
|
@@ -1107,7 +1067,7 @@ class HTMLDocSerializer(DocSerializer):
|
|
|
1107
1067
|
head_parts.append("<title>Docling Document</title>")
|
|
1108
1068
|
|
|
1109
1069
|
head_parts.append(
|
|
1110
|
-
'<meta name="generator" content="Docling HTML Serializer"
|
|
1070
|
+
'<meta name="generator" content="Docling HTML Serializer"/>'
|
|
1111
1071
|
)
|
|
1112
1072
|
|
|
1113
1073
|
# Add default styles or custom CSS
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import copy
|
|
5
5
|
import hashlib
|
|
6
|
-
import itertools
|
|
7
6
|
import json
|
|
8
7
|
import logging
|
|
9
8
|
import mimetypes
|
|
@@ -54,8 +53,8 @@ from docling_core.types.doc.labels import (
|
|
|
54
53
|
GroupLabel,
|
|
55
54
|
PictureClassificationLabel,
|
|
56
55
|
)
|
|
57
|
-
from docling_core.types.doc.tokens import
|
|
58
|
-
from docling_core.types.doc.utils import relative_path
|
|
56
|
+
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
57
|
+
from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
|
|
59
58
|
|
|
60
59
|
_logger = logging.getLogger(__name__)
|
|
61
60
|
|
|
@@ -4688,181 +4687,6 @@ class DoclingDocument(BaseModel):
|
|
|
4688
4687
|
bbox = None
|
|
4689
4688
|
return caption_item, bbox
|
|
4690
4689
|
|
|
4691
|
-
def otsl_parse_texts(texts, tokens):
|
|
4692
|
-
split_word = TableToken.OTSL_NL.value
|
|
4693
|
-
# CLEAN tokens from extra tags, only structural OTSL allowed
|
|
4694
|
-
clean_tokens = []
|
|
4695
|
-
for t in tokens:
|
|
4696
|
-
if t in [
|
|
4697
|
-
TableToken.OTSL_ECEL.value,
|
|
4698
|
-
TableToken.OTSL_FCEL.value,
|
|
4699
|
-
TableToken.OTSL_LCEL.value,
|
|
4700
|
-
TableToken.OTSL_UCEL.value,
|
|
4701
|
-
TableToken.OTSL_XCEL.value,
|
|
4702
|
-
TableToken.OTSL_NL.value,
|
|
4703
|
-
TableToken.OTSL_CHED.value,
|
|
4704
|
-
TableToken.OTSL_RHED.value,
|
|
4705
|
-
TableToken.OTSL_SROW.value,
|
|
4706
|
-
]:
|
|
4707
|
-
clean_tokens.append(t)
|
|
4708
|
-
tokens = clean_tokens
|
|
4709
|
-
split_row_tokens = [
|
|
4710
|
-
list(y)
|
|
4711
|
-
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
4712
|
-
if not x
|
|
4713
|
-
]
|
|
4714
|
-
|
|
4715
|
-
table_cells = []
|
|
4716
|
-
r_idx = 0
|
|
4717
|
-
c_idx = 0
|
|
4718
|
-
|
|
4719
|
-
def count_right(tokens, c_idx, r_idx, which_tokens):
|
|
4720
|
-
span = 0
|
|
4721
|
-
c_idx_iter = c_idx
|
|
4722
|
-
while tokens[r_idx][c_idx_iter] in which_tokens:
|
|
4723
|
-
c_idx_iter += 1
|
|
4724
|
-
span += 1
|
|
4725
|
-
if c_idx_iter >= len(tokens[r_idx]):
|
|
4726
|
-
return span
|
|
4727
|
-
return span
|
|
4728
|
-
|
|
4729
|
-
def count_down(tokens, c_idx, r_idx, which_tokens):
|
|
4730
|
-
span = 0
|
|
4731
|
-
r_idx_iter = r_idx
|
|
4732
|
-
while tokens[r_idx_iter][c_idx] in which_tokens:
|
|
4733
|
-
r_idx_iter += 1
|
|
4734
|
-
span += 1
|
|
4735
|
-
if r_idx_iter >= len(tokens):
|
|
4736
|
-
return span
|
|
4737
|
-
return span
|
|
4738
|
-
|
|
4739
|
-
for i, text in enumerate(texts):
|
|
4740
|
-
cell_text = ""
|
|
4741
|
-
if text in [
|
|
4742
|
-
TableToken.OTSL_FCEL.value,
|
|
4743
|
-
TableToken.OTSL_ECEL.value,
|
|
4744
|
-
TableToken.OTSL_CHED.value,
|
|
4745
|
-
TableToken.OTSL_RHED.value,
|
|
4746
|
-
TableToken.OTSL_SROW.value,
|
|
4747
|
-
]:
|
|
4748
|
-
row_span = 1
|
|
4749
|
-
col_span = 1
|
|
4750
|
-
right_offset = 1
|
|
4751
|
-
if text != TableToken.OTSL_ECEL.value:
|
|
4752
|
-
cell_text = texts[i + 1]
|
|
4753
|
-
right_offset = 2
|
|
4754
|
-
|
|
4755
|
-
# Check next element(s) for lcel / ucel / xcel,
|
|
4756
|
-
# set properly row_span, col_span
|
|
4757
|
-
next_right_cell = ""
|
|
4758
|
-
if i + right_offset < len(texts):
|
|
4759
|
-
next_right_cell = texts[i + right_offset]
|
|
4760
|
-
|
|
4761
|
-
next_bottom_cell = ""
|
|
4762
|
-
if r_idx + 1 < len(split_row_tokens):
|
|
4763
|
-
if c_idx < len(split_row_tokens[r_idx + 1]):
|
|
4764
|
-
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
|
4765
|
-
|
|
4766
|
-
if next_right_cell in [
|
|
4767
|
-
TableToken.OTSL_LCEL.value,
|
|
4768
|
-
TableToken.OTSL_XCEL.value,
|
|
4769
|
-
]:
|
|
4770
|
-
# we have horisontal spanning cell or 2d spanning cell
|
|
4771
|
-
col_span += count_right(
|
|
4772
|
-
split_row_tokens,
|
|
4773
|
-
c_idx + 1,
|
|
4774
|
-
r_idx,
|
|
4775
|
-
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
|
4776
|
-
)
|
|
4777
|
-
if next_bottom_cell in [
|
|
4778
|
-
TableToken.OTSL_UCEL.value,
|
|
4779
|
-
TableToken.OTSL_XCEL.value,
|
|
4780
|
-
]:
|
|
4781
|
-
# we have a vertical spanning cell or 2d spanning cell
|
|
4782
|
-
row_span += count_down(
|
|
4783
|
-
split_row_tokens,
|
|
4784
|
-
c_idx,
|
|
4785
|
-
r_idx + 1,
|
|
4786
|
-
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
|
4787
|
-
)
|
|
4788
|
-
|
|
4789
|
-
table_cells.append(
|
|
4790
|
-
TableCell(
|
|
4791
|
-
text=cell_text.strip(),
|
|
4792
|
-
row_span=row_span,
|
|
4793
|
-
col_span=col_span,
|
|
4794
|
-
start_row_offset_idx=r_idx,
|
|
4795
|
-
end_row_offset_idx=r_idx + row_span,
|
|
4796
|
-
start_col_offset_idx=c_idx,
|
|
4797
|
-
end_col_offset_idx=c_idx + col_span,
|
|
4798
|
-
)
|
|
4799
|
-
)
|
|
4800
|
-
if text in [
|
|
4801
|
-
TableToken.OTSL_FCEL.value,
|
|
4802
|
-
TableToken.OTSL_ECEL.value,
|
|
4803
|
-
TableToken.OTSL_CHED.value,
|
|
4804
|
-
TableToken.OTSL_RHED.value,
|
|
4805
|
-
TableToken.OTSL_SROW.value,
|
|
4806
|
-
TableToken.OTSL_LCEL.value,
|
|
4807
|
-
TableToken.OTSL_UCEL.value,
|
|
4808
|
-
TableToken.OTSL_XCEL.value,
|
|
4809
|
-
]:
|
|
4810
|
-
c_idx += 1
|
|
4811
|
-
if text == TableToken.OTSL_NL.value:
|
|
4812
|
-
r_idx += 1
|
|
4813
|
-
c_idx = 0
|
|
4814
|
-
return table_cells, split_row_tokens
|
|
4815
|
-
|
|
4816
|
-
def otsl_extract_tokens_and_text(s: str):
|
|
4817
|
-
# Pattern to match anything enclosed by < >
|
|
4818
|
-
# (including the angle brackets themselves)
|
|
4819
|
-
pattern = r"(<[^>]+>)"
|
|
4820
|
-
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
|
4821
|
-
tokens = re.findall(pattern, s)
|
|
4822
|
-
# Remove any tokens that start with "<loc_"
|
|
4823
|
-
tokens = [
|
|
4824
|
-
token
|
|
4825
|
-
for token in tokens
|
|
4826
|
-
if not (
|
|
4827
|
-
token.startswith(rf"<{_LOC_PREFIX}")
|
|
4828
|
-
or token
|
|
4829
|
-
in [
|
|
4830
|
-
rf"<{DocumentToken.OTSL.value}>",
|
|
4831
|
-
rf"</{DocumentToken.OTSL.value}>",
|
|
4832
|
-
]
|
|
4833
|
-
)
|
|
4834
|
-
]
|
|
4835
|
-
# Split the string by those tokens to get the in-between text
|
|
4836
|
-
text_parts = re.split(pattern, s)
|
|
4837
|
-
text_parts = [
|
|
4838
|
-
token
|
|
4839
|
-
for token in text_parts
|
|
4840
|
-
if not (
|
|
4841
|
-
token.startswith(rf"<{_LOC_PREFIX}")
|
|
4842
|
-
or token
|
|
4843
|
-
in [
|
|
4844
|
-
rf"<{DocumentToken.OTSL.value}>",
|
|
4845
|
-
rf"</{DocumentToken.OTSL.value}>",
|
|
4846
|
-
]
|
|
4847
|
-
)
|
|
4848
|
-
]
|
|
4849
|
-
# Remove any empty or purely whitespace strings from text_parts
|
|
4850
|
-
text_parts = [part for part in text_parts if part.strip()]
|
|
4851
|
-
|
|
4852
|
-
return tokens, text_parts
|
|
4853
|
-
|
|
4854
|
-
def parse_table_content(otsl_content: str) -> TableData:
|
|
4855
|
-
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
|
4856
|
-
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
|
4857
|
-
|
|
4858
|
-
return TableData(
|
|
4859
|
-
num_rows=len(split_row_tokens),
|
|
4860
|
-
num_cols=(
|
|
4861
|
-
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
|
4862
|
-
),
|
|
4863
|
-
table_cells=table_cells,
|
|
4864
|
-
)
|
|
4865
|
-
|
|
4866
4690
|
def extract_chart_type(text_chunk: str):
|
|
4867
4691
|
label = None
|
|
4868
4692
|
chart_labels = [
|
|
@@ -5094,7 +4918,7 @@ class DoclingDocument(BaseModel):
|
|
|
5094
4918
|
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
|
|
5095
4919
|
|
|
5096
4920
|
if tag_name == DocumentToken.OTSL.value:
|
|
5097
|
-
table_data =
|
|
4921
|
+
table_data = parse_otsl_table_content(full_chunk)
|
|
5098
4922
|
caption, caption_bbox = extract_caption(full_chunk)
|
|
5099
4923
|
if caption is not None and caption_bbox is not None:
|
|
5100
4924
|
caption.prov.append(
|
|
@@ -5137,7 +4961,7 @@ class DoclingDocument(BaseModel):
|
|
|
5137
4961
|
table_data = None
|
|
5138
4962
|
chart_type = None
|
|
5139
4963
|
if tag_name == DocumentToken.CHART.value:
|
|
5140
|
-
table_data =
|
|
4964
|
+
table_data = parse_otsl_table_content(full_chunk)
|
|
5141
4965
|
chart_type = extract_chart_type(full_chunk)
|
|
5142
4966
|
if image:
|
|
5143
4967
|
if bbox:
|
|
@@ -5683,69 +5507,174 @@ class DoclingDocument(BaseModel):
|
|
|
5683
5507
|
)
|
|
5684
5508
|
return self
|
|
5685
5509
|
|
|
5510
|
+
class _DocIndex(BaseModel):
|
|
5511
|
+
"""A document merge buffer."""
|
|
5512
|
+
|
|
5513
|
+
groups: list[GroupItem] = []
|
|
5514
|
+
texts: list[TextItem] = []
|
|
5515
|
+
pictures: list[PictureItem] = []
|
|
5516
|
+
tables: list[TableItem] = []
|
|
5517
|
+
key_value_items: list[KeyValueItem] = []
|
|
5518
|
+
form_items: list[FormItem] = []
|
|
5519
|
+
|
|
5520
|
+
pages: dict[int, PageItem] = {}
|
|
5521
|
+
|
|
5522
|
+
_body: Optional[GroupItem] = None
|
|
5523
|
+
_max_page: int = 0
|
|
5524
|
+
_names: list[str] = []
|
|
5525
|
+
|
|
5526
|
+
def get_item_list(self, key: str) -> list[NodeItem]:
|
|
5527
|
+
return getattr(self, key)
|
|
5528
|
+
|
|
5529
|
+
def index(self, doc: "DoclingDocument") -> None:
|
|
5530
|
+
|
|
5531
|
+
orig_ref_to_new_ref: dict[str, str] = {}
|
|
5532
|
+
page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
|
|
5533
|
+
|
|
5534
|
+
if self._body is None:
|
|
5535
|
+
self._body = GroupItem(**doc.body.model_dump(exclude={"children"}))
|
|
5536
|
+
|
|
5537
|
+
self._names.append(doc.name)
|
|
5538
|
+
|
|
5539
|
+
# collect items in traversal order
|
|
5540
|
+
for item, _ in doc.iterate_items(
|
|
5541
|
+
with_groups=True,
|
|
5542
|
+
traverse_pictures=True,
|
|
5543
|
+
included_content_layers={c for c in ContentLayer},
|
|
5544
|
+
):
|
|
5545
|
+
key = item.self_ref.split("/")[1]
|
|
5546
|
+
is_body = key == "body"
|
|
5547
|
+
new_cref = (
|
|
5548
|
+
"#/body" if is_body else f"#/{key}/{len(self.get_item_list(key))}"
|
|
5549
|
+
)
|
|
5550
|
+
# register cref mapping:
|
|
5551
|
+
orig_ref_to_new_ref[item.self_ref] = new_cref
|
|
5552
|
+
|
|
5553
|
+
if not is_body:
|
|
5554
|
+
new_item = copy.deepcopy(item)
|
|
5555
|
+
new_item.children = []
|
|
5556
|
+
|
|
5557
|
+
# put item in the right list
|
|
5558
|
+
self.get_item_list(key).append(new_item)
|
|
5559
|
+
|
|
5560
|
+
# update item's self reference
|
|
5561
|
+
new_item.self_ref = new_cref
|
|
5562
|
+
|
|
5563
|
+
if isinstance(new_item, DocItem):
|
|
5564
|
+
# update page numbers
|
|
5565
|
+
# NOTE other prov sources (e.g. GraphCell) currently not covered
|
|
5566
|
+
for prov in new_item.prov:
|
|
5567
|
+
prov.page_no += page_delta
|
|
5568
|
+
|
|
5569
|
+
if item.parent:
|
|
5570
|
+
# set item's parent
|
|
5571
|
+
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
|
|
5572
|
+
new_item.parent = RefItem(cref=new_parent_cref)
|
|
5573
|
+
|
|
5574
|
+
# add item to parent's children
|
|
5575
|
+
path_components = new_parent_cref.split("/")
|
|
5576
|
+
num_components = len(path_components)
|
|
5577
|
+
if num_components == 3:
|
|
5578
|
+
_, parent_key, parent_index_str = path_components
|
|
5579
|
+
parent_index = int(parent_index_str)
|
|
5580
|
+
parent_item = self.get_item_list(parent_key)[parent_index]
|
|
5581
|
+
|
|
5582
|
+
# update captions field (not possible in iterate_items order):
|
|
5583
|
+
if isinstance(parent_item, FloatingItem):
|
|
5584
|
+
for cap_it, cap in enumerate(parent_item.captions):
|
|
5585
|
+
if cap.cref == item.self_ref:
|
|
5586
|
+
parent_item.captions[cap_it] = RefItem(
|
|
5587
|
+
cref=new_cref
|
|
5588
|
+
)
|
|
5589
|
+
break
|
|
5590
|
+
|
|
5591
|
+
elif num_components == 2 and path_components[1] == "body":
|
|
5592
|
+
parent_item = self._body
|
|
5593
|
+
else:
|
|
5594
|
+
raise RuntimeError(
|
|
5595
|
+
f"Unsupported ref format: {new_parent_cref}"
|
|
5596
|
+
)
|
|
5597
|
+
parent_item.children.append(RefItem(cref=new_cref))
|
|
5598
|
+
|
|
5599
|
+
# update pages
|
|
5600
|
+
new_max_page = None
|
|
5601
|
+
for page_nr in doc.pages:
|
|
5602
|
+
new_page = copy.deepcopy(doc.pages[page_nr])
|
|
5603
|
+
new_page_nr = page_nr + page_delta
|
|
5604
|
+
new_page.page_no = new_page_nr
|
|
5605
|
+
self.pages[new_page_nr] = new_page
|
|
5606
|
+
if new_max_page is None or new_page_nr > new_max_page:
|
|
5607
|
+
new_max_page = new_page_nr
|
|
5608
|
+
if new_max_page is not None:
|
|
5609
|
+
self._max_page = new_max_page
|
|
5610
|
+
|
|
5611
|
+
def get_name(self) -> str:
|
|
5612
|
+
return " + ".join(self._names)
|
|
5613
|
+
|
|
5614
|
+
def _update_from_index(self, doc_index: "_DocIndex") -> None:
|
|
5615
|
+
if doc_index._body is not None:
|
|
5616
|
+
self.body = doc_index._body
|
|
5617
|
+
self.groups = doc_index.groups
|
|
5618
|
+
self.texts = doc_index.texts
|
|
5619
|
+
self.pictures = doc_index.pictures
|
|
5620
|
+
self.tables = doc_index.tables
|
|
5621
|
+
self.key_value_items = doc_index.key_value_items
|
|
5622
|
+
self.form_items = doc_index.form_items
|
|
5623
|
+
self.pages = doc_index.pages
|
|
5624
|
+
self.name = doc_index.get_name()
|
|
5625
|
+
|
|
5686
5626
|
def _normalize_references(self) -> None:
|
|
5687
|
-
|
|
5688
|
-
|
|
5689
|
-
|
|
5690
|
-
|
|
5691
|
-
|
|
5692
|
-
|
|
5693
|
-
|
|
5694
|
-
|
|
5695
|
-
|
|
5696
|
-
|
|
5697
|
-
|
|
5698
|
-
|
|
5627
|
+
doc_index = DoclingDocument._DocIndex()
|
|
5628
|
+
doc_index.index(doc=self)
|
|
5629
|
+
self._update_from_index(doc_index)
|
|
5630
|
+
|
|
5631
|
+
@classmethod
|
|
5632
|
+
def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
|
|
5633
|
+
"""Concatenate multiple documents into a single document."""
|
|
5634
|
+
doc_index = DoclingDocument._DocIndex()
|
|
5635
|
+
for doc in docs:
|
|
5636
|
+
doc_index.index(doc=doc)
|
|
5637
|
+
|
|
5638
|
+
res_doc = DoclingDocument(name=" + ".join([doc.name for doc in docs]))
|
|
5639
|
+
res_doc._update_from_index(doc_index)
|
|
5640
|
+
return res_doc
|
|
5641
|
+
|
|
5642
|
+
def _validate_rules(self):
|
|
5643
|
+
def validate_list_group(doc: DoclingDocument, item: ListGroup):
|
|
5644
|
+
for ref in item.children:
|
|
5645
|
+
child = ref.resolve(doc)
|
|
5646
|
+
if not isinstance(child, ListItem):
|
|
5647
|
+
raise ValueError(
|
|
5648
|
+
f"ListGroup {item.self_ref} contains non-ListItem {child.self_ref} ({child.label=})"
|
|
5649
|
+
)
|
|
5650
|
+
|
|
5651
|
+
def validate_list_item(doc: DoclingDocument, item: ListItem):
|
|
5652
|
+
if item.parent is None:
|
|
5653
|
+
raise ValueError(f"ListItem {item.self_ref} has no parent")
|
|
5654
|
+
if not isinstance(item.parent.resolve(doc), ListGroup):
|
|
5655
|
+
raise ValueError(
|
|
5656
|
+
f"ListItem {item.self_ref} has non-ListGroup parent: {item.parent.cref}"
|
|
5657
|
+
)
|
|
5658
|
+
|
|
5659
|
+
def validate_group(doc: DoclingDocument, item: GroupItem):
|
|
5660
|
+
if (
|
|
5661
|
+
item.parent and not item.children
|
|
5662
|
+
): # tolerate empty body, but not other groups
|
|
5663
|
+
raise ValueError(f"Group {item.self_ref} has no children")
|
|
5699
5664
|
|
|
5700
|
-
# collect items in traversal order
|
|
5701
5665
|
for item, _ in self.iterate_items(
|
|
5702
5666
|
with_groups=True,
|
|
5703
5667
|
traverse_pictures=True,
|
|
5704
5668
|
included_content_layers={c for c in ContentLayer},
|
|
5705
5669
|
):
|
|
5706
|
-
|
|
5707
|
-
|
|
5708
|
-
|
|
5709
|
-
|
|
5710
|
-
|
|
5711
|
-
|
|
5712
|
-
|
|
5713
|
-
|
|
5714
|
-
new_item.children = []
|
|
5715
|
-
|
|
5716
|
-
# put item in the right list
|
|
5717
|
-
item_lists[key].append(new_item)
|
|
5718
|
-
|
|
5719
|
-
# update item's self reference
|
|
5720
|
-
new_item.self_ref = new_cref
|
|
5721
|
-
|
|
5722
|
-
if item.parent:
|
|
5723
|
-
# set item's parent
|
|
5724
|
-
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
|
|
5725
|
-
new_item.parent = RefItem(cref=new_parent_cref)
|
|
5726
|
-
|
|
5727
|
-
# add item to parent's children
|
|
5728
|
-
path_components = new_parent_cref.split("/")
|
|
5729
|
-
num_components = len(path_components)
|
|
5730
|
-
parent_node: NodeItem
|
|
5731
|
-
if num_components == 3:
|
|
5732
|
-
_, parent_key, parent_index_str = path_components
|
|
5733
|
-
parent_index = int(parent_index_str)
|
|
5734
|
-
parent_node = item_lists[parent_key][parent_index]
|
|
5735
|
-
elif num_components == 2 and path_components[1] == "body":
|
|
5736
|
-
parent_node = new_body
|
|
5737
|
-
else:
|
|
5738
|
-
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
|
|
5739
|
-
parent_node.children.append(RefItem(cref=new_cref))
|
|
5740
|
-
|
|
5741
|
-
# update document
|
|
5742
|
-
self.groups = item_lists["groups"] # type: ignore
|
|
5743
|
-
self.texts = item_lists["texts"] # type: ignore
|
|
5744
|
-
self.pictures = item_lists["pictures"] # type: ignore
|
|
5745
|
-
self.tables = item_lists["tables"] # type: ignore
|
|
5746
|
-
self.key_value_items = item_lists["key_value_items"] # type: ignore
|
|
5747
|
-
self.form_items = item_lists["form_items"] # type: ignore
|
|
5748
|
-
self.body = new_body
|
|
5670
|
+
if isinstance(item, ListGroup):
|
|
5671
|
+
validate_list_group(self, item)
|
|
5672
|
+
|
|
5673
|
+
elif isinstance(item, GroupItem):
|
|
5674
|
+
validate_group(self, item)
|
|
5675
|
+
|
|
5676
|
+
elif isinstance(item, ListItem):
|
|
5677
|
+
validate_list_item(self, item)
|
|
5749
5678
|
|
|
5750
5679
|
|
|
5751
5680
|
# deprecated aliases (kept for backwards compatibility):
|