docling-core 2.44.0__tar.gz → 2.44.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.44.0 → docling_core-2.44.2}/PKG-INFO +1 -1
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/serializer/html.py +32 -72
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/doc/document.py +14 -12
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.44.0 → docling_core-2.44.2}/pyproject.toml +1 -1
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_docling_doc.py +14 -13
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_serialization.py +18 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/LICENSE +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/README.md +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/cli/view.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/py.typed +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/search/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/search/mapping.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/search/meta.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/search/package.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/page_chunker.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/serializer/common.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/serializer/markdown.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/visualizer/key_value_visualizer.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/visualizer/layout_visualizer.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/base.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/utils/alias.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/utils/file.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/utils/validate.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core/utils/validators.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core.egg-info/SOURCES.txt +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/setup.cfg +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_base.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_collection.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_doc_base.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_doc_schema.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_doctags_load.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_nlp_qa.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_page.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_page_chunker.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_rec_schema.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_search_meta.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_utils.py +0 -0
- {docling_core-2.44.0 → docling_core-2.44.2}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.44.
|
|
3
|
+
Version: 2.44.2
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -130,11 +130,14 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
130
130
|
doc_serializer: BaseDocSerializer,
|
|
131
131
|
doc: DoclingDocument,
|
|
132
132
|
is_inline_scope: bool = False,
|
|
133
|
+
visited: Optional[set[str]] = None,
|
|
133
134
|
**kwargs: Any,
|
|
134
135
|
) -> SerializationResult:
|
|
135
136
|
"""Serializes the passed text item to HTML."""
|
|
136
137
|
params = HTMLParams(**kwargs)
|
|
138
|
+
my_visited: set[str] = visited if visited is not None else set()
|
|
137
139
|
res_parts: list[SerializationResult] = []
|
|
140
|
+
post_processed = False
|
|
138
141
|
|
|
139
142
|
# Prepare the HTML based on item type
|
|
140
143
|
if isinstance(item, TitleItem):
|
|
@@ -162,7 +165,28 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
162
165
|
|
|
163
166
|
elif isinstance(item, ListItem):
|
|
164
167
|
# List items are handled by list serializer
|
|
165
|
-
|
|
168
|
+
text_parts: list[str] = []
|
|
169
|
+
if item_text := self._prepare_content(item.text):
|
|
170
|
+
item_text = doc_serializer.post_process(
|
|
171
|
+
text=item_text,
|
|
172
|
+
formatting=item.formatting,
|
|
173
|
+
hyperlink=item.hyperlink,
|
|
174
|
+
)
|
|
175
|
+
post_processed = True
|
|
176
|
+
text_parts.append(item_text)
|
|
177
|
+
nested_parts = [
|
|
178
|
+
r.text
|
|
179
|
+
for r in doc_serializer.get_parts(
|
|
180
|
+
item=item,
|
|
181
|
+
is_inline_scope=is_inline_scope,
|
|
182
|
+
visited=my_visited,
|
|
183
|
+
**kwargs,
|
|
184
|
+
)
|
|
185
|
+
]
|
|
186
|
+
text_parts.extend(nested_parts)
|
|
187
|
+
text_inner = "\n".join(text_parts)
|
|
188
|
+
if nested_parts:
|
|
189
|
+
text_inner = f"\n{text_inner}\n"
|
|
166
190
|
text = (
|
|
167
191
|
get_html_tag_with_text_direction(
|
|
168
192
|
html_tag="li",
|
|
@@ -185,11 +209,12 @@ class HTMLTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
185
209
|
text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
|
|
186
210
|
|
|
187
211
|
# Apply formatting and hyperlinks
|
|
188
|
-
|
|
189
|
-
text=
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
212
|
+
if not post_processed:
|
|
213
|
+
text = doc_serializer.post_process(
|
|
214
|
+
text=text,
|
|
215
|
+
formatting=item.formatting,
|
|
216
|
+
hyperlink=item.hyperlink,
|
|
217
|
+
)
|
|
193
218
|
|
|
194
219
|
if text:
|
|
195
220
|
text_res = create_ser_result(text=text, span_source=item)
|
|
@@ -703,7 +728,6 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
703
728
|
) -> SerializationResult:
|
|
704
729
|
"""Serializes a list to HTML."""
|
|
705
730
|
my_visited: set[str] = visited if visited is not None else set()
|
|
706
|
-
params = HTMLParams(**kwargs)
|
|
707
731
|
# Get all child parts
|
|
708
732
|
parts = doc_serializer.get_parts(
|
|
709
733
|
item=item,
|
|
@@ -713,72 +737,8 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
713
737
|
**kwargs,
|
|
714
738
|
)
|
|
715
739
|
|
|
716
|
-
# Append nested list to parent list item:
|
|
717
|
-
i = 0
|
|
718
|
-
while i < len(parts):
|
|
719
|
-
prt = parts[i]
|
|
720
|
-
if prt.text.startswith(("<ul>", "<ol>")):
|
|
721
|
-
for j in range(i - 1, -1, -1):
|
|
722
|
-
if parts[j].text.startswith(("<li>", "<li ")) and parts[
|
|
723
|
-
j
|
|
724
|
-
].text.endswith("</li>"):
|
|
725
|
-
before, _, _ = parts[j].text.rpartition("</li>")
|
|
726
|
-
parts[j].text = f"{before}\n{prt.text}\n</li>"
|
|
727
|
-
break
|
|
728
|
-
if j > -1:
|
|
729
|
-
parts.pop(i)
|
|
730
|
-
else:
|
|
731
|
-
i += 1
|
|
732
|
-
|
|
733
740
|
# Add all child parts
|
|
734
|
-
text_res = "\n".join(
|
|
735
|
-
[
|
|
736
|
-
(
|
|
737
|
-
p.text
|
|
738
|
-
if (
|
|
739
|
-
(
|
|
740
|
-
p.text.startswith(("<li>", "<li "))
|
|
741
|
-
and p.text.endswith("</li>")
|
|
742
|
-
)
|
|
743
|
-
or (
|
|
744
|
-
p.text.startswith(("<ol>", "<ol "))
|
|
745
|
-
and p.text.endswith("</ol>")
|
|
746
|
-
)
|
|
747
|
-
or (
|
|
748
|
-
p.text.startswith(("<ul>", "<ul "))
|
|
749
|
-
and p.text.endswith("</ul>")
|
|
750
|
-
)
|
|
751
|
-
)
|
|
752
|
-
else (
|
|
753
|
-
get_html_tag_with_text_direction(
|
|
754
|
-
html_tag="li",
|
|
755
|
-
text=p.text,
|
|
756
|
-
attrs=(
|
|
757
|
-
{
|
|
758
|
-
"style": f"list-style-type: '{grandparent_item.marker} ';"
|
|
759
|
-
}
|
|
760
|
-
if params.show_original_list_item_marker
|
|
761
|
-
and grandparent_item.marker
|
|
762
|
-
else {}
|
|
763
|
-
),
|
|
764
|
-
)
|
|
765
|
-
if p.spans
|
|
766
|
-
and p.spans[0].item.parent
|
|
767
|
-
and isinstance(
|
|
768
|
-
(parent_item := p.spans[0].item.parent.resolve(doc)),
|
|
769
|
-
InlineGroup,
|
|
770
|
-
)
|
|
771
|
-
and parent_item.parent
|
|
772
|
-
and isinstance(
|
|
773
|
-
(grandparent_item := parent_item.parent.resolve(doc)),
|
|
774
|
-
ListItem,
|
|
775
|
-
)
|
|
776
|
-
else f"<li>{p.text}</li>"
|
|
777
|
-
)
|
|
778
|
-
)
|
|
779
|
-
for p in parts
|
|
780
|
-
]
|
|
781
|
-
)
|
|
741
|
+
text_res = "\n".join(p.text for p in parts if p.text)
|
|
782
742
|
if text_res:
|
|
783
743
|
tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
|
|
784
744
|
text_res = f"<{tag}>\n{text_res}\n</{tag}>"
|
|
@@ -1373,11 +1373,12 @@ class PictureItem(FloatingItem):
|
|
|
1373
1373
|
) # Encode to Base64 and decode to string
|
|
1374
1374
|
return img_base64
|
|
1375
1375
|
|
|
1376
|
-
|
|
1376
|
+
@staticmethod
|
|
1377
|
+
def _image_to_hexhash(img: Optional[PILImage.Image]) -> Optional[str]:
|
|
1377
1378
|
"""Hexash from the image."""
|
|
1378
|
-
if
|
|
1379
|
+
if img is not None:
|
|
1379
1380
|
# Convert the image to raw bytes
|
|
1380
|
-
image_bytes =
|
|
1381
|
+
image_bytes = img.tobytes()
|
|
1381
1382
|
|
|
1382
1383
|
# Create a hash object (e.g., SHA-256)
|
|
1383
1384
|
hasher = hashlib.sha256(usedforsecurity=False)
|
|
@@ -4116,16 +4117,10 @@ class DoclingDocument(BaseModel):
|
|
|
4116
4117
|
if image_dir.is_dir():
|
|
4117
4118
|
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
|
|
4118
4119
|
if isinstance(item, PictureItem):
|
|
4120
|
+
img = item.get_image(doc=self)
|
|
4121
|
+
if img is not None:
|
|
4119
4122
|
|
|
4120
|
-
|
|
4121
|
-
item.image is not None
|
|
4122
|
-
and isinstance(item.image.uri, AnyUrl)
|
|
4123
|
-
and item.image.uri.scheme == "data"
|
|
4124
|
-
and item.image.pil_image is not None
|
|
4125
|
-
):
|
|
4126
|
-
img = item.image.pil_image
|
|
4127
|
-
|
|
4128
|
-
hexhash = item._image_to_hexhash()
|
|
4123
|
+
hexhash = PictureItem._image_to_hexhash(img)
|
|
4129
4124
|
|
|
4130
4125
|
# loc_path = image_dir / f"image_{img_count:06}.png"
|
|
4131
4126
|
if hexhash is not None:
|
|
@@ -4140,6 +4135,11 @@ class DoclingDocument(BaseModel):
|
|
|
4140
4135
|
else:
|
|
4141
4136
|
obj_path = loc_path
|
|
4142
4137
|
|
|
4138
|
+
if item.image is None:
|
|
4139
|
+
scale = img.size[0] / item.prov[0].bbox.width
|
|
4140
|
+
item.image = ImageRef.from_pil(
|
|
4141
|
+
image=img, dpi=round(72 * scale)
|
|
4142
|
+
)
|
|
4143
4143
|
item.image.uri = Path(obj_path)
|
|
4144
4144
|
|
|
4145
4145
|
# if item.image._pil is not None:
|
|
@@ -4539,6 +4539,8 @@ class DoclingDocument(BaseModel):
|
|
|
4539
4539
|
reference_path = None
|
|
4540
4540
|
else:
|
|
4541
4541
|
reference_path = filename.parent
|
|
4542
|
+
artifacts_dir = reference_path / artifacts_dir
|
|
4543
|
+
|
|
4542
4544
|
return artifacts_dir, reference_path
|
|
4543
4545
|
|
|
4544
4546
|
def _make_copy_with_refmode(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.44.
|
|
3
|
+
Version: 2.44.2
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.44.
|
|
3
|
+
version = "2.44.2" # DO NOT EDIT, updated automatically
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
license-files = ["LICENSE"]
|
|
@@ -1442,10 +1442,11 @@ def test_save_to_disk():
|
|
|
1442
1442
|
|
|
1443
1443
|
doc: DoclingDocument = _construct_doc()
|
|
1444
1444
|
|
|
1445
|
-
|
|
1445
|
+
test_dir = Path("./test/data/doc")
|
|
1446
|
+
image_dir = Path("constructed_images/") # will be relative to test_dir
|
|
1446
1447
|
|
|
1447
1448
|
doc_with_references = doc._with_pictures_refs(
|
|
1448
|
-
image_dir=
|
|
1449
|
+
image_dir=(test_dir / image_dir),
|
|
1449
1450
|
page_no=None,
|
|
1450
1451
|
)
|
|
1451
1452
|
|
|
@@ -1455,19 +1456,19 @@ def test_save_to_disk():
|
|
|
1455
1456
|
|
|
1456
1457
|
### MarkDown
|
|
1457
1458
|
|
|
1458
|
-
filename =
|
|
1459
|
+
filename = test_dir / "constructed_doc.placeholder.md"
|
|
1459
1460
|
doc.save_as_markdown(
|
|
1460
1461
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.PLACEHOLDER
|
|
1461
1462
|
)
|
|
1462
1463
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1463
1464
|
|
|
1464
|
-
filename =
|
|
1465
|
+
filename = test_dir / "constructed_doc.embedded.md"
|
|
1465
1466
|
doc.save_as_markdown(
|
|
1466
1467
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.EMBEDDED
|
|
1467
1468
|
)
|
|
1468
1469
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1469
1470
|
|
|
1470
|
-
filename =
|
|
1471
|
+
filename = test_dir / "constructed_doc.referenced.md"
|
|
1471
1472
|
doc.save_as_markdown(
|
|
1472
1473
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.REFERENCED
|
|
1473
1474
|
)
|
|
@@ -1475,19 +1476,19 @@ def test_save_to_disk():
|
|
|
1475
1476
|
|
|
1476
1477
|
### HTML
|
|
1477
1478
|
|
|
1478
|
-
filename =
|
|
1479
|
+
filename = test_dir / "constructed_doc.placeholder.html"
|
|
1479
1480
|
doc.save_as_html(
|
|
1480
1481
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.PLACEHOLDER
|
|
1481
1482
|
)
|
|
1482
1483
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1483
1484
|
|
|
1484
|
-
filename =
|
|
1485
|
+
filename = test_dir / "constructed_doc.embedded.html"
|
|
1485
1486
|
doc.save_as_html(
|
|
1486
1487
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.EMBEDDED
|
|
1487
1488
|
)
|
|
1488
1489
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1489
1490
|
|
|
1490
|
-
filename =
|
|
1491
|
+
filename = test_dir / "constructed_doc.referenced.html"
|
|
1491
1492
|
doc.save_as_html(
|
|
1492
1493
|
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.REFERENCED
|
|
1493
1494
|
)
|
|
@@ -1495,13 +1496,13 @@ def test_save_to_disk():
|
|
|
1495
1496
|
|
|
1496
1497
|
### Document Tokens
|
|
1497
1498
|
|
|
1498
|
-
filename =
|
|
1499
|
+
filename = test_dir / "constructed_doc.dt"
|
|
1499
1500
|
doc.save_as_doctags(filename=filename)
|
|
1500
1501
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1501
1502
|
|
|
1502
1503
|
### JSON
|
|
1503
1504
|
|
|
1504
|
-
filename =
|
|
1505
|
+
filename = test_dir / "constructed_doc.embedded.json"
|
|
1505
1506
|
doc.save_as_json(
|
|
1506
1507
|
filename=filename,
|
|
1507
1508
|
artifacts_dir=image_dir,
|
|
@@ -1512,7 +1513,7 @@ def test_save_to_disk():
|
|
|
1512
1513
|
doc_emb_loaded = DoclingDocument.load_from_json(filename)
|
|
1513
1514
|
_verify_loaded_output(filename=filename, pred=doc_emb_loaded)
|
|
1514
1515
|
|
|
1515
|
-
filename =
|
|
1516
|
+
filename = test_dir / "constructed_doc.referenced.json"
|
|
1516
1517
|
doc.save_as_json(
|
|
1517
1518
|
filename=filename,
|
|
1518
1519
|
artifacts_dir=image_dir,
|
|
@@ -1525,7 +1526,7 @@ def test_save_to_disk():
|
|
|
1525
1526
|
|
|
1526
1527
|
### YAML
|
|
1527
1528
|
|
|
1528
|
-
filename =
|
|
1529
|
+
filename = test_dir / "constructed_doc.embedded.yaml"
|
|
1529
1530
|
doc.save_as_yaml(
|
|
1530
1531
|
filename=filename,
|
|
1531
1532
|
artifacts_dir=image_dir,
|
|
@@ -1533,7 +1534,7 @@ def test_save_to_disk():
|
|
|
1533
1534
|
)
|
|
1534
1535
|
_verify_saved_output(filename=filename, paths=paths)
|
|
1535
1536
|
|
|
1536
|
-
filename =
|
|
1537
|
+
filename = test_dir / "constructed_doc.referenced.yaml"
|
|
1537
1538
|
doc.save_as_yaml(
|
|
1538
1539
|
filename=filename,
|
|
1539
1540
|
artifacts_dir=image_dir,
|
|
@@ -352,6 +352,15 @@ def test_md_mark_annotations_true():
|
|
|
352
352
|
)
|
|
353
353
|
|
|
354
354
|
|
|
355
|
+
def test_md_nested_lists():
|
|
356
|
+
src = Path("./test/data/doc/polymers.json")
|
|
357
|
+
doc = DoclingDocument.load_from_json(src)
|
|
358
|
+
|
|
359
|
+
ser = MarkdownDocSerializer(doc=doc)
|
|
360
|
+
actual = ser.serialize().text
|
|
361
|
+
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
|
|
362
|
+
|
|
363
|
+
|
|
355
364
|
def test_html_split_page():
|
|
356
365
|
src = Path("./test/data/doc/2408.09869v3_enriched.json")
|
|
357
366
|
doc = DoclingDocument.load_from_json(src)
|
|
@@ -482,6 +491,15 @@ def test_html_list_item_markers():
|
|
|
482
491
|
)
|
|
483
492
|
|
|
484
493
|
|
|
494
|
+
def test_html_nested_lists():
|
|
495
|
+
src = Path("./test/data/doc/polymers.json")
|
|
496
|
+
doc = DoclingDocument.load_from_json(src)
|
|
497
|
+
|
|
498
|
+
ser = HTMLDocSerializer(doc=doc)
|
|
499
|
+
actual = ser.serialize().text
|
|
500
|
+
verify(exp_file=src.parent / f"{src.stem}.gt.html", actual=actual)
|
|
501
|
+
|
|
502
|
+
|
|
485
503
|
def test_doctags_inline_loc_tags():
|
|
486
504
|
src = Path("./test/data/doc/2408.09869v3_enriched.json")
|
|
487
505
|
doc = DoclingDocument.load_from_json(src)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/__init__.py
RENAMED
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/base.py
RENAMED
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/chunker/tokenizer/openai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/serializer/html_styles.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/visualizer/layout_visualizer.py
RENAMED
|
File without changes
|
|
File without changes
|
{docling_core-2.44.0 → docling_core-2.44.2}/docling_core/transforms/visualizer/table_visualizer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|