docling-core 2.42.0__tar.gz → 2.43.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.42.0 → docling_core-2.43.1}/PKG-INFO +1 -1
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/__init__.py +1 -0
- docling_core-2.43.1/docling_core/transforms/chunker/page_chunker.py +59 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/base.py +10 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/common.py +1 -1
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/html.py +17 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/layout_visualizer.py +3 -1
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/document.py +11 -7
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/SOURCES.txt +2 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/pyproject.toml +1 -1
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_base.py +2 -2
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_docling_doc.py +53 -11
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_doctags_load.py +8 -4
- docling_core-2.43.1/test/test_page_chunker.py +36 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/LICENSE +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/README.md +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/cli/view.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/py.typed +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/mapping.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/meta.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/package.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/markdown.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/base.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/alias.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/file.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/validate.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core/utils/validators.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/setup.cfg +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_collection.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_doc_base.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_doc_schema.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_nlp_qa.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_page.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_rec_schema.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_search_meta.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_serialization.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_utils.py +0 -0
- {docling_core-2.42.0 → docling_core-2.43.1}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.43.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Page-based chunker implementation: each chunk corresponds to a single page."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Iterator
|
|
6
|
+
|
|
7
|
+
from pydantic import ConfigDict
|
|
8
|
+
from typing_extensions import override
|
|
9
|
+
|
|
10
|
+
from docling_core.transforms.chunker import BaseChunker, DocChunk, DocMeta
|
|
11
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
12
|
+
ChunkingSerializerProvider,
|
|
13
|
+
)
|
|
14
|
+
from docling_core.types import DoclingDocument as DLDocument
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PageChunker(BaseChunker):
|
|
18
|
+
r"""Chunker implementation that yields one chunk per page."""
|
|
19
|
+
|
|
20
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
21
|
+
|
|
22
|
+
serializer_provider: ChunkingSerializerProvider = ChunkingSerializerProvider()
|
|
23
|
+
|
|
24
|
+
@override
|
|
25
|
+
def chunk(
|
|
26
|
+
self,
|
|
27
|
+
dl_doc: DLDocument,
|
|
28
|
+
**kwargs: Any,
|
|
29
|
+
) -> Iterator[DocChunk]:
|
|
30
|
+
"""Chunk the provided document by page."""
|
|
31
|
+
my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
|
|
32
|
+
if dl_doc.pages:
|
|
33
|
+
# chunk by page
|
|
34
|
+
for page_no in sorted(dl_doc.pages.keys()):
|
|
35
|
+
ser_res = my_doc_ser.serialize(pages={page_no})
|
|
36
|
+
if not ser_res.text:
|
|
37
|
+
continue
|
|
38
|
+
yield DocChunk(
|
|
39
|
+
text=ser_res.text,
|
|
40
|
+
meta=DocMeta(
|
|
41
|
+
doc_items=ser_res.get_unique_doc_items(),
|
|
42
|
+
headings=None,
|
|
43
|
+
captions=None,
|
|
44
|
+
origin=dl_doc.origin,
|
|
45
|
+
),
|
|
46
|
+
)
|
|
47
|
+
else:
|
|
48
|
+
# if no pages, treat whole document as single chunk
|
|
49
|
+
ser_res = my_doc_ser.serialize()
|
|
50
|
+
if ser_res.text:
|
|
51
|
+
yield DocChunk(
|
|
52
|
+
text=ser_res.text,
|
|
53
|
+
meta=DocMeta(
|
|
54
|
+
doc_items=ser_res.get_unique_doc_items(),
|
|
55
|
+
headings=None,
|
|
56
|
+
captions=None,
|
|
57
|
+
origin=dl_doc.origin,
|
|
58
|
+
),
|
|
59
|
+
)
|
|
@@ -39,6 +39,16 @@ class SerializationResult(BaseModel):
|
|
|
39
39
|
spans: list[Span] = []
|
|
40
40
|
# group: Optional[GroupItem] = None # set when result reflects specific group item
|
|
41
41
|
|
|
42
|
+
def get_unique_doc_items(self) -> list[DocItem]:
|
|
43
|
+
"""Get the doc items corresponding to this result."""
|
|
44
|
+
seen_doc_item_refs: set[str] = set()
|
|
45
|
+
doc_items: list[DocItem] = []
|
|
46
|
+
for span in self.spans:
|
|
47
|
+
if span.item.self_ref not in seen_doc_item_refs:
|
|
48
|
+
seen_doc_item_refs.add(span.item.self_ref)
|
|
49
|
+
doc_items.append(span.item)
|
|
50
|
+
return doc_items
|
|
51
|
+
|
|
42
52
|
|
|
43
53
|
class BaseTextSerializer(ABC):
|
|
44
54
|
"""Base class for text item serializers."""
|
|
@@ -285,7 +285,7 @@ class DocSerializer(BaseModel, BaseDocSerializer):
|
|
|
285
285
|
|
|
286
286
|
def _serialize_body(self, **kwargs) -> SerializationResult:
|
|
287
287
|
"""Serialize the document body."""
|
|
288
|
-
subparts = self.get_parts()
|
|
288
|
+
subparts = self.get_parts(**kwargs)
|
|
289
289
|
res = self.serialize_doc(parts=subparts, **kwargs)
|
|
290
290
|
return res
|
|
291
291
|
|
|
@@ -713,6 +713,23 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
713
713
|
**kwargs,
|
|
714
714
|
)
|
|
715
715
|
|
|
716
|
+
# Append nested list to parent list item:
|
|
717
|
+
i = 0
|
|
718
|
+
while i < len(parts):
|
|
719
|
+
prt = parts[i]
|
|
720
|
+
if prt.text.startswith(("<ul>", "<ol>")):
|
|
721
|
+
for j in range(i - 1, -1, -1):
|
|
722
|
+
if parts[j].text.startswith(("<li>", "<li ")) and parts[
|
|
723
|
+
j
|
|
724
|
+
].text.endswith("</li>"):
|
|
725
|
+
before, _, _ = parts[j].text.rpartition("</li>")
|
|
726
|
+
parts[j].text = f"{before}\n{prt.text}\n</li>"
|
|
727
|
+
break
|
|
728
|
+
if j > -1:
|
|
729
|
+
parts.pop(i)
|
|
730
|
+
else:
|
|
731
|
+
i += 1
|
|
732
|
+
|
|
716
733
|
# Add all child parts
|
|
717
734
|
text_res = "\n".join(
|
|
718
735
|
[
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/layout_visualizer.py
RENAMED
|
@@ -148,7 +148,9 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
148
148
|
prev_image = None
|
|
149
149
|
prev_page_nr = None
|
|
150
150
|
for idx, (elem, _) in enumerate(
|
|
151
|
-
doc.iterate_items(
|
|
151
|
+
doc.iterate_items(
|
|
152
|
+
included_content_layers=included_content_layers, traverse_pictures=True
|
|
153
|
+
)
|
|
152
154
|
):
|
|
153
155
|
if not isinstance(elem, DocItem):
|
|
154
156
|
continue
|
|
@@ -4098,7 +4098,10 @@ class DoclingDocument(BaseModel):
|
|
|
4098
4098
|
return result
|
|
4099
4099
|
|
|
4100
4100
|
def _with_pictures_refs(
|
|
4101
|
-
self,
|
|
4101
|
+
self,
|
|
4102
|
+
image_dir: Path,
|
|
4103
|
+
page_no: Optional[int],
|
|
4104
|
+
reference_path: Optional[Path] = None,
|
|
4102
4105
|
) -> "DoclingDocument":
|
|
4103
4106
|
"""Document with images as refs.
|
|
4104
4107
|
|
|
@@ -4111,7 +4114,7 @@ class DoclingDocument(BaseModel):
|
|
|
4111
4114
|
image_dir.mkdir(parents=True, exist_ok=True)
|
|
4112
4115
|
|
|
4113
4116
|
if image_dir.is_dir():
|
|
4114
|
-
for item, level in result.iterate_items(with_groups=False):
|
|
4117
|
+
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
|
|
4115
4118
|
if isinstance(item, PictureItem):
|
|
4116
4119
|
|
|
4117
4120
|
if (
|
|
@@ -4211,7 +4214,7 @@ class DoclingDocument(BaseModel):
|
|
|
4211
4214
|
os.makedirs(artifacts_dir, exist_ok=True)
|
|
4212
4215
|
|
|
4213
4216
|
new_doc = self._make_copy_with_refmode(
|
|
4214
|
-
artifacts_dir, image_mode, reference_path=reference_path
|
|
4217
|
+
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
|
|
4215
4218
|
)
|
|
4216
4219
|
|
|
4217
4220
|
out = new_doc.export_to_dict(
|
|
@@ -4254,7 +4257,7 @@ class DoclingDocument(BaseModel):
|
|
|
4254
4257
|
os.makedirs(artifacts_dir, exist_ok=True)
|
|
4255
4258
|
|
|
4256
4259
|
new_doc = self._make_copy_with_refmode(
|
|
4257
|
-
artifacts_dir, image_mode, reference_path=reference_path
|
|
4260
|
+
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
|
|
4258
4261
|
)
|
|
4259
4262
|
|
|
4260
4263
|
out = new_doc.export_to_dict(
|
|
@@ -4327,7 +4330,7 @@ class DoclingDocument(BaseModel):
|
|
|
4327
4330
|
os.makedirs(artifacts_dir, exist_ok=True)
|
|
4328
4331
|
|
|
4329
4332
|
new_doc = self._make_copy_with_refmode(
|
|
4330
|
-
artifacts_dir, image_mode, reference_path=reference_path
|
|
4333
|
+
artifacts_dir, image_mode, page_no, reference_path=reference_path
|
|
4331
4334
|
)
|
|
4332
4335
|
|
|
4333
4336
|
md_out = new_doc.export_to_markdown(
|
|
@@ -4503,7 +4506,7 @@ class DoclingDocument(BaseModel):
|
|
|
4503
4506
|
os.makedirs(artifacts_dir, exist_ok=True)
|
|
4504
4507
|
|
|
4505
4508
|
new_doc = self._make_copy_with_refmode(
|
|
4506
|
-
artifacts_dir, image_mode, reference_path=reference_path
|
|
4509
|
+
artifacts_dir, image_mode, page_no, reference_path=reference_path
|
|
4507
4510
|
)
|
|
4508
4511
|
|
|
4509
4512
|
html_out = new_doc.export_to_html(
|
|
@@ -4542,6 +4545,7 @@ class DoclingDocument(BaseModel):
|
|
|
4542
4545
|
self,
|
|
4543
4546
|
artifacts_dir: Path,
|
|
4544
4547
|
image_mode: ImageRefMode,
|
|
4548
|
+
page_no: Optional[int],
|
|
4545
4549
|
reference_path: Optional[Path] = None,
|
|
4546
4550
|
):
|
|
4547
4551
|
new_doc = None
|
|
@@ -4549,7 +4553,7 @@ class DoclingDocument(BaseModel):
|
|
|
4549
4553
|
new_doc = self
|
|
4550
4554
|
elif image_mode == ImageRefMode.REFERENCED:
|
|
4551
4555
|
new_doc = self._with_pictures_refs(
|
|
4552
|
-
image_dir=artifacts_dir, reference_path=reference_path
|
|
4556
|
+
image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path
|
|
4553
4557
|
)
|
|
4554
4558
|
elif image_mode == ImageRefMode.EMBEDDED:
|
|
4555
4559
|
new_doc = self._with_embedded_pictures()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.43.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -30,6 +30,7 @@ docling_core/transforms/chunker/__init__.py
|
|
|
30
30
|
docling_core/transforms/chunker/base.py
|
|
31
31
|
docling_core/transforms/chunker/hierarchical_chunker.py
|
|
32
32
|
docling_core/transforms/chunker/hybrid_chunker.py
|
|
33
|
+
docling_core/transforms/chunker/page_chunker.py
|
|
33
34
|
docling_core/transforms/chunker/tokenizer/__init__.py
|
|
34
35
|
docling_core/transforms/chunker/tokenizer/base.py
|
|
35
36
|
docling_core/transforms/chunker/tokenizer/huggingface.py
|
|
@@ -98,6 +99,7 @@ test/test_json_schema_to_search_mapper.py
|
|
|
98
99
|
test/test_nlp_qa.py
|
|
99
100
|
test/test_otsl_table_export.py
|
|
100
101
|
test/test_page.py
|
|
102
|
+
test/test_page_chunker.py
|
|
101
103
|
test/test_rec_schema.py
|
|
102
104
|
test/test_search_meta.py
|
|
103
105
|
test/test_serialization.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.43.1" # DO NOT EDIT, updated automatically
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
license-files = ["LICENSE"]
|
|
@@ -36,8 +36,8 @@ def test_identifier():
|
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
# schema_json(): no need to set by_alias since it is True by the default
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
with open("test/data/json_schemas/base_identifier.json", encoding="utf-8") as tf:
|
|
40
|
+
gold_json = json.load(tf)
|
|
41
41
|
|
|
42
42
|
assert Identifier.model_json_schema() == gold_json
|
|
43
43
|
|
|
@@ -1349,12 +1349,51 @@ def test_save_pictures():
|
|
|
1349
1349
|
|
|
1350
1350
|
doc: DoclingDocument = _construct_doc()
|
|
1351
1351
|
|
|
1352
|
-
new_doc = doc._with_pictures_refs(
|
|
1352
|
+
new_doc = doc._with_pictures_refs(
|
|
1353
|
+
image_dir=Path("./test/data/constructed_images/"), page_no=None
|
|
1354
|
+
)
|
|
1353
1355
|
|
|
1354
1356
|
img_paths = new_doc._list_images_on_disk()
|
|
1355
1357
|
assert len(img_paths) == 1, "len(img_paths)!=1"
|
|
1356
1358
|
|
|
1357
1359
|
|
|
1360
|
+
def test_save_pictures_with_page():
|
|
1361
|
+
# Given
|
|
1362
|
+
doc = DoclingDocument(name="Dummy")
|
|
1363
|
+
|
|
1364
|
+
doc.add_page(page_no=1, size=Size(width=2000, height=4000), image=None)
|
|
1365
|
+
doc.add_page(
|
|
1366
|
+
page_no=2,
|
|
1367
|
+
size=Size(width=2000, height=4000),
|
|
1368
|
+
)
|
|
1369
|
+
image = PILImage.new(mode="RGB", size=(200, 400), color=(0, 0, 0))
|
|
1370
|
+
doc.add_picture(
|
|
1371
|
+
image=ImageRef.from_pil(image=image, dpi=72),
|
|
1372
|
+
prov=ProvenanceItem(
|
|
1373
|
+
page_no=2,
|
|
1374
|
+
bbox=BoundingBox(
|
|
1375
|
+
b=0, l=0, r=200, t=400, coord_origin=CoordOrigin.BOTTOMLEFT
|
|
1376
|
+
),
|
|
1377
|
+
charspan=(1, 2),
|
|
1378
|
+
),
|
|
1379
|
+
)
|
|
1380
|
+
|
|
1381
|
+
# When
|
|
1382
|
+
with_ref = doc._with_pictures_refs(
|
|
1383
|
+
image_dir=Path("./test/data/constructed_images/"), page_no=1
|
|
1384
|
+
)
|
|
1385
|
+
# Then
|
|
1386
|
+
n_images = len(with_ref._list_images_on_disk())
|
|
1387
|
+
assert n_images == 0
|
|
1388
|
+
# When
|
|
1389
|
+
with_ref = with_ref._with_pictures_refs(
|
|
1390
|
+
image_dir=Path("./test/data/constructed_images/"), page_no=2
|
|
1391
|
+
)
|
|
1392
|
+
n_images = len(with_ref._list_images_on_disk())
|
|
1393
|
+
# Then
|
|
1394
|
+
assert n_images == 1
|
|
1395
|
+
|
|
1396
|
+
|
|
1358
1397
|
def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]):
|
|
1359
1398
|
|
|
1360
1399
|
for p in paths:
|
|
@@ -1406,7 +1445,8 @@ def test_save_to_disk():
|
|
|
1406
1445
|
image_dir = Path("./test/data/doc/constructed_images/")
|
|
1407
1446
|
|
|
1408
1447
|
doc_with_references = doc._with_pictures_refs(
|
|
1409
|
-
image_dir=image_dir # Path("./test/data/constructed_images/")
|
|
1448
|
+
image_dir=image_dir, # Path("./test/data/constructed_images/")
|
|
1449
|
+
page_no=None,
|
|
1410
1450
|
)
|
|
1411
1451
|
|
|
1412
1452
|
# paths will be different on different machines, so needs to be kept!
|
|
@@ -1779,9 +1819,10 @@ def test_document_manipulation():
|
|
|
1779
1819
|
|
|
1780
1820
|
# Test the handling of list items in insert_* methods, both with and without parent groups
|
|
1781
1821
|
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1822
|
+
with pytest.warns(DeprecationWarning, match="ListItem parent must be a ListGroup"):
|
|
1823
|
+
li_sibling = doc.insert_list_item(
|
|
1824
|
+
sibling=node, text="Inserted List Item, Incorrect Parent", after=False
|
|
1825
|
+
)
|
|
1785
1826
|
doc.insert_list_item(
|
|
1786
1827
|
sibling=li_sibling, text="Inserted List Item, Correct Parent", after=True
|
|
1787
1828
|
)
|
|
@@ -1791,12 +1832,13 @@ def test_document_manipulation():
|
|
|
1791
1832
|
text="Inserted Text with LIST_ITEM Label, Correct Parent",
|
|
1792
1833
|
after=False,
|
|
1793
1834
|
)
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1835
|
+
with pytest.warns(DeprecationWarning, match="ListItem parent must be a ListGroup"):
|
|
1836
|
+
doc.insert_text(
|
|
1837
|
+
sibling=node,
|
|
1838
|
+
label=DocItemLabel.LIST_ITEM,
|
|
1839
|
+
text="Inserted Text with LIST_ITEM Label, Incorrect Parent",
|
|
1840
|
+
after=True,
|
|
1841
|
+
)
|
|
1800
1842
|
|
|
1801
1843
|
filename = Path(
|
|
1802
1844
|
"test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json"
|
|
@@ -60,7 +60,8 @@ def test_doctags_load_from_files():
|
|
|
60
60
|
|
|
61
61
|
def test_doctags_load_from_memory():
|
|
62
62
|
|
|
63
|
-
|
|
63
|
+
with Path("test/data/doc/page_with_pic.dt").open() as file:
|
|
64
|
+
doctags = file.read()
|
|
64
65
|
image = PILImage.open(Path("test/data/doc/page_with_pic.png"))
|
|
65
66
|
|
|
66
67
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
|
@@ -75,7 +76,8 @@ def test_doctags_load_from_memory():
|
|
|
75
76
|
|
|
76
77
|
|
|
77
78
|
def test_doctags_load_without_image():
|
|
78
|
-
|
|
79
|
+
with Path("test/data/doc/page_with_pic.dt").open() as file:
|
|
80
|
+
doctags = file.read()
|
|
79
81
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], None)
|
|
80
82
|
doc = DoclingDocument.load_from_doctags(doctags_doc)
|
|
81
83
|
exp = "test/data/doc/page_without_pic.dt.json"
|
|
@@ -86,7 +88,8 @@ def test_doctags_load_without_image():
|
|
|
86
88
|
|
|
87
89
|
|
|
88
90
|
def test_doctags_load_for_kv_region():
|
|
89
|
-
|
|
91
|
+
with Path("test/data/doc/doc_with_kv.dt").open() as file:
|
|
92
|
+
doctags = file.read()
|
|
90
93
|
image = PILImage.open(Path("test/data/doc/doc_with_kv.png"))
|
|
91
94
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
|
92
95
|
doc = DoclingDocument.load_from_doctags(doctags_doc)
|
|
@@ -98,7 +101,8 @@ def test_doctags_load_for_kv_region():
|
|
|
98
101
|
|
|
99
102
|
|
|
100
103
|
def test_multipage_doctags_load():
|
|
101
|
-
|
|
104
|
+
with Path("test/data/doc/2206.01062.yaml.dt").open() as file:
|
|
105
|
+
doctags = file.read()
|
|
102
106
|
doctags_doc = DocTagsDocument.from_multipage_doctags_and_images(doctags, None)
|
|
103
107
|
doc = DoclingDocument.load_from_doctags(doctags_doc)
|
|
104
108
|
exp = "test/data/doc/2206.01062.yaml.dt.json"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from docling_core.transforms.chunker.hierarchical_chunker import DocChunk
|
|
5
|
+
from docling_core.transforms.chunker.page_chunker import PageChunker
|
|
6
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
7
|
+
|
|
8
|
+
from .test_data_gen_flag import GEN_TEST_DATA
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _process(act_data, exp_path_str):
|
|
12
|
+
if GEN_TEST_DATA:
|
|
13
|
+
with open(exp_path_str, mode="w", encoding="utf-8") as f:
|
|
14
|
+
json.dump(act_data, fp=f, indent=4)
|
|
15
|
+
f.write("\n")
|
|
16
|
+
else:
|
|
17
|
+
with open(exp_path_str, encoding="utf-8") as f:
|
|
18
|
+
exp_data = json.load(fp=f)
|
|
19
|
+
assert exp_data == act_data
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_page_chunks():
|
|
23
|
+
src = Path("./test/data/doc/cross_page_lists.json")
|
|
24
|
+
doc = DoclingDocument.load_from_json(src)
|
|
25
|
+
|
|
26
|
+
chunker = PageChunker()
|
|
27
|
+
|
|
28
|
+
chunk_iter = chunker.chunk(dl_doc=doc)
|
|
29
|
+
chunks = list(chunk_iter)
|
|
30
|
+
act_data = dict(
|
|
31
|
+
root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
|
|
32
|
+
)
|
|
33
|
+
_process(
|
|
34
|
+
act_data=act_data,
|
|
35
|
+
exp_path_str=src.parent / f"{src.stem}_chunks.json",
|
|
36
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/__init__.py
RENAMED
|
File without changes
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/base.py
RENAMED
|
File without changes
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
File without changes
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/chunker/tokenizer/openai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/serializer/html_styles.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.42.0 → docling_core-2.43.1}/docling_core/transforms/visualizer/table_visualizer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|