docling-core 2.43.0__tar.gz → 2.44.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.43.0 → docling_core-2.44.0}/PKG-INFO +1 -1
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/serializer/html.py +17 -0
- docling_core-2.44.0/docling_core/transforms/visualizer/key_value_visualizer.py +217 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/visualizer/layout_visualizer.py +3 -1
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/doc/document.py +45 -10
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core.egg-info/PKG-INFO +1 -1
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core.egg-info/SOURCES.txt +1 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/pyproject.toml +1 -1
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_base.py +2 -2
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_docling_doc.py +11 -9
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_doctags_load.py +8 -4
- {docling_core-2.43.0 → docling_core-2.44.0}/LICENSE +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/README.md +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/cli/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/cli/view.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/experimental/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/py.typed +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/search/package.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/base.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/page_chunker.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/serializer/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/serializer/base.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/serializer/common.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/serializer/doctags.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/serializer/html_styles.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/serializer/markdown.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/visualizer/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/visualizer/base.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/visualizer/table_visualizer.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/base.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/doc/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/doc/base.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/doc/page.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/doc/tokens.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/doc/utils.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/io/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/utils/legacy.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core/utils/validators.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core.egg-info/dependency_links.txt +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core.egg-info/entry_points.txt +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core.egg-info/requires.txt +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/docling_core.egg-info/top_level.txt +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/setup.cfg +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_collection.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_data_gen_flag.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_doc_base.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_doc_legacy_convert.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_doc_schema.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_doc_schema_extractor.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_hierarchical_chunker.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_hybrid_chunker.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_json_schema_to_search_mapper.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_nlp_qa.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_otsl_table_export.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_page.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_page_chunker.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_rec_schema.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_search_meta.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_serialization.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_utils.py +0 -0
- {docling_core-2.43.0 → docling_core-2.44.0}/test/test_visualization.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.44.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -713,6 +713,23 @@ class HTMLListSerializer(BaseModel, BaseListSerializer):
|
|
|
713
713
|
**kwargs,
|
|
714
714
|
)
|
|
715
715
|
|
|
716
|
+
# Append nested list to parent list item:
|
|
717
|
+
i = 0
|
|
718
|
+
while i < len(parts):
|
|
719
|
+
prt = parts[i]
|
|
720
|
+
if prt.text.startswith(("<ul>", "<ol>")):
|
|
721
|
+
for j in range(i - 1, -1, -1):
|
|
722
|
+
if parts[j].text.startswith(("<li>", "<li ")) and parts[
|
|
723
|
+
j
|
|
724
|
+
].text.endswith("</li>"):
|
|
725
|
+
before, _, _ = parts[j].text.rpartition("</li>")
|
|
726
|
+
parts[j].text = f"{before}\n{prt.text}\n</li>"
|
|
727
|
+
break
|
|
728
|
+
if j > -1:
|
|
729
|
+
parts.pop(i)
|
|
730
|
+
else:
|
|
731
|
+
i += 1
|
|
732
|
+
|
|
716
733
|
# Add all child parts
|
|
717
734
|
text_res = "\n".join(
|
|
718
735
|
[
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""Key‑value visualizer overlaying key/value cells and their links on page images.
|
|
2
|
+
|
|
3
|
+
This module complements :py:class:`layout_visualizer.LayoutVisualizer` by drawing
|
|
4
|
+
*key* and *value* cells plus the directed links between them. It can be stacked
|
|
5
|
+
on top of any other :py:class:`BaseVisualizer` – e.g. first draw the general
|
|
6
|
+
layout, then add the key‑value layer.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from typing import Optional, Union
|
|
11
|
+
|
|
12
|
+
from PIL import ImageDraw, ImageFont
|
|
13
|
+
from PIL.Image import Image
|
|
14
|
+
from PIL.ImageFont import FreeTypeFont
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
from typing_extensions import override
|
|
17
|
+
|
|
18
|
+
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
19
|
+
from docling_core.types.doc.document import ContentLayer, DoclingDocument
|
|
20
|
+
from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Helper functions / constants
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
# Semi‑transparent RGBA colours for key / value cells and their connecting link
|
|
27
|
+
_KEY_FILL = (0, 170, 0, 70) # greenish
|
|
28
|
+
_VALUE_FILL = (0, 0, 200, 70) # bluish
|
|
29
|
+
_LINK_COLOUR = (255, 0, 0, 255) # red line (solid)
|
|
30
|
+
|
|
31
|
+
_LABEL_TXT_COLOUR = (0, 0, 0, 255)
|
|
32
|
+
_LABEL_BG_COLOUR = (255, 255, 255, 180) # semi‑transparent white
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class KeyValueVisualizer(BaseVisualizer):
|
|
36
|
+
"""Draw key/value graphs stored in :py:attr:`DoclingDocument.key_value_items`."""
|
|
37
|
+
|
|
38
|
+
class Params(BaseModel):
|
|
39
|
+
"""Parameters for KeyValueVisualizer controlling label and cell id display, and content layers to visualize."""
|
|
40
|
+
|
|
41
|
+
show_label: bool = True # draw cell text close to bbox
|
|
42
|
+
show_cell_id: bool = False # annotate each rectangle with its cell_id
|
|
43
|
+
content_layers: set[ContentLayer] = {cl for cl in ContentLayer}
|
|
44
|
+
|
|
45
|
+
base_visualizer: Optional[BaseVisualizer] = None
|
|
46
|
+
params: Params = Params()
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------
|
|
49
|
+
# Internal helpers
|
|
50
|
+
# ---------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
def _cell_fill(self, label: GraphCellLabel) -> tuple[int, int, int, int]:
|
|
53
|
+
"""Return RGBA fill colour depending on *label*."""
|
|
54
|
+
return _KEY_FILL if label == GraphCellLabel.KEY else _VALUE_FILL
|
|
55
|
+
|
|
56
|
+
def _draw_key_value_layer(
|
|
57
|
+
self,
|
|
58
|
+
*,
|
|
59
|
+
image: Image,
|
|
60
|
+
doc: DoclingDocument,
|
|
61
|
+
page_no: int,
|
|
62
|
+
scale_x: float,
|
|
63
|
+
scale_y: float,
|
|
64
|
+
) -> None:
|
|
65
|
+
"""Draw every key‑value graph that has cells on *page_no* onto *image*."""
|
|
66
|
+
draw = ImageDraw.Draw(image, "RGBA")
|
|
67
|
+
# Choose a small truetype font if available, otherwise default bitmap font
|
|
68
|
+
font: Union[ImageFont.ImageFont, FreeTypeFont]
|
|
69
|
+
try:
|
|
70
|
+
font = ImageFont.truetype("arial.ttf", 12)
|
|
71
|
+
except OSError:
|
|
72
|
+
font = ImageFont.load_default()
|
|
73
|
+
|
|
74
|
+
for kv_item in doc.key_value_items:
|
|
75
|
+
cell_dict = {cell.cell_id: cell for cell in kv_item.graph.cells}
|
|
76
|
+
|
|
77
|
+
# ------------------------------------------------------------------
|
|
78
|
+
# First draw cells (rectangles + optional labels)
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
for cell in cell_dict.values():
|
|
81
|
+
if cell.prov is None or cell.prov.page_no != page_no:
|
|
82
|
+
continue # skip cells not on this page or without bbox
|
|
83
|
+
|
|
84
|
+
tl_bbox = cell.prov.bbox.to_top_left_origin(
|
|
85
|
+
page_height=doc.pages[page_no].size.height
|
|
86
|
+
)
|
|
87
|
+
x0, y0, x1, y1 = tl_bbox.as_tuple()
|
|
88
|
+
x0 *= scale_x
|
|
89
|
+
x1 *= scale_x
|
|
90
|
+
y0 *= scale_y
|
|
91
|
+
y1 *= scale_y
|
|
92
|
+
fill_rgba = self._cell_fill(cell.label)
|
|
93
|
+
|
|
94
|
+
draw.rectangle(
|
|
95
|
+
[(x0, y0), (x1, y1)],
|
|
96
|
+
outline=fill_rgba[:-1] + (255,),
|
|
97
|
+
fill=fill_rgba,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if self.params.show_label:
|
|
101
|
+
txt_parts = []
|
|
102
|
+
if self.params.show_cell_id:
|
|
103
|
+
txt_parts.append(str(cell.cell_id))
|
|
104
|
+
txt_parts.append(cell.text)
|
|
105
|
+
label_text = " | ".join(txt_parts)
|
|
106
|
+
|
|
107
|
+
tbx = draw.textbbox((x0, y0), label_text, font=font)
|
|
108
|
+
pad = 2
|
|
109
|
+
draw.rectangle(
|
|
110
|
+
[(tbx[0] - pad, tbx[1] - pad), (tbx[2] + pad, tbx[3] + pad)],
|
|
111
|
+
fill=_LABEL_BG_COLOUR,
|
|
112
|
+
)
|
|
113
|
+
draw.text((x0, y0), label_text, font=font, fill=_LABEL_TXT_COLOUR)
|
|
114
|
+
|
|
115
|
+
# ------------------------------------------------------------------
|
|
116
|
+
# Then draw links (after rectangles so they appear on top)
|
|
117
|
+
# ------------------------------------------------------------------
|
|
118
|
+
for link in kv_item.graph.links:
|
|
119
|
+
if link.label != GraphLinkLabel.TO_VALUE:
|
|
120
|
+
# Future‑proof: ignore other link types silently
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
src_cell = cell_dict.get(link.source_cell_id)
|
|
124
|
+
tgt_cell = cell_dict.get(link.target_cell_id)
|
|
125
|
+
if src_cell is None or tgt_cell is None:
|
|
126
|
+
continue
|
|
127
|
+
if (
|
|
128
|
+
src_cell.prov is None
|
|
129
|
+
or tgt_cell.prov is None
|
|
130
|
+
or src_cell.prov.page_no != page_no
|
|
131
|
+
or tgt_cell.prov.page_no != page_no
|
|
132
|
+
):
|
|
133
|
+
continue # only draw if both ends are on this page
|
|
134
|
+
|
|
135
|
+
def _centre(bbox):
|
|
136
|
+
tl = bbox.to_top_left_origin(
|
|
137
|
+
page_height=doc.pages[page_no].size.height
|
|
138
|
+
)
|
|
139
|
+
l, t, r, b = tl.as_tuple()
|
|
140
|
+
return ((l + r) / 2 * scale_x, (t + b) / 2 * scale_y)
|
|
141
|
+
|
|
142
|
+
src_xy = _centre(src_cell.prov.bbox)
|
|
143
|
+
tgt_xy = _centre(tgt_cell.prov.bbox)
|
|
144
|
+
|
|
145
|
+
draw.line([src_xy, tgt_xy], fill=_LINK_COLOUR, width=2)
|
|
146
|
+
|
|
147
|
+
# draw a small arrow‑head by rendering a short orthogonal line
|
|
148
|
+
# segment; exact geometry is not critical for visual inspection
|
|
149
|
+
arrow_len = 6
|
|
150
|
+
dx = tgt_xy[0] - src_xy[0]
|
|
151
|
+
dy = tgt_xy[1] - src_xy[1]
|
|
152
|
+
length = (dx**2 + dy**2) ** 0.5 or 1.0
|
|
153
|
+
ux, uy = dx / length, dy / length
|
|
154
|
+
# perpendicular vector
|
|
155
|
+
px, py = -uy, ux
|
|
156
|
+
# two points forming the arrow head triangle base
|
|
157
|
+
head_base_left = (
|
|
158
|
+
tgt_xy[0] - ux * arrow_len - px * arrow_len / 2,
|
|
159
|
+
tgt_xy[1] - uy * arrow_len - py * arrow_len / 2,
|
|
160
|
+
)
|
|
161
|
+
head_base_right = (
|
|
162
|
+
tgt_xy[0] - ux * arrow_len + px * arrow_len / 2,
|
|
163
|
+
tgt_xy[1] - uy * arrow_len + py * arrow_len / 2,
|
|
164
|
+
)
|
|
165
|
+
draw.polygon(
|
|
166
|
+
[tgt_xy, head_base_left, head_base_right], fill=_LINK_COLOUR
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# ---------------------------------------------------------------------
|
|
170
|
+
# Public API – BaseVisualizer implementation
|
|
171
|
+
# ---------------------------------------------------------------------
|
|
172
|
+
|
|
173
|
+
@override
|
|
174
|
+
def get_visualization(
|
|
175
|
+
self,
|
|
176
|
+
*,
|
|
177
|
+
doc: DoclingDocument,
|
|
178
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
179
|
+
**kwargs,
|
|
180
|
+
) -> dict[Optional[int], Image]:
|
|
181
|
+
"""Return page‑wise images with key/value overlay (incl. base layer)."""
|
|
182
|
+
base_images = (
|
|
183
|
+
self.base_visualizer.get_visualization(
|
|
184
|
+
doc=doc, included_content_layers=included_content_layers, **kwargs
|
|
185
|
+
)
|
|
186
|
+
if self.base_visualizer
|
|
187
|
+
else None
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
if included_content_layers is None:
|
|
191
|
+
included_content_layers = {cl for cl in ContentLayer}
|
|
192
|
+
|
|
193
|
+
images: dict[Optional[int], Image] = {}
|
|
194
|
+
|
|
195
|
+
# Ensure we have page images to draw on
|
|
196
|
+
for page_nr, page in doc.pages.items():
|
|
197
|
+
base_img = (base_images or {}).get(page_nr)
|
|
198
|
+
if base_img is None:
|
|
199
|
+
if page.image is None or (pil_img := page.image.pil_image) is None:
|
|
200
|
+
raise RuntimeError("Cannot visualize document without page images")
|
|
201
|
+
base_img = deepcopy(pil_img)
|
|
202
|
+
images[page_nr] = base_img
|
|
203
|
+
|
|
204
|
+
# Overlay key‑value content
|
|
205
|
+
for page_nr, img in images.items(): # type: ignore
|
|
206
|
+
assert isinstance(page_nr, int)
|
|
207
|
+
scale_x = img.width / doc.pages[page_nr].size.width
|
|
208
|
+
scale_y = img.height / doc.pages[page_nr].size.height
|
|
209
|
+
self._draw_key_value_layer(
|
|
210
|
+
image=img,
|
|
211
|
+
doc=doc,
|
|
212
|
+
page_no=page_nr,
|
|
213
|
+
scale_x=scale_x,
|
|
214
|
+
scale_y=scale_y,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
return images
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/visualizer/layout_visualizer.py
RENAMED
|
@@ -148,7 +148,9 @@ class LayoutVisualizer(BaseVisualizer):
|
|
|
148
148
|
prev_image = None
|
|
149
149
|
prev_page_nr = None
|
|
150
150
|
for idx, (elem, _) in enumerate(
|
|
151
|
-
doc.iterate_items(
|
|
151
|
+
doc.iterate_items(
|
|
152
|
+
included_content_layers=included_content_layers, traverse_pictures=True
|
|
153
|
+
)
|
|
152
154
|
):
|
|
153
155
|
if not isinstance(elem, DocItem):
|
|
154
156
|
continue
|
|
@@ -5543,8 +5543,27 @@ class DoclingDocument(BaseModel):
|
|
|
5543
5543
|
self,
|
|
5544
5544
|
show_label: bool = True,
|
|
5545
5545
|
show_branch_numbering: bool = False,
|
|
5546
|
+
viz_mode: Literal["reading_order", "key_value"] = "reading_order",
|
|
5547
|
+
show_cell_id: bool = False,
|
|
5546
5548
|
) -> dict[Optional[int], PILImage.Image]:
|
|
5547
|
-
"""Get visualization of the document as images by page.
|
|
5549
|
+
"""Get visualization of the document as images by page.
|
|
5550
|
+
|
|
5551
|
+
:param show_label: Show labels on elements (applies to all visualizers).
|
|
5552
|
+
:type show_label: bool
|
|
5553
|
+
:param show_branch_numbering: Show branch numbering (reading order visualizer only).
|
|
5554
|
+
:type show_branch_numbering: bool
|
|
5555
|
+
:param visualizer: Which visualizer to use. One of 'reading_order' (default), 'key_value'.
|
|
5556
|
+
:type visualizer: str
|
|
5557
|
+
:param show_cell_id: Show cell IDs (key value visualizer only).
|
|
5558
|
+
:type show_cell_id: bool
|
|
5559
|
+
|
|
5560
|
+
:returns: Dictionary mapping page numbers to PIL images.
|
|
5561
|
+
:rtype: dict[Optional[int], PILImage.Image]
|
|
5562
|
+
"""
|
|
5563
|
+
from docling_core.transforms.visualizer.base import BaseVisualizer
|
|
5564
|
+
from docling_core.transforms.visualizer.key_value_visualizer import (
|
|
5565
|
+
KeyValueVisualizer,
|
|
5566
|
+
)
|
|
5548
5567
|
from docling_core.transforms.visualizer.layout_visualizer import (
|
|
5549
5568
|
LayoutVisualizer,
|
|
5550
5569
|
)
|
|
@@ -5552,18 +5571,34 @@ class DoclingDocument(BaseModel):
|
|
|
5552
5571
|
ReadingOrderVisualizer,
|
|
5553
5572
|
)
|
|
5554
5573
|
|
|
5555
|
-
|
|
5556
|
-
|
|
5557
|
-
|
|
5574
|
+
visualizer_obj: BaseVisualizer
|
|
5575
|
+
if viz_mode == "reading_order":
|
|
5576
|
+
visualizer_obj = ReadingOrderVisualizer(
|
|
5577
|
+
base_visualizer=LayoutVisualizer(
|
|
5578
|
+
params=LayoutVisualizer.Params(
|
|
5579
|
+
show_label=show_label,
|
|
5580
|
+
),
|
|
5581
|
+
),
|
|
5582
|
+
params=ReadingOrderVisualizer.Params(
|
|
5583
|
+
show_branch_numbering=show_branch_numbering,
|
|
5584
|
+
),
|
|
5585
|
+
)
|
|
5586
|
+
elif viz_mode == "key_value":
|
|
5587
|
+
visualizer_obj = KeyValueVisualizer(
|
|
5588
|
+
base_visualizer=LayoutVisualizer(
|
|
5589
|
+
params=LayoutVisualizer.Params(
|
|
5590
|
+
show_label=show_label,
|
|
5591
|
+
),
|
|
5592
|
+
),
|
|
5593
|
+
params=KeyValueVisualizer.Params(
|
|
5558
5594
|
show_label=show_label,
|
|
5595
|
+
show_cell_id=show_cell_id,
|
|
5559
5596
|
),
|
|
5560
|
-
)
|
|
5561
|
-
|
|
5562
|
-
|
|
5563
|
-
),
|
|
5564
|
-
)
|
|
5565
|
-
images = visualizer.get_visualization(doc=self)
|
|
5597
|
+
)
|
|
5598
|
+
else:
|
|
5599
|
+
raise ValueError(f"Unknown visualization mode: {viz_mode}")
|
|
5566
5600
|
|
|
5601
|
+
images = visualizer_obj.get_visualization(doc=self)
|
|
5567
5602
|
return images
|
|
5568
5603
|
|
|
5569
5604
|
@field_validator("version")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.44.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
|
|
@@ -44,6 +44,7 @@ docling_core/transforms/serializer/html_styles.py
|
|
|
44
44
|
docling_core/transforms/serializer/markdown.py
|
|
45
45
|
docling_core/transforms/visualizer/__init__.py
|
|
46
46
|
docling_core/transforms/visualizer/base.py
|
|
47
|
+
docling_core/transforms/visualizer/key_value_visualizer.py
|
|
47
48
|
docling_core/transforms/visualizer/layout_visualizer.py
|
|
48
49
|
docling_core/transforms/visualizer/reading_order_visualizer.py
|
|
49
50
|
docling_core/transforms/visualizer/table_visualizer.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "docling-core"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.44.0" # DO NOT EDIT, updated automatically
|
|
4
4
|
description = "A python library to define and validate data types in Docling."
|
|
5
5
|
license = "MIT"
|
|
6
6
|
license-files = ["LICENSE"]
|
|
@@ -36,8 +36,8 @@ def test_identifier():
|
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
# schema_json(): no need to set by_alias since it is True by the default
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
with open("test/data/json_schemas/base_identifier.json", encoding="utf-8") as tf:
|
|
40
|
+
gold_json = json.load(tf)
|
|
41
41
|
|
|
42
42
|
assert Identifier.model_json_schema() == gold_json
|
|
43
43
|
|
|
@@ -1819,9 +1819,10 @@ def test_document_manipulation():
|
|
|
1819
1819
|
|
|
1820
1820
|
# Test the handling of list items in insert_* methods, both with and without parent groups
|
|
1821
1821
|
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1822
|
+
with pytest.warns(DeprecationWarning, match="ListItem parent must be a ListGroup"):
|
|
1823
|
+
li_sibling = doc.insert_list_item(
|
|
1824
|
+
sibling=node, text="Inserted List Item, Incorrect Parent", after=False
|
|
1825
|
+
)
|
|
1825
1826
|
doc.insert_list_item(
|
|
1826
1827
|
sibling=li_sibling, text="Inserted List Item, Correct Parent", after=True
|
|
1827
1828
|
)
|
|
@@ -1831,12 +1832,13 @@ def test_document_manipulation():
|
|
|
1831
1832
|
text="Inserted Text with LIST_ITEM Label, Correct Parent",
|
|
1832
1833
|
after=False,
|
|
1833
1834
|
)
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1835
|
+
with pytest.warns(DeprecationWarning, match="ListItem parent must be a ListGroup"):
|
|
1836
|
+
doc.insert_text(
|
|
1837
|
+
sibling=node,
|
|
1838
|
+
label=DocItemLabel.LIST_ITEM,
|
|
1839
|
+
text="Inserted Text with LIST_ITEM Label, Incorrect Parent",
|
|
1840
|
+
after=True,
|
|
1841
|
+
)
|
|
1840
1842
|
|
|
1841
1843
|
filename = Path(
|
|
1842
1844
|
"test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json"
|
|
@@ -60,7 +60,8 @@ def test_doctags_load_from_files():
|
|
|
60
60
|
|
|
61
61
|
def test_doctags_load_from_memory():
|
|
62
62
|
|
|
63
|
-
|
|
63
|
+
with Path("test/data/doc/page_with_pic.dt").open() as file:
|
|
64
|
+
doctags = file.read()
|
|
64
65
|
image = PILImage.open(Path("test/data/doc/page_with_pic.png"))
|
|
65
66
|
|
|
66
67
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
|
@@ -75,7 +76,8 @@ def test_doctags_load_from_memory():
|
|
|
75
76
|
|
|
76
77
|
|
|
77
78
|
def test_doctags_load_without_image():
|
|
78
|
-
|
|
79
|
+
with Path("test/data/doc/page_with_pic.dt").open() as file:
|
|
80
|
+
doctags = file.read()
|
|
79
81
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], None)
|
|
80
82
|
doc = DoclingDocument.load_from_doctags(doctags_doc)
|
|
81
83
|
exp = "test/data/doc/page_without_pic.dt.json"
|
|
@@ -86,7 +88,8 @@ def test_doctags_load_without_image():
|
|
|
86
88
|
|
|
87
89
|
|
|
88
90
|
def test_doctags_load_for_kv_region():
|
|
89
|
-
|
|
91
|
+
with Path("test/data/doc/doc_with_kv.dt").open() as file:
|
|
92
|
+
doctags = file.read()
|
|
90
93
|
image = PILImage.open(Path("test/data/doc/doc_with_kv.png"))
|
|
91
94
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
|
92
95
|
doc = DoclingDocument.load_from_doctags(doctags_doc)
|
|
@@ -98,7 +101,8 @@ def test_doctags_load_for_kv_region():
|
|
|
98
101
|
|
|
99
102
|
|
|
100
103
|
def test_multipage_doctags_load():
|
|
101
|
-
|
|
104
|
+
with Path("test/data/doc/2206.01062.yaml.dt").open() as file:
|
|
105
|
+
doctags = file.read()
|
|
102
106
|
doctags_doc = DocTagsDocument.from_multipage_doctags_and_images(doctags, None)
|
|
103
107
|
doc = DoclingDocument.load_from_doctags(doctags_doc)
|
|
104
108
|
exp = "test/data/doc/2206.01062.yaml.dt.json"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
File without changes
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/hybrid_chunker.py
RENAMED
|
File without changes
|
|
File without changes
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/tokenizer/__init__.py
RENAMED
|
File without changes
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/tokenizer/base.py
RENAMED
|
File without changes
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/tokenizer/huggingface.py
RENAMED
|
File without changes
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/chunker/tokenizer/openai.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/serializer/html_styles.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.43.0 → docling_core-2.44.0}/docling_core/transforms/visualizer/table_visualizer.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|