docling-core 2.34.2__tar.gz → 2.35.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. {docling_core-2.34.2 → docling_core-2.35.0}/PKG-INFO +1 -1
  2. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/visualizer/layout_visualizer.py +8 -4
  3. docling_core-2.35.0/docling_core/transforms/visualizer/table_visualizer.py +135 -0
  4. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core.egg-info/PKG-INFO +1 -1
  5. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core.egg-info/SOURCES.txt +1 -0
  6. {docling_core-2.34.2 → docling_core-2.35.0}/pyproject.toml +1 -1
  7. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_visualization.py +14 -0
  8. {docling_core-2.34.2 → docling_core-2.35.0}/LICENSE +0 -0
  9. {docling_core-2.34.2 → docling_core-2.35.0}/README.md +0 -0
  10. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/__init__.py +0 -0
  11. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/cli/__init__.py +0 -0
  12. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/cli/view.py +0 -0
  13. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/experimental/__init__.py +0 -0
  14. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/py.typed +0 -0
  15. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  16. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  17. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  18. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  19. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  20. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  21. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  22. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  23. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/search/__init__.py +0 -0
  24. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  25. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/search/mapping.py +0 -0
  26. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/search/meta.py +0 -0
  27. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/search/package.py +0 -0
  28. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/__init__.py +0 -0
  29. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/chunker/__init__.py +0 -0
  30. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/chunker/base.py +0 -0
  31. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  32. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  33. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/chunker/tokenizer/__init__.py +0 -0
  34. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/chunker/tokenizer/base.py +0 -0
  35. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/chunker/tokenizer/huggingface.py +0 -0
  36. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/chunker/tokenizer/openai.py +0 -0
  37. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/serializer/__init__.py +0 -0
  38. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/serializer/base.py +0 -0
  39. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/serializer/common.py +0 -0
  40. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/serializer/doctags.py +0 -0
  41. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/serializer/html.py +0 -0
  42. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/serializer/html_styles.py +0 -0
  43. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/serializer/markdown.py +0 -0
  44. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/visualizer/__init__.py +0 -0
  45. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/visualizer/base.py +0 -0
  46. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/transforms/visualizer/reading_order_visualizer.py +0 -0
  47. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/__init__.py +0 -0
  48. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/base.py +0 -0
  49. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/doc/__init__.py +0 -0
  50. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/doc/base.py +0 -0
  51. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/doc/document.py +0 -0
  52. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/doc/labels.py +0 -0
  53. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/doc/page.py +0 -0
  54. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/doc/tokens.py +0 -0
  55. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/doc/utils.py +0 -0
  56. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/gen/__init__.py +0 -0
  57. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/gen/generic.py +0 -0
  58. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/io/__init__.py +0 -0
  59. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  60. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/legacy_doc/base.py +0 -0
  61. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  62. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  63. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  64. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/legacy_doc/document.py +0 -0
  65. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  66. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/nlp/__init__.py +0 -0
  67. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/nlp/qa.py +0 -0
  68. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/nlp/qa_labels.py +0 -0
  69. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/rec/__init__.py +0 -0
  70. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/rec/attribute.py +0 -0
  71. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/rec/base.py +0 -0
  72. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/rec/predicate.py +0 -0
  73. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/rec/record.py +0 -0
  74. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/rec/statement.py +0 -0
  75. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/types/rec/subject.py +0 -0
  76. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/utils/__init__.py +0 -0
  77. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/utils/alias.py +0 -0
  78. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/utils/file.py +0 -0
  79. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/utils/generate_docs.py +0 -0
  80. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/utils/generate_jsonschema.py +0 -0
  81. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/utils/legacy.py +0 -0
  82. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/utils/validate.py +0 -0
  83. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core/utils/validators.py +0 -0
  84. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core.egg-info/dependency_links.txt +0 -0
  85. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core.egg-info/entry_points.txt +0 -0
  86. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core.egg-info/requires.txt +0 -0
  87. {docling_core-2.34.2 → docling_core-2.35.0}/docling_core.egg-info/top_level.txt +0 -0
  88. {docling_core-2.34.2 → docling_core-2.35.0}/setup.cfg +0 -0
  89. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_base.py +0 -0
  90. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_collection.py +0 -0
  91. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_data_gen_flag.py +0 -0
  92. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_doc_base.py +0 -0
  93. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_doc_legacy_convert.py +0 -0
  94. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_doc_schema.py +0 -0
  95. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_doc_schema_extractor.py +0 -0
  96. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_docling_doc.py +0 -0
  97. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_doctags_load.py +0 -0
  98. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_hierarchical_chunker.py +0 -0
  99. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_hybrid_chunker.py +0 -0
  100. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_json_schema_to_search_mapper.py +0 -0
  101. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_nlp_qa.py +0 -0
  102. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_otsl_table_export.py +0 -0
  103. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_page.py +0 -0
  104. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_rec_schema.py +0 -0
  105. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_search_meta.py +0 -0
  106. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_serialization.py +0 -0
  107. {docling_core-2.34.2 → docling_core-2.35.0}/test/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.34.2
3
+ Version: 2.35.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -119,7 +119,10 @@ class LayoutVisualizer(BaseVisualizer):
119
119
  )
120
120
 
121
121
  def _draw_doc_layout(
122
- self, doc: DoclingDocument, images: Optional[dict[Optional[int], Image]] = None
122
+ self,
123
+ doc: DoclingDocument,
124
+ images: Optional[dict[Optional[int], Image]] = None,
125
+ included_content_layers: Optional[set[ContentLayer]] = None,
123
126
  ):
124
127
  """Draw the document clusters and optionaly the reading order."""
125
128
  clusters = []
@@ -128,6 +131,9 @@ class LayoutVisualizer(BaseVisualizer):
128
131
  if images is not None:
129
132
  my_images = images
130
133
 
134
+ if included_content_layers is None:
135
+ included_content_layers = {c for c in ContentLayer}
136
+
131
137
  # Initialise `my_images` beforehand: sometimes, you have the
132
138
  # page-images but no DocItems!
133
139
  for page_nr, page in doc.pages.items():
@@ -141,9 +147,7 @@ class LayoutVisualizer(BaseVisualizer):
141
147
  prev_image = None
142
148
  prev_page_nr = None
143
149
  for idx, (elem, _) in enumerate(
144
- doc.iterate_items(
145
- included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
146
- )
150
+ doc.iterate_items(included_content_layers=included_content_layers)
147
151
  ):
148
152
  if not isinstance(elem, DocItem):
149
153
  continue
@@ -0,0 +1,135 @@
1
+ """Define classes for layout visualization."""
2
+
3
+ import logging
4
+ from copy import deepcopy
5
+ from typing import Optional
6
+
7
+ from PIL import ImageDraw
8
+ from PIL.Image import Image
9
+ from pydantic import BaseModel
10
+ from typing_extensions import override
11
+
12
+ from docling_core.transforms.visualizer.base import BaseVisualizer
13
+ from docling_core.types.doc.document import ContentLayer, DoclingDocument, TableItem
14
+
15
+ _log = logging.getLogger(__name__)
16
+
17
+
18
+ class TableVisualizer(BaseVisualizer):
19
+ """Table visualizer."""
20
+
21
+ class Params(BaseModel):
22
+ """Table visualization parameters."""
23
+
24
+ # show_Label: bool = False
25
+ show_cells: bool = True
26
+ # show_rows: bool = False
27
+ # show_cols: bool = False
28
+
29
+ base_visualizer: Optional[BaseVisualizer] = None
30
+ params: Params = Params()
31
+
32
+ def _draw_table_cells(
33
+ self,
34
+ table: TableItem,
35
+ page_image: Image,
36
+ page_height: float,
37
+ scale_x: float,
38
+ scale_y: float,
39
+ ):
40
+ """Draw individual table cells."""
41
+ draw = ImageDraw.Draw(page_image, "RGBA")
42
+
43
+ for cell in table.data.table_cells:
44
+ if cell.bbox is not None:
45
+
46
+ tl_bbox = cell.bbox.to_top_left_origin(page_height=page_height)
47
+
48
+ cell_color = (256, 0, 0, 32) # Transparent black for cells
49
+
50
+ cx0, cy0, cx1, cy1 = tl_bbox.as_tuple()
51
+ cx0 *= scale_x
52
+ cx1 *= scale_x
53
+ cy0 *= scale_y
54
+ cy1 *= scale_y
55
+
56
+ draw.rectangle(
57
+ [(cx0, cy0), (cx1, cy1)],
58
+ outline=(256, 0, 0, 128),
59
+ fill=cell_color,
60
+ )
61
+
62
+ def _draw_doc_tables(
63
+ self,
64
+ doc: DoclingDocument,
65
+ images: Optional[dict[Optional[int], Image]] = None,
66
+ included_content_layers: Optional[set[ContentLayer]] = None,
67
+ ):
68
+ """Draw the document tables."""
69
+ my_images: dict[Optional[int], Image] = {}
70
+
71
+ if images is not None:
72
+ my_images = images
73
+
74
+ if included_content_layers is None:
75
+ included_content_layers = {c for c in ContentLayer}
76
+
77
+ # Initialise `my_images` beforehand: sometimes, you have the
78
+ # page-images but no DocItems!
79
+ for page_nr, page in doc.pages.items():
80
+ page_image = doc.pages[page_nr].image
81
+ if page_image is None or (pil_img := page_image.pil_image) is None:
82
+ raise RuntimeError("Cannot visualize document without images")
83
+ elif page_nr not in my_images:
84
+ image = deepcopy(pil_img)
85
+ my_images[page_nr] = image
86
+
87
+ for idx, (elem, _) in enumerate(
88
+ doc.iterate_items(included_content_layers=included_content_layers)
89
+ ):
90
+ if not isinstance(elem, TableItem):
91
+ continue
92
+ if len(elem.prov) == 0:
93
+ continue # Skip elements without provenances
94
+
95
+ if len(elem.prov) == 1:
96
+
97
+ page_nr = elem.prov[0].page_no
98
+
99
+ if page_nr in my_images:
100
+ image = my_images[page_nr]
101
+
102
+ if self.params.show_cells:
103
+ self._draw_table_cells(
104
+ table=elem,
105
+ page_height=doc.pages[page_nr].size.height,
106
+ page_image=image,
107
+ scale_x=image.width / doc.pages[page_nr].size.width,
108
+ scale_y=image.height / doc.pages[page_nr].size.height,
109
+ )
110
+
111
+ else:
112
+ raise RuntimeError(f"Cannot visualize page-image for {page_nr}")
113
+
114
+ else:
115
+ _log.error("Can not yet visualise tables with multiple provenances")
116
+
117
+ return my_images
118
+
119
+ @override
120
+ def get_visualization(
121
+ self,
122
+ *,
123
+ doc: DoclingDocument,
124
+ **kwargs,
125
+ ) -> dict[Optional[int], Image]:
126
+ """Get visualization of the document as images by page."""
127
+ base_images = (
128
+ self.base_visualizer.get_visualization(doc=doc, **kwargs)
129
+ if self.base_visualizer
130
+ else None
131
+ )
132
+ return self._draw_doc_tables(
133
+ doc=doc,
134
+ images=base_images,
135
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-core
3
- Version: 2.34.2
3
+ Version: 2.35.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Author-email: Cesar Berrospi Ramis <ceb@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  Maintainer-email: Panos Vagenas <pva@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Christoph Auer <cau@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>, Cesar Berrospi Ramis <ceb@zurich.ibm.com>
@@ -45,6 +45,7 @@ docling_core/transforms/visualizer/__init__.py
45
45
  docling_core/transforms/visualizer/base.py
46
46
  docling_core/transforms/visualizer/layout_visualizer.py
47
47
  docling_core/transforms/visualizer/reading_order_visualizer.py
48
+ docling_core/transforms/visualizer/table_visualizer.py
48
49
  docling_core/types/__init__.py
49
50
  docling_core/types/base.py
50
51
  docling_core/types/doc/__init__.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling-core"
3
- version = "2.34.2" # DO NOT EDIT, updated automatically
3
+ version = "2.35.0" # DO NOT EDIT, updated automatically
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  license-files = ["LICENSE"]
@@ -2,6 +2,7 @@ from pathlib import Path
2
2
 
3
3
  import PIL.Image
4
4
 
5
+ from docling_core.transforms.visualizer.table_visualizer import TableVisualizer
5
6
  from docling_core.types.doc.document import DoclingDocument
6
7
 
7
8
  from .test_data_gen_flag import GEN_TEST_DATA
@@ -52,3 +53,16 @@ def test_doc_visualization_no_label():
52
53
  exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_viz_wout_lbl_p{k}.png",
53
54
  actual=viz_pages[k],
54
55
  )
56
+
57
+
58
+ def test_table_visualization_no_label():
59
+ src = Path("./test/data/doc/2408.09869v3_enriched.json")
60
+ doc = DoclingDocument.load_from_json(src)
61
+
62
+ visualizer = TableVisualizer()
63
+ viz_pages = visualizer.get_visualization(doc=doc)
64
+
65
+ verify(
66
+ exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_table_viz_wout_lbl_p5.png",
67
+ actual=viz_pages[5],
68
+ )
File without changes
File without changes
File without changes