docling 2.28.4__tar.gz → 2.30.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.28.4 → docling-2.30.0}/PKG-INFO +3 -3
- {docling-2.28.4 → docling-2.30.0}/docling/backend/docx/latex/latex_dict.py +3 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/docx/latex/omml.py +14 -14
- {docling-2.28.4 → docling-2.30.0}/docling/backend/html_backend.py +2 -1
- docling-2.30.0/docling/backend/msexcel_backend.py +525 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/mspowerpoint_backend.py +4 -3
- {docling-2.28.4 → docling-2.30.0}/docling/backend/msword_backend.py +320 -118
- {docling-2.28.4 → docling-2.30.0}/docling/cli/main.py +70 -2
- {docling-2.28.4 → docling-2.30.0}/docling/datamodel/base_models.py +33 -0
- {docling-2.28.4 → docling-2.30.0}/docling/datamodel/document.py +7 -0
- {docling-2.28.4 → docling-2.30.0}/docling/datamodel/pipeline_options.py +29 -3
- docling-2.30.0/docling/models/api_vlm_model.py +67 -0
- docling-2.30.0/docling/models/picture_description_api_model.py +58 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/picture_description_base_model.py +14 -2
- {docling-2.28.4 → docling-2.30.0}/docling/models/tesseract_ocr_cli_model.py +1 -1
- {docling-2.28.4 → docling-2.30.0}/docling/pipeline/standard_pdf_pipeline.py +6 -2
- {docling-2.28.4 → docling-2.30.0}/docling/pipeline/vlm_pipeline.py +27 -17
- docling-2.30.0/docling/utils/api_image_request.py +61 -0
- {docling-2.28.4 → docling-2.30.0}/pyproject.toml +3 -3
- docling-2.28.4/docling/backend/msexcel_backend.py +0 -343
- docling-2.28.4/docling/models/picture_description_api_model.py +0 -125
- {docling-2.28.4 → docling-2.30.0}/LICENSE +0 -0
- {docling-2.28.4 → docling-2.30.0}/README.md +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/csv_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/docling_parse_v4_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/docx/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/docx/latex/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/md_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/xml/jats_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/chunking/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/cli/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/cli/models.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/cli/tools.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/datamodel/settings.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/document_converter.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/exceptions.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/base_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/factories/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/factories/base_factory.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/factories/ocr_factory.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/factories/picture_description_factory.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/hf_mlx_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/hf_vlm_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/layout_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/picture_description_vlm_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/plugins/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/plugins/defaults.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/readingorder_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/py.typed +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/__init__.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/export.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/locks.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/model_downloader.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/profiling.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/utils.py +0 -0
- {docling-2.28.4 → docling-2.30.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.30.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -58,7 +58,7 @@ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
|
58
58
|
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
59
59
|
Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
|
60
60
|
Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
61
|
-
Requires-Dist: typer (>=0.12.5,<0.
|
61
|
+
Requires-Dist: typer (>=0.12.5,<0.16.0)
|
62
62
|
Project-URL: Repository, https://github.com/docling-project/docling
|
63
63
|
Description-Content-Type: text/markdown
|
64
64
|
|
@@ -5,6 +5,8 @@ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
|
|
5
5
|
On 23/01/2025
|
6
6
|
"""
|
7
7
|
|
8
|
+
import logging
|
9
|
+
|
8
10
|
import lxml.etree as ET
|
9
11
|
from pylatexenc.latexencode import UnicodeToLatexEncoder
|
10
12
|
|
@@ -39,6 +41,8 @@ from docling.backend.docx.latex.latex_dict import (
|
|
39
41
|
|
40
42
|
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
|
41
43
|
|
44
|
+
_log = logging.getLogger(__name__)
|
45
|
+
|
42
46
|
|
43
47
|
def load(stream):
|
44
48
|
tree = ET.parse(stream)
|
@@ -281,8 +285,10 @@ class oMath2Latex(Tag2Method):
|
|
281
285
|
if FUNC.get(t):
|
282
286
|
latex_chars.append(FUNC[t])
|
283
287
|
else:
|
284
|
-
|
285
|
-
|
288
|
+
_log.warning("Function not supported, will default to text: %s", t)
|
289
|
+
if isinstance(t, str):
|
290
|
+
latex_chars.append(t)
|
291
|
+
elif isinstance(t, str):
|
286
292
|
latex_chars.append(t)
|
287
293
|
t = BLANK.join(latex_chars)
|
288
294
|
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
@@ -382,8 +388,6 @@ class oMath2Latex(Tag2Method):
|
|
382
388
|
|
383
389
|
out_latex_str = self.u.unicode_to_latex(s)
|
384
390
|
|
385
|
-
# print(s, out_latex_str)
|
386
|
-
|
387
391
|
if (
|
388
392
|
s.startswith("{") is False
|
389
393
|
and out_latex_str.startswith("{")
|
@@ -392,19 +396,13 @@ class oMath2Latex(Tag2Method):
|
|
392
396
|
):
|
393
397
|
out_latex_str = f" {out_latex_str[1:-1]} "
|
394
398
|
|
395
|
-
# print(s, out_latex_str)
|
396
|
-
|
397
399
|
if "ensuremath" in out_latex_str:
|
398
400
|
out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
|
399
401
|
out_latex_str = out_latex_str.replace("}", " ")
|
400
402
|
|
401
|
-
# print(s, out_latex_str)
|
402
|
-
|
403
403
|
if out_latex_str.strip().startswith("\\text"):
|
404
404
|
out_latex_str = f" \\text{{{out_latex_str}}} "
|
405
405
|
|
406
|
-
# print(s, out_latex_str)
|
407
|
-
|
408
406
|
return out_latex_str
|
409
407
|
|
410
408
|
def do_r(self, elm):
|
@@ -415,10 +413,12 @@ class oMath2Latex(Tag2Method):
|
|
415
413
|
"""
|
416
414
|
_str = []
|
417
415
|
_base_str = []
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
416
|
+
found_text = elm.findtext("./{0}t".format(OMML_NS))
|
417
|
+
if found_text:
|
418
|
+
for s in found_text:
|
419
|
+
out_latex_str = self.process_unicode(s)
|
420
|
+
_str.append(out_latex_str)
|
421
|
+
_base_str.append(s)
|
422
422
|
|
423
423
|
proc_str = escape_latex(BLANK.join(_str))
|
424
424
|
base_proc_str = BLANK.join(_base_str)
|
@@ -34,6 +34,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
|
34
34
|
"h6",
|
35
35
|
"p",
|
36
36
|
"pre",
|
37
|
+
"code",
|
37
38
|
"ul",
|
38
39
|
"ol",
|
39
40
|
"li",
|
@@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
165
166
|
self.handle_header(tag, doc)
|
166
167
|
elif tag.name in ["p"]:
|
167
168
|
self.handle_paragraph(tag, doc)
|
168
|
-
elif tag.name in ["pre"]:
|
169
|
+
elif tag.name in ["pre", "code"]:
|
169
170
|
self.handle_code(tag, doc)
|
170
171
|
elif tag.name in ["ul", "ol"]:
|
171
172
|
self.handle_list(tag, doc)
|
@@ -0,0 +1,525 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Any, Union, cast
|
5
|
+
|
6
|
+
from docling_core.types.doc import (
|
7
|
+
BoundingBox,
|
8
|
+
CoordOrigin,
|
9
|
+
DocItem,
|
10
|
+
DoclingDocument,
|
11
|
+
DocumentOrigin,
|
12
|
+
GroupLabel,
|
13
|
+
ImageRef,
|
14
|
+
ProvenanceItem,
|
15
|
+
Size,
|
16
|
+
TableCell,
|
17
|
+
TableData,
|
18
|
+
)
|
19
|
+
from openpyxl import load_workbook
|
20
|
+
from openpyxl.drawing.image import Image
|
21
|
+
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
|
22
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
23
|
+
from PIL import Image as PILImage
|
24
|
+
from pydantic import BaseModel, NonNegativeInt, PositiveInt
|
25
|
+
from typing_extensions import override
|
26
|
+
|
27
|
+
from docling.backend.abstract_backend import (
|
28
|
+
DeclarativeDocumentBackend,
|
29
|
+
PaginatedDocumentBackend,
|
30
|
+
)
|
31
|
+
from docling.datamodel.base_models import InputFormat
|
32
|
+
from docling.datamodel.document import InputDocument
|
33
|
+
|
34
|
+
_log = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
|
37
|
+
class ExcelCell(BaseModel):
|
38
|
+
"""Represents an Excel cell.
|
39
|
+
|
40
|
+
Attributes:
|
41
|
+
row: The row number of the cell.
|
42
|
+
col: The column number of the cell.
|
43
|
+
text: The text content of the cell.
|
44
|
+
row_span: The number of rows the cell spans.
|
45
|
+
col_span: The number of columns the cell spans.
|
46
|
+
"""
|
47
|
+
|
48
|
+
row: int
|
49
|
+
col: int
|
50
|
+
text: str
|
51
|
+
row_span: int
|
52
|
+
col_span: int
|
53
|
+
|
54
|
+
|
55
|
+
class ExcelTable(BaseModel):
|
56
|
+
"""Represents an Excel table on a worksheet.
|
57
|
+
|
58
|
+
Attributes:
|
59
|
+
anchor: The column and row indices of the upper-left cell of the table
|
60
|
+
(0-based index).
|
61
|
+
num_rows: The number of rows in the table.
|
62
|
+
num_cols: The number of columns in the table.
|
63
|
+
data: The data in the table, represented as a list of ExcelCell objects.
|
64
|
+
"""
|
65
|
+
|
66
|
+
anchor: tuple[NonNegativeInt, NonNegativeInt]
|
67
|
+
num_rows: int
|
68
|
+
num_cols: int
|
69
|
+
data: list[ExcelCell]
|
70
|
+
|
71
|
+
|
72
|
+
class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
73
|
+
"""Backend for parsing Excel workbooks.
|
74
|
+
|
75
|
+
The backend converts an Excel workbook into a DoclingDocument object.
|
76
|
+
Each worksheet is converted into a separate page.
|
77
|
+
The following elements are parsed:
|
78
|
+
- Cell contents, parsed as tables. If two groups of cells are disconnected
|
79
|
+
between each other, they will be parsed as two different tables.
|
80
|
+
- Images, parsed as PictureItem objects.
|
81
|
+
|
82
|
+
The DoclingDocument tables and pictures have their provenance information, including
|
83
|
+
the position in their original Excel worksheet. The position is represented by a
|
84
|
+
bounding box object with the cell indices as units (0-based index). The size of this
|
85
|
+
bounding box is the number of columns and rows that the table or picture spans.
|
86
|
+
"""
|
87
|
+
|
88
|
+
@override
|
89
|
+
def __init__(
|
90
|
+
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
91
|
+
) -> None:
|
92
|
+
"""Initialize the MsExcelDocumentBackend object.
|
93
|
+
|
94
|
+
Parameters:
|
95
|
+
in_doc: The input document object.
|
96
|
+
path_or_stream: The path or stream to the Excel file.
|
97
|
+
|
98
|
+
Raises:
|
99
|
+
RuntimeError: An error occurred parsing the file.
|
100
|
+
"""
|
101
|
+
super().__init__(in_doc, path_or_stream)
|
102
|
+
|
103
|
+
# Initialise the parents for the hierarchy
|
104
|
+
self.max_levels = 10
|
105
|
+
|
106
|
+
self.parents: dict[int, Any] = {}
|
107
|
+
for i in range(-1, self.max_levels):
|
108
|
+
self.parents[i] = None
|
109
|
+
|
110
|
+
self.workbook = None
|
111
|
+
try:
|
112
|
+
if isinstance(self.path_or_stream, BytesIO):
|
113
|
+
self.workbook = load_workbook(filename=self.path_or_stream)
|
114
|
+
|
115
|
+
elif isinstance(self.path_or_stream, Path):
|
116
|
+
self.workbook = load_workbook(filename=str(self.path_or_stream))
|
117
|
+
|
118
|
+
self.valid = self.workbook is not None
|
119
|
+
except Exception as e:
|
120
|
+
self.valid = False
|
121
|
+
|
122
|
+
raise RuntimeError(
|
123
|
+
f"MsExcelDocumentBackend could not load document with hash {self.document_hash}"
|
124
|
+
) from e
|
125
|
+
|
126
|
+
@override
|
127
|
+
def is_valid(self) -> bool:
|
128
|
+
_log.debug(f"valid: {self.valid}")
|
129
|
+
return self.valid
|
130
|
+
|
131
|
+
@classmethod
|
132
|
+
@override
|
133
|
+
def supports_pagination(cls) -> bool:
|
134
|
+
return True
|
135
|
+
|
136
|
+
@override
|
137
|
+
def page_count(self) -> int:
|
138
|
+
if self.is_valid() and self.workbook:
|
139
|
+
return len(self.workbook.sheetnames)
|
140
|
+
else:
|
141
|
+
return 0
|
142
|
+
|
143
|
+
@classmethod
|
144
|
+
@override
|
145
|
+
def supported_formats(cls) -> set[InputFormat]:
|
146
|
+
return {InputFormat.XLSX}
|
147
|
+
|
148
|
+
@override
|
149
|
+
def convert(self) -> DoclingDocument:
|
150
|
+
"""Parse the Excel workbook into a DoclingDocument object.
|
151
|
+
|
152
|
+
Raises:
|
153
|
+
RuntimeError: Unable to run the conversion since the backend object failed to
|
154
|
+
initialize.
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
The DoclingDocument object representing the Excel workbook.
|
158
|
+
"""
|
159
|
+
origin = DocumentOrigin(
|
160
|
+
filename=self.file.name or "file.xlsx",
|
161
|
+
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
162
|
+
binary_hash=self.document_hash,
|
163
|
+
)
|
164
|
+
|
165
|
+
doc = DoclingDocument(name=self.file.stem or "file.xlsx", origin=origin)
|
166
|
+
|
167
|
+
if self.is_valid():
|
168
|
+
doc = self._convert_workbook(doc)
|
169
|
+
else:
|
170
|
+
raise RuntimeError(
|
171
|
+
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
172
|
+
)
|
173
|
+
|
174
|
+
return doc
|
175
|
+
|
176
|
+
def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument:
|
177
|
+
"""Parse the Excel workbook and attach its structure to a DoclingDocument.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
doc: A DoclingDocument object.
|
181
|
+
|
182
|
+
Returns:
|
183
|
+
A DoclingDocument object with the parsed items.
|
184
|
+
"""
|
185
|
+
|
186
|
+
if self.workbook is not None:
|
187
|
+
|
188
|
+
# Iterate over all sheets
|
189
|
+
for sheet_name in self.workbook.sheetnames:
|
190
|
+
_log.info(f"Processing sheet: {sheet_name}")
|
191
|
+
|
192
|
+
sheet = self.workbook[sheet_name]
|
193
|
+
page_no = self.workbook.index(sheet) + 1
|
194
|
+
# do not rely on sheet.max_column, sheet.max_row if there are images
|
195
|
+
page = doc.add_page(page_no=page_no, size=Size(width=0, height=0))
|
196
|
+
|
197
|
+
self.parents[0] = doc.add_group(
|
198
|
+
parent=None,
|
199
|
+
label=GroupLabel.SECTION,
|
200
|
+
name=f"sheet: {sheet_name}",
|
201
|
+
)
|
202
|
+
doc = self._convert_sheet(doc, sheet)
|
203
|
+
width, height = self._find_page_size(doc, page_no)
|
204
|
+
page.size = Size(width=width, height=height)
|
205
|
+
else:
|
206
|
+
_log.error("Workbook is not initialized.")
|
207
|
+
|
208
|
+
return doc
|
209
|
+
|
210
|
+
def _convert_sheet(self, doc: DoclingDocument, sheet: Worksheet) -> DoclingDocument:
|
211
|
+
"""Parse an Excel worksheet and attach its structure to a DoclingDocument
|
212
|
+
|
213
|
+
Args:
|
214
|
+
doc: The DoclingDocument to be updated.
|
215
|
+
sheet: The Excel worksheet to be parsed.
|
216
|
+
|
217
|
+
Returns:
|
218
|
+
The updated DoclingDocument.
|
219
|
+
"""
|
220
|
+
|
221
|
+
doc = self._find_tables_in_sheet(doc, sheet)
|
222
|
+
|
223
|
+
doc = self._find_images_in_sheet(doc, sheet)
|
224
|
+
|
225
|
+
return doc
|
226
|
+
|
227
|
+
def _find_tables_in_sheet(
|
228
|
+
self, doc: DoclingDocument, sheet: Worksheet
|
229
|
+
) -> DoclingDocument:
|
230
|
+
"""Find all tables in an Excel sheet and attach them to a DoclingDocument.
|
231
|
+
|
232
|
+
Args:
|
233
|
+
doc: The DoclingDocument to be updated.
|
234
|
+
sheet: The Excel worksheet to be parsed.
|
235
|
+
|
236
|
+
Returns:
|
237
|
+
The updated DoclingDocument.
|
238
|
+
"""
|
239
|
+
|
240
|
+
if self.workbook is not None:
|
241
|
+
tables = self._find_data_tables(sheet)
|
242
|
+
|
243
|
+
for excel_table in tables:
|
244
|
+
origin_col = excel_table.anchor[0]
|
245
|
+
origin_row = excel_table.anchor[1]
|
246
|
+
num_rows = excel_table.num_rows
|
247
|
+
num_cols = excel_table.num_cols
|
248
|
+
|
249
|
+
table_data = TableData(
|
250
|
+
num_rows=num_rows,
|
251
|
+
num_cols=num_cols,
|
252
|
+
table_cells=[],
|
253
|
+
)
|
254
|
+
|
255
|
+
for excel_cell in excel_table.data:
|
256
|
+
|
257
|
+
cell = TableCell(
|
258
|
+
text=excel_cell.text,
|
259
|
+
row_span=excel_cell.row_span,
|
260
|
+
col_span=excel_cell.col_span,
|
261
|
+
start_row_offset_idx=excel_cell.row,
|
262
|
+
end_row_offset_idx=excel_cell.row + excel_cell.row_span,
|
263
|
+
start_col_offset_idx=excel_cell.col,
|
264
|
+
end_col_offset_idx=excel_cell.col + excel_cell.col_span,
|
265
|
+
column_header=excel_cell.row == 0,
|
266
|
+
row_header=False,
|
267
|
+
)
|
268
|
+
table_data.table_cells.append(cell)
|
269
|
+
|
270
|
+
page_no = self.workbook.index(sheet) + 1
|
271
|
+
doc.add_table(
|
272
|
+
data=table_data,
|
273
|
+
parent=self.parents[0],
|
274
|
+
prov=ProvenanceItem(
|
275
|
+
page_no=page_no,
|
276
|
+
charspan=(0, 0),
|
277
|
+
bbox=BoundingBox.from_tuple(
|
278
|
+
(
|
279
|
+
origin_col,
|
280
|
+
origin_row,
|
281
|
+
origin_col + num_cols,
|
282
|
+
origin_row + num_rows,
|
283
|
+
),
|
284
|
+
origin=CoordOrigin.TOPLEFT,
|
285
|
+
),
|
286
|
+
),
|
287
|
+
)
|
288
|
+
|
289
|
+
return doc
|
290
|
+
|
291
|
+
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
|
292
|
+
"""Find all compact rectangular data tables in an Excel worksheet.
|
293
|
+
|
294
|
+
Args:
|
295
|
+
sheet: The Excel worksheet to be parsed.
|
296
|
+
|
297
|
+
Returns:
|
298
|
+
A list of ExcelTable objects representing the data tables.
|
299
|
+
"""
|
300
|
+
tables: list[ExcelTable] = [] # List to store found tables
|
301
|
+
visited: set[tuple[int, int]] = set() # Track already visited cells
|
302
|
+
|
303
|
+
# Iterate over all cells in the sheet
|
304
|
+
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
305
|
+
for rj, cell in enumerate(row):
|
306
|
+
|
307
|
+
# Skip empty or already visited cells
|
308
|
+
if cell.value is None or (ri, rj) in visited:
|
309
|
+
continue
|
310
|
+
|
311
|
+
# If the cell starts a new table, find its bounds
|
312
|
+
table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
|
313
|
+
|
314
|
+
visited.update(visited_cells) # Mark these cells as visited
|
315
|
+
tables.append(table_bounds)
|
316
|
+
|
317
|
+
return tables
|
318
|
+
|
319
|
+
def _find_table_bounds(
|
320
|
+
self,
|
321
|
+
sheet: Worksheet,
|
322
|
+
start_row: int,
|
323
|
+
start_col: int,
|
324
|
+
) -> tuple[ExcelTable, set[tuple[int, int]]]:
|
325
|
+
"""Determine the bounds of a compact rectangular table.
|
326
|
+
|
327
|
+
Args:
|
328
|
+
sheet: The Excel worksheet to be parsed.
|
329
|
+
start_row: The row number of the starting cell.
|
330
|
+
start_col: The column number of the starting cell.
|
331
|
+
|
332
|
+
Returns:
|
333
|
+
A tuple with an Excel table and a set of cell coordinates.
|
334
|
+
"""
|
335
|
+
_log.debug("find_table_bounds")
|
336
|
+
|
337
|
+
max_row = self._find_table_bottom(sheet, start_row, start_col)
|
338
|
+
max_col = self._find_table_right(sheet, start_row, start_col)
|
339
|
+
|
340
|
+
# Collect the data within the bounds
|
341
|
+
data = []
|
342
|
+
visited_cells: set[tuple[int, int]] = set()
|
343
|
+
for ri in range(start_row, max_row + 1):
|
344
|
+
for rj in range(start_col, max_col + 1):
|
345
|
+
|
346
|
+
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
347
|
+
|
348
|
+
# Check if the cell belongs to a merged range
|
349
|
+
row_span = 1
|
350
|
+
col_span = 1
|
351
|
+
|
352
|
+
for merged_range in sheet.merged_cells.ranges:
|
353
|
+
|
354
|
+
if (
|
355
|
+
merged_range.min_row <= ri + 1
|
356
|
+
and ri + 1 <= merged_range.max_row
|
357
|
+
and merged_range.min_col <= rj + 1
|
358
|
+
and rj + 1 <= merged_range.max_col
|
359
|
+
):
|
360
|
+
|
361
|
+
row_span = merged_range.max_row - merged_range.min_row + 1
|
362
|
+
col_span = merged_range.max_col - merged_range.min_col + 1
|
363
|
+
break
|
364
|
+
|
365
|
+
if (ri, rj) not in visited_cells:
|
366
|
+
data.append(
|
367
|
+
ExcelCell(
|
368
|
+
row=ri - start_row,
|
369
|
+
col=rj - start_col,
|
370
|
+
text=str(cell.value),
|
371
|
+
row_span=row_span,
|
372
|
+
col_span=col_span,
|
373
|
+
)
|
374
|
+
)
|
375
|
+
|
376
|
+
# Mark all cells in the span as visited
|
377
|
+
for span_row in range(ri, ri + row_span):
|
378
|
+
for span_col in range(rj, rj + col_span):
|
379
|
+
visited_cells.add((span_row, span_col))
|
380
|
+
|
381
|
+
return (
|
382
|
+
ExcelTable(
|
383
|
+
anchor=(start_col, start_row),
|
384
|
+
num_rows=max_row + 1 - start_row,
|
385
|
+
num_cols=max_col + 1 - start_col,
|
386
|
+
data=data,
|
387
|
+
),
|
388
|
+
visited_cells,
|
389
|
+
)
|
390
|
+
|
391
|
+
def _find_table_bottom(
|
392
|
+
self, sheet: Worksheet, start_row: int, start_col: int
|
393
|
+
) -> int:
|
394
|
+
"""Find the bottom boundary of a table.
|
395
|
+
|
396
|
+
Args:
|
397
|
+
sheet: The Excel worksheet to be parsed.
|
398
|
+
start_row: The starting row of the table.
|
399
|
+
start_col: The starting column of the table.
|
400
|
+
|
401
|
+
Returns:
|
402
|
+
The row index representing the bottom boundary of the table.
|
403
|
+
"""
|
404
|
+
max_row: int = start_row
|
405
|
+
|
406
|
+
while max_row < sheet.max_row - 1:
|
407
|
+
# Get the cell value or check if it is part of a merged cell
|
408
|
+
cell = sheet.cell(row=max_row + 2, column=start_col + 1)
|
409
|
+
|
410
|
+
# Check if the cell is part of a merged range
|
411
|
+
merged_range = next(
|
412
|
+
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
413
|
+
None,
|
414
|
+
)
|
415
|
+
|
416
|
+
if cell.value is None and not merged_range:
|
417
|
+
break # Stop if the cell is empty and not merged
|
418
|
+
|
419
|
+
# Expand max_row to include the merged range if applicable
|
420
|
+
if merged_range:
|
421
|
+
max_row = max(max_row, merged_range.max_row - 1)
|
422
|
+
else:
|
423
|
+
max_row += 1
|
424
|
+
|
425
|
+
return max_row
|
426
|
+
|
427
|
+
def _find_table_right(
|
428
|
+
self, sheet: Worksheet, start_row: int, start_col: int
|
429
|
+
) -> int:
|
430
|
+
"""Find the right boundary of a table.
|
431
|
+
|
432
|
+
Args:
|
433
|
+
sheet: The Excel worksheet to be parsed.
|
434
|
+
start_row: The starting row of the table.
|
435
|
+
start_col: The starting column of the table.
|
436
|
+
|
437
|
+
Returns:
|
438
|
+
The column index representing the right boundary of the table."
|
439
|
+
"""
|
440
|
+
max_col: int = start_col
|
441
|
+
|
442
|
+
while max_col < sheet.max_column - 1:
|
443
|
+
# Get the cell value or check if it is part of a merged cell
|
444
|
+
cell = sheet.cell(row=start_row + 1, column=max_col + 2)
|
445
|
+
|
446
|
+
# Check if the cell is part of a merged range
|
447
|
+
merged_range = next(
|
448
|
+
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
449
|
+
None,
|
450
|
+
)
|
451
|
+
|
452
|
+
if cell.value is None and not merged_range:
|
453
|
+
break # Stop if the cell is empty and not merged
|
454
|
+
|
455
|
+
# Expand max_col to include the merged range if applicable
|
456
|
+
if merged_range:
|
457
|
+
max_col = max(max_col, merged_range.max_col - 1)
|
458
|
+
else:
|
459
|
+
max_col += 1
|
460
|
+
|
461
|
+
return max_col
|
462
|
+
|
463
|
+
def _find_images_in_sheet(
|
464
|
+
self, doc: DoclingDocument, sheet: Worksheet
|
465
|
+
) -> DoclingDocument:
|
466
|
+
"""Find images in the Excel sheet and attach them to the DoclingDocument.
|
467
|
+
|
468
|
+
Args:
|
469
|
+
doc: The DoclingDocument to be updated.
|
470
|
+
sheet: The Excel worksheet to be parsed.
|
471
|
+
|
472
|
+
Returns:
|
473
|
+
The updated DoclingDocument.
|
474
|
+
"""
|
475
|
+
if self.workbook is not None:
|
476
|
+
# Iterate over byte images in the sheet
|
477
|
+
for item in sheet._images: # type: ignore[attr-defined]
|
478
|
+
try:
|
479
|
+
image: Image = cast(Image, item)
|
480
|
+
pil_image = PILImage.open(image.ref) # type: ignore[arg-type]
|
481
|
+
page_no = self.workbook.index(sheet) + 1
|
482
|
+
anchor = (0, 0, 0, 0)
|
483
|
+
if isinstance(image.anchor, TwoCellAnchor):
|
484
|
+
anchor = (
|
485
|
+
image.anchor._from.col,
|
486
|
+
image.anchor._from.row,
|
487
|
+
image.anchor.to.col + 1,
|
488
|
+
image.anchor.to.row + 1,
|
489
|
+
)
|
490
|
+
doc.add_picture(
|
491
|
+
parent=self.parents[0],
|
492
|
+
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
493
|
+
caption=None,
|
494
|
+
prov=ProvenanceItem(
|
495
|
+
page_no=page_no,
|
496
|
+
charspan=(0, 0),
|
497
|
+
bbox=BoundingBox.from_tuple(
|
498
|
+
anchor, origin=CoordOrigin.TOPLEFT
|
499
|
+
),
|
500
|
+
),
|
501
|
+
)
|
502
|
+
except:
|
503
|
+
_log.error("could not extract the image from excel sheets")
|
504
|
+
|
505
|
+
return doc
|
506
|
+
|
507
|
+
@staticmethod
|
508
|
+
def _find_page_size(
|
509
|
+
doc: DoclingDocument, page_no: PositiveInt
|
510
|
+
) -> tuple[float, float]:
|
511
|
+
left: float = -1.0
|
512
|
+
top: float = -1.0
|
513
|
+
right: float = -1.0
|
514
|
+
bottom: float = -1.0
|
515
|
+
for item, _ in doc.iterate_items(traverse_pictures=True, page_no=page_no):
|
516
|
+
if not isinstance(item, DocItem):
|
517
|
+
continue
|
518
|
+
for provenance in item.prov:
|
519
|
+
bbox = provenance.bbox
|
520
|
+
left = min(left, bbox.l) if left != -1 else bbox.l
|
521
|
+
right = max(right, bbox.r) if right != -1 else bbox.r
|
522
|
+
top = min(top, bbox.t) if top != -1 else bbox.t
|
523
|
+
bottom = max(bottom, bbox.b) if bottom != -1 else bbox.b
|
524
|
+
|
525
|
+
return (right - left, bottom - top)
|
@@ -392,9 +392,10 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
392
392
|
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size)
|
393
393
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
394
394
|
# Handle Pictures
|
395
|
-
|
396
|
-
|
397
|
-
|
395
|
+
if hasattr(shape, "image"):
|
396
|
+
self.handle_pictures(
|
397
|
+
shape, parent_slide, slide_ind, doc, slide_size
|
398
|
+
)
|
398
399
|
# If shape doesn't have any text, move on to the next shape
|
399
400
|
if not hasattr(shape, "text"):
|
400
401
|
return
|