docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +33 -37
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +20 -16
- docling/backend/docling_parse_v2_backend.py +248 -0
- docling/backend/html_backend.py +429 -0
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +496 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +16 -11
- docling/cli/main.py +96 -65
- docling/datamodel/base_models.py +79 -193
- docling/datamodel/document.py +405 -320
- docling/datamodel/pipeline_options.py +19 -3
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +240 -251
- docling/models/base_model.py +28 -0
- docling/models/base_ocr_model.py +40 -10
- docling/models/ds_glm_model.py +244 -30
- docling/models/easyocr_model.py +57 -42
- docling/models/layout_model.py +158 -116
- docling/models/page_assemble_model.py +127 -101
- docling/models/page_preprocessing_model.py +79 -0
- docling/models/table_structure_model.py +162 -116
- docling/models/tesseract_ocr_cli_model.py +76 -59
- docling/models/tesseract_ocr_model.py +90 -58
- docling/pipeline/base_pipeline.py +189 -0
- docling/pipeline/simple_pipeline.py +56 -0
- docling/pipeline/standard_pdf_pipeline.py +201 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling/utils/profiling.py +62 -0
- docling-2.4.1.dist-info/METADATA +154 -0
- docling-2.4.1.dist-info/RECORD +45 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,248 @@
|
|
1
|
+
import logging
|
2
|
+
import random
|
3
|
+
from io import BytesIO
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
6
|
+
|
7
|
+
import pypdfium2 as pdfium
|
8
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
+
from docling_parse.docling_parse import pdf_parser_v2
|
10
|
+
from PIL import Image, ImageDraw
|
11
|
+
from pypdfium2 import PdfPage
|
12
|
+
|
13
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
14
|
+
from docling.datamodel.base_models import Cell, Size
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from docling.datamodel.document import InputDocument
|
18
|
+
|
19
|
+
_log = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class DoclingParseV2PageBackend(PdfPageBackend):
|
23
|
+
def __init__(
|
24
|
+
self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
|
25
|
+
):
|
26
|
+
self._ppage = page_obj
|
27
|
+
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
|
28
|
+
|
29
|
+
self.valid = "pages" in parsed_page and len(parsed_page["pages"]) == 1
|
30
|
+
if self.valid:
|
31
|
+
self._dpage = parsed_page["pages"][0]
|
32
|
+
else:
|
33
|
+
_log.info(
|
34
|
+
f"An error occurred when loading page {page_no} of document {document_hash}."
|
35
|
+
)
|
36
|
+
|
37
|
+
def is_valid(self) -> bool:
|
38
|
+
return self.valid
|
39
|
+
|
40
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
41
|
+
if not self.valid:
|
42
|
+
return ""
|
43
|
+
# Find intersecting cells on the page
|
44
|
+
text_piece = ""
|
45
|
+
page_size = self.get_size()
|
46
|
+
|
47
|
+
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
48
|
+
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
49
|
+
|
50
|
+
scale = (
|
51
|
+
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
52
|
+
)
|
53
|
+
|
54
|
+
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
55
|
+
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
56
|
+
|
57
|
+
for i, cell_data in enumerate(cells_data):
|
58
|
+
x0 = cell_data[cells_header.index("x0")]
|
59
|
+
y0 = cell_data[cells_header.index("y0")]
|
60
|
+
x1 = cell_data[cells_header.index("x1")]
|
61
|
+
y1 = cell_data[cells_header.index("y1")]
|
62
|
+
|
63
|
+
cell_bbox = BoundingBox(
|
64
|
+
l=x0 * scale * page_size.width / parser_width,
|
65
|
+
b=y0 * scale * page_size.height / parser_height,
|
66
|
+
r=x1 * scale * page_size.width / parser_width,
|
67
|
+
t=y1 * scale * page_size.height / parser_height,
|
68
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
69
|
+
).to_top_left_origin(page_height=page_size.height * scale)
|
70
|
+
|
71
|
+
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
72
|
+
|
73
|
+
if overlap_frac > 0.5:
|
74
|
+
if len(text_piece) > 0:
|
75
|
+
text_piece += " "
|
76
|
+
text_piece += cell_data[cells_header.index("text")]
|
77
|
+
|
78
|
+
return text_piece
|
79
|
+
|
80
|
+
def get_text_cells(self) -> Iterable[Cell]:
|
81
|
+
cells: List[Cell] = []
|
82
|
+
cell_counter = 0
|
83
|
+
|
84
|
+
if not self.valid:
|
85
|
+
return cells
|
86
|
+
|
87
|
+
page_size = self.get_size()
|
88
|
+
|
89
|
+
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
90
|
+
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
91
|
+
|
92
|
+
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
93
|
+
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
94
|
+
|
95
|
+
for i, cell_data in enumerate(cells_data):
|
96
|
+
x0 = cell_data[cells_header.index("x0")]
|
97
|
+
y0 = cell_data[cells_header.index("y0")]
|
98
|
+
x1 = cell_data[cells_header.index("x1")]
|
99
|
+
y1 = cell_data[cells_header.index("y1")]
|
100
|
+
|
101
|
+
if x1 < x0:
|
102
|
+
x0, x1 = x1, x0
|
103
|
+
if y1 < y0:
|
104
|
+
y0, y1 = y1, y0
|
105
|
+
|
106
|
+
text_piece = cell_data[cells_header.index("text")]
|
107
|
+
cells.append(
|
108
|
+
Cell(
|
109
|
+
id=cell_counter,
|
110
|
+
text=text_piece,
|
111
|
+
bbox=BoundingBox(
|
112
|
+
# l=x0, b=y0, r=x1, t=y1,
|
113
|
+
l=x0 * page_size.width / parser_width,
|
114
|
+
b=y0 * page_size.height / parser_height,
|
115
|
+
r=x1 * page_size.width / parser_width,
|
116
|
+
t=y1 * page_size.height / parser_height,
|
117
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
118
|
+
).to_top_left_origin(page_size.height),
|
119
|
+
)
|
120
|
+
)
|
121
|
+
cell_counter += 1
|
122
|
+
|
123
|
+
def draw_clusters_and_cells():
|
124
|
+
image = (
|
125
|
+
self.get_page_image()
|
126
|
+
) # make new image to avoid drawing on the saved ones
|
127
|
+
draw = ImageDraw.Draw(image)
|
128
|
+
for c in cells:
|
129
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
130
|
+
cell_color = (
|
131
|
+
random.randint(30, 140),
|
132
|
+
random.randint(30, 140),
|
133
|
+
random.randint(30, 140),
|
134
|
+
)
|
135
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
136
|
+
image.show()
|
137
|
+
|
138
|
+
# draw_clusters_and_cells()
|
139
|
+
|
140
|
+
return cells
|
141
|
+
|
142
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
143
|
+
AREA_THRESHOLD = 32 * 32
|
144
|
+
|
145
|
+
images = self._dpage["sanitized"]["images"]["data"]
|
146
|
+
images_header = self._dpage["sanitized"]["images"]["header"]
|
147
|
+
|
148
|
+
for row in images:
|
149
|
+
x0 = row[images_header.index("x0")]
|
150
|
+
y0 = row[images_header.index("y0")]
|
151
|
+
x1 = row[images_header.index("x1")]
|
152
|
+
y1 = row[images_header.index("y1")]
|
153
|
+
|
154
|
+
cropbox = BoundingBox.from_tuple(
|
155
|
+
(x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
|
156
|
+
).to_top_left_origin(self.get_size().height)
|
157
|
+
|
158
|
+
if cropbox.area() > AREA_THRESHOLD:
|
159
|
+
cropbox = cropbox.scaled(scale=scale)
|
160
|
+
|
161
|
+
yield cropbox
|
162
|
+
|
163
|
+
def get_page_image(
|
164
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
165
|
+
) -> Image.Image:
|
166
|
+
|
167
|
+
page_size = self.get_size()
|
168
|
+
|
169
|
+
if not cropbox:
|
170
|
+
cropbox = BoundingBox(
|
171
|
+
l=0,
|
172
|
+
r=page_size.width,
|
173
|
+
t=0,
|
174
|
+
b=page_size.height,
|
175
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
176
|
+
)
|
177
|
+
padbox = BoundingBox(
|
178
|
+
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
179
|
+
)
|
180
|
+
else:
|
181
|
+
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
182
|
+
padbox.r = page_size.width - padbox.r
|
183
|
+
padbox.t = page_size.height - padbox.t
|
184
|
+
|
185
|
+
image = (
|
186
|
+
self._ppage.render(
|
187
|
+
scale=scale * 1.5,
|
188
|
+
rotation=0, # no additional rotation
|
189
|
+
crop=padbox.as_tuple(),
|
190
|
+
)
|
191
|
+
.to_pil()
|
192
|
+
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
193
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
194
|
+
|
195
|
+
return image
|
196
|
+
|
197
|
+
def get_size(self) -> Size:
|
198
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
199
|
+
|
200
|
+
def unload(self):
|
201
|
+
self._ppage = None
|
202
|
+
self._dpage = None
|
203
|
+
|
204
|
+
|
205
|
+
class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
206
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
207
|
+
super().__init__(in_doc, path_or_stream)
|
208
|
+
|
209
|
+
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
210
|
+
self.parser = pdf_parser_v2("fatal")
|
211
|
+
|
212
|
+
success = False
|
213
|
+
if isinstance(path_or_stream, BytesIO):
|
214
|
+
success = self.parser.load_document_from_bytesio(
|
215
|
+
self.document_hash, path_or_stream
|
216
|
+
)
|
217
|
+
elif isinstance(path_or_stream, Path):
|
218
|
+
success = self.parser.load_document(self.document_hash, str(path_or_stream))
|
219
|
+
|
220
|
+
if not success:
|
221
|
+
raise RuntimeError(
|
222
|
+
f"docling-parse v2 could not load document {self.document_hash}."
|
223
|
+
)
|
224
|
+
|
225
|
+
def page_count(self) -> int:
|
226
|
+
# return len(self._pdoc) # To be replaced with docling-parse API
|
227
|
+
|
228
|
+
len_1 = len(self._pdoc)
|
229
|
+
len_2 = self.parser.number_of_pages(self.document_hash)
|
230
|
+
|
231
|
+
if len_1 != len_2:
|
232
|
+
_log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
|
233
|
+
|
234
|
+
return len_2
|
235
|
+
|
236
|
+
def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
|
237
|
+
return DoclingParseV2PageBackend(
|
238
|
+
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
239
|
+
)
|
240
|
+
|
241
|
+
def is_valid(self) -> bool:
|
242
|
+
return self.page_count() > 0
|
243
|
+
|
244
|
+
def unload(self):
|
245
|
+
super().unload()
|
246
|
+
self.parser.unload_document(self.document_hash)
|
247
|
+
self._pdoc.close()
|
248
|
+
self._pdoc = None
|
@@ -0,0 +1,429 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Set, Union
|
5
|
+
|
6
|
+
from bs4 import BeautifulSoup
|
7
|
+
from docling_core.types.doc import (
|
8
|
+
DocItemLabel,
|
9
|
+
DoclingDocument,
|
10
|
+
DocumentOrigin,
|
11
|
+
GroupLabel,
|
12
|
+
TableCell,
|
13
|
+
TableData,
|
14
|
+
)
|
15
|
+
|
16
|
+
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
17
|
+
from docling.datamodel.base_models import InputFormat
|
18
|
+
from docling.datamodel.document import InputDocument
|
19
|
+
|
20
|
+
_log = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
24
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
25
|
+
super().__init__(in_doc, path_or_stream)
|
26
|
+
_log.debug("About to init HTML backend...")
|
27
|
+
self.soup = None
|
28
|
+
# HTML file:
|
29
|
+
self.path_or_stream = path_or_stream
|
30
|
+
# Initialise the parents for the hierarchy
|
31
|
+
self.max_levels = 10
|
32
|
+
self.level = 0
|
33
|
+
self.parents = {} # type: ignore
|
34
|
+
for i in range(0, self.max_levels):
|
35
|
+
self.parents[i] = None
|
36
|
+
self.labels = {} # type: ignore
|
37
|
+
|
38
|
+
try:
|
39
|
+
if isinstance(self.path_or_stream, BytesIO):
|
40
|
+
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
41
|
+
self.soup = BeautifulSoup(text_stream, "html.parser")
|
42
|
+
if isinstance(self.path_or_stream, Path):
|
43
|
+
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
44
|
+
html_content = f.read()
|
45
|
+
self.soup = BeautifulSoup(html_content, "html.parser")
|
46
|
+
except Exception as e:
|
47
|
+
raise RuntimeError(
|
48
|
+
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
49
|
+
) from e
|
50
|
+
|
51
|
+
def is_valid(self) -> bool:
|
52
|
+
return self.soup is not None
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def supports_pagination(cls) -> bool:
|
56
|
+
return False
|
57
|
+
|
58
|
+
def unload(self):
|
59
|
+
if isinstance(self.path_or_stream, BytesIO):
|
60
|
+
self.path_or_stream.close()
|
61
|
+
|
62
|
+
self.path_or_stream = None
|
63
|
+
|
64
|
+
@classmethod
|
65
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
66
|
+
return {InputFormat.HTML}
|
67
|
+
|
68
|
+
def convert(self) -> DoclingDocument:
|
69
|
+
# access self.path_or_stream to load stuff
|
70
|
+
origin = DocumentOrigin(
|
71
|
+
filename=self.file.name or "file",
|
72
|
+
mimetype="text/html",
|
73
|
+
binary_hash=self.document_hash,
|
74
|
+
)
|
75
|
+
|
76
|
+
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
77
|
+
_log.debug("Trying to convert HTML...")
|
78
|
+
|
79
|
+
if self.is_valid():
|
80
|
+
assert self.soup is not None
|
81
|
+
# Replace <br> tags with newline characters
|
82
|
+
for br in self.soup.body.find_all("br"):
|
83
|
+
br.replace_with("\n")
|
84
|
+
doc = self.walk(self.soup.body, doc)
|
85
|
+
else:
|
86
|
+
raise RuntimeError(
|
87
|
+
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
88
|
+
)
|
89
|
+
return doc
|
90
|
+
|
91
|
+
def walk(self, element, doc):
|
92
|
+
try:
|
93
|
+
# Iterate over elements in the body of the document
|
94
|
+
for idx, element in enumerate(element.children):
|
95
|
+
try:
|
96
|
+
self.analyse_element(element, idx, doc)
|
97
|
+
except Exception as exc_child:
|
98
|
+
|
99
|
+
_log.error(" -> error treating child: ", exc_child)
|
100
|
+
_log.error(" => element: ", element, "\n")
|
101
|
+
raise exc_child
|
102
|
+
|
103
|
+
except Exception as exc:
|
104
|
+
pass
|
105
|
+
|
106
|
+
return doc
|
107
|
+
|
108
|
+
def analyse_element(self, element, idx, doc):
|
109
|
+
"""
|
110
|
+
if element.name!=None:
|
111
|
+
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
112
|
+
"""
|
113
|
+
|
114
|
+
if element.name in self.labels:
|
115
|
+
self.labels[element.name] += 1
|
116
|
+
else:
|
117
|
+
self.labels[element.name] = 1
|
118
|
+
|
119
|
+
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
120
|
+
self.handle_header(element, idx, doc)
|
121
|
+
elif element.name in ["p"]:
|
122
|
+
self.handle_paragraph(element, idx, doc)
|
123
|
+
elif element.name in ["ul", "ol"]:
|
124
|
+
self.handle_list(element, idx, doc)
|
125
|
+
elif element.name in ["li"]:
|
126
|
+
self.handle_listitem(element, idx, doc)
|
127
|
+
elif element.name == "table":
|
128
|
+
self.handle_table(element, idx, doc)
|
129
|
+
elif element.name == "figure":
|
130
|
+
self.handle_figure(element, idx, doc)
|
131
|
+
elif element.name == "img":
|
132
|
+
self.handle_image(element, idx, doc)
|
133
|
+
else:
|
134
|
+
self.walk(element, doc)
|
135
|
+
|
136
|
+
def get_direct_text(self, item):
|
137
|
+
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
138
|
+
text = item.find(string=True, recursive=False)
|
139
|
+
if isinstance(text, str):
|
140
|
+
return text.strip()
|
141
|
+
|
142
|
+
return ""
|
143
|
+
|
144
|
+
# Function to recursively extract text from all child nodes
|
145
|
+
def extract_text_recursively(self, item):
|
146
|
+
result = []
|
147
|
+
|
148
|
+
if isinstance(item, str):
|
149
|
+
return [item]
|
150
|
+
|
151
|
+
if item.name not in ["ul", "ol"]:
|
152
|
+
try:
|
153
|
+
# Iterate over the children (and their text and tails)
|
154
|
+
for child in item:
|
155
|
+
try:
|
156
|
+
# Recursively get the child's text content
|
157
|
+
result.extend(self.extract_text_recursively(child))
|
158
|
+
except:
|
159
|
+
pass
|
160
|
+
except:
|
161
|
+
_log.warn("item has no children")
|
162
|
+
pass
|
163
|
+
|
164
|
+
return "".join(result) + " "
|
165
|
+
|
166
|
+
def handle_header(self, element, idx, doc):
|
167
|
+
"""Handles header tags (h1, h2, etc.)."""
|
168
|
+
hlevel = int(element.name.replace("h", ""))
|
169
|
+
slevel = hlevel - 1
|
170
|
+
|
171
|
+
label = DocItemLabel.SECTION_HEADER
|
172
|
+
text = element.text.strip()
|
173
|
+
|
174
|
+
if hlevel == 1:
|
175
|
+
for key, val in self.parents.items():
|
176
|
+
self.parents[key] = None
|
177
|
+
|
178
|
+
self.level = 1
|
179
|
+
self.parents[self.level] = doc.add_text(
|
180
|
+
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
181
|
+
)
|
182
|
+
else:
|
183
|
+
if hlevel > self.level:
|
184
|
+
|
185
|
+
# add invisible group
|
186
|
+
for i in range(self.level + 1, hlevel):
|
187
|
+
self.parents[i] = doc.add_group(
|
188
|
+
name=f"header-{i}",
|
189
|
+
label=GroupLabel.SECTION,
|
190
|
+
parent=self.parents[i - 1],
|
191
|
+
)
|
192
|
+
self.level = hlevel
|
193
|
+
|
194
|
+
elif hlevel < self.level:
|
195
|
+
|
196
|
+
# remove the tail
|
197
|
+
for key, val in self.parents.items():
|
198
|
+
if key > hlevel:
|
199
|
+
self.parents[key] = None
|
200
|
+
self.level = hlevel
|
201
|
+
|
202
|
+
self.parents[hlevel] = doc.add_heading(
|
203
|
+
parent=self.parents[hlevel - 1],
|
204
|
+
text=text,
|
205
|
+
level=hlevel,
|
206
|
+
)
|
207
|
+
|
208
|
+
def handle_paragraph(self, element, idx, doc):
|
209
|
+
"""Handles paragraph tags (p)."""
|
210
|
+
if element.text is None:
|
211
|
+
return
|
212
|
+
text = element.text.strip()
|
213
|
+
label = DocItemLabel.PARAGRAPH
|
214
|
+
if len(text) == 0:
|
215
|
+
return
|
216
|
+
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
217
|
+
|
218
|
+
def handle_list(self, element, idx, doc):
|
219
|
+
"""Handles list tags (ul, ol) and their list items."""
|
220
|
+
|
221
|
+
if element.name == "ul":
|
222
|
+
# create a list group
|
223
|
+
self.parents[self.level + 1] = doc.add_group(
|
224
|
+
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
225
|
+
)
|
226
|
+
elif element.name == "ol":
|
227
|
+
# create a list group
|
228
|
+
self.parents[self.level + 1] = doc.add_group(
|
229
|
+
parent=self.parents[self.level],
|
230
|
+
name="ordered list",
|
231
|
+
label=GroupLabel.ORDERED_LIST,
|
232
|
+
)
|
233
|
+
self.level += 1
|
234
|
+
|
235
|
+
self.walk(element, doc)
|
236
|
+
|
237
|
+
self.parents[self.level + 1] = None
|
238
|
+
self.level -= 1
|
239
|
+
|
240
|
+
def handle_listitem(self, element, idx, doc):
|
241
|
+
"""Handles listitem tags (li)."""
|
242
|
+
nested_lists = element.find(["ul", "ol"])
|
243
|
+
|
244
|
+
parent_list_label = self.parents[self.level].label
|
245
|
+
index_in_list = len(self.parents[self.level].children) + 1
|
246
|
+
|
247
|
+
if nested_lists:
|
248
|
+
name = element.name
|
249
|
+
# Text in list item can be hidden within hierarchy, hence
|
250
|
+
# we need to extract it recursively
|
251
|
+
text = self.extract_text_recursively(element)
|
252
|
+
# Flatten text, remove break lines:
|
253
|
+
text = text.replace("\n", "").replace("\r", "")
|
254
|
+
text = " ".join(text.split()).strip()
|
255
|
+
|
256
|
+
marker = ""
|
257
|
+
enumerated = False
|
258
|
+
if parent_list_label == GroupLabel.ORDERED_LIST:
|
259
|
+
marker = str(index_in_list)
|
260
|
+
enumerated = True
|
261
|
+
|
262
|
+
if len(text) > 0:
|
263
|
+
# create a list-item
|
264
|
+
self.parents[self.level + 1] = doc.add_list_item(
|
265
|
+
text=text,
|
266
|
+
enumerated=enumerated,
|
267
|
+
marker=marker,
|
268
|
+
parent=self.parents[self.level],
|
269
|
+
)
|
270
|
+
self.level += 1
|
271
|
+
|
272
|
+
self.walk(element, doc)
|
273
|
+
|
274
|
+
self.parents[self.level + 1] = None
|
275
|
+
self.level -= 1
|
276
|
+
|
277
|
+
elif isinstance(element.text, str):
|
278
|
+
text = element.text.strip()
|
279
|
+
|
280
|
+
marker = ""
|
281
|
+
enumerated = False
|
282
|
+
if parent_list_label == GroupLabel.ORDERED_LIST:
|
283
|
+
marker = f"{str(index_in_list)}."
|
284
|
+
enumerated = True
|
285
|
+
doc.add_list_item(
|
286
|
+
text=text,
|
287
|
+
enumerated=enumerated,
|
288
|
+
marker=marker,
|
289
|
+
parent=self.parents[self.level],
|
290
|
+
)
|
291
|
+
else:
|
292
|
+
_log.warn("list-item has no text: ", element)
|
293
|
+
|
294
|
+
def handle_table(self, element, idx, doc):
|
295
|
+
"""Handles table tags."""
|
296
|
+
|
297
|
+
nested_tables = element.find("table")
|
298
|
+
if nested_tables is not None:
|
299
|
+
_log.warn("detected nested tables: skipping for now")
|
300
|
+
return
|
301
|
+
|
302
|
+
# Count the number of rows (number of <tr> elements)
|
303
|
+
num_rows = len(element.find_all("tr"))
|
304
|
+
|
305
|
+
# Find the number of columns (taking into account colspan)
|
306
|
+
num_cols = 0
|
307
|
+
for row in element.find_all("tr"):
|
308
|
+
col_count = 0
|
309
|
+
for cell in row.find_all(["td", "th"]):
|
310
|
+
colspan = int(cell.get("colspan", 1))
|
311
|
+
col_count += colspan
|
312
|
+
num_cols = max(num_cols, col_count)
|
313
|
+
|
314
|
+
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
315
|
+
|
316
|
+
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
317
|
+
|
318
|
+
# Iterate over the rows in the table
|
319
|
+
for row_idx, row in enumerate(element.find_all("tr")):
|
320
|
+
|
321
|
+
# For each row, find all the column cells (both <td> and <th>)
|
322
|
+
cells = row.find_all(["td", "th"])
|
323
|
+
|
324
|
+
# Check if each cell in the row is a header -> means it is a column header
|
325
|
+
col_header = True
|
326
|
+
for j, html_cell in enumerate(cells):
|
327
|
+
if html_cell.name == "td":
|
328
|
+
col_header = False
|
329
|
+
|
330
|
+
col_idx = 0
|
331
|
+
# Extract and print the text content of each cell
|
332
|
+
for _, html_cell in enumerate(cells):
|
333
|
+
|
334
|
+
text = html_cell.text
|
335
|
+
try:
|
336
|
+
text = self.extract_table_cell_text(html_cell)
|
337
|
+
except Exception as exc:
|
338
|
+
_log.warn("exception: ", exc)
|
339
|
+
exit(-1)
|
340
|
+
|
341
|
+
# label = html_cell.name
|
342
|
+
|
343
|
+
col_span = int(html_cell.get("colspan", 1))
|
344
|
+
row_span = int(html_cell.get("rowspan", 1))
|
345
|
+
|
346
|
+
while grid[row_idx][col_idx] is not None:
|
347
|
+
col_idx += 1
|
348
|
+
for r in range(row_span):
|
349
|
+
for c in range(col_span):
|
350
|
+
grid[row_idx + r][col_idx + c] = text
|
351
|
+
|
352
|
+
cell = TableCell(
|
353
|
+
text=text,
|
354
|
+
row_span=row_span,
|
355
|
+
col_span=col_span,
|
356
|
+
start_row_offset_idx=row_idx,
|
357
|
+
end_row_offset_idx=row_idx + row_span,
|
358
|
+
start_col_offset_idx=col_idx,
|
359
|
+
end_col_offset_idx=col_idx + col_span,
|
360
|
+
col_header=col_header,
|
361
|
+
row_header=((not col_header) and html_cell.name == "th"),
|
362
|
+
)
|
363
|
+
data.table_cells.append(cell)
|
364
|
+
|
365
|
+
doc.add_table(data=data, parent=self.parents[self.level])
|
366
|
+
|
367
|
+
def get_list_text(self, list_element, level=0):
|
368
|
+
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
369
|
+
result = []
|
370
|
+
bullet_char = "*" # Default bullet character for unordered lists
|
371
|
+
|
372
|
+
if list_element.name == "ol": # For ordered lists, use numbers
|
373
|
+
for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
|
374
|
+
# Add numbering for ordered lists
|
375
|
+
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
|
376
|
+
# Handle nested lists
|
377
|
+
nested_list = li.find(["ul", "ol"])
|
378
|
+
if nested_list:
|
379
|
+
result.extend(self.get_list_text(nested_list, level + 1))
|
380
|
+
elif list_element.name == "ul": # For unordered lists, use bullet points
|
381
|
+
for li in list_element.find_all("li", recursive=False):
|
382
|
+
# Add bullet points for unordered lists
|
383
|
+
result.append(
|
384
|
+
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
|
385
|
+
)
|
386
|
+
# Handle nested lists
|
387
|
+
nested_list = li.find(["ul", "ol"])
|
388
|
+
if nested_list:
|
389
|
+
result.extend(self.get_list_text(nested_list, level + 1))
|
390
|
+
|
391
|
+
return result
|
392
|
+
|
393
|
+
def extract_table_cell_text(self, cell):
|
394
|
+
"""Extract text from a table cell, including lists with indents."""
|
395
|
+
contains_lists = cell.find(["ul", "ol"])
|
396
|
+
if contains_lists is None:
|
397
|
+
return cell.text
|
398
|
+
else:
|
399
|
+
_log.debug(
|
400
|
+
"should extract the content correctly for table-cells with lists ..."
|
401
|
+
)
|
402
|
+
return cell.text
|
403
|
+
|
404
|
+
def handle_figure(self, element, idx, doc):
|
405
|
+
"""Handles image tags (img)."""
|
406
|
+
|
407
|
+
# Extract the image URI from the <img> tag
|
408
|
+
# image_uri = root.xpath('//figure//img/@src')[0]
|
409
|
+
|
410
|
+
contains_captions = element.find(["figcaption"])
|
411
|
+
if contains_captions is None:
|
412
|
+
doc.add_picture(parent=self.parents[self.level], caption=None)
|
413
|
+
|
414
|
+
else:
|
415
|
+
texts = []
|
416
|
+
for item in contains_captions:
|
417
|
+
texts.append(item.text)
|
418
|
+
|
419
|
+
fig_caption = doc.add_text(
|
420
|
+
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
421
|
+
)
|
422
|
+
doc.add_picture(
|
423
|
+
parent=self.parents[self.level],
|
424
|
+
caption=fig_caption,
|
425
|
+
)
|
426
|
+
|
427
|
+
def handle_image(self, element, idx, doc):
|
428
|
+
"""Handles image tags (img)."""
|
429
|
+
doc.add_picture(parent=self.parents[self.level], caption=None)
|