docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/abstract_backend.py +33 -37
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +20 -16
  4. docling/backend/docling_parse_v2_backend.py +248 -0
  5. docling/backend/html_backend.py +429 -0
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +398 -0
  8. docling/backend/msword_backend.py +496 -0
  9. docling/backend/pdf_backend.py +78 -0
  10. docling/backend/pypdfium2_backend.py +16 -11
  11. docling/cli/main.py +96 -65
  12. docling/datamodel/base_models.py +79 -193
  13. docling/datamodel/document.py +405 -320
  14. docling/datamodel/pipeline_options.py +19 -3
  15. docling/datamodel/settings.py +16 -1
  16. docling/document_converter.py +240 -251
  17. docling/models/base_model.py +28 -0
  18. docling/models/base_ocr_model.py +40 -10
  19. docling/models/ds_glm_model.py +244 -30
  20. docling/models/easyocr_model.py +57 -42
  21. docling/models/layout_model.py +158 -116
  22. docling/models/page_assemble_model.py +127 -101
  23. docling/models/page_preprocessing_model.py +79 -0
  24. docling/models/table_structure_model.py +162 -116
  25. docling/models/tesseract_ocr_cli_model.py +76 -59
  26. docling/models/tesseract_ocr_model.py +90 -58
  27. docling/pipeline/base_pipeline.py +189 -0
  28. docling/pipeline/simple_pipeline.py +56 -0
  29. docling/pipeline/standard_pdf_pipeline.py +201 -0
  30. docling/utils/export.py +4 -3
  31. docling/utils/layout_utils.py +17 -11
  32. docling/utils/profiling.py +62 -0
  33. docling-2.4.1.dist-info/METADATA +154 -0
  34. docling-2.4.1.dist-info/RECORD +45 -0
  35. docling/pipeline/base_model_pipeline.py +0 -18
  36. docling/pipeline/standard_model_pipeline.py +0 -66
  37. docling-1.19.1.dist-info/METADATA +0 -380
  38. docling-1.19.1.dist-info/RECORD +0 -34
  39. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  40. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  41. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,248 @@
1
+ import logging
2
+ import random
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
+
7
+ import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
9
+ from docling_parse.docling_parse import pdf_parser_v2
10
+ from PIL import Image, ImageDraw
11
+ from pypdfium2 import PdfPage
12
+
13
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import Cell, Size
15
+
16
+ if TYPE_CHECKING:
17
+ from docling.datamodel.document import InputDocument
18
+
19
+ _log = logging.getLogger(__name__)
20
+
21
+
22
+ class DoclingParseV2PageBackend(PdfPageBackend):
23
+ def __init__(
24
+ self, parser: pdf_parser_v2, document_hash: str, page_no: int, page_obj: PdfPage
25
+ ):
26
+ self._ppage = page_obj
27
+ parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
28
+
29
+ self.valid = "pages" in parsed_page and len(parsed_page["pages"]) == 1
30
+ if self.valid:
31
+ self._dpage = parsed_page["pages"][0]
32
+ else:
33
+ _log.info(
34
+ f"An error occurred when loading page {page_no} of document {document_hash}."
35
+ )
36
+
37
+ def is_valid(self) -> bool:
38
+ return self.valid
39
+
40
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
41
+ if not self.valid:
42
+ return ""
43
+ # Find intersecting cells on the page
44
+ text_piece = ""
45
+ page_size = self.get_size()
46
+
47
+ parser_width = self._dpage["sanitized"]["dimension"]["width"]
48
+ parser_height = self._dpage["sanitized"]["dimension"]["height"]
49
+
50
+ scale = (
51
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
52
+ )
53
+
54
+ cells_data = self._dpage["sanitized"]["cells"]["data"]
55
+ cells_header = self._dpage["sanitized"]["cells"]["header"]
56
+
57
+ for i, cell_data in enumerate(cells_data):
58
+ x0 = cell_data[cells_header.index("x0")]
59
+ y0 = cell_data[cells_header.index("y0")]
60
+ x1 = cell_data[cells_header.index("x1")]
61
+ y1 = cell_data[cells_header.index("y1")]
62
+
63
+ cell_bbox = BoundingBox(
64
+ l=x0 * scale * page_size.width / parser_width,
65
+ b=y0 * scale * page_size.height / parser_height,
66
+ r=x1 * scale * page_size.width / parser_width,
67
+ t=y1 * scale * page_size.height / parser_height,
68
+ coord_origin=CoordOrigin.BOTTOMLEFT,
69
+ ).to_top_left_origin(page_height=page_size.height * scale)
70
+
71
+ overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
72
+
73
+ if overlap_frac > 0.5:
74
+ if len(text_piece) > 0:
75
+ text_piece += " "
76
+ text_piece += cell_data[cells_header.index("text")]
77
+
78
+ return text_piece
79
+
80
+ def get_text_cells(self) -> Iterable[Cell]:
81
+ cells: List[Cell] = []
82
+ cell_counter = 0
83
+
84
+ if not self.valid:
85
+ return cells
86
+
87
+ page_size = self.get_size()
88
+
89
+ parser_width = self._dpage["sanitized"]["dimension"]["width"]
90
+ parser_height = self._dpage["sanitized"]["dimension"]["height"]
91
+
92
+ cells_data = self._dpage["sanitized"]["cells"]["data"]
93
+ cells_header = self._dpage["sanitized"]["cells"]["header"]
94
+
95
+ for i, cell_data in enumerate(cells_data):
96
+ x0 = cell_data[cells_header.index("x0")]
97
+ y0 = cell_data[cells_header.index("y0")]
98
+ x1 = cell_data[cells_header.index("x1")]
99
+ y1 = cell_data[cells_header.index("y1")]
100
+
101
+ if x1 < x0:
102
+ x0, x1 = x1, x0
103
+ if y1 < y0:
104
+ y0, y1 = y1, y0
105
+
106
+ text_piece = cell_data[cells_header.index("text")]
107
+ cells.append(
108
+ Cell(
109
+ id=cell_counter,
110
+ text=text_piece,
111
+ bbox=BoundingBox(
112
+ # l=x0, b=y0, r=x1, t=y1,
113
+ l=x0 * page_size.width / parser_width,
114
+ b=y0 * page_size.height / parser_height,
115
+ r=x1 * page_size.width / parser_width,
116
+ t=y1 * page_size.height / parser_height,
117
+ coord_origin=CoordOrigin.BOTTOMLEFT,
118
+ ).to_top_left_origin(page_size.height),
119
+ )
120
+ )
121
+ cell_counter += 1
122
+
123
+ def draw_clusters_and_cells():
124
+ image = (
125
+ self.get_page_image()
126
+ ) # make new image to avoid drawing on the saved ones
127
+ draw = ImageDraw.Draw(image)
128
+ for c in cells:
129
+ x0, y0, x1, y1 = c.bbox.as_tuple()
130
+ cell_color = (
131
+ random.randint(30, 140),
132
+ random.randint(30, 140),
133
+ random.randint(30, 140),
134
+ )
135
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
136
+ image.show()
137
+
138
+ # draw_clusters_and_cells()
139
+
140
+ return cells
141
+
142
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
143
+ AREA_THRESHOLD = 32 * 32
144
+
145
+ images = self._dpage["sanitized"]["images"]["data"]
146
+ images_header = self._dpage["sanitized"]["images"]["header"]
147
+
148
+ for row in images:
149
+ x0 = row[images_header.index("x0")]
150
+ y0 = row[images_header.index("y0")]
151
+ x1 = row[images_header.index("x1")]
152
+ y1 = row[images_header.index("y1")]
153
+
154
+ cropbox = BoundingBox.from_tuple(
155
+ (x0, y0, x1, y1), origin=CoordOrigin.BOTTOMLEFT
156
+ ).to_top_left_origin(self.get_size().height)
157
+
158
+ if cropbox.area() > AREA_THRESHOLD:
159
+ cropbox = cropbox.scaled(scale=scale)
160
+
161
+ yield cropbox
162
+
163
+ def get_page_image(
164
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
165
+ ) -> Image.Image:
166
+
167
+ page_size = self.get_size()
168
+
169
+ if not cropbox:
170
+ cropbox = BoundingBox(
171
+ l=0,
172
+ r=page_size.width,
173
+ t=0,
174
+ b=page_size.height,
175
+ coord_origin=CoordOrigin.TOPLEFT,
176
+ )
177
+ padbox = BoundingBox(
178
+ l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
179
+ )
180
+ else:
181
+ padbox = cropbox.to_bottom_left_origin(page_size.height)
182
+ padbox.r = page_size.width - padbox.r
183
+ padbox.t = page_size.height - padbox.t
184
+
185
+ image = (
186
+ self._ppage.render(
187
+ scale=scale * 1.5,
188
+ rotation=0, # no additional rotation
189
+ crop=padbox.as_tuple(),
190
+ )
191
+ .to_pil()
192
+ .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
193
+ ) # We resize the image from 1.5x the given scale to make it sharper.
194
+
195
+ return image
196
+
197
+ def get_size(self) -> Size:
198
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
199
+
200
+ def unload(self):
201
+ self._ppage = None
202
+ self._dpage = None
203
+
204
+
205
+ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
206
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
207
+ super().__init__(in_doc, path_or_stream)
208
+
209
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
210
+ self.parser = pdf_parser_v2("fatal")
211
+
212
+ success = False
213
+ if isinstance(path_or_stream, BytesIO):
214
+ success = self.parser.load_document_from_bytesio(
215
+ self.document_hash, path_or_stream
216
+ )
217
+ elif isinstance(path_or_stream, Path):
218
+ success = self.parser.load_document(self.document_hash, str(path_or_stream))
219
+
220
+ if not success:
221
+ raise RuntimeError(
222
+ f"docling-parse v2 could not load document {self.document_hash}."
223
+ )
224
+
225
+ def page_count(self) -> int:
226
+ # return len(self._pdoc) # To be replaced with docling-parse API
227
+
228
+ len_1 = len(self._pdoc)
229
+ len_2 = self.parser.number_of_pages(self.document_hash)
230
+
231
+ if len_1 != len_2:
232
+ _log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
233
+
234
+ return len_2
235
+
236
+ def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
237
+ return DoclingParseV2PageBackend(
238
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
239
+ )
240
+
241
+ def is_valid(self) -> bool:
242
+ return self.page_count() > 0
243
+
244
+ def unload(self):
245
+ super().unload()
246
+ self.parser.unload_document(self.document_hash)
247
+ self._pdoc.close()
248
+ self._pdoc = None
@@ -0,0 +1,429 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ from bs4 import BeautifulSoup
7
+ from docling_core.types.doc import (
8
+ DocItemLabel,
9
+ DoclingDocument,
10
+ DocumentOrigin,
11
+ GroupLabel,
12
+ TableCell,
13
+ TableData,
14
+ )
15
+
16
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
17
+ from docling.datamodel.base_models import InputFormat
18
+ from docling.datamodel.document import InputDocument
19
+
20
+ _log = logging.getLogger(__name__)
21
+
22
+
23
+ class HTMLDocumentBackend(DeclarativeDocumentBackend):
24
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
25
+ super().__init__(in_doc, path_or_stream)
26
+ _log.debug("About to init HTML backend...")
27
+ self.soup = None
28
+ # HTML file:
29
+ self.path_or_stream = path_or_stream
30
+ # Initialise the parents for the hierarchy
31
+ self.max_levels = 10
32
+ self.level = 0
33
+ self.parents = {} # type: ignore
34
+ for i in range(0, self.max_levels):
35
+ self.parents[i] = None
36
+ self.labels = {} # type: ignore
37
+
38
+ try:
39
+ if isinstance(self.path_or_stream, BytesIO):
40
+ text_stream = self.path_or_stream.getvalue().decode("utf-8")
41
+ self.soup = BeautifulSoup(text_stream, "html.parser")
42
+ if isinstance(self.path_or_stream, Path):
43
+ with open(self.path_or_stream, "r", encoding="utf-8") as f:
44
+ html_content = f.read()
45
+ self.soup = BeautifulSoup(html_content, "html.parser")
46
+ except Exception as e:
47
+ raise RuntimeError(
48
+ f"Could not initialize HTML backend for file with hash {self.document_hash}."
49
+ ) from e
50
+
51
+ def is_valid(self) -> bool:
52
+ return self.soup is not None
53
+
54
+ @classmethod
55
+ def supports_pagination(cls) -> bool:
56
+ return False
57
+
58
+ def unload(self):
59
+ if isinstance(self.path_or_stream, BytesIO):
60
+ self.path_or_stream.close()
61
+
62
+ self.path_or_stream = None
63
+
64
+ @classmethod
65
+ def supported_formats(cls) -> Set[InputFormat]:
66
+ return {InputFormat.HTML}
67
+
68
+ def convert(self) -> DoclingDocument:
69
+ # access self.path_or_stream to load stuff
70
+ origin = DocumentOrigin(
71
+ filename=self.file.name or "file",
72
+ mimetype="text/html",
73
+ binary_hash=self.document_hash,
74
+ )
75
+
76
+ doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
77
+ _log.debug("Trying to convert HTML...")
78
+
79
+ if self.is_valid():
80
+ assert self.soup is not None
81
+ # Replace <br> tags with newline characters
82
+ for br in self.soup.body.find_all("br"):
83
+ br.replace_with("\n")
84
+ doc = self.walk(self.soup.body, doc)
85
+ else:
86
+ raise RuntimeError(
87
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
88
+ )
89
+ return doc
90
+
91
+ def walk(self, element, doc):
92
+ try:
93
+ # Iterate over elements in the body of the document
94
+ for idx, element in enumerate(element.children):
95
+ try:
96
+ self.analyse_element(element, idx, doc)
97
+ except Exception as exc_child:
98
+
99
+ _log.error(" -> error treating child: ", exc_child)
100
+ _log.error(" => element: ", element, "\n")
101
+ raise exc_child
102
+
103
+ except Exception as exc:
104
+ pass
105
+
106
+ return doc
107
+
108
+ def analyse_element(self, element, idx, doc):
109
+ """
110
+ if element.name!=None:
111
+ _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
112
+ """
113
+
114
+ if element.name in self.labels:
115
+ self.labels[element.name] += 1
116
+ else:
117
+ self.labels[element.name] = 1
118
+
119
+ if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
120
+ self.handle_header(element, idx, doc)
121
+ elif element.name in ["p"]:
122
+ self.handle_paragraph(element, idx, doc)
123
+ elif element.name in ["ul", "ol"]:
124
+ self.handle_list(element, idx, doc)
125
+ elif element.name in ["li"]:
126
+ self.handle_listitem(element, idx, doc)
127
+ elif element.name == "table":
128
+ self.handle_table(element, idx, doc)
129
+ elif element.name == "figure":
130
+ self.handle_figure(element, idx, doc)
131
+ elif element.name == "img":
132
+ self.handle_image(element, idx, doc)
133
+ else:
134
+ self.walk(element, doc)
135
+
136
+ def get_direct_text(self, item):
137
+ """Get the direct text of the <li> element (ignoring nested lists)."""
138
+ text = item.find(string=True, recursive=False)
139
+ if isinstance(text, str):
140
+ return text.strip()
141
+
142
+ return ""
143
+
144
+ # Function to recursively extract text from all child nodes
145
+ def extract_text_recursively(self, item):
146
+ result = []
147
+
148
+ if isinstance(item, str):
149
+ return [item]
150
+
151
+ if item.name not in ["ul", "ol"]:
152
+ try:
153
+ # Iterate over the children (and their text and tails)
154
+ for child in item:
155
+ try:
156
+ # Recursively get the child's text content
157
+ result.extend(self.extract_text_recursively(child))
158
+ except:
159
+ pass
160
+ except:
161
+ _log.warn("item has no children")
162
+ pass
163
+
164
+ return "".join(result) + " "
165
+
166
+ def handle_header(self, element, idx, doc):
167
+ """Handles header tags (h1, h2, etc.)."""
168
+ hlevel = int(element.name.replace("h", ""))
169
+ slevel = hlevel - 1
170
+
171
+ label = DocItemLabel.SECTION_HEADER
172
+ text = element.text.strip()
173
+
174
+ if hlevel == 1:
175
+ for key, val in self.parents.items():
176
+ self.parents[key] = None
177
+
178
+ self.level = 1
179
+ self.parents[self.level] = doc.add_text(
180
+ parent=self.parents[0], label=DocItemLabel.TITLE, text=text
181
+ )
182
+ else:
183
+ if hlevel > self.level:
184
+
185
+ # add invisible group
186
+ for i in range(self.level + 1, hlevel):
187
+ self.parents[i] = doc.add_group(
188
+ name=f"header-{i}",
189
+ label=GroupLabel.SECTION,
190
+ parent=self.parents[i - 1],
191
+ )
192
+ self.level = hlevel
193
+
194
+ elif hlevel < self.level:
195
+
196
+ # remove the tail
197
+ for key, val in self.parents.items():
198
+ if key > hlevel:
199
+ self.parents[key] = None
200
+ self.level = hlevel
201
+
202
+ self.parents[hlevel] = doc.add_heading(
203
+ parent=self.parents[hlevel - 1],
204
+ text=text,
205
+ level=hlevel,
206
+ )
207
+
208
+ def handle_paragraph(self, element, idx, doc):
209
+ """Handles paragraph tags (p)."""
210
+ if element.text is None:
211
+ return
212
+ text = element.text.strip()
213
+ label = DocItemLabel.PARAGRAPH
214
+ if len(text) == 0:
215
+ return
216
+ doc.add_text(parent=self.parents[self.level], label=label, text=text)
217
+
218
+ def handle_list(self, element, idx, doc):
219
+ """Handles list tags (ul, ol) and their list items."""
220
+
221
+ if element.name == "ul":
222
+ # create a list group
223
+ self.parents[self.level + 1] = doc.add_group(
224
+ parent=self.parents[self.level], name="list", label=GroupLabel.LIST
225
+ )
226
+ elif element.name == "ol":
227
+ # create a list group
228
+ self.parents[self.level + 1] = doc.add_group(
229
+ parent=self.parents[self.level],
230
+ name="ordered list",
231
+ label=GroupLabel.ORDERED_LIST,
232
+ )
233
+ self.level += 1
234
+
235
+ self.walk(element, doc)
236
+
237
+ self.parents[self.level + 1] = None
238
+ self.level -= 1
239
+
240
+ def handle_listitem(self, element, idx, doc):
241
+ """Handles listitem tags (li)."""
242
+ nested_lists = element.find(["ul", "ol"])
243
+
244
+ parent_list_label = self.parents[self.level].label
245
+ index_in_list = len(self.parents[self.level].children) + 1
246
+
247
+ if nested_lists:
248
+ name = element.name
249
+ # Text in list item can be hidden within hierarchy, hence
250
+ # we need to extract it recursively
251
+ text = self.extract_text_recursively(element)
252
+ # Flatten text, remove break lines:
253
+ text = text.replace("\n", "").replace("\r", "")
254
+ text = " ".join(text.split()).strip()
255
+
256
+ marker = ""
257
+ enumerated = False
258
+ if parent_list_label == GroupLabel.ORDERED_LIST:
259
+ marker = str(index_in_list)
260
+ enumerated = True
261
+
262
+ if len(text) > 0:
263
+ # create a list-item
264
+ self.parents[self.level + 1] = doc.add_list_item(
265
+ text=text,
266
+ enumerated=enumerated,
267
+ marker=marker,
268
+ parent=self.parents[self.level],
269
+ )
270
+ self.level += 1
271
+
272
+ self.walk(element, doc)
273
+
274
+ self.parents[self.level + 1] = None
275
+ self.level -= 1
276
+
277
+ elif isinstance(element.text, str):
278
+ text = element.text.strip()
279
+
280
+ marker = ""
281
+ enumerated = False
282
+ if parent_list_label == GroupLabel.ORDERED_LIST:
283
+ marker = f"{str(index_in_list)}."
284
+ enumerated = True
285
+ doc.add_list_item(
286
+ text=text,
287
+ enumerated=enumerated,
288
+ marker=marker,
289
+ parent=self.parents[self.level],
290
+ )
291
+ else:
292
+ _log.warn("list-item has no text: ", element)
293
+
294
+ def handle_table(self, element, idx, doc):
295
+ """Handles table tags."""
296
+
297
+ nested_tables = element.find("table")
298
+ if nested_tables is not None:
299
+ _log.warn("detected nested tables: skipping for now")
300
+ return
301
+
302
+ # Count the number of rows (number of <tr> elements)
303
+ num_rows = len(element.find_all("tr"))
304
+
305
+ # Find the number of columns (taking into account colspan)
306
+ num_cols = 0
307
+ for row in element.find_all("tr"):
308
+ col_count = 0
309
+ for cell in row.find_all(["td", "th"]):
310
+ colspan = int(cell.get("colspan", 1))
311
+ col_count += colspan
312
+ num_cols = max(num_cols, col_count)
313
+
314
+ grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
315
+
316
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
317
+
318
+ # Iterate over the rows in the table
319
+ for row_idx, row in enumerate(element.find_all("tr")):
320
+
321
+ # For each row, find all the column cells (both <td> and <th>)
322
+ cells = row.find_all(["td", "th"])
323
+
324
+ # Check if each cell in the row is a header -> means it is a column header
325
+ col_header = True
326
+ for j, html_cell in enumerate(cells):
327
+ if html_cell.name == "td":
328
+ col_header = False
329
+
330
+ col_idx = 0
331
+ # Extract and print the text content of each cell
332
+ for _, html_cell in enumerate(cells):
333
+
334
+ text = html_cell.text
335
+ try:
336
+ text = self.extract_table_cell_text(html_cell)
337
+ except Exception as exc:
338
+ _log.warn("exception: ", exc)
339
+ exit(-1)
340
+
341
+ # label = html_cell.name
342
+
343
+ col_span = int(html_cell.get("colspan", 1))
344
+ row_span = int(html_cell.get("rowspan", 1))
345
+
346
+ while grid[row_idx][col_idx] is not None:
347
+ col_idx += 1
348
+ for r in range(row_span):
349
+ for c in range(col_span):
350
+ grid[row_idx + r][col_idx + c] = text
351
+
352
+ cell = TableCell(
353
+ text=text,
354
+ row_span=row_span,
355
+ col_span=col_span,
356
+ start_row_offset_idx=row_idx,
357
+ end_row_offset_idx=row_idx + row_span,
358
+ start_col_offset_idx=col_idx,
359
+ end_col_offset_idx=col_idx + col_span,
360
+ col_header=col_header,
361
+ row_header=((not col_header) and html_cell.name == "th"),
362
+ )
363
+ data.table_cells.append(cell)
364
+
365
+ doc.add_table(data=data, parent=self.parents[self.level])
366
+
367
+ def get_list_text(self, list_element, level=0):
368
+ """Recursively extract text from <ul> or <ol> with proper indentation."""
369
+ result = []
370
+ bullet_char = "*" # Default bullet character for unordered lists
371
+
372
+ if list_element.name == "ol": # For ordered lists, use numbers
373
+ for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
374
+ # Add numbering for ordered lists
375
+ result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
376
+ # Handle nested lists
377
+ nested_list = li.find(["ul", "ol"])
378
+ if nested_list:
379
+ result.extend(self.get_list_text(nested_list, level + 1))
380
+ elif list_element.name == "ul": # For unordered lists, use bullet points
381
+ for li in list_element.find_all("li", recursive=False):
382
+ # Add bullet points for unordered lists
383
+ result.append(
384
+ f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
385
+ )
386
+ # Handle nested lists
387
+ nested_list = li.find(["ul", "ol"])
388
+ if nested_list:
389
+ result.extend(self.get_list_text(nested_list, level + 1))
390
+
391
+ return result
392
+
393
+ def extract_table_cell_text(self, cell):
394
+ """Extract text from a table cell, including lists with indents."""
395
+ contains_lists = cell.find(["ul", "ol"])
396
+ if contains_lists is None:
397
+ return cell.text
398
+ else:
399
+ _log.debug(
400
+ "should extract the content correctly for table-cells with lists ..."
401
+ )
402
+ return cell.text
403
+
404
+ def handle_figure(self, element, idx, doc):
405
+ """Handles image tags (img)."""
406
+
407
+ # Extract the image URI from the <img> tag
408
+ # image_uri = root.xpath('//figure//img/@src')[0]
409
+
410
+ contains_captions = element.find(["figcaption"])
411
+ if contains_captions is None:
412
+ doc.add_picture(parent=self.parents[self.level], caption=None)
413
+
414
+ else:
415
+ texts = []
416
+ for item in contains_captions:
417
+ texts.append(item.text)
418
+
419
+ fig_caption = doc.add_text(
420
+ label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
421
+ )
422
+ doc.add_picture(
423
+ parent=self.parents[self.level],
424
+ caption=fig_caption,
425
+ )
426
+
427
+ def handle_image(self, element, idx, doc):
428
+ """Handles image tags (img)."""
429
+ doc.add_picture(parent=self.parents[self.level], caption=None)