docling 1.20.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +15 -11
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +364 -318
  12. docling/datamodel/pipeline_options.py +13 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +10 -5
  17. docling/models/ds_glm_model.py +209 -20
  18. docling/models/easyocr_model.py +4 -1
  19. docling/models/layout_model.py +73 -61
  20. docling/models/page_assemble_model.py +21 -5
  21. docling/models/page_preprocessing_model.py +57 -0
  22. docling/models/table_structure_model.py +34 -32
  23. docling/models/tesseract_ocr_cli_model.py +8 -5
  24. docling/models/tesseract_ocr_model.py +8 -5
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.0.0.dist-info/METADATA +149 -0
  31. docling-2.0.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.20.0.dist-info/METADATA +0 -380
  35. docling-1.20.0.dist-info/RECORD +0 -35
  36. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
  37. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
  38. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -1,68 +1,63 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
4
+ from typing import TYPE_CHECKING, Set, Union
5
5
 
6
- from PIL import Image
6
+ from docling_core.types.doc import DoclingDocument
7
7
 
8
8
  if TYPE_CHECKING:
9
- from docling.datamodel.base_models import BoundingBox, Cell, PageSize
9
+ from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.document import InputDocument
10
11
 
11
12
 
12
- class PdfPageBackend(ABC):
13
-
13
+ class AbstractDocumentBackend(ABC):
14
14
  @abstractmethod
15
- def get_text_in_rect(self, bbox: "BoundingBox") -> str:
16
- pass
15
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
16
+ self.path_or_stream = path_or_stream
17
+ self.document_hash = in_doc.document_hash
18
+ self.input_format = in_doc.format
17
19
 
18
20
  @abstractmethod
19
- def get_text_cells(self) -> Iterable["Cell"]:
21
+ def is_valid(self) -> bool:
20
22
  pass
21
23
 
24
+ @classmethod
22
25
  @abstractmethod
23
- def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
26
+ def supports_pagination(cls) -> bool:
24
27
  pass
25
28
 
26
29
  @abstractmethod
27
- def get_page_image(
28
- self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
29
- ) -> Image.Image:
30
- pass
30
+ def unload(self):
31
+ if isinstance(self.path_or_stream, BytesIO):
32
+ self.path_or_stream.close()
31
33
 
32
- @abstractmethod
33
- def get_size(self) -> "PageSize":
34
- pass
34
+ self.path_or_stream = None
35
35
 
36
+ @classmethod
36
37
  @abstractmethod
37
- def is_valid(self) -> bool:
38
+ def supported_formats(cls) -> Set["InputFormat"]:
38
39
  pass
39
40
 
40
- @abstractmethod
41
- def unload(self):
42
- pass
43
41
 
42
+ class PaginatedDocumentBackend(AbstractDocumentBackend):
43
+ """DeclarativeDocumentBackend.
44
44
 
45
- class PdfDocumentBackend(ABC):
46
- @abstractmethod
47
- def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
48
- self.path_or_stream = path_or_stream
49
- self.document_hash = document_hash
50
-
51
- @abstractmethod
52
- def load_page(self, page_no: int) -> PdfPageBackend:
53
- pass
45
+ A declarative document backend is a backend that can transform to DoclingDocument
46
+ straight without a recognition pipeline.
47
+ """
54
48
 
55
49
  @abstractmethod
56
50
  def page_count(self) -> int:
57
51
  pass
58
52
 
59
- @abstractmethod
60
- def is_valid(self) -> bool:
61
- pass
62
53
 
63
- @abstractmethod
64
- def unload(self):
65
- if isinstance(self.path_or_stream, BytesIO):
66
- self.path_or_stream.close()
54
+ class DeclarativeDocumentBackend(AbstractDocumentBackend):
55
+ """DeclarativeDocumentBackend.
67
56
 
68
- self.path_or_stream = None
57
+ A declarative document backend is a backend that can transform to DoclingDocument
58
+ straight without a recognition pipeline.
59
+ """
60
+
61
+ @abstractmethod
62
+ def convert(self) -> DoclingDocument:
63
+ pass
@@ -5,12 +5,14 @@ from pathlib import Path
5
5
  from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
8
9
  from docling_parse.docling_parse import pdf_parser
9
10
  from PIL import Image, ImageDraw
10
11
  from pypdfium2 import PdfPage
11
12
 
12
- from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
13
- from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import Cell
15
+ from docling.datamodel.document import InputDocument
14
16
 
15
17
  _log = logging.getLogger(__name__)
16
18
 
@@ -177,8 +179,8 @@ class DoclingParsePageBackend(PdfPageBackend):
177
179
 
178
180
  return image
179
181
 
180
- def get_size(self) -> PageSize:
181
- return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
182
+ def get_size(self) -> Size:
183
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
182
184
 
183
185
  def unload(self):
184
186
  self._ppage = None
@@ -186,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
186
188
 
187
189
 
188
190
  class DoclingParseDocumentBackend(PdfDocumentBackend):
189
- def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
190
- super().__init__(path_or_stream, document_hash)
191
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
192
+ super().__init__(in_doc, path_or_stream)
191
193
 
192
- self._pdoc = pdfium.PdfDocument(path_or_stream)
194
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
193
195
  self.parser = pdf_parser()
194
196
 
195
197
  success = False
196
- if isinstance(path_or_stream, BytesIO):
198
+ if isinstance(self.path_or_stream, BytesIO):
197
199
  success = self.parser.load_document_from_bytesio(
198
- document_hash, path_or_stream
200
+ self.document_hash, self.path_or_stream
201
+ )
202
+ elif isinstance(self.path_or_stream, Path):
203
+ success = self.parser.load_document(
204
+ self.document_hash, str(self.path_or_stream)
199
205
  )
200
- elif isinstance(path_or_stream, Path):
201
- success = self.parser.load_document(document_hash, str(path_or_stream))
202
206
 
203
207
  if not success:
204
208
  raise RuntimeError(
205
- f"docling-parse could not load document {document_hash}."
209
+ f"docling-parse could not load document with hash {self.document_hash}."
206
210
  )
207
211
 
208
212
  def page_count(self) -> int:
@@ -2,15 +2,19 @@ import logging
2
2
  import random
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
8
9
  from docling_parse.docling_parse import pdf_parser_v2
9
10
  from PIL import Image, ImageDraw
10
11
  from pypdfium2 import PdfPage
11
12
 
12
- from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
13
- from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
14
+ from docling.datamodel.base_models import Cell, Size
15
+
16
+ if TYPE_CHECKING:
17
+ from docling.datamodel.document import InputDocument
14
18
 
15
19
  _log = logging.getLogger(__name__)
16
20
 
@@ -190,8 +194,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):
190
194
 
191
195
  return image
192
196
 
193
- def get_size(self) -> PageSize:
194
- return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
197
+ def get_size(self) -> Size:
198
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
195
199
 
196
200
  def unload(self):
197
201
  self._ppage = None
@@ -199,23 +203,23 @@ class DoclingParseV2PageBackend(PdfPageBackend):
199
203
 
200
204
 
201
205
  class DoclingParseV2DocumentBackend(PdfDocumentBackend):
202
- def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
203
- super().__init__(path_or_stream, document_hash)
206
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
207
+ super().__init__(in_doc, path_or_stream)
204
208
 
205
- self._pdoc = pdfium.PdfDocument(path_or_stream)
209
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
206
210
  self.parser = pdf_parser_v2("fatal")
207
211
 
208
212
  success = False
209
213
  if isinstance(path_or_stream, BytesIO):
210
214
  success = self.parser.load_document_from_bytesio(
211
- document_hash, path_or_stream
215
+ self.document_hash, path_or_stream
212
216
  )
213
217
  elif isinstance(path_or_stream, Path):
214
- success = self.parser.load_document(document_hash, str(path_or_stream))
218
+ success = self.parser.load_document(self.document_hash, str(path_or_stream))
215
219
 
216
220
  if not success:
217
221
  raise RuntimeError(
218
- f"docling-parse could not load document {document_hash}."
222
+ f"docling-parse v2 could not load document {self.document_hash}."
219
223
  )
220
224
 
221
225
  def page_count(self) -> int:
@@ -0,0 +1,425 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ from bs4 import BeautifulSoup
7
+ from docling_core.types.doc import (
8
+ DocItemLabel,
9
+ DoclingDocument,
10
+ GroupLabel,
11
+ TableCell,
12
+ TableData,
13
+ )
14
+
15
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
16
+ from docling.datamodel.base_models import InputFormat
17
+ from docling.datamodel.document import InputDocument
18
+
19
+ _log = logging.getLogger(__name__)
20
+
21
+
22
+ class HTMLDocumentBackend(DeclarativeDocumentBackend):
23
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
24
+ super().__init__(in_doc, path_or_stream)
25
+ _log.debug("About to init HTML backend...")
26
+ self.soup = None
27
+ # HTML file:
28
+ self.path_or_stream = path_or_stream
29
+ # Initialise the parents for the hierarchy
30
+ self.max_levels = 10
31
+ self.level = 0
32
+ self.parents = {} # type: ignore
33
+ for i in range(0, self.max_levels):
34
+ self.parents[i] = None
35
+ self.labels = {} # type: ignore
36
+
37
+ try:
38
+ if isinstance(self.path_or_stream, BytesIO):
39
+ text_stream = self.path_or_stream.getvalue().decode("utf-8")
40
+ self.soup = BeautifulSoup(text_stream, "html.parser")
41
+ if isinstance(self.path_or_stream, Path):
42
+ with open(self.path_or_stream, "r", encoding="utf-8") as f:
43
+ html_content = f.read()
44
+ self.soup = BeautifulSoup(html_content, "html.parser")
45
+ except Exception as e:
46
+ raise RuntimeError(
47
+ f"Could not initialize HTML backend for file with hash {self.document_hash}."
48
+ ) from e
49
+
50
+ def is_valid(self) -> bool:
51
+ return self.soup is not None
52
+
53
+ @classmethod
54
+ def supports_pagination(cls) -> bool:
55
+ return False
56
+
57
+ def unload(self):
58
+ if isinstance(self.path_or_stream, BytesIO):
59
+ self.path_or_stream.close()
60
+
61
+ self.path_or_stream = None
62
+
63
+ @classmethod
64
+ def supported_formats(cls) -> Set[InputFormat]:
65
+ return {InputFormat.HTML}
66
+
67
+ def convert(self) -> DoclingDocument:
68
+ # access self.path_or_stream to load stuff
69
+ doc = DoclingDocument(name="dummy")
70
+ _log.debug("Trying to convert HTML...")
71
+
72
+ if self.is_valid():
73
+ assert self.soup is not None
74
+ # Replace <br> tags with newline characters
75
+ for br in self.soup.body.find_all("br"):
76
+ br.replace_with("\n")
77
+ doc = self.walk(self.soup.body, doc)
78
+ else:
79
+ raise RuntimeError(
80
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
81
+ )
82
+ return doc
83
+
84
+ def walk(self, element, doc):
85
+ try:
86
+ # Iterate over elements in the body of the document
87
+ for idx, element in enumerate(element.children):
88
+ try:
89
+ self.analyse_element(element, idx, doc)
90
+ except Exception as exc_child:
91
+
92
+ _log.error(" -> error treating child: ", exc_child)
93
+ _log.error(" => element: ", element, "\n")
94
+ raise exc_child
95
+
96
+ except Exception as exc:
97
+ pass
98
+
99
+ return doc
100
+
101
+ def analyse_element(self, element, idx, doc):
102
+ """
103
+ if element.name!=None:
104
+ _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
105
+ """
106
+
107
+ if element.name in self.labels:
108
+ self.labels[element.name] += 1
109
+ else:
110
+ self.labels[element.name] = 1
111
+
112
+ if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
113
+ self.handle_header(element, idx, doc)
114
+ elif element.name in ["p"]:
115
+ self.handle_paragraph(element, idx, doc)
116
+ elif element.name in ["ul", "ol"]:
117
+ self.handle_list(element, idx, doc)
118
+ elif element.name in ["li"]:
119
+ self.handle_listitem(element, idx, doc)
120
+ elif element.name == "table":
121
+ self.handle_table(element, idx, doc)
122
+ elif element.name == "figure":
123
+ self.handle_figure(element, idx, doc)
124
+ elif element.name == "img":
125
+ self.handle_image(element, idx, doc)
126
+ else:
127
+ self.walk(element, doc)
128
+
129
+ def get_direct_text(self, item):
130
+ """Get the direct text of the <li> element (ignoring nested lists)."""
131
+ text = item.find(string=True, recursive=False)
132
+
133
+ if isinstance(text, str):
134
+ return text.strip()
135
+
136
+ return ""
137
+
138
+ # Function to recursively extract text from all child nodes
139
+ def extract_text_recursively(self, item):
140
+ result = []
141
+
142
+ if isinstance(item, str):
143
+ return [item]
144
+
145
+ result.append(self.get_direct_text(item))
146
+
147
+ try:
148
+ # Iterate over the children (and their text and tails)
149
+ for child in item:
150
+ try:
151
+ # Recursively get the child's text content
152
+ result.extend(self.extract_text_recursively(child))
153
+ except:
154
+ pass
155
+ except:
156
+ _log.warn("item has no children")
157
+ pass
158
+
159
+ return " ".join(result)
160
+
161
+ def handle_header(self, element, idx, doc):
162
+ """Handles header tags (h1, h2, etc.)."""
163
+ hlevel = int(element.name.replace("h", ""))
164
+ slevel = hlevel - 1
165
+
166
+ label = DocItemLabel.SECTION_HEADER
167
+ text = element.text.strip()
168
+
169
+ if hlevel == 1:
170
+ for key, val in self.parents.items():
171
+ self.parents[key] = None
172
+
173
+ self.level = 1
174
+ self.parents[self.level] = doc.add_text(
175
+ parent=self.parents[0], label=DocItemLabel.TITLE, text=text
176
+ )
177
+
178
+ elif hlevel == self.level:
179
+ self.parents[hlevel] = doc.add_text(
180
+ parent=self.parents[hlevel - 1], label=label, text=text
181
+ )
182
+
183
+ elif hlevel > self.level:
184
+
185
+ # add invisible group
186
+ for i in range(self.level + 1, hlevel):
187
+ self.parents[i] = doc.add_group(
188
+ name=f"header-{i}",
189
+ label=GroupLabel.SECTION,
190
+ parent=self.parents[i - 1],
191
+ )
192
+
193
+ self.parents[hlevel] = doc.add_text(
194
+ parent=self.parents[hlevel - 1], label=label, text=text
195
+ )
196
+ self.level = hlevel
197
+
198
+ elif hlevel < self.level:
199
+
200
+ # remove the tail
201
+ for key, val in self.parents.items():
202
+ if key > hlevel:
203
+ self.parents[key] = None
204
+
205
+ self.parents[hlevel] = doc.add_text(
206
+ parent=self.parents[hlevel - 1], label=label, text=text
207
+ )
208
+ self.level = hlevel
209
+
210
+ def handle_paragraph(self, element, idx, doc):
211
+ """Handles paragraph tags (p)."""
212
+ if element.text is None:
213
+ return
214
+ text = element.text.strip()
215
+ label = DocItemLabel.PARAGRAPH
216
+ if len(text) == 0:
217
+ return
218
+ doc.add_text(parent=self.parents[self.level], label=label, text=text)
219
+
220
+ def handle_list(self, element, idx, doc):
221
+ """Handles list tags (ul, ol) and their list items."""
222
+
223
+ if element.name == "ul":
224
+ # create a list group
225
+ self.parents[self.level + 1] = doc.add_group(
226
+ parent=self.parents[self.level], name="list", label=GroupLabel.LIST
227
+ )
228
+ elif element.name == "ol":
229
+ # create a list group
230
+ self.parents[self.level + 1] = doc.add_group(
231
+ parent=self.parents[self.level],
232
+ name="ordered list",
233
+ label=GroupLabel.ORDERED_LIST,
234
+ )
235
+ self.level += 1
236
+
237
+ self.walk(element, doc)
238
+
239
+ self.parents[self.level + 1] = None
240
+ self.level -= 1
241
+
242
+ def handle_listitem(self, element, idx, doc):
243
+ """Handles listitem tags (li)."""
244
+ nested_lists = element.find(["ul", "ol"])
245
+
246
+ parent_list_label = self.parents[self.level].label
247
+ index_in_list = len(self.parents[self.level].children) + 1
248
+
249
+ if nested_lists:
250
+ name = element.name
251
+ text = self.get_direct_text(element)
252
+
253
+ marker = ""
254
+ enumerated = False
255
+ if parent_list_label == GroupLabel.ORDERED_LIST:
256
+ marker = str(index_in_list)
257
+ enumerated = True
258
+
259
+ # create a list-item
260
+ self.parents[self.level + 1] = doc.add_list_item(
261
+ text=text,
262
+ enumerated=enumerated,
263
+ marker=marker,
264
+ parent=self.parents[self.level],
265
+ )
266
+ self.level += 1
267
+
268
+ self.walk(element, doc)
269
+
270
+ self.parents[self.level + 1] = None
271
+ self.level -= 1
272
+
273
+ elif isinstance(element.text, str):
274
+ text = element.text.strip()
275
+
276
+ marker = ""
277
+ enumerated = False
278
+ if parent_list_label == GroupLabel.ORDERED_LIST:
279
+ marker = f"{str(index_in_list)}."
280
+ enumerated = True
281
+ doc.add_list_item(
282
+ text=text,
283
+ enumerated=enumerated,
284
+ marker=marker,
285
+ parent=self.parents[self.level],
286
+ )
287
+ else:
288
+ _log.warn("list-item has no text: ", element)
289
+
290
+ def handle_table(self, element, idx, doc):
291
+ """Handles table tags."""
292
+
293
+ nested_tables = element.find("table")
294
+ if nested_tables is not None:
295
+ _log.warn("detected nested tables: skipping for now")
296
+ return
297
+
298
+ # Count the number of rows (number of <tr> elements)
299
+ num_rows = len(element.find_all("tr"))
300
+
301
+ # Find the number of columns (taking into account colspan)
302
+ num_cols = 0
303
+ for row in element.find_all("tr"):
304
+ col_count = 0
305
+ for cell in row.find_all(["td", "th"]):
306
+ colspan = int(cell.get("colspan", 1))
307
+ col_count += colspan
308
+ num_cols = max(num_cols, col_count)
309
+
310
+ grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
311
+
312
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
313
+
314
+ # Iterate over the rows in the table
315
+ for row_idx, row in enumerate(element.find_all("tr")):
316
+
317
+ # For each row, find all the column cells (both <td> and <th>)
318
+ cells = row.find_all(["td", "th"])
319
+
320
+ # Check if each cell in the row is a header -> means it is a column header
321
+ col_header = True
322
+ for j, html_cell in enumerate(cells):
323
+ if html_cell.name == "td":
324
+ col_header = False
325
+
326
+ col_idx = 0
327
+ # Extract and print the text content of each cell
328
+ for _, html_cell in enumerate(cells):
329
+
330
+ text = html_cell.text
331
+ try:
332
+ text = self.extract_table_cell_text(html_cell)
333
+ except Exception as exc:
334
+ _log.warn("exception: ", exc)
335
+ exit(-1)
336
+
337
+ # label = html_cell.name
338
+
339
+ col_span = int(html_cell.get("colspan", 1))
340
+ row_span = int(html_cell.get("rowspan", 1))
341
+
342
+ while grid[row_idx][col_idx] is not None:
343
+ col_idx += 1
344
+ for r in range(row_span):
345
+ for c in range(col_span):
346
+ grid[row_idx + r][col_idx + c] = text
347
+
348
+ cell = TableCell(
349
+ text=text,
350
+ row_span=row_span,
351
+ col_span=col_span,
352
+ start_row_offset_idx=row_idx,
353
+ end_row_offset_idx=row_idx + row_span,
354
+ start_col_offset_idx=col_idx,
355
+ end_col_offset_idx=col_idx + col_span,
356
+ col_header=col_header,
357
+ row_header=((not col_header) and html_cell.name == "th"),
358
+ )
359
+ data.table_cells.append(cell)
360
+
361
+ doc.add_table(data=data, parent=self.parents[self.level])
362
+
363
+ def get_list_text(self, list_element, level=0):
364
+ """Recursively extract text from <ul> or <ol> with proper indentation."""
365
+ result = []
366
+ bullet_char = "*" # Default bullet character for unordered lists
367
+
368
+ if list_element.name == "ol": # For ordered lists, use numbers
369
+ for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
370
+ # Add numbering for ordered lists
371
+ result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
372
+ # Handle nested lists
373
+ nested_list = li.find(["ul", "ol"])
374
+ if nested_list:
375
+ result.extend(self.get_list_text(nested_list, level + 1))
376
+ elif list_element.name == "ul": # For unordered lists, use bullet points
377
+ for li in list_element.find_all("li", recursive=False):
378
+ # Add bullet points for unordered lists
379
+ result.append(
380
+ f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
381
+ )
382
+ # Handle nested lists
383
+ nested_list = li.find(["ul", "ol"])
384
+ if nested_list:
385
+ result.extend(self.get_list_text(nested_list, level + 1))
386
+
387
+ return result
388
+
389
+ def extract_table_cell_text(self, cell):
390
+ """Extract text from a table cell, including lists with indents."""
391
+ contains_lists = cell.find(["ul", "ol"])
392
+ if contains_lists is None:
393
+ return cell.text
394
+ else:
395
+ _log.debug(
396
+ "should extract the content correctly for table-cells with lists ..."
397
+ )
398
+ return cell.text
399
+
400
+ def handle_figure(self, element, idx, doc):
401
+ """Handles image tags (img)."""
402
+
403
+ # Extract the image URI from the <img> tag
404
+ # image_uri = root.xpath('//figure//img/@src')[0]
405
+
406
+ contains_captions = element.find(["figcaption"])
407
+ if contains_captions is None:
408
+ doc.add_picture(parent=self.parents[self.level], caption=None)
409
+
410
+ else:
411
+ texts = []
412
+ for item in contains_captions:
413
+ texts.append(item.text)
414
+
415
+ fig_caption = doc.add_text(
416
+ label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
417
+ )
418
+ doc.add_picture(
419
+ parent=self.parents[self.level],
420
+ caption=fig_caption,
421
+ )
422
+
423
+ def handle_image(self, element, idx, doc):
424
+ """Handles image tags (img)."""
425
+ doc.add_picture(parent=self.parents[self.level], caption=None)