docling 2.30.0__py3-none-any.whl → 2.31.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. docling/backend/asciidoc_backend.py +7 -15
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +2 -2
  4. docling/backend/docling_parse_v2_backend.py +2 -2
  5. docling/backend/docling_parse_v4_backend.py +3 -4
  6. docling/backend/docx/latex/latex_dict.py +0 -5
  7. docling/backend/docx/latex/omml.py +4 -7
  8. docling/backend/html_backend.py +66 -25
  9. docling/backend/md_backend.py +6 -8
  10. docling/backend/msexcel_backend.py +1 -7
  11. docling/backend/mspowerpoint_backend.py +4 -7
  12. docling/backend/msword_backend.py +5 -5
  13. docling/backend/pdf_backend.py +2 -1
  14. docling/backend/pypdfium2_backend.py +3 -3
  15. docling/backend/xml/jats_backend.py +11 -14
  16. docling/backend/xml/uspto_backend.py +19 -23
  17. docling/cli/main.py +8 -8
  18. docling/cli/models.py +6 -3
  19. docling/datamodel/base_models.py +7 -5
  20. docling/datamodel/document.py +19 -10
  21. docling/datamodel/pipeline_options.py +0 -1
  22. docling/document_converter.py +8 -6
  23. docling/models/api_vlm_model.py +1 -2
  24. docling/models/base_model.py +2 -4
  25. docling/models/base_ocr_model.py +2 -2
  26. docling/models/code_formula_model.py +2 -1
  27. docling/models/document_picture_classifier.py +2 -1
  28. docling/models/easyocr_model.py +10 -11
  29. docling/models/factories/__init__.py +2 -2
  30. docling/models/factories/base_factory.py +1 -1
  31. docling/models/hf_mlx_model.py +4 -6
  32. docling/models/hf_vlm_model.py +7 -5
  33. docling/models/layout_model.py +2 -2
  34. docling/models/ocr_mac_model.py +3 -4
  35. docling/models/page_assemble_model.py +7 -12
  36. docling/models/page_preprocessing_model.py +2 -1
  37. docling/models/picture_description_api_model.py +2 -1
  38. docling/models/picture_description_base_model.py +2 -3
  39. docling/models/picture_description_vlm_model.py +6 -4
  40. docling/models/rapid_ocr_model.py +2 -3
  41. docling/models/readingorder_model.py +9 -24
  42. docling/models/table_structure_model.py +4 -8
  43. docling/models/tesseract_ocr_cli_model.py +17 -16
  44. docling/models/tesseract_ocr_model.py +9 -5
  45. docling/pipeline/base_pipeline.py +4 -8
  46. docling/pipeline/simple_pipeline.py +0 -1
  47. docling/pipeline/standard_pdf_pipeline.py +0 -1
  48. docling/pipeline/vlm_pipeline.py +0 -3
  49. docling/utils/export.py +2 -4
  50. docling/utils/glm_utils.py +2 -2
  51. docling/utils/layout_postprocessor.py +4 -2
  52. docling/utils/model_downloader.py +31 -7
  53. docling/utils/utils.py +3 -3
  54. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/METADATA +2 -1
  55. docling-2.31.1.dist-info/RECORD +86 -0
  56. docling-2.30.0.dist-info/RECORD +0 -86
  57. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/LICENSE +0 -0
  58. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/WHEEL +0 -0
  59. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/entry_points.txt +0 -0
@@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
34
34
  text_stream = self.path_or_stream.getvalue().decode("utf-8")
35
35
  self.lines = text_stream.split("\n")
36
36
  if isinstance(self.path_or_stream, Path):
37
- with open(self.path_or_stream, "r", encoding="utf-8") as f:
37
+ with open(self.path_or_stream, encoding="utf-8") as f:
38
38
  self.lines = f.readlines()
39
39
  self.valid = True
40
40
 
@@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
75
75
 
76
76
  return doc
77
77
 
78
- def _parse(self, doc: DoclingDocument):
78
+ def _parse(self, doc: DoclingDocument): # noqa: C901
79
79
  """
80
80
  Main function that orchestrates the parsing by yielding components:
81
81
  title, section headers, text, lists, and tables.
82
82
  """
83
83
 
84
- content = ""
85
-
86
84
  in_list = False
87
85
  in_table = False
88
86
 
@@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
95
93
  # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
96
94
  indents: dict[int, Union[GroupItem, None]] = {}
97
95
 
98
- for i in range(0, 10):
96
+ for i in range(10):
99
97
  parents[i] = None
100
98
  indents[i] = None
101
99
 
@@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
125
123
 
126
124
  # Lists
127
125
  elif self._is_list_item(line):
128
-
129
126
  _log.debug(f"line: {line}")
130
127
  item = self._parse_list_item(line)
131
128
  _log.debug(f"parsed list-item: {item}")
@@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
147
144
  indents[level + 1] = item["indent"]
148
145
 
149
146
  elif in_list and item["indent"] < indents[level]:
150
-
151
147
  # print(item["indent"], " => ", indents[level])
152
148
  while item["indent"] < indents[level]:
153
149
  # print(item["indent"], " => ", indents[level])
@@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
176
172
  elif in_table and (
177
173
  (not self._is_table_line(line)) or line.strip() == "|==="
178
174
  ): # end of table
179
-
180
175
  caption = None
181
176
  if len(caption_data) > 0:
182
177
  caption = doc.add_text(
@@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
195
190
 
196
191
  # Picture
197
192
  elif self._is_picture(line):
198
-
199
193
  caption = None
200
194
  if len(caption_data) > 0:
201
195
  caption = doc.add_text(
@@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
250
244
  text_data = []
251
245
 
252
246
  elif len(line.strip()) > 0: # allow multiline texts
253
-
254
247
  item = self._parse_text(line)
255
248
  text_data.append(item["text"])
256
249
 
@@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
273
266
 
274
267
  def _get_current_level(self, parents):
275
268
  for k, v in parents.items():
276
- if v == None and k > 0:
269
+ if v is None and k > 0:
277
270
  return k - 1
278
271
 
279
272
  return 0
280
273
 
281
274
  def _get_current_parent(self, parents):
282
275
  for k, v in parents.items():
283
- if v == None and k > 0:
276
+ if v is None and k > 0:
284
277
  return parents[k - 1]
285
278
 
286
279
  return None
@@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
328
321
  "marker": marker,
329
322
  "text": text.strip(),
330
323
  "numbered": False,
331
- "indent": 0 if indent == None else len(indent),
324
+ "indent": 0 if indent is None else len(indent),
332
325
  }
333
326
  else:
334
327
  return {
@@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
336
329
  "marker": marker,
337
330
  "text": text.strip(),
338
331
  "numbered": True,
339
- "indent": 0 if indent == None else len(indent),
332
+ "indent": 0 if indent is None else len(indent),
340
333
  }
341
334
  else:
342
335
  # Fallback if no match
@@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
357
350
  return [cell.strip() for cell in line.split("|") if cell.strip()]
358
351
 
359
352
  def _populate_table_as_grid(self, table_data):
360
-
361
353
  num_rows = len(table_data)
362
354
 
363
355
  # Adjust the table data into a grid format
@@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
58
58
  head = self.content.readline()
59
59
  dialect = csv.Sniffer().sniff(head, ",;\t|:")
60
60
  _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
61
- if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
61
+ if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
62
62
  raise RuntimeError(
63
63
  f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
64
64
  )
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import random
3
+ from collections.abc import Iterable
3
4
  from io import BytesIO
4
5
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Union
6
+ from typing import List, Optional, Union
6
7
 
7
8
  import pypdfium2 as pdfium
8
9
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
@@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
156
157
  def get_page_image(
157
158
  self, scale: float = 1, cropbox: Optional[BoundingBox] = None
158
159
  ) -> Image.Image:
159
-
160
160
  page_size = self.get_size()
161
161
 
162
162
  if not cropbox:
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import random
3
+ from collections.abc import Iterable
3
4
  from io import BytesIO
4
5
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
+ from typing import TYPE_CHECKING, List, Optional, Union
6
7
 
7
8
  import pypdfium2 as pdfium
8
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
172
173
  def get_page_image(
173
174
  self, scale: float = 1, cropbox: Optional[BoundingBox] = None
174
175
  ) -> Image.Image:
175
-
176
176
  page_size = self.get_size()
177
177
 
178
178
  if not cropbox:
@@ -1,14 +1,14 @@
1
1
  import logging
2
- import random
2
+ from collections.abc import Iterable
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Iterable, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
9
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell
10
10
  from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
11
- from PIL import Image, ImageDraw
11
+ from PIL import Image
12
12
  from pypdfium2 import PdfPage
13
13
 
14
14
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
@@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
93
93
  def get_page_image(
94
94
  self, scale: float = 1, cropbox: Optional[BoundingBox] = None
95
95
  ) -> Image.Image:
96
-
97
96
  page_size = self.get_size()
98
97
 
99
98
  if not cropbox:
@@ -1,12 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """
4
2
  Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
5
3
  On 23/01/2025
6
4
  """
7
5
 
8
- from __future__ import unicode_literals
9
-
10
6
  CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
11
7
 
12
8
  BLANK = ""
@@ -79,7 +75,6 @@ CHR_BO = {
79
75
  }
80
76
 
81
77
  T = {
82
- "\u2192": "\\rightarrow ",
83
78
  # Greek letters
84
79
  "\U0001d6fc": "\\alpha ",
85
80
  "\U0001d6fd": "\\beta ",
@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
76
76
  return default
77
77
 
78
78
 
79
- class Tag2Method(object):
80
-
79
+ class Tag2Method:
81
80
  def call_method(self, elm, stag=None):
82
81
  getmethod = self.tag2meth.get
83
82
  if stag is None:
@@ -130,7 +129,6 @@ class Tag2Method(object):
130
129
 
131
130
 
132
131
  class Pr(Tag2Method):
133
-
134
132
  text = ""
135
133
 
136
134
  __val_tags = ("chr", "pos", "begChr", "endChr", "type")
@@ -159,7 +157,7 @@ class Pr(Tag2Method):
159
157
  def do_common(self, elm):
160
158
  stag = elm.tag.replace(OMML_NS, "")
161
159
  if stag in self.__val_tags:
162
- t = elm.get("{0}val".format(OMML_NS))
160
+ t = elm.get(f"{OMML_NS}val")
163
161
  self.__innerdict[stag] = t
164
162
  return None
165
163
 
@@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
248
246
  """
249
247
  the Pre-Sub-Superscript object -- Not support yet
250
248
  """
251
- pass
252
249
 
253
250
  def do_sub(self, elm):
254
251
  text = self.process_children(elm)
@@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
331
328
  t_dict = self.process_children_dict(elm, include=("e", "lim"))
332
329
  latex_s = LIM_FUNC.get(t_dict["e"])
333
330
  if not latex_s:
334
- raise NotSupport("Not support lim %s" % t_dict["e"])
331
+ raise RuntimeError("Not support lim {}".format(t_dict["e"]))
335
332
  else:
336
333
  return latex_s.format(lim=t_dict.get("lim"))
337
334
 
@@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
413
410
  """
414
411
  _str = []
415
412
  _base_str = []
416
- found_text = elm.findtext("./{0}t".format(OMML_NS))
413
+ found_text = elm.findtext(f"./{OMML_NS}t")
417
414
  if found_text:
418
415
  for s in found_text:
419
416
  out_latex_str = self.process_unicode(s)
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import traceback
2
3
  from io import BytesIO
3
4
  from pathlib import Path
4
5
  from typing import Final, Optional, Union, cast
@@ -26,6 +27,8 @@ _log = logging.getLogger(__name__)
26
27
 
27
28
  # tags that generate NodeItem elements
28
29
  TAGS_FOR_NODE_ITEMS: Final = [
30
+ "address",
31
+ "details",
29
32
  "h1",
30
33
  "h2",
31
34
  "h3",
@@ -38,6 +41,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
38
41
  "ul",
39
42
  "ol",
40
43
  "li",
44
+ "summary",
41
45
  "table",
42
46
  "figure",
43
47
  "img",
@@ -55,7 +59,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
55
59
  self.max_levels = 10
56
60
  self.level = 0
57
61
  self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
58
- for i in range(0, self.max_levels):
62
+ for i in range(self.max_levels):
59
63
  self.parents[i] = None
60
64
 
61
65
  try:
@@ -126,7 +130,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
126
130
  return doc
127
131
 
128
132
  def walk(self, tag: Tag, doc: DoclingDocument) -> None:
129
-
130
133
  # Iterate over elements in the body of the document
131
134
  text: str = ""
132
135
  for element in tag.children:
@@ -135,7 +138,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
135
138
  self.analyze_tag(cast(Tag, element), doc)
136
139
  except Exception as exc_child:
137
140
  _log.error(
138
- f"Error processing child from tag {tag.name}: {repr(exc_child)}"
141
+ f"Error processing child from tag {tag.name}:\n{traceback.format_exc()}"
139
142
  )
140
143
  raise exc_child
141
144
  elif isinstance(element, NavigableString) and not isinstance(
@@ -147,7 +150,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
147
150
  item for item in element.next_siblings if isinstance(item, Tag)
148
151
  ]
149
152
  if element.next_sibling is None or any(
150
- [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
153
+ item.name in TAGS_FOR_NODE_ITEMS for item in siblings
151
154
  ):
152
155
  text = text.strip()
153
156
  if text and tag.name in ["div"]:
@@ -164,7 +167,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
164
167
  def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
165
168
  if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
166
169
  self.handle_header(tag, doc)
167
- elif tag.name in ["p"]:
170
+ elif tag.name in ["p", "address", "summary"]:
168
171
  self.handle_paragraph(tag, doc)
169
172
  elif tag.name in ["pre", "code"]:
170
173
  self.handle_code(tag, doc)
@@ -178,6 +181,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
178
181
  self.handle_figure(tag, doc)
179
182
  elif tag.name == "img":
180
183
  self.handle_image(tag, doc)
184
+ elif tag.name == "details":
185
+ self.handle_details(tag, doc)
181
186
  else:
182
187
  self.walk(tag, doc)
183
188
 
@@ -202,6 +207,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
202
207
 
203
208
  return ["".join(result) + " "]
204
209
 
210
+ def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
211
+ """Handle details tag (details) and its content."""
212
+
213
+ self.parents[self.level + 1] = doc.add_group(
214
+ name="details",
215
+ label=GroupLabel.SECTION,
216
+ parent=self.parents[self.level],
217
+ content_layer=self.content_layer,
218
+ )
219
+
220
+ self.level += 1
221
+ self.walk(element, doc)
222
+ self.parents[self.level + 1] = None
223
+ self.level -= 1
224
+
205
225
  def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
206
226
  """Handles header tags (h1, h2, etc.)."""
207
227
  hlevel = int(element.name.replace("h", ""))
@@ -222,7 +242,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
222
242
  )
223
243
  else:
224
244
  if hlevel > self.level:
225
-
226
245
  # add invisible group
227
246
  for i in range(self.level + 1, hlevel):
228
247
  self.parents[i] = doc.add_group(
@@ -234,7 +253,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
234
253
  self.level = hlevel
235
254
 
236
255
  elif hlevel < self.level:
237
-
238
256
  # remove the tail
239
257
  for key in self.parents.keys():
240
258
  if key > hlevel:
@@ -261,7 +279,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
261
279
  )
262
280
 
263
281
  def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
264
- """Handles paragraph tags (p)."""
282
+ """Handles paragraph tags (p) or equivalent ones."""
265
283
  if element.text is None:
266
284
  return
267
285
  text = element.text.strip()
@@ -360,7 +378,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
360
378
  marker = ""
361
379
  enumerated = False
362
380
  if parent_label == GroupLabel.ORDERED_LIST:
363
- marker = f"{str(index_in_list)}."
381
+ marker = f"{index_in_list!s}."
364
382
  enumerated = True
365
383
  doc.add_list_item(
366
384
  text=text,
@@ -373,46 +391,64 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
373
391
  _log.debug(f"list-item has no text: {element}")
374
392
 
375
393
  @staticmethod
376
- def parse_table_data(element: Tag) -> Optional[TableData]:
394
+ def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
377
395
  nested_tables = element.find("table")
378
396
  if nested_tables is not None:
379
397
  _log.debug("Skipping nested table.")
380
398
  return None
381
399
 
382
- # Count the number of rows (number of <tr> elements)
383
- num_rows = len(element("tr"))
384
-
385
- # Find the number of columns (taking into account colspan)
400
+ # Find the number of rows and columns (taking into account spans)
401
+ num_rows = 0
386
402
  num_cols = 0
387
403
  for row in element("tr"):
388
404
  col_count = 0
405
+ is_row_header = True
389
406
  if not isinstance(row, Tag):
390
407
  continue
391
408
  for cell in row(["td", "th"]):
392
409
  if not isinstance(row, Tag):
393
410
  continue
394
- val = cast(Tag, cell).get("colspan", "1")
411
+ cell_tag = cast(Tag, cell)
412
+ val = cell_tag.get("colspan", "1")
395
413
  colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
396
414
  col_count += colspan
415
+ if cell_tag.name == "td" or cell_tag.get("rowspan") is None:
416
+ is_row_header = False
397
417
  num_cols = max(num_cols, col_count)
418
+ if not is_row_header:
419
+ num_rows += 1
420
+
421
+ _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
398
422
 
399
423
  grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
400
424
 
401
425
  data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
402
426
 
403
427
  # Iterate over the rows in the table
404
- for row_idx, row in enumerate(element("tr")):
428
+ start_row_span = 0
429
+ row_idx = -1
430
+ for row in element("tr"):
405
431
  if not isinstance(row, Tag):
406
432
  continue
407
433
 
408
434
  # For each row, find all the column cells (both <td> and <th>)
409
435
  cells = row(["td", "th"])
410
436
 
411
- # Check if each cell in the row is a header -> means it is a column header
437
+ # Check if cell is in a column header or row header
412
438
  col_header = True
439
+ row_header = True
413
440
  for html_cell in cells:
414
- if isinstance(html_cell, Tag) and html_cell.name == "td":
415
- col_header = False
441
+ if isinstance(html_cell, Tag):
442
+ if html_cell.name == "td":
443
+ col_header = False
444
+ row_header = False
445
+ elif html_cell.get("rowspan") is None:
446
+ row_header = False
447
+ if not row_header:
448
+ row_idx += 1
449
+ start_row_span = 0
450
+ else:
451
+ start_row_span += 1
416
452
 
417
453
  # Extract the text content of each cell
418
454
  col_idx = 0
@@ -443,19 +479,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
443
479
  if isinstance(row_val, str) and row_val.isnumeric()
444
480
  else 1
445
481
  )
446
-
447
- while grid[row_idx][col_idx] is not None:
482
+ if row_header:
483
+ row_span -= 1
484
+ while (
485
+ col_idx < num_cols
486
+ and grid[row_idx + start_row_span][col_idx] is not None
487
+ ):
448
488
  col_idx += 1
449
- for r in range(row_span):
489
+ for r in range(start_row_span, start_row_span + row_span):
450
490
  for c in range(col_span):
451
- grid[row_idx + r][col_idx + c] = text
491
+ if row_idx + r < num_rows and col_idx + c < num_cols:
492
+ grid[row_idx + r][col_idx + c] = text
452
493
 
453
494
  table_cell = TableCell(
454
495
  text=text,
455
496
  row_span=row_span,
456
497
  col_span=col_span,
457
- start_row_offset_idx=row_idx,
458
- end_row_offset_idx=row_idx + row_span,
498
+ start_row_offset_idx=start_row_span + row_idx,
499
+ end_row_offset_idx=start_row_span + row_idx + row_span,
459
500
  start_col_offset_idx=col_idx,
460
501
  end_col_offset_idx=col_idx + col_span,
461
502
  column_header=col_header,
@@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
83
83
  # otherwise they represent emphasis (bold or italic)
84
84
  self.markdown = self._shorten_underscore_sequences(text_stream)
85
85
  if isinstance(self.path_or_stream, Path):
86
- with open(self.path_or_stream, "r", encoding="utf-8") as f:
86
+ with open(self.path_or_stream, encoding="utf-8") as f:
87
87
  md_content = f.read()
88
88
  # remove invalid sequences
89
89
  # very long sequences of underscores will lead to unnecessary long processing times.
@@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
168
168
  )
169
169
  self.inline_texts = []
170
170
 
171
- def _iterate_elements(
171
+ def _iterate_elements( # noqa: C901
172
172
  self,
173
173
  element: marko.element.Element,
174
174
  depth: int,
@@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
176
176
  visited: Set[marko.element.Element],
177
177
  parent_item: Optional[NodeItem] = None,
178
178
  ):
179
-
180
179
  if element in visited:
181
180
  return
182
181
 
@@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
236
235
  if has_non_empty_list_items:
237
236
  label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
238
237
  parent_item = doc.add_group(
239
- label=label, name=f"list", parent=parent_item
238
+ label=label, name="list", parent=parent_item
240
239
  )
241
240
 
242
241
  elif (
@@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
320
319
  self._html_blocks += 1
321
320
  self._process_inline_text(parent_item, doc)
322
321
  self._close_table(doc)
323
- _log.debug("HTML Block: {}".format(element))
322
+ _log.debug(f"HTML Block: {element}")
324
323
  if (
325
324
  len(element.body) > 0
326
325
  ): # If Marko doesn't return any content for HTML block, skip it
@@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
332
331
  else:
333
332
  if not isinstance(element, str):
334
333
  self._close_table(doc)
335
- _log.debug("Some other element: {}".format(element))
334
+ _log.debug(f"Some other element: {element}")
336
335
 
337
336
  processed_block_types = (
338
337
  marko.block.Heading,
@@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
398
397
 
399
398
  # if HTML blocks were detected, export to HTML and delegate to HTML backend
400
399
  if self._html_blocks > 0:
401
-
402
400
  # export to HTML
403
401
  html_backend_cls = HTMLDocumentBackend
404
402
  html_str = doc.export_to_html()
@@ -411,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
411
409
  )
412
410
  return _txt
413
411
 
414
- # restore original HTML by removing previouly added markers
412
+ # restore original HTML by removing previously added markers
415
413
  for regex in [
416
414
  rf"<pre>\s*<code>\s*{_START_MARKER}",
417
415
  rf"{_STOP_MARKER}\s*</code>\s*</pre>",
@@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
184
184
  """
185
185
 
186
186
  if self.workbook is not None:
187
-
188
187
  # Iterate over all sheets
189
188
  for sheet_name in self.workbook.sheetnames:
190
189
  _log.info(f"Processing sheet: {sheet_name}")
@@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
253
252
  )
254
253
 
255
254
  for excel_cell in excel_table.data:
256
-
257
255
  cell = TableCell(
258
256
  text=excel_cell.text,
259
257
  row_span=excel_cell.row_span,
@@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
303
301
  # Iterate over all cells in the sheet
304
302
  for ri, row in enumerate(sheet.iter_rows(values_only=False)):
305
303
  for rj, cell in enumerate(row):
306
-
307
304
  # Skip empty or already visited cells
308
305
  if cell.value is None or (ri, rj) in visited:
309
306
  continue
@@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
342
339
  visited_cells: set[tuple[int, int]] = set()
343
340
  for ri in range(start_row, max_row + 1):
344
341
  for rj in range(start_col, max_col + 1):
345
-
346
342
  cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
347
343
 
348
344
  # Check if the cell belongs to a merged range
@@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
350
346
  col_span = 1
351
347
 
352
348
  for merged_range in sheet.merged_cells.ranges:
353
-
354
349
  if (
355
350
  merged_range.min_row <= ri + 1
356
351
  and ri + 1 <= merged_range.max_row
357
352
  and merged_range.min_col <= rj + 1
358
353
  and rj + 1 <= merged_range.max_col
359
354
  ):
360
-
361
355
  row_span = merged_range.max_row - merged_range.min_row + 1
362
356
  col_span = merged_range.max_col - merged_range.min_col + 1
363
357
  break
@@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
499
493
  ),
500
494
  ),
501
495
  )
502
- except:
496
+ except Exception:
503
497
  _log.error("could not extract the image from excel sheets")
504
498
 
505
499
  return doc
@@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
120
120
 
121
121
  return prov
122
122
 
123
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
123
+ def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
124
124
  is_a_list = False
125
125
  is_list_group_created = False
126
126
  enum_list_item_value = 0
127
127
  new_list = None
128
128
  bullet_type = "None"
129
- list_text = ""
130
129
  list_label = GroupLabel.LIST
131
130
  doc_label = DocItemLabel.LIST_ITEM
132
131
  prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
243
242
  enum_marker = str(enum_list_item_value) + "."
244
243
  if not is_list_group_created:
245
244
  new_list = doc.add_group(
246
- label=list_label, name=f"list", parent=parent_slide
245
+ label=list_label, name="list", parent=parent_slide
247
246
  )
248
247
  is_list_group_created = True
249
248
  doc.add_list_item(
@@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
368
367
  slide_width = pptx_obj.slide_width
369
368
  slide_height = pptx_obj.slide_height
370
369
 
371
- text_content = [] # type: ignore
372
-
373
370
  max_levels = 10
374
371
  parents = {} # type: ignore
375
- for i in range(0, max_levels):
372
+ for i in range(max_levels):
376
373
  parents[i] = None
377
374
 
378
375
  # Loop through each slide
@@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
383
380
  )
384
381
 
385
382
  slide_size = Size(width=slide_width, height=slide_height)
386
- parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
383
+ doc.add_page(page_no=slide_ind + 1, size=slide_size)
387
384
 
388
385
  def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
389
386
  handle_groups(shape, parent_slide, slide_ind, doc, slide_size)