docling 2.53.0__py3-none-any.whl → 2.55.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -78,7 +78,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
78
78
 
79
79
  return doc
80
80
 
81
- def _parse(self, doc: DoclingDocument): # noqa: C901
81
+ def _parse(self, doc: DoclingDocument):
82
82
  """
83
83
  Main function that orchestrates the parsing by yielding components:
84
84
  title, section headers, text, lists, and tables.
@@ -17,8 +17,11 @@ from docling_core.types.doc import (
17
17
  DocumentOrigin,
18
18
  GroupItem,
19
19
  GroupLabel,
20
+ RefItem,
21
+ RichTableCell,
20
22
  TableCell,
21
23
  TableData,
24
+ TableItem,
22
25
  TextItem,
23
26
  )
24
27
  from docling_core.types.doc.document import ContentLayer, Formatting, Script
@@ -276,10 +279,175 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
276
279
  # reset context
277
280
  self.ctx = _Context()
278
281
  self._walk(content, doc)
279
-
280
282
  return doc
281
283
 
282
- def _walk(self, element: Tag, doc: DoclingDocument) -> None:
284
+ @staticmethod
285
+ def group_cell_elements(
286
+ group_name: str,
287
+ doc: DoclingDocument,
288
+ provs_in_cell: list[RefItem],
289
+ docling_table: TableItem,
290
+ ) -> RefItem:
291
+ group_element = doc.add_group(
292
+ label=GroupLabel.UNSPECIFIED,
293
+ name=group_name,
294
+ parent=docling_table,
295
+ )
296
+ for prov in provs_in_cell:
297
+ group_element.children.append(prov)
298
+ pr_item = prov.resolve(doc)
299
+ item_parent = pr_item.parent.resolve(doc)
300
+ if pr_item.get_ref() in item_parent.children:
301
+ item_parent.children.remove(pr_item.get_ref())
302
+ pr_item.parent = group_element.get_ref()
303
+ ref_for_rich_cell = group_element.get_ref()
304
+ return ref_for_rich_cell
305
+
306
+ @staticmethod
307
+ def process_rich_table_cells(
308
+ provs_in_cell: list[RefItem],
309
+ group_name: str,
310
+ doc: DoclingDocument,
311
+ docling_table: TableItem,
312
+ ) -> tuple[bool, RefItem]:
313
+ rich_table_cell = False
314
+ ref_for_rich_cell = provs_in_cell[0]
315
+ if len(provs_in_cell) > 1:
316
+ # Cell has multiple elements, we need to group them
317
+ rich_table_cell = True
318
+ ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
319
+ group_name, doc, provs_in_cell, docling_table
320
+ )
321
+ elif len(provs_in_cell) == 1:
322
+ item_ref = provs_in_cell[0]
323
+ pr_item = item_ref.resolve(doc)
324
+ if isinstance(pr_item, TextItem):
325
+ # Cell has only one element and it's just a text
326
+ rich_table_cell = False
327
+ doc.delete_items(node_items=[pr_item])
328
+ else:
329
+ rich_table_cell = True
330
+ ref_for_rich_cell = HTMLDocumentBackend.group_cell_elements(
331
+ group_name, doc, provs_in_cell, docling_table
332
+ )
333
+
334
+ return rich_table_cell, ref_for_rich_cell
335
+
336
+ def parse_table_data(
337
+ self,
338
+ element: Tag,
339
+ doc: DoclingDocument,
340
+ docling_table: TableItem,
341
+ num_rows: int,
342
+ num_cols: int,
343
+ ) -> Optional[TableData]:
344
+ for t in cast(list[Tag], element.find_all(["thead", "tbody"], recursive=False)):
345
+ t.unwrap()
346
+
347
+ _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
348
+ grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
349
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
350
+
351
+ # Iterate over the rows in the table
352
+ start_row_span = 0
353
+ row_idx = -1
354
+
355
+ # We don't want this recursive to support nested tables
356
+ for row in element("tr", recursive=False):
357
+ if not isinstance(row, Tag):
358
+ continue
359
+ # For each row, find all the column cells (both <td> and <th>)
360
+ # We don't want this recursive to support nested tables
361
+ cells = row(["td", "th"], recursive=False)
362
+ # Check if cell is in a column header or row header
363
+ col_header = True
364
+ row_header = True
365
+ for html_cell in cells:
366
+ if isinstance(html_cell, Tag):
367
+ _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
368
+ if html_cell.name == "td":
369
+ col_header = False
370
+ row_header = False
371
+ elif row_span == 1:
372
+ row_header = False
373
+ if not row_header:
374
+ row_idx += 1
375
+ start_row_span = 0
376
+ else:
377
+ start_row_span += 1
378
+
379
+ # Extract the text content of each cell
380
+ col_idx = 0
381
+ for html_cell in cells:
382
+ if not isinstance(html_cell, Tag):
383
+ continue
384
+
385
+ # extract inline formulas
386
+ for formula in html_cell("inline-formula"):
387
+ math_parts = formula.text.split("$$")
388
+ if len(math_parts) == 3:
389
+ math_formula = f"$${math_parts[1]}$$"
390
+ formula.replace_with(NavigableString(math_formula))
391
+
392
+ provs_in_cell: list[RefItem] = []
393
+ # Parse table cell sub-tree for Rich Cells content:
394
+ provs_in_cell = self._walk(html_cell, doc)
395
+
396
+ rich_table_cell = False
397
+ ref_for_rich_cell = None
398
+ if len(provs_in_cell) > 0:
399
+ group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}"
400
+ rich_table_cell, ref_for_rich_cell = (
401
+ HTMLDocumentBackend.process_rich_table_cells(
402
+ provs_in_cell, group_name, doc, docling_table
403
+ )
404
+ )
405
+
406
+ # Extracting text
407
+ text = self.get_text(html_cell).strip()
408
+ col_span, row_span = self._get_cell_spans(html_cell)
409
+ if row_header:
410
+ row_span -= 1
411
+ while (
412
+ col_idx < num_cols
413
+ and grid[row_idx + start_row_span][col_idx] is not None
414
+ ):
415
+ col_idx += 1
416
+ for r in range(start_row_span, start_row_span + row_span):
417
+ for c in range(col_span):
418
+ if row_idx + r < num_rows and col_idx + c < num_cols:
419
+ grid[row_idx + r][col_idx + c] = text
420
+
421
+ if rich_table_cell:
422
+ rich_cell = RichTableCell(
423
+ text=text,
424
+ row_span=row_span,
425
+ col_span=col_span,
426
+ start_row_offset_idx=start_row_span + row_idx,
427
+ end_row_offset_idx=start_row_span + row_idx + row_span,
428
+ start_col_offset_idx=col_idx,
429
+ end_col_offset_idx=col_idx + col_span,
430
+ column_header=col_header,
431
+ row_header=((not col_header) and html_cell.name == "th"),
432
+ ref=ref_for_rich_cell, # points to an artificial group around children
433
+ )
434
+ doc.add_table_cell(table_item=docling_table, cell=rich_cell)
435
+ else:
436
+ simple_cell = TableCell(
437
+ text=text,
438
+ row_span=row_span,
439
+ col_span=col_span,
440
+ start_row_offset_idx=start_row_span + row_idx,
441
+ end_row_offset_idx=start_row_span + row_idx + row_span,
442
+ start_col_offset_idx=col_idx,
443
+ end_col_offset_idx=col_idx + col_span,
444
+ column_header=col_header,
445
+ row_header=((not col_header) and html_cell.name == "th"),
446
+ )
447
+ doc.add_table_cell(table_item=docling_table, cell=simple_cell)
448
+ return data
449
+
450
+ def _walk(self, element: Tag, doc: DoclingDocument) -> list[RefItem]:
283
451
  """Parse an XML tag by recursively walking its content.
284
452
 
285
453
  While walking, the method buffers inline text across tags like <b> or <span>,
@@ -289,17 +457,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
289
457
  element: The XML tag to parse.
290
458
  doc: The Docling document to be updated with the parsed content.
291
459
  """
460
+ added_refs: list[RefItem] = []
292
461
  buffer: AnnotatedTextList = AnnotatedTextList()
293
462
 
294
463
  def flush_buffer():
295
464
  if not buffer:
296
- return
465
+ return added_refs
297
466
  annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
298
467
  parts = annotated_text_list.split_by_newline()
299
468
  buffer.clear()
300
469
 
301
470
  if not "".join([el.text for el in annotated_text_list]):
302
- return
471
+ return added_refs
303
472
 
304
473
  for annotated_text_list in parts:
305
474
  with self._use_inline_group(annotated_text_list, doc):
@@ -309,15 +478,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
309
478
  annotated_text.text.strip()
310
479
  )
311
480
  if annotated_text.code:
312
- doc.add_code(
481
+ docling_code2 = doc.add_code(
313
482
  parent=self.parents[self.level],
314
483
  text=seg_clean,
315
484
  content_layer=self.content_layer,
316
485
  formatting=annotated_text.formatting,
317
486
  hyperlink=annotated_text.hyperlink,
318
487
  )
488
+ added_refs.append(docling_code2.get_ref())
319
489
  else:
320
- doc.add_text(
490
+ docling_text2 = doc.add_text(
321
491
  parent=self.parents[self.level],
322
492
  label=DocItemLabel.TEXT,
323
493
  text=seg_clean,
@@ -325,25 +495,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
325
495
  formatting=annotated_text.formatting,
326
496
  hyperlink=annotated_text.hyperlink,
327
497
  )
498
+ added_refs.append(docling_text2.get_ref())
328
499
 
329
500
  for node in element.contents:
330
501
  if isinstance(node, Tag):
331
502
  name = node.name.lower()
332
503
  if name == "img":
333
504
  flush_buffer()
334
- self._emit_image(node, doc)
505
+ im_ref3 = self._emit_image(node, doc)
506
+ added_refs.append(im_ref3)
335
507
  elif name in _FORMAT_TAG_MAP:
336
508
  with self._use_format([name]):
337
- self._walk(node, doc)
509
+ wk = self._walk(node, doc)
510
+ added_refs.extend(wk)
338
511
  elif name == "a":
339
512
  with self._use_hyperlink(node):
340
- self._walk(node, doc)
513
+ wk2 = self._walk(node, doc)
514
+ added_refs.extend(wk2)
341
515
  elif name in _BLOCK_TAGS:
342
516
  flush_buffer()
343
- self._handle_block(node, doc)
517
+ blk = self._handle_block(node, doc)
518
+ added_refs.extend(blk)
344
519
  elif node.find(_BLOCK_TAGS):
345
520
  flush_buffer()
346
- self._walk(node, doc)
521
+ wk3 = self._walk(node, doc)
522
+ added_refs.extend(wk3)
347
523
  else:
348
524
  buffer.extend(
349
525
  self._extract_text_and_hyperlink_recursively(
@@ -363,6 +539,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
363
539
  )
364
540
 
365
541
  flush_buffer()
542
+ return added_refs
366
543
 
367
544
  @staticmethod
368
545
  def _collect_parent_format_tags(item: PageElement) -> list[str]:
@@ -581,7 +758,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
581
758
  self.level -= 1
582
759
  self.content_layer = current_layer
583
760
 
584
- def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> None:
761
+ def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
762
+ added_ref = []
585
763
  tag_name = tag.name.lower()
586
764
  # set default content layer to BODY as soon as we encounter a heading
587
765
  self.content_layer = ContentLayer.BODY
@@ -596,12 +774,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
596
774
  for key in self.parents.keys():
597
775
  self.parents[key] = None
598
776
  self.level = 0
599
- self.parents[self.level + 1] = doc.add_title(
777
+ docling_title = self.parents[self.level + 1] = doc.add_title(
600
778
  text_clean,
601
779
  content_layer=self.content_layer,
602
780
  formatting=annotated_text.formatting,
603
781
  hyperlink=annotated_text.hyperlink,
604
782
  )
783
+ added_ref = [docling_title.get_ref()]
605
784
  # the other levels need to be lowered by 1 if a title was set
606
785
  else:
607
786
  level -= 1
@@ -623,7 +802,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
623
802
  _log.debug(f"Remove the tail of level {key}")
624
803
  self.parents[key] = None
625
804
  self.level = level
626
- self.parents[self.level + 1] = doc.add_heading(
805
+ docling_heading = self.parents[self.level + 1] = doc.add_heading(
627
806
  parent=self.parents[self.level],
628
807
  text=text_clean,
629
808
  orig=annotated_text.text,
@@ -632,12 +811,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
632
811
  formatting=annotated_text.formatting,
633
812
  hyperlink=annotated_text.hyperlink,
634
813
  )
814
+ added_ref = [docling_heading.get_ref()]
635
815
  self.level += 1
636
816
  for img_tag in tag("img"):
637
817
  if isinstance(img_tag, Tag):
638
- self._emit_image(img_tag, doc)
818
+ im_ref = self._emit_image(img_tag, doc)
819
+ added_ref.append(im_ref)
820
+ return added_ref
639
821
 
640
- def _handle_list(self, tag: Tag, doc: DoclingDocument) -> None:
822
+ def _handle_list(self, tag: Tag, doc: DoclingDocument) -> RefItem:
641
823
  tag_name = tag.name.lower()
642
824
  start: Optional[int] = None
643
825
  name: str = ""
@@ -765,20 +947,50 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
765
947
 
766
948
  self.parents[self.level + 1] = None
767
949
  self.level -= 1
950
+ return list_group.get_ref()
951
+
952
+ @staticmethod
953
+ def get_html_table_row_col(tag: Tag) -> tuple[int, int]:
954
+ for t in cast(list[Tag], tag.find_all(["thead", "tbody"], recursive=False)):
955
+ t.unwrap()
956
+ # Find the number of rows and columns (taking into account spans)
957
+ num_rows: int = 0
958
+ num_cols: int = 0
959
+ for row in tag("tr", recursive=False):
960
+ col_count = 0
961
+ is_row_header = True
962
+ if not isinstance(row, Tag):
963
+ continue
964
+ for cell in row(["td", "th"], recursive=False):
965
+ if not isinstance(row, Tag):
966
+ continue
967
+ cell_tag = cast(Tag, cell)
968
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
969
+ col_count += col_span
970
+ if cell_tag.name == "td" or row_span == 1:
971
+ is_row_header = False
972
+ num_cols = max(num_cols, col_count)
973
+ if not is_row_header:
974
+ num_rows += 1
975
+ return num_rows, num_cols
768
976
 
769
- def _handle_block(self, tag: Tag, doc: DoclingDocument) -> None:
977
+ def _handle_block(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]:
978
+ added_refs = []
770
979
  tag_name = tag.name.lower()
771
980
 
772
981
  if tag_name == "figure":
773
982
  img_tag = tag.find("img")
774
983
  if isinstance(img_tag, Tag):
775
- self._emit_image(img_tag, doc)
984
+ im_ref = self._emit_image(img_tag, doc)
985
+ added_refs.append(im_ref)
776
986
 
777
987
  elif tag_name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
778
- self._handle_heading(tag, doc)
988
+ heading_refs = self._handle_heading(tag, doc)
989
+ added_refs.extend(heading_refs)
779
990
 
780
991
  elif tag_name in {"ul", "ol"}:
781
- self._handle_list(tag, doc)
992
+ list_ref = self._handle_list(tag, doc)
993
+ added_refs.append(list_ref)
782
994
 
783
995
  elif tag_name in {"p", "address", "summary"}:
784
996
  text_list = self._extract_text_and_hyperlink_recursively(
@@ -791,15 +1003,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
791
1003
  if seg := annotated_text.text.strip():
792
1004
  seg_clean = HTMLDocumentBackend._clean_unicode(seg)
793
1005
  if annotated_text.code:
794
- doc.add_code(
1006
+ docling_code = doc.add_code(
795
1007
  parent=self.parents[self.level],
796
1008
  text=seg_clean,
797
1009
  content_layer=self.content_layer,
798
1010
  formatting=annotated_text.formatting,
799
1011
  hyperlink=annotated_text.hyperlink,
800
1012
  )
1013
+ added_refs.append(docling_code.get_ref())
801
1014
  else:
802
- doc.add_text(
1015
+ docling_text = doc.add_text(
803
1016
  parent=self.parents[self.level],
804
1017
  label=DocItemLabel.TEXT,
805
1018
  text=seg_clean,
@@ -807,22 +1020,27 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
807
1020
  formatting=annotated_text.formatting,
808
1021
  hyperlink=annotated_text.hyperlink,
809
1022
  )
1023
+ added_refs.append(docling_text.get_ref())
810
1024
 
811
1025
  for img_tag in tag("img"):
812
1026
  if isinstance(img_tag, Tag):
813
1027
  self._emit_image(img_tag, doc)
814
1028
 
815
1029
  elif tag_name == "table":
816
- data = HTMLDocumentBackend.parse_table_data(tag)
1030
+ num_rows, num_cols = self.get_html_table_row_col(tag)
1031
+ data_e = TableData(num_rows=num_rows, num_cols=num_cols)
1032
+ docling_table = doc.add_table(
1033
+ data=data_e,
1034
+ parent=self.parents[self.level],
1035
+ content_layer=self.content_layer,
1036
+ )
1037
+ added_refs.append(docling_table.get_ref())
1038
+ self.parse_table_data(tag, doc, docling_table, num_rows, num_cols)
1039
+
817
1040
  for img_tag in tag("img"):
818
1041
  if isinstance(img_tag, Tag):
819
- self._emit_image(tag, doc)
820
- if data is not None:
821
- doc.add_table(
822
- data=data,
823
- parent=self.parents[self.level],
824
- content_layer=self.content_layer,
825
- )
1042
+ im_ref2 = self._emit_image(tag, doc)
1043
+ added_refs.append(im_ref2)
826
1044
 
827
1045
  elif tag_name in {"pre"}:
828
1046
  # handle monospace code snippets (pre).
@@ -835,13 +1053,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
835
1053
  text_clean = HTMLDocumentBackend._clean_unicode(
836
1054
  annotated_text.text.strip()
837
1055
  )
838
- doc.add_code(
1056
+ docling_code2 = doc.add_code(
839
1057
  parent=self.parents[self.level],
840
1058
  text=text_clean,
841
1059
  content_layer=self.content_layer,
842
1060
  formatting=annotated_text.formatting,
843
1061
  hyperlink=annotated_text.hyperlink,
844
1062
  )
1063
+ added_refs.append(docling_code2.get_ref())
845
1064
 
846
1065
  elif tag_name == "footer":
847
1066
  with self._use_footer(tag, doc):
@@ -850,8 +1069,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
850
1069
  elif tag_name == "details":
851
1070
  with self._use_details(tag, doc):
852
1071
  self._walk(tag, doc)
1072
+ return added_refs
853
1073
 
854
- def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
1074
+ def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> RefItem:
855
1075
  figure = img_tag.find_parent("figure")
856
1076
  caption: AnnotatedTextList = AnnotatedTextList()
857
1077
 
@@ -894,11 +1114,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
894
1114
  hyperlink=caption_anno_text.hyperlink,
895
1115
  )
896
1116
 
897
- doc.add_picture(
1117
+ docling_pic = doc.add_picture(
898
1118
  caption=caption_item,
899
1119
  parent=self.parents[self.level],
900
1120
  content_layer=self.content_layer,
901
1121
  )
1122
+ return docling_pic.get_ref()
902
1123
 
903
1124
  @staticmethod
904
1125
  def get_text(item: PageElement) -> str:
@@ -996,106 +1217,3 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
996
1217
  )
997
1218
 
998
1219
  return int_spans
999
-
1000
- @staticmethod
1001
- def parse_table_data(element: Tag) -> Optional[TableData]: # noqa: C901
1002
- nested_tables = element.find("table")
1003
- if nested_tables is not None:
1004
- _log.debug("Skipping nested table.")
1005
- return None
1006
-
1007
- # Find the number of rows and columns (taking into account spans)
1008
- num_rows = 0
1009
- num_cols = 0
1010
- for row in element("tr"):
1011
- col_count = 0
1012
- is_row_header = True
1013
- if not isinstance(row, Tag):
1014
- continue
1015
- for cell in row(["td", "th"]):
1016
- if not isinstance(row, Tag):
1017
- continue
1018
- cell_tag = cast(Tag, cell)
1019
- col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
1020
- col_count += col_span
1021
- if cell_tag.name == "td" or row_span == 1:
1022
- is_row_header = False
1023
- num_cols = max(num_cols, col_count)
1024
- if not is_row_header:
1025
- num_rows += 1
1026
-
1027
- _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
1028
-
1029
- grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
1030
-
1031
- data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
1032
-
1033
- # Iterate over the rows in the table
1034
- start_row_span = 0
1035
- row_idx = -1
1036
- for row in element("tr"):
1037
- if not isinstance(row, Tag):
1038
- continue
1039
-
1040
- # For each row, find all the column cells (both <td> and <th>)
1041
- cells = row(["td", "th"])
1042
-
1043
- # Check if cell is in a column header or row header
1044
- col_header = True
1045
- row_header = True
1046
- for html_cell in cells:
1047
- if isinstance(html_cell, Tag):
1048
- _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
1049
- if html_cell.name == "td":
1050
- col_header = False
1051
- row_header = False
1052
- elif row_span == 1:
1053
- row_header = False
1054
- if not row_header:
1055
- row_idx += 1
1056
- start_row_span = 0
1057
- else:
1058
- start_row_span += 1
1059
-
1060
- # Extract the text content of each cell
1061
- col_idx = 0
1062
- for html_cell in cells:
1063
- if not isinstance(html_cell, Tag):
1064
- continue
1065
-
1066
- # extract inline formulas
1067
- for formula in html_cell("inline-formula"):
1068
- math_parts = formula.text.split("$$")
1069
- if len(math_parts) == 3:
1070
- math_formula = f"$${math_parts[1]}$$"
1071
- formula.replace_with(NavigableString(math_formula))
1072
-
1073
- # TODO: extract content correctly from table-cells with lists
1074
- text = HTMLDocumentBackend.get_text(html_cell).strip()
1075
- col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
1076
- if row_header:
1077
- row_span -= 1
1078
- while (
1079
- col_idx < num_cols
1080
- and grid[row_idx + start_row_span][col_idx] is not None
1081
- ):
1082
- col_idx += 1
1083
- for r in range(start_row_span, start_row_span + row_span):
1084
- for c in range(col_span):
1085
- if row_idx + r < num_rows and col_idx + c < num_cols:
1086
- grid[row_idx + r][col_idx + c] = text
1087
-
1088
- table_cell = TableCell(
1089
- text=text,
1090
- row_span=row_span,
1091
- col_span=col_span,
1092
- start_row_offset_idx=start_row_span + row_idx,
1093
- end_row_offset_idx=start_row_span + row_idx + row_span,
1094
- start_col_offset_idx=col_idx,
1095
- end_col_offset_idx=col_idx + col_span,
1096
- column_header=col_header,
1097
- row_header=((not col_header) and html_cell.name == "th"),
1098
- )
1099
- data.table_cells.append(table_cell)
1100
-
1101
- return data
@@ -3,6 +3,7 @@ import re
3
3
  import warnings
4
4
  from copy import deepcopy
5
5
  from enum import Enum
6
+ from html import unescape
6
7
  from io import BytesIO
7
8
  from pathlib import Path
8
9
  from typing import Literal, Optional, Union, cast
@@ -321,9 +322,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
321
322
 
322
323
  fig_caption: Optional[TextItem] = None
323
324
  if element.title is not None and element.title != "":
325
+ title = unescape(element.title)
324
326
  fig_caption = doc.add_text(
325
327
  label=DocItemLabel.CAPTION,
326
- text=element.title,
328
+ text=title,
327
329
  formatting=formatting,
328
330
  hyperlink=hyperlink,
329
331
  )
@@ -351,6 +353,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
351
353
  snippet_text = (
352
354
  element.children.strip() if isinstance(element.children, str) else ""
353
355
  )
356
+ snippet_text = unescape(snippet_text)
354
357
  # Detect start of the table:
355
358
  if "|" in snippet_text or self.in_table:
356
359
  # most likely part of the markdown table