html-to-markdown 1.11.0__py3-none-any.whl → 1.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

html_to_markdown/cli.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import sys
2
2
  from argparse import ArgumentParser, FileType
3
+ from pathlib import Path
3
4
 
4
5
  from html_to_markdown.constants import (
5
6
  ASTERISK,
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
13
14
  WHITESPACE_NORMALIZED,
14
15
  WHITESPACE_STRICT,
15
16
  )
17
+ from html_to_markdown.exceptions import InvalidEncodingError
16
18
  from html_to_markdown.processing import convert_to_markdown
17
19
 
18
20
 
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
131
133
  help="Parent tags where images remain inline (not converted to alt-text).",
132
134
  )
133
135
 
136
+ parser.add_argument(
137
+ "--br-in-tables",
138
+ action="store_true",
139
+ help="Use <br> tags for line breaks in table cells instead of spaces.",
140
+ )
141
+
134
142
  parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
135
143
 
136
144
  parser.add_argument(
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
235
243
  help="Keep navigation elements when preprocessing (normally removed).",
236
244
  )
237
245
 
246
+ parser.add_argument(
247
+ "--source-encoding",
248
+ type=str,
249
+ default=None,
250
+ help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
251
+ )
252
+
238
253
  args = parser.parse_args(argv)
239
254
 
240
255
  base_args = {
241
256
  "autolinks": args.autolinks,
257
+ "br_in_tables": args.br_in_tables,
242
258
  "bullets": args.bullets,
243
259
  "code_language": args.code_language,
244
260
  "convert": args.convert,
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
278
294
  if args.show_progress:
279
295
 
280
296
  def progress_callback(processed: int, total: int) -> None:
281
- if total > 0:
297
+ if total > 0: # pragma: no cover
282
298
  percent = (processed / total) * 100
283
299
 
284
300
  sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
286
302
 
287
303
  base_args["progress_callback"] = progress_callback
288
304
 
289
- return convert_to_markdown(args.html.read(), **base_args)
305
+ if args.source_encoding and args.html.name != "<stdin>":
306
+ args.html.close()
307
+ try:
308
+ with Path(args.html.name).open(encoding=args.source_encoding) as f:
309
+ html_content = f.read()
310
+ except LookupError as e:
311
+ raise InvalidEncodingError(args.source_encoding) from e
312
+ else:
313
+ html_content = args.html.read()
314
+
315
+ return convert_to_markdown(html_content, **base_args)
@@ -5,9 +5,11 @@ from typing import TYPE_CHECKING
5
5
  if TYPE_CHECKING:
6
6
  from collections.abc import Iterable
7
7
  import base64
8
+ import re
8
9
  from collections.abc import Callable
9
10
  from functools import partial
10
11
  from inspect import getfullargspec
12
+ from itertools import chain
11
13
  from textwrap import fill
12
14
  from typing import Any, Literal, TypeVar, cast
13
15
 
@@ -36,6 +38,18 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
36
38
  return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
37
39
 
38
40
 
41
+ def _find_list_item_ancestor(tag: Tag) -> Tag | None:
42
+ parent = tag.parent
43
+ while parent and parent.name != "li":
44
+ parent = parent.parent
45
+ return parent
46
+
47
+
48
+ BLOCK_ELEMENTS = frozenset({"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"})
49
+
50
+ _LIST_ITEM_PATTERN = re.compile(r"^\s*(\*|\+|-|\d+\.)\s")
51
+
52
+
39
53
  SupportedElements = Literal[
40
54
  "a",
41
55
  "abbr",
@@ -216,14 +230,15 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool, list_in
216
230
  return quote_text
217
231
 
218
232
 
219
- def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
233
+ def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag, text: str) -> str:
220
234
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
221
235
 
222
236
  if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
223
- return " "
237
+ return " " + text.strip()
224
238
 
225
239
  _ = convert_as_inline
226
- return "\\\n" if newline_style.lower() == BACKSLASH else " \n"
240
+ newline = "\\\n" if newline_style.lower() == BACKSLASH else " \n"
241
+ return newline + text.strip() if text.strip() else newline
227
242
 
228
243
 
229
244
  def _convert_hn(
@@ -270,52 +285,87 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
270
285
  return f"![{alt}]({src}{title_part})"
271
286
 
272
287
 
288
+ def _has_block_list_items(tag: Tag) -> bool:
289
+ return any(
290
+ any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
291
+ for li in tag.find_all("li", recursive=False)
292
+ )
293
+
294
+
295
+ def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
296
+ prev_p = None
297
+ for child in parent.children:
298
+ if hasattr(child, "name"):
299
+ if child.name == "p":
300
+ prev_p = child
301
+ break
302
+
303
+ if prev_p:
304
+ lines = text.strip().split("\n")
305
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
306
+ return "\n" + "\n".join(indented_lines) + "\n"
307
+ return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
308
+
309
+
310
+ def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
311
+ lines = text.strip().split("\n")
312
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
313
+ result = "\n".join(indented_lines)
314
+ return result + "\n" if not result.endswith("\n") else result
315
+
316
+
317
+ def _add_list_item_spacing(text: str) -> str:
318
+ lines = text.split("\n")
319
+ items_with_blocks = set()
320
+
321
+ i = 0
322
+ while i < len(lines):
323
+ line = lines[i]
324
+ if line.strip() and _LIST_ITEM_PATTERN.match(line.lstrip()):
325
+ j = i + 1
326
+ has_continuation = False
327
+ while j < len(lines):
328
+ next_line = lines[j]
329
+ if next_line.strip() and _LIST_ITEM_PATTERN.match(next_line.lstrip()):
330
+ break
331
+ if next_line.strip() and next_line.startswith((" ", " ", "\t")):
332
+ has_continuation = True
333
+ j += 1
334
+
335
+ if has_continuation and j < len(lines):
336
+ items_with_blocks.add(j - 1)
337
+
338
+ i += 1
339
+
340
+ if items_with_blocks:
341
+ processed_lines = list(
342
+ chain.from_iterable([line, ""] if i in items_with_blocks else [line] for i, line in enumerate(lines))
343
+ )
344
+ return "\n".join(processed_lines)
345
+
346
+ return text
347
+
348
+
273
349
  def _convert_list(*, tag: Tag, text: str, list_indent_str: str) -> str:
274
350
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
275
351
 
276
- before_paragraph = False
277
- if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
278
- before_paragraph = True
352
+ before_paragraph = tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}
279
353
 
280
- if _has_ancestor(tag, "li"):
281
- parent = tag.parent
282
- while parent and parent.name != "li":
283
- parent = parent.parent
354
+ has_block_items = _has_block_list_items(tag)
284
355
 
356
+ if _has_ancestor(tag, "li"):
357
+ parent = _find_list_item_ancestor(tag)
285
358
  if parent:
286
- prev_p = None
287
- for child in parent.children:
288
- if hasattr(child, "name"):
289
- if child == tag:
290
- break
291
- if child.name == "p":
292
- prev_p = child
293
-
294
- if prev_p:
295
- lines = text.strip().split("\n")
296
- indented_lines = []
297
- for line in lines:
298
- if line.strip():
299
- indented_lines.append(f"{list_indent_str}{line}")
300
- else:
301
- indented_lines.append("")
302
- return "\n" + "\n".join(indented_lines) + "\n"
303
- return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
359
+ return _handle_nested_list_indentation(text, list_indent_str, parent)
304
360
 
305
361
  if tag.parent and tag.parent.name in {"ul", "ol"}:
306
- lines = text.strip().split("\n")
307
- indented_lines = []
308
- for line in lines:
309
- if line.strip():
310
- indented_lines.append(f"{list_indent_str}{line}")
311
- else:
312
- indented_lines.append("")
313
- result = "\n".join(indented_lines)
314
- if not result.endswith("\n"):
315
- result += "\n"
316
- return result
362
+ return _handle_direct_nested_list_indentation(text, list_indent_str)
317
363
 
318
- return text + ("\n" if before_paragraph else "")
364
+ if has_block_items:
365
+ text = _add_list_item_spacing(text)
366
+
367
+ trailing_newlines = "\n\n" if has_block_items else ("\n" if before_paragraph else "")
368
+ return text + trailing_newlines
319
369
 
320
370
 
321
371
  def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> str:
@@ -324,10 +374,8 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
324
374
  checked = checkbox.get("checked") is not None
325
375
  checkbox_symbol = "[x]" if checked else "[ ]"
326
376
 
327
- checkbox_text = text
328
- if checkbox.string:
329
- checkbox_text = text.replace(str(checkbox.string), "").strip()
330
- return f"- {checkbox_symbol} {checkbox_text.strip()}\n"
377
+ checkbox_text = text.strip()
378
+ return f"- {checkbox_symbol} {checkbox_text}\n"
331
379
 
332
380
  parent = tag.parent
333
381
  if parent is not None and parent.name == "ol":
@@ -349,11 +397,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
349
397
 
350
398
  bullet = bullets[depth % len(bullets)]
351
399
 
352
- has_block_children = any(
353
- child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
354
- for child in tag.children
355
- if hasattr(child, "name")
356
- )
400
+ has_block_children = "\n\n" in text
357
401
 
358
402
  if has_block_children:
359
403
  paragraphs = text.strip().split("\n\n")
@@ -390,20 +434,13 @@ def _convert_p(
390
434
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
391
435
 
392
436
  if _has_ancestor(tag, "li"):
393
- parent = tag.parent
394
- while parent and parent.name != "li":
395
- parent = parent.parent
437
+ parent = _find_list_item_ancestor(tag)
396
438
 
397
439
  if parent:
398
440
  p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
399
441
 
400
442
  if p_children and tag != p_children[0]:
401
- indented_lines = []
402
- for line in text.split("\n"):
403
- if line.strip():
404
- indented_lines.append(f"{list_indent_str}{line}")
405
- else:
406
- indented_lines.append("")
443
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
407
444
  text = "\n".join(indented_lines)
408
445
 
409
446
  return f"{text}\n\n" if text else ""
@@ -440,66 +477,65 @@ def _convert_pre(
440
477
  return f"\n```{code_language}\n{text}\n```\n"
441
478
 
442
479
 
443
- def _convert_td(*, tag: Tag, text: str) -> str:
480
+ def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
481
+ if br_in_tables:
482
+ block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
483
+
484
+ if len(block_children) > 1:
485
+ child_contents = []
486
+ for child in block_children:
487
+ child_text = child.get_text().strip()
488
+ if child_text:
489
+ child_contents.append(child_text)
490
+ return "<br>".join(child_contents)
491
+ return text.strip().replace("\n", "<br>")
492
+ return text.strip().replace("\n", " ")
493
+
494
+
495
+ def _convert_td(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
444
496
  colspan = _get_colspan(tag)
445
- return " " + text.strip().replace("\n", " ") + " |" * colspan
497
+ processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
498
+ return " " + processed_text + " |" * colspan
446
499
 
447
500
 
448
- def _convert_th(*, tag: Tag, text: str) -> str:
501
+ def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
449
502
  colspan = _get_colspan(tag)
450
- return " " + text.strip().replace("\n", " ") + " |" * colspan
503
+ processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
504
+ return " " + processed_text + " |" * colspan
451
505
 
452
506
 
453
- def _convert_tr(*, tag: Tag, text: str) -> str:
454
- cells = tag.find_all(["td", "th"])
455
- parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
456
- tag_grand_parent = tag.parent.parent if tag.parent else None
507
+ def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
508
+ rowspan_positions = []
509
+ col_pos = 0
457
510
 
458
- if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
459
- prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
460
- rowspan_positions = []
461
- col_pos = 0
462
-
463
- for prev_cell in prev_cells:
464
- rowspan = 1
465
- if (
466
- "rowspan" in prev_cell.attrs
467
- and isinstance(prev_cell["rowspan"], str)
468
- and prev_cell["rowspan"].isdigit()
469
- ):
470
- rowspan = int(prev_cell["rowspan"])
471
-
472
- if rowspan > 1:
473
- rowspan_positions.append(col_pos)
474
-
475
- colspan = 1
476
- if (
477
- "colspan" in prev_cell.attrs
478
- and isinstance(prev_cell["colspan"], str)
479
- and prev_cell["colspan"].isdigit()
480
- ):
481
- colspan = int(prev_cell["colspan"])
482
- col_pos += colspan
511
+ for prev_cell in prev_cells:
512
+ rowspan = 1
513
+ if "rowspan" in prev_cell.attrs and isinstance(prev_cell["rowspan"], str) and prev_cell["rowspan"].isdigit():
514
+ rowspan = int(prev_cell["rowspan"])
515
+
516
+ if rowspan > 1:
517
+ rowspan_positions.append(col_pos)
518
+
519
+ colspan = 1
520
+ if "colspan" in prev_cell.attrs and isinstance(prev_cell["colspan"], str) and prev_cell["colspan"].isdigit():
521
+ colspan = int(prev_cell["colspan"])
522
+ col_pos += colspan
523
+
524
+ return rowspan_positions, col_pos
483
525
 
484
- if rowspan_positions:
485
- converted_cells: list[str] = []
486
- if text.strip():
487
- parts = text.split("|")
488
- converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
489
526
 
490
- new_cells: list[str] = []
491
- cell_index = 0
527
+ def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
528
+ converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
529
+ rowspan_set = set(rowspan_positions)
492
530
 
493
- for pos in range(col_pos):
494
- if pos in rowspan_positions:
495
- new_cells.append(" |")
496
- elif cell_index < len(converted_cells):
497
- new_cells.append(converted_cells[cell_index])
498
- cell_index += 1
531
+ cell_iter = iter(converted_cells)
532
+ new_cells = [" |" if pos in rowspan_set else next(cell_iter, "") for pos in range(col_pos)]
499
533
 
500
- text = "".join(new_cells)
534
+ return "".join(new_cells)
501
535
 
502
- is_headrow = (
536
+
537
+ def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
538
+ return (
503
539
  all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
504
540
  or (not tag.previous_sibling and parent_name != "tbody")
505
541
  or (
@@ -508,25 +544,47 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
508
544
  and (not tag_grand_parent or len(tag_grand_parent.find_all(["thead"])) < 1)
509
545
  )
510
546
  )
547
+
548
+
549
+ def _calculate_total_colspan(cells: list[Tag]) -> int:
550
+ full_colspan = 0
551
+ for cell in cells:
552
+ if hasattr(cell, "attrs") and "colspan" in cell.attrs:
553
+ colspan_value = cell.attrs["colspan"]
554
+ if isinstance(colspan_value, str) and colspan_value.isdigit():
555
+ full_colspan += int(colspan_value)
556
+ else:
557
+ full_colspan += 1
558
+ else:
559
+ full_colspan += 1
560
+ return full_colspan
561
+
562
+
563
+ def _convert_tr(*, tag: Tag, text: str) -> str:
564
+ cells = tag.find_all(["td", "th"])
565
+ parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
566
+ tag_grand_parent = tag.parent.parent if tag.parent else None
567
+
568
+ if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
569
+ prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
570
+ rowspan_positions, col_pos = _get_rowspan_positions(prev_cells)
571
+
572
+ if rowspan_positions:
573
+ text = _handle_rowspan_text(text, rowspan_positions, col_pos)
574
+
575
+ is_headrow = _is_header_row(tag, cells, parent_name, tag_grand_parent)
511
576
  overline = ""
512
577
  underline = ""
578
+
513
579
  if is_headrow and not tag.previous_sibling:
514
- full_colspan = 0
515
- for cell in cells:
516
- if hasattr(cell, "attrs") and "colspan" in cell.attrs:
517
- colspan_value = cell.attrs["colspan"]
518
- if isinstance(colspan_value, str) and colspan_value.isdigit():
519
- full_colspan += int(colspan_value)
520
- else:
521
- full_colspan += 1
522
- else:
523
- full_colspan += 1
580
+ full_colspan = _calculate_total_colspan(cells)
524
581
  underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
525
582
  elif not tag.previous_sibling and (
526
583
  parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
527
584
  ):
528
- overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
529
- overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
585
+ overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n" # pragma: no cover
586
+ overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n" # pragma: no cover
587
+
530
588
  return overline + "|" + text + "\n" + underline
531
589
 
532
590
 
@@ -578,10 +636,23 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
578
636
  return f"{text}\n\n" if text.strip() else ""
579
637
 
580
638
 
581
- def _convert_div(*, text: str, convert_as_inline: bool) -> str:
639
+ def _convert_div(*, text: str, convert_as_inline: bool, tag: Tag, list_indent_str: str) -> str:
582
640
  if convert_as_inline:
583
641
  return text
584
642
 
643
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
644
+
645
+ if _has_ancestor(tag, "li"):
646
+ parent = _find_list_item_ancestor(tag)
647
+ if parent:
648
+ div_children = [child for child in parent.children if hasattr(child, "name") and child.name == "div"]
649
+
650
+ if div_children and tag != div_children[0]:
651
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
652
+ indented_text = "\n".join(indented_lines)
653
+
654
+ return f"{indented_text}\n\n" if indented_text.strip() else ""
655
+
585
656
  return _format_block_element(text)
586
657
 
587
658
 
@@ -603,7 +674,7 @@ def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
603
674
  if convert_as_inline:
604
675
  return text
605
676
 
606
- return f"{text}\n" if text.strip() else ""
677
+ return _format_block_element(text)
607
678
 
608
679
 
609
680
  def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
@@ -616,14 +687,21 @@ def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
616
687
  return f"{text.strip()}\n"
617
688
 
618
689
 
619
- def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
690
+ def _convert_dd(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
620
691
  if convert_as_inline:
621
692
  return text
622
693
 
623
- if not text.strip():
624
- return ""
694
+ has_dt_sibling = False
695
+ current = tag.previous_sibling
696
+ while current:
697
+ if hasattr(current, "name") and current.name and current.name == "dt":
698
+ has_dt_sibling = True
699
+ break
700
+ current = current.previous_sibling
625
701
 
626
- return f": {text.strip()}\n\n"
702
+ if has_dt_sibling:
703
+ return f": {text.strip()}\n\n" if text.strip() else ": \n\n"
704
+ return f"{text.strip()}\n\n" if text.strip() else ""
627
705
 
628
706
 
629
707
  def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
@@ -648,9 +726,7 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
648
726
 
649
727
 
650
728
  def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
651
- src = tag.get("src", "")
652
-
653
- if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
729
+ if not (src := tag.get("src", "")) and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
654
730
  src = source_tag.get("src", "")
655
731
 
656
732
  if src and isinstance(src, str) and src.strip():
@@ -670,9 +746,8 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
670
746
 
671
747
  def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
672
748
  _ = text
673
- src = tag.get("src", "")
674
749
 
675
- if src and isinstance(src, str) and src.strip():
750
+ if (src := tag.get("src", "")) and isinstance(src, str) and src.strip():
676
751
  link = f"[{src}]({src})"
677
752
  if convert_as_inline:
678
753
  return link
@@ -939,7 +1014,7 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
939
1014
  content = text.strip()
940
1015
  if content and not content.endswith("\n\n"):
941
1016
  if content.endswith("\n"):
942
- content += "\n"
1017
+ content += "\n" # pragma: no cover
943
1018
  else:
944
1019
  content += "\n\n"
945
1020
  return content
@@ -997,6 +1072,7 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
997
1072
 
998
1073
  def create_converters_map(
999
1074
  autolinks: bool,
1075
+ br_in_tables: bool,
1000
1076
  bullets: str,
1001
1077
  code_language: str,
1002
1078
  code_language_callback: Callable[[Tag], str] | None,
@@ -1029,6 +1105,8 @@ def create_converters_map(
1029
1105
  kwargs["convert_as_inline"] = convert_as_inline
1030
1106
  if "list_indent_str" in spec.kwonlyargs:
1031
1107
  kwargs["list_indent_str"] = list_indent_str
1108
+ if "br_in_tables" in spec.kwonlyargs:
1109
+ kwargs["br_in_tables"] = br_in_tables
1032
1110
  return func(**kwargs)
1033
1111
  return func(text)
1034
1112
 
@@ -37,3 +37,8 @@ class ConflictingOptionsError(HtmlToMarkdownError):
37
37
  self.option2 = option2
38
38
 
39
39
  super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
40
+
41
+
42
+ class InvalidEncodingError(HtmlToMarkdownError):
43
+ def __init__(self, encoding: str) -> None:
44
+ super().__init__(f"The specified encoding ({encoding}) is not valid.")