html-to-markdown 1.11.0__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

html_to_markdown/cli.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import sys
2
2
  from argparse import ArgumentParser, FileType
3
+ from pathlib import Path
3
4
 
4
5
  from html_to_markdown.constants import (
5
6
  ASTERISK,
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
13
14
  WHITESPACE_NORMALIZED,
14
15
  WHITESPACE_STRICT,
15
16
  )
17
+ from html_to_markdown.exceptions import InvalidEncodingError
16
18
  from html_to_markdown.processing import convert_to_markdown
17
19
 
18
20
 
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
131
133
  help="Parent tags where images remain inline (not converted to alt-text).",
132
134
  )
133
135
 
136
+ parser.add_argument(
137
+ "--br-in-tables",
138
+ action="store_true",
139
+ help="Use <br> tags for line breaks in table cells instead of spaces.",
140
+ )
141
+
134
142
  parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
135
143
 
136
144
  parser.add_argument(
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
235
243
  help="Keep navigation elements when preprocessing (normally removed).",
236
244
  )
237
245
 
246
+ parser.add_argument(
247
+ "--source-encoding",
248
+ type=str,
249
+ default=None,
250
+ help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
251
+ )
252
+
238
253
  args = parser.parse_args(argv)
239
254
 
240
255
  base_args = {
241
256
  "autolinks": args.autolinks,
257
+ "br_in_tables": args.br_in_tables,
242
258
  "bullets": args.bullets,
243
259
  "code_language": args.code_language,
244
260
  "convert": args.convert,
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
278
294
  if args.show_progress:
279
295
 
280
296
  def progress_callback(processed: int, total: int) -> None:
281
- if total > 0:
297
+ if total > 0: # pragma: no cover
282
298
  percent = (processed / total) * 100
283
299
 
284
300
  sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
286
302
 
287
303
  base_args["progress_callback"] = progress_callback
288
304
 
289
- return convert_to_markdown(args.html.read(), **base_args)
305
+ if args.source_encoding and args.html.name != "<stdin>":
306
+ args.html.close()
307
+ try:
308
+ with Path(args.html.name).open(encoding=args.source_encoding) as f:
309
+ html_content = f.read()
310
+ except LookupError as e:
311
+ raise InvalidEncodingError(args.source_encoding) from e
312
+ else:
313
+ html_content = args.html.read()
314
+
315
+ return convert_to_markdown(html_content, **base_args)
@@ -5,9 +5,11 @@ from typing import TYPE_CHECKING
5
5
  if TYPE_CHECKING:
6
6
  from collections.abc import Iterable
7
7
  import base64
8
+ import re
8
9
  from collections.abc import Callable
9
10
  from functools import partial
10
11
  from inspect import getfullargspec
12
+ from itertools import chain
11
13
  from textwrap import fill
12
14
  from typing import Any, Literal, TypeVar, cast
13
15
 
@@ -36,6 +38,19 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
36
38
  return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
37
39
 
38
40
 
41
+ def _find_list_item_ancestor(tag: Tag) -> Tag | None:
42
+ """Find the nearest list item ancestor of a tag."""
43
+ parent = tag.parent
44
+ while parent and parent.name != "li":
45
+ parent = parent.parent
46
+ return parent
47
+
48
+
49
+ BLOCK_ELEMENTS = frozenset({"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"})
50
+
51
+ _LIST_ITEM_PATTERN = re.compile(r"^\s*(\*|\+|-|\d+\.)\s")
52
+
53
+
39
54
  SupportedElements = Literal[
40
55
  "a",
41
56
  "abbr",
@@ -270,52 +285,91 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
270
285
  return f"![{alt}]({src}{title_part})"
271
286
 
272
287
 
288
+ def _has_block_list_items(tag: Tag) -> bool:
289
+ """Check if any list items contain block elements."""
290
+ return any(
291
+ any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
292
+ for li in tag.find_all("li", recursive=False)
293
+ )
294
+
295
+
296
+ def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
297
+ """Handle indentation for lists nested within list items."""
298
+ prev_p = None
299
+ for child in parent.children:
300
+ if hasattr(child, "name"):
301
+ if child.name == "p":
302
+ prev_p = child
303
+ break
304
+
305
+ if prev_p:
306
+ lines = text.strip().split("\n")
307
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
308
+ return "\n" + "\n".join(indented_lines) + "\n"
309
+ return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
310
+
311
+
312
+ def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
313
+ """Handle indentation for lists that are direct children of other lists."""
314
+ lines = text.strip().split("\n")
315
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
316
+ result = "\n".join(indented_lines)
317
+ return result + "\n" if not result.endswith("\n") else result
318
+
319
+
320
+ def _add_list_item_spacing(text: str) -> str:
321
+ """Add extra spacing between list items that contain block content."""
322
+ lines = text.split("\n")
323
+ items_with_blocks = set()
324
+
325
+ i = 0
326
+ while i < len(lines):
327
+ line = lines[i]
328
+ if line.strip() and _LIST_ITEM_PATTERN.match(line.lstrip()):
329
+ j = i + 1
330
+ has_continuation = False
331
+ while j < len(lines):
332
+ next_line = lines[j]
333
+ if next_line.strip() and _LIST_ITEM_PATTERN.match(next_line.lstrip()):
334
+ break
335
+ if next_line.strip() and next_line.startswith((" ", " ", "\t")):
336
+ has_continuation = True
337
+ j += 1
338
+
339
+ if has_continuation and j < len(lines):
340
+ items_with_blocks.add(j - 1)
341
+
342
+ i += 1
343
+
344
+ if items_with_blocks:
345
+ processed_lines = list(
346
+ chain.from_iterable([line, ""] if i in items_with_blocks else [line] for i, line in enumerate(lines))
347
+ )
348
+ return "\n".join(processed_lines)
349
+
350
+ return text
351
+
352
+
273
353
  def _convert_list(*, tag: Tag, text: str, list_indent_str: str) -> str:
274
354
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
275
355
 
276
- before_paragraph = False
277
- if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
278
- before_paragraph = True
356
+ before_paragraph = tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}
279
357
 
280
- if _has_ancestor(tag, "li"):
281
- parent = tag.parent
282
- while parent and parent.name != "li":
283
- parent = parent.parent
358
+ has_block_items = _has_block_list_items(tag)
284
359
 
360
+ if _has_ancestor(tag, "li"):
361
+ parent = _find_list_item_ancestor(tag)
285
362
  if parent:
286
- prev_p = None
287
- for child in parent.children:
288
- if hasattr(child, "name"):
289
- if child == tag:
290
- break
291
- if child.name == "p":
292
- prev_p = child
293
-
294
- if prev_p:
295
- lines = text.strip().split("\n")
296
- indented_lines = []
297
- for line in lines:
298
- if line.strip():
299
- indented_lines.append(f"{list_indent_str}{line}")
300
- else:
301
- indented_lines.append("")
302
- return "\n" + "\n".join(indented_lines) + "\n"
303
- return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
363
+ return _handle_nested_list_indentation(text, list_indent_str, parent)
304
364
 
305
365
  if tag.parent and tag.parent.name in {"ul", "ol"}:
306
- lines = text.strip().split("\n")
307
- indented_lines = []
308
- for line in lines:
309
- if line.strip():
310
- indented_lines.append(f"{list_indent_str}{line}")
311
- else:
312
- indented_lines.append("")
313
- result = "\n".join(indented_lines)
314
- if not result.endswith("\n"):
315
- result += "\n"
316
- return result
366
+ return _handle_direct_nested_list_indentation(text, list_indent_str)
317
367
 
318
- return text + ("\n" if before_paragraph else "")
368
+ if has_block_items:
369
+ text = _add_list_item_spacing(text)
370
+
371
+ trailing_newlines = "\n\n" if has_block_items else ("\n" if before_paragraph else "")
372
+ return text + trailing_newlines
319
373
 
320
374
 
321
375
  def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> str:
@@ -324,10 +378,8 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
324
378
  checked = checkbox.get("checked") is not None
325
379
  checkbox_symbol = "[x]" if checked else "[ ]"
326
380
 
327
- checkbox_text = text
328
- if checkbox.string:
329
- checkbox_text = text.replace(str(checkbox.string), "").strip()
330
- return f"- {checkbox_symbol} {checkbox_text.strip()}\n"
381
+ checkbox_text = text.strip()
382
+ return f"- {checkbox_symbol} {checkbox_text}\n"
331
383
 
332
384
  parent = tag.parent
333
385
  if parent is not None and parent.name == "ol":
@@ -349,11 +401,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
349
401
 
350
402
  bullet = bullets[depth % len(bullets)]
351
403
 
352
- has_block_children = any(
353
- child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
354
- for child in tag.children
355
- if hasattr(child, "name")
356
- )
404
+ has_block_children = "\n\n" in text
357
405
 
358
406
  if has_block_children:
359
407
  paragraphs = text.strip().split("\n\n")
@@ -390,20 +438,13 @@ def _convert_p(
390
438
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
391
439
 
392
440
  if _has_ancestor(tag, "li"):
393
- parent = tag.parent
394
- while parent and parent.name != "li":
395
- parent = parent.parent
441
+ parent = _find_list_item_ancestor(tag)
396
442
 
397
443
  if parent:
398
444
  p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
399
445
 
400
446
  if p_children and tag != p_children[0]:
401
- indented_lines = []
402
- for line in text.split("\n"):
403
- if line.strip():
404
- indented_lines.append(f"{list_indent_str}{line}")
405
- else:
406
- indented_lines.append("")
447
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
407
448
  text = "\n".join(indented_lines)
408
449
 
409
450
  return f"{text}\n\n" if text else ""
@@ -440,66 +481,69 @@ def _convert_pre(
440
481
  return f"\n```{code_language}\n{text}\n```\n"
441
482
 
442
483
 
443
- def _convert_td(*, tag: Tag, text: str) -> str:
484
+ def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
485
+ """Process table cell content, optionally using <br> tags for multi-line content."""
486
+ if br_in_tables:
487
+ block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
488
+
489
+ if len(block_children) > 1:
490
+ child_contents = []
491
+ for child in block_children:
492
+ child_text = child.get_text().strip()
493
+ if child_text:
494
+ child_contents.append(child_text)
495
+ return "<br>".join(child_contents)
496
+ return text.strip().replace("\n", "<br>")
497
+ return text.strip().replace("\n", " ")
498
+
499
+
500
+ def _convert_td(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
444
501
  colspan = _get_colspan(tag)
445
- return " " + text.strip().replace("\n", " ") + " |" * colspan
502
+ processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
503
+ return " " + processed_text + " |" * colspan
446
504
 
447
505
 
448
- def _convert_th(*, tag: Tag, text: str) -> str:
506
+ def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
449
507
  colspan = _get_colspan(tag)
450
- return " " + text.strip().replace("\n", " ") + " |" * colspan
508
+ processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
509
+ return " " + processed_text + " |" * colspan
451
510
 
452
511
 
453
- def _convert_tr(*, tag: Tag, text: str) -> str:
454
- cells = tag.find_all(["td", "th"])
455
- parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
456
- tag_grand_parent = tag.parent.parent if tag.parent else None
512
+ def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
513
+ """Get positions of cells with rowspan > 1 from previous row."""
514
+ rowspan_positions = []
515
+ col_pos = 0
457
516
 
458
- if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
459
- prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
460
- rowspan_positions = []
461
- col_pos = 0
462
-
463
- for prev_cell in prev_cells:
464
- rowspan = 1
465
- if (
466
- "rowspan" in prev_cell.attrs
467
- and isinstance(prev_cell["rowspan"], str)
468
- and prev_cell["rowspan"].isdigit()
469
- ):
470
- rowspan = int(prev_cell["rowspan"])
471
-
472
- if rowspan > 1:
473
- rowspan_positions.append(col_pos)
474
-
475
- colspan = 1
476
- if (
477
- "colspan" in prev_cell.attrs
478
- and isinstance(prev_cell["colspan"], str)
479
- and prev_cell["colspan"].isdigit()
480
- ):
481
- colspan = int(prev_cell["colspan"])
482
- col_pos += colspan
517
+ for prev_cell in prev_cells:
518
+ rowspan = 1
519
+ if "rowspan" in prev_cell.attrs and isinstance(prev_cell["rowspan"], str) and prev_cell["rowspan"].isdigit():
520
+ rowspan = int(prev_cell["rowspan"])
521
+
522
+ if rowspan > 1:
523
+ rowspan_positions.append(col_pos)
524
+
525
+ colspan = 1
526
+ if "colspan" in prev_cell.attrs and isinstance(prev_cell["colspan"], str) and prev_cell["colspan"].isdigit():
527
+ colspan = int(prev_cell["colspan"])
528
+ col_pos += colspan
529
+
530
+ return rowspan_positions, col_pos
483
531
 
484
- if rowspan_positions:
485
- converted_cells: list[str] = []
486
- if text.strip():
487
- parts = text.split("|")
488
- converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
489
532
 
490
- new_cells: list[str] = []
491
- cell_index = 0
533
+ def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
534
+ """Handle text adjustment for rows with rowspan cells."""
535
+ converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
536
+ rowspan_set = set(rowspan_positions)
492
537
 
493
- for pos in range(col_pos):
494
- if pos in rowspan_positions:
495
- new_cells.append(" |")
496
- elif cell_index < len(converted_cells):
497
- new_cells.append(converted_cells[cell_index])
498
- cell_index += 1
538
+ cell_iter = iter(converted_cells)
539
+ new_cells = [" |" if pos in rowspan_set else next(cell_iter, "") for pos in range(col_pos)]
499
540
 
500
- text = "".join(new_cells)
541
+ return "".join(new_cells)
501
542
 
502
- is_headrow = (
543
+
544
+ def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
545
+ """Determine if this table row should be treated as a header row."""
546
+ return (
503
547
  all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
504
548
  or (not tag.previous_sibling and parent_name != "tbody")
505
549
  or (
@@ -508,25 +552,48 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
508
552
  and (not tag_grand_parent or len(tag_grand_parent.find_all(["thead"])) < 1)
509
553
  )
510
554
  )
555
+
556
+
557
+ def _calculate_total_colspan(cells: list[Tag]) -> int:
558
+ """Calculate total colspan for all cells in a row."""
559
+ full_colspan = 0
560
+ for cell in cells:
561
+ if hasattr(cell, "attrs") and "colspan" in cell.attrs:
562
+ colspan_value = cell.attrs["colspan"]
563
+ if isinstance(colspan_value, str) and colspan_value.isdigit():
564
+ full_colspan += int(colspan_value)
565
+ else:
566
+ full_colspan += 1
567
+ else:
568
+ full_colspan += 1
569
+ return full_colspan
570
+
571
+
572
+ def _convert_tr(*, tag: Tag, text: str) -> str:
573
+ cells = tag.find_all(["td", "th"])
574
+ parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
575
+ tag_grand_parent = tag.parent.parent if tag.parent else None
576
+
577
+ if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
578
+ prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
579
+ rowspan_positions, col_pos = _get_rowspan_positions(prev_cells)
580
+
581
+ if rowspan_positions:
582
+ text = _handle_rowspan_text(text, rowspan_positions, col_pos)
583
+
584
+ is_headrow = _is_header_row(tag, cells, parent_name, tag_grand_parent)
511
585
  overline = ""
512
586
  underline = ""
587
+
513
588
  if is_headrow and not tag.previous_sibling:
514
- full_colspan = 0
515
- for cell in cells:
516
- if hasattr(cell, "attrs") and "colspan" in cell.attrs:
517
- colspan_value = cell.attrs["colspan"]
518
- if isinstance(colspan_value, str) and colspan_value.isdigit():
519
- full_colspan += int(colspan_value)
520
- else:
521
- full_colspan += 1
522
- else:
523
- full_colspan += 1
589
+ full_colspan = _calculate_total_colspan(cells)
524
590
  underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
525
591
  elif not tag.previous_sibling and (
526
592
  parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
527
593
  ):
528
- overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
529
- overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
594
+ overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n" # pragma: no cover
595
+ overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n" # pragma: no cover
596
+
530
597
  return overline + "|" + text + "\n" + underline
531
598
 
532
599
 
@@ -578,10 +645,23 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
578
645
  return f"{text}\n\n" if text.strip() else ""
579
646
 
580
647
 
581
- def _convert_div(*, text: str, convert_as_inline: bool) -> str:
648
+ def _convert_div(*, text: str, convert_as_inline: bool, tag: Tag, list_indent_str: str) -> str:
582
649
  if convert_as_inline:
583
650
  return text
584
651
 
652
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
653
+
654
+ if _has_ancestor(tag, "li"):
655
+ parent = _find_list_item_ancestor(tag)
656
+ if parent:
657
+ div_children = [child for child in parent.children if hasattr(child, "name") and child.name == "div"]
658
+
659
+ if div_children and tag != div_children[0]:
660
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
661
+ indented_text = "\n".join(indented_lines)
662
+
663
+ return f"{indented_text}\n\n" if indented_text.strip() else ""
664
+
585
665
  return _format_block_element(text)
586
666
 
587
667
 
@@ -603,7 +683,7 @@ def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
603
683
  if convert_as_inline:
604
684
  return text
605
685
 
606
- return f"{text}\n" if text.strip() else ""
686
+ return _format_block_element(text)
607
687
 
608
688
 
609
689
  def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
@@ -616,14 +696,21 @@ def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
616
696
  return f"{text.strip()}\n"
617
697
 
618
698
 
619
- def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
699
+ def _convert_dd(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
620
700
  if convert_as_inline:
621
701
  return text
622
702
 
623
- if not text.strip():
624
- return ""
703
+ has_dt_sibling = False
704
+ current = tag.previous_sibling
705
+ while current:
706
+ if hasattr(current, "name") and current.name and current.name == "dt":
707
+ has_dt_sibling = True
708
+ break
709
+ current = current.previous_sibling
625
710
 
626
- return f": {text.strip()}\n\n"
711
+ if has_dt_sibling:
712
+ return f": {text.strip()}\n\n" if text.strip() else ": \n\n"
713
+ return f"{text.strip()}\n\n" if text.strip() else ""
627
714
 
628
715
 
629
716
  def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
@@ -648,9 +735,7 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
648
735
 
649
736
 
650
737
  def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
651
- src = tag.get("src", "")
652
-
653
- if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
738
+ if not (src := tag.get("src", "")) and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
654
739
  src = source_tag.get("src", "")
655
740
 
656
741
  if src and isinstance(src, str) and src.strip():
@@ -670,9 +755,8 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
670
755
 
671
756
  def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
672
757
  _ = text
673
- src = tag.get("src", "")
674
758
 
675
- if src and isinstance(src, str) and src.strip():
759
+ if (src := tag.get("src", "")) and isinstance(src, str) and src.strip():
676
760
  link = f"[{src}]({src})"
677
761
  if convert_as_inline:
678
762
  return link
@@ -939,7 +1023,7 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
939
1023
  content = text.strip()
940
1024
  if content and not content.endswith("\n\n"):
941
1025
  if content.endswith("\n"):
942
- content += "\n"
1026
+ content += "\n" # pragma: no cover
943
1027
  else:
944
1028
  content += "\n\n"
945
1029
  return content
@@ -997,6 +1081,7 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
997
1081
 
998
1082
  def create_converters_map(
999
1083
  autolinks: bool,
1084
+ br_in_tables: bool,
1000
1085
  bullets: str,
1001
1086
  code_language: str,
1002
1087
  code_language_callback: Callable[[Tag], str] | None,
@@ -1029,6 +1114,8 @@ def create_converters_map(
1029
1114
  kwargs["convert_as_inline"] = convert_as_inline
1030
1115
  if "list_indent_str" in spec.kwonlyargs:
1031
1116
  kwargs["list_indent_str"] = list_indent_str
1117
+ if "br_in_tables" in spec.kwonlyargs:
1118
+ kwargs["br_in_tables"] = br_in_tables
1032
1119
  return func(**kwargs)
1033
1120
  return func(text)
1034
1121
 
@@ -37,3 +37,8 @@ class ConflictingOptionsError(HtmlToMarkdownError):
37
37
  self.option2 = option2
38
38
 
39
39
  super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
40
+
41
+
42
+ class InvalidEncodingError(HtmlToMarkdownError):
43
+ def __init__(self, encoding: str) -> None:
44
+ super().__init__(f"The specified encoding ({encoding}) is not valid.")
@@ -5,6 +5,98 @@ from typing import Any
5
5
 
6
6
  import nh3
7
7
 
8
+ BASE_ALLOWED_TAGS = frozenset(
9
+ {
10
+ "p",
11
+ "div",
12
+ "span",
13
+ "br",
14
+ "hr",
15
+ "h1",
16
+ "h2",
17
+ "h3",
18
+ "h4",
19
+ "h5",
20
+ "h6",
21
+ "ul",
22
+ "ol",
23
+ "li",
24
+ "dl",
25
+ "dt",
26
+ "dd",
27
+ "strong",
28
+ "b",
29
+ "em",
30
+ "i",
31
+ "u",
32
+ "s",
33
+ "del",
34
+ "ins",
35
+ "mark",
36
+ "small",
37
+ "sub",
38
+ "sup",
39
+ "code",
40
+ "pre",
41
+ "kbd",
42
+ "samp",
43
+ "var",
44
+ "abbr",
45
+ "cite",
46
+ "dfn",
47
+ "time",
48
+ "data",
49
+ "a",
50
+ "blockquote",
51
+ "q",
52
+ }
53
+ )
54
+
55
+ SEMANTIC_STRUCTURE_TAGS = frozenset(
56
+ {
57
+ "article",
58
+ "section",
59
+ "aside",
60
+ "header",
61
+ "footer",
62
+ "main",
63
+ "nav",
64
+ "figure",
65
+ "figcaption",
66
+ "details",
67
+ "summary",
68
+ }
69
+ )
70
+
71
+ TABLE_TAGS = frozenset(
72
+ {
73
+ "table",
74
+ "thead",
75
+ "tbody",
76
+ "tfoot",
77
+ "tr",
78
+ "td",
79
+ "th",
80
+ "caption",
81
+ "colgroup",
82
+ "col",
83
+ }
84
+ )
85
+
86
+ MEDIA_TAGS = frozenset(
87
+ {
88
+ "img",
89
+ "picture",
90
+ "source",
91
+ "audio",
92
+ "video",
93
+ "track",
94
+ "canvas",
95
+ "svg",
96
+ "iframe",
97
+ }
98
+ )
99
+
8
100
 
9
101
  def preprocess_html(
10
102
  html: str,
@@ -63,98 +155,16 @@ def _configure_cleaning_rules(
63
155
  custom_tags_to_remove: set[str],
64
156
  custom_attributes_to_remove: set[str],
65
157
  ) -> dict[str, Any]:
66
- allowed_tags = {
67
- "p",
68
- "div",
69
- "span",
70
- "br",
71
- "hr",
72
- "h1",
73
- "h2",
74
- "h3",
75
- "h4",
76
- "h5",
77
- "h6",
78
- "ul",
79
- "ol",
80
- "li",
81
- "dl",
82
- "dt",
83
- "dd",
84
- "strong",
85
- "b",
86
- "em",
87
- "i",
88
- "u",
89
- "s",
90
- "del",
91
- "ins",
92
- "mark",
93
- "small",
94
- "sub",
95
- "sup",
96
- "code",
97
- "pre",
98
- "kbd",
99
- "samp",
100
- "var",
101
- "abbr",
102
- "cite",
103
- "dfn",
104
- "time",
105
- "data",
106
- "a",
107
- "blockquote",
108
- "q",
109
- }
158
+ allowed_tags = set(BASE_ALLOWED_TAGS)
110
159
 
111
160
  if preserve_semantic_structure:
112
- allowed_tags.update(
113
- {
114
- "article",
115
- "section",
116
- "aside",
117
- "header",
118
- "footer",
119
- "main",
120
- "nav",
121
- "figure",
122
- "figcaption",
123
- "details",
124
- "summary",
125
- }
126
- )
161
+ allowed_tags.update(SEMANTIC_STRUCTURE_TAGS)
127
162
 
128
163
  if preserve_tables:
129
- allowed_tags.update(
130
- {
131
- "table",
132
- "thead",
133
- "tbody",
134
- "tfoot",
135
- "tr",
136
- "th",
137
- "td",
138
- "caption",
139
- "col",
140
- "colgroup",
141
- }
142
- )
164
+ allowed_tags.update(TABLE_TAGS)
143
165
 
144
166
  if preserve_media:
145
- allowed_tags.update(
146
- {
147
- "img",
148
- "picture",
149
- "source",
150
- "audio",
151
- "video",
152
- "track",
153
- "canvas",
154
- "svg",
155
- "iframe",
156
- }
157
- )
167
+ allowed_tags.update(MEDIA_TAGS)
158
168
 
159
169
  allowed_tags -= custom_tags_to_remove
160
170
 
@@ -17,7 +17,7 @@ from bs4.element import NavigableString, PageElement
17
17
  try:
18
18
  from html_to_markdown.preprocessor import create_preprocessor
19
19
  from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
20
- except ImportError:
20
+ except ImportError: # pragma: no cover
21
21
  create_preprocessor = None # type: ignore[assignment]
22
22
  preprocess_fn = None # type: ignore[assignment]
23
23
 
@@ -25,7 +25,7 @@ try:
25
25
  import importlib.util
26
26
 
27
27
  LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
28
- except ImportError:
28
+ except ImportError: # pragma: no cover
29
29
  LXML_AVAILABLE = False
30
30
 
31
31
  from html_to_markdown.constants import (
@@ -322,7 +322,7 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
322
322
  def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
323
323
  elem_id = id(element)
324
324
  cache = _ancestor_cache.get()
325
- if cache is None:
325
+ if cache is None: # pragma: no cover
326
326
  cache = {}
327
327
  _ancestor_cache.set(cache)
328
328
 
@@ -338,7 +338,7 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
338
338
  ancestor_names.add(current.name)
339
339
 
340
340
  parent_id = id(current)
341
- if parent_id in cache:
341
+ if parent_id in cache: # pragma: no cover
342
342
  ancestor_names.update(cache[parent_id])
343
343
  break
344
344
 
@@ -386,36 +386,35 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
386
386
  metadata["base-href"] = base_tag["href"]
387
387
 
388
388
  for meta in soup.find_all("meta"):
389
- if meta.get("name") and meta.get("content") is not None:
390
- name = meta["name"]
391
- content = meta["content"]
389
+ if (name := meta.get("name")) and (content := meta.get("content")) is not None:
392
390
  if isinstance(name, str) and isinstance(content, str):
393
- key = f"meta-{name.lower()}"
394
- metadata[key] = content
391
+ metadata[f"meta-{name.lower()}"] = content
395
392
 
396
- elif meta.get("property") and meta.get("content") is not None:
397
- prop = meta["property"]
398
- content = meta["content"]
393
+ elif (prop := meta.get("property")) and (content := meta.get("content")) is not None:
399
394
  if isinstance(prop, str) and isinstance(content, str):
400
- key = f"meta-{prop.lower().replace(':', '-')}"
401
- metadata[key] = content
395
+ metadata[f"meta-{prop.lower().replace(':', '-')}"] = content
402
396
 
403
- elif meta.get("http-equiv") and meta.get("content") is not None:
404
- equiv = meta["http-equiv"]
405
- content = meta["content"]
406
- if isinstance(equiv, str) and isinstance(content, str):
407
- key = f"meta-{equiv.lower()}"
408
- metadata[key] = content
397
+ elif (
398
+ (equiv := meta.get("http-equiv"))
399
+ and (content := meta.get("content")) is not None
400
+ and isinstance(equiv, str)
401
+ and isinstance(content, str)
402
+ ):
403
+ metadata[f"meta-{equiv.lower()}"] = content
409
404
 
410
405
  canonical = soup.find("link", rel="canonical", href=True)
411
406
  if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
412
407
  metadata["canonical"] = canonical["href"]
413
408
 
414
409
  link_relations = {"author", "license", "alternate"}
415
- for rel_type in link_relations:
416
- link = soup.find("link", rel=rel_type, href=True)
417
- if link and isinstance(link, Tag) and isinstance(link["href"], str):
418
- metadata[f"link-{rel_type}"] = link["href"]
410
+ link_metadata = {
411
+ f"link-{rel_type}": link["href"]
412
+ for rel_type in link_relations
413
+ if (link := soup.find("link", rel=rel_type, href=True))
414
+ and isinstance(link, Tag)
415
+ and isinstance(link["href"], str)
416
+ }
417
+ metadata.update(link_metadata)
419
418
 
420
419
  return metadata
421
420
 
@@ -424,11 +423,7 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
424
423
  if not metadata:
425
424
  return ""
426
425
 
427
- lines = ["<!--"]
428
- for key, value in sorted(metadata.items()):
429
- safe_value = value.replace("-->", "--&gt;")
430
- lines.append(f"{key}: {safe_value}")
431
- lines.append("-->")
426
+ lines = ["<!--", *[f"{key}: {value.replace('-->', '--&gt;')}" for key, value in sorted(metadata.items())], "-->"]
432
427
 
433
428
  return "\n".join(lines) + "\n\n"
434
429
 
@@ -442,6 +437,7 @@ def convert_to_markdown(
442
437
  progress_callback: Callable[[int, int], None] | None = None,
443
438
  parser: str | None = None,
444
439
  autolinks: bool = True,
440
+ br_in_tables: bool = False,
445
441
  bullets: str = "*+-",
446
442
  code_language: str = "",
447
443
  code_language_callback: Callable[[Any], str] | None = None,
@@ -485,6 +481,7 @@ def convert_to_markdown(
485
481
  progress_callback: Callback for progress updates (current, total).
486
482
  parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
487
483
  autolinks: Convert URLs to automatic links.
484
+ br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
488
485
  bullets: Characters to use for unordered list bullets.
489
486
  code_language: Default language for code blocks.
490
487
  code_language_callback: Callback to determine code language from element.
@@ -644,7 +641,7 @@ def convert_to_markdown(
644
641
  result = re.sub(r"\n{3,}", "\n\n", result)
645
642
 
646
643
  if convert_as_inline:
647
- result = result.rstrip("\n")
644
+ result = result.rstrip("\n") # pragma: no cover
648
645
 
649
646
  return result
650
647
 
@@ -658,6 +655,7 @@ def convert_to_markdown(
658
655
  whitespace_handler=whitespace_handler,
659
656
  parser=parser,
660
657
  autolinks=autolinks,
658
+ br_in_tables=br_in_tables,
661
659
  bullets=bullets,
662
660
  code_language=code_language,
663
661
  code_language_callback=code_language_callback,
@@ -819,6 +817,7 @@ def _process_html_core(
819
817
  whitespace_handler: WhitespaceHandler,
820
818
  parser: str | None = None,
821
819
  autolinks: bool,
820
+ br_in_tables: bool,
822
821
  bullets: str,
823
822
  code_language: str,
824
823
  code_language_callback: Callable[[Any], str] | None,
@@ -849,24 +848,25 @@ def _process_html_core(
849
848
  try:
850
849
  if isinstance(source, str):
851
850
  if strip_newlines:
852
- source = source.replace("\n", " ").replace("\r", " ")
851
+ source = source.replace("\n", " ").replace("\r", " ") # pragma: no cover
853
852
 
854
853
  if "".join(source.split("\n")):
855
854
  if parser is None:
856
855
  parser = "lxml" if LXML_AVAILABLE else "html.parser"
857
856
 
858
- if parser == "lxml" and not LXML_AVAILABLE:
857
+ if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
859
858
  raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
860
859
 
861
860
  source = BeautifulSoup(source, parser)
862
861
  else:
863
862
  raise EmptyHtmlError
864
863
 
865
- if strip is not None and convert is not None:
864
+ if strip is not None and convert is not None: # pragma: no cover
866
865
  raise ConflictingOptionsError("strip", "convert")
867
866
 
868
867
  converters_map = create_converters_map(
869
868
  autolinks=autolinks,
869
+ br_in_tables=br_in_tables,
870
870
  bullets=bullets,
871
871
  code_language=code_language,
872
872
  code_language_callback=code_language_callback,
@@ -935,6 +935,7 @@ def convert_to_markdown_stream(
935
935
  progress_callback: Callable[[int, int], None] | None = None,
936
936
  parser: str | None = None,
937
937
  autolinks: bool = True,
938
+ br_in_tables: bool = False,
938
939
  bullets: str = "*+-",
939
940
  code_language: str = "",
940
941
  code_language_callback: Callable[[Any], str] | None = None,
@@ -976,6 +977,7 @@ def convert_to_markdown_stream(
976
977
  whitespace_handler=whitespace_handler,
977
978
  parser=parser,
978
979
  autolinks=autolinks,
980
+ br_in_tables=br_in_tables,
979
981
  bullets=bullets,
980
982
  code_language=code_language,
981
983
  code_language_callback=code_language_callback,
@@ -1027,7 +1029,7 @@ def convert_to_markdown_stream(
1027
1029
  end_pos = search_start + newline_pos + 1
1028
1030
 
1029
1031
  chunk = combined_result[pos:end_pos]
1030
- if chunk:
1032
+ if chunk: # pragma: no cover
1031
1033
  yield chunk
1032
1034
 
1033
1035
  pos = end_pos
html_to_markdown/utils.py CHANGED
@@ -12,9 +12,7 @@ def chomp(text: str) -> tuple[str, str, str]:
12
12
  prefix = " " if text.startswith((" ", "\t")) else ""
13
13
  suffix = " " if text.endswith((" ", "\t")) else ""
14
14
 
15
- text = text.strip()
16
-
17
- return prefix, suffix, text
15
+ return prefix, suffix, text.strip()
18
16
 
19
17
 
20
18
  def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
@@ -7,7 +7,7 @@ import unicodedata
7
7
  from typing import TYPE_CHECKING, Literal
8
8
 
9
9
  if TYPE_CHECKING:
10
- from bs4 import NavigableString, PageElement, Tag
10
+ from bs4 import NavigableString, PageElement
11
11
 
12
12
 
13
13
  WhitespaceMode = Literal["normalized", "strict"]
@@ -132,7 +132,7 @@ class WhitespaceHandler:
132
132
  for char in text:
133
133
  if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
134
134
  normalized.append(" ")
135
- elif char in ("\r\n", "\r"):
135
+ elif char == "\r": # pragma: no cover
136
136
  normalized.append("\n")
137
137
  else:
138
138
  normalized.append(char)
@@ -168,15 +168,12 @@ class WhitespaceHandler:
168
168
  *,
169
169
  in_pre: bool = False,
170
170
  ) -> str:
171
- if not text:
171
+ if not text: # pragma: no cover
172
172
  return ""
173
173
 
174
174
  if in_pre or self.should_preserve_whitespace(element):
175
175
  return text
176
176
 
177
- if self.mode == "strict":
178
- return text
179
-
180
177
  text = self.normalize_unicode_spaces(text)
181
178
  return self._process_normalized(text, element)
182
179
 
@@ -204,8 +201,8 @@ class WhitespaceHandler:
204
201
  def _process_text_with_content(self, text: str, element: NavigableString) -> str:
205
202
  original = str(element)
206
203
 
207
- has_lead_space = original and original[0] in " \t\n"
208
- has_trail_space = original and original[-1] in " \t\n"
204
+ has_lead_space = bool(original and original[0] in " \t\n")
205
+ has_trail_space = bool(original and original[-1] in " \t\n")
209
206
 
210
207
  text = self._multiple_spaces.sub(" ", text.strip())
211
208
 
@@ -215,9 +212,9 @@ class WhitespaceHandler:
215
212
  return self._process_special_inline_containers(text, original)
216
213
 
217
214
  if parent and self.is_inline_element(parent):
218
- return self._process_inline_element_text(text, original, bool(has_lead_space), bool(has_trail_space))
215
+ return self._process_inline_element_text(text, original, has_lead_space, has_trail_space)
219
216
 
220
- return self._process_standalone_text(text, original, element, bool(has_lead_space), bool(has_trail_space))
217
+ return self._process_standalone_text(text, original, element, has_lead_space, has_trail_space)
221
218
 
222
219
  def _process_special_inline_containers(self, text: str, original: str) -> str:
223
220
  if original and "\n" not in original and "\t" not in original:
@@ -280,24 +277,3 @@ class WhitespaceHandler:
280
277
  text = text + "\n\n"
281
278
 
282
279
  return text
283
-
284
- def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
285
- if self.mode == "strict":
286
- return ""
287
-
288
- tag_name = tag.name.lower() if hasattr(tag, "name") else ""
289
-
290
- double_newline_elements = {"p", "div", "blockquote", "pre", "table", "ul", "ol", "dl"}
291
-
292
- single_newline_elements = {"li", "dt", "dd", "tr", "td", "th"}
293
-
294
- if tag_name in double_newline_elements:
295
- if self.is_block_element(next_sibling):
296
- return "\n\n"
297
- return "\n"
298
- if tag_name in single_newline_elements:
299
- return "\n"
300
- if tag_name.startswith("h") and len(tag_name) == 2 and tag_name[1].isdigit():
301
- return "\n\n"
302
-
303
- return ""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.11.0
3
+ Version: 1.12.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -320,6 +320,88 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
320
320
 
321
321
  Custom converters take precedence over built-in converters and can be used alongside other configuration options.
322
322
 
323
+ ### Streaming API
324
+
325
+ For processing large documents with memory constraints, use the streaming API:
326
+
327
+ ```python
328
+ from html_to_markdown import convert_to_markdown_stream
329
+
330
+ # Process large HTML in chunks
331
+ with open("large_document.html", "r") as f:
332
+ html_content = f.read()
333
+
334
+ # Returns a generator that yields markdown chunks
335
+ for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
336
+ print(chunk, end="")
337
+ ```
338
+
339
+ With progress tracking:
340
+
341
+ ```python
342
+ def show_progress(processed: int, total: int):
343
+ if total > 0:
344
+ percent = (processed / total) * 100
345
+ print(f"\rProgress: {percent:.1f}%", end="")
346
+
347
+ # Stream with progress callback
348
+ markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
349
+ ```
350
+
351
+ ### Preprocessing API
352
+
353
+ The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
354
+
355
+ ```python
356
+ from html_to_markdown import preprocess_html, create_preprocessor
357
+
358
+ # Direct preprocessing with custom options
359
+ cleaned_html = preprocess_html(
360
+ raw_html,
361
+ remove_navigation=True,
362
+ remove_forms=True,
363
+ remove_scripts=True,
364
+ remove_styles=True,
365
+ remove_comments=True,
366
+ preserve_semantic_structure=True,
367
+ preserve_tables=True,
368
+ preserve_media=True,
369
+ )
370
+ markdown = convert_to_markdown(cleaned_html)
371
+
372
+ # Create a preprocessor configuration from presets
373
+ config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
374
+ markdown = convert_to_markdown(html, **config)
375
+ ```
376
+
377
+ ### Exception Handling
378
+
379
+ The library provides specific exception classes for better error handling:
380
+
381
+ ````python
382
+ from html_to_markdown import (
383
+ convert_to_markdown,
384
+ HtmlToMarkdownError,
385
+ EmptyHtmlError,
386
+ InvalidParserError,
387
+ ConflictingOptionsError,
388
+ MissingDependencyError
389
+ )
390
+
391
+ try:
392
+ markdown = convert_to_markdown(html, parser='lxml')
393
+ except MissingDependencyError:
394
+ # lxml not installed
395
+ markdown = convert_to_markdown(html, parser='html.parser')
396
+ except EmptyHtmlError:
397
+ print("No HTML content to convert")
398
+ except InvalidParserError as e:
399
+ print(f"Parser error: {e}")
400
+ except ConflictingOptionsError as e:
401
+ print(f"Conflicting options: {e}")
402
+ except HtmlToMarkdownError as e:
403
+ print(f"Conversion error: {e}")
404
+
323
405
  ## CLI Usage
324
406
 
325
407
  Convert HTML files directly from the command line with full access to all API options:
@@ -340,7 +422,7 @@ html_to_markdown \
340
422
  --preprocess-html \
341
423
  --preprocessing-preset aggressive \
342
424
  input.html > output.md
343
- ```
425
+ ````
344
426
 
345
427
  ### Key CLI Options
346
428
 
@@ -353,6 +435,20 @@ html_to_markdown \
353
435
  --whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
354
436
  --heading-style {atx,atx_closed,underlined} # Header style
355
437
  --no-extract-metadata # Disable metadata extraction
438
+ --br-in-tables # Use <br> tags for line breaks in table cells
439
+ --source-encoding ENCODING # Override auto-detected encoding (rarely needed)
440
+ ```
441
+
442
+ **File Encoding:**
443
+
444
+ The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
445
+
446
+ ```shell
447
+ # Override auto-detection for Latin-1 encoded file
448
+ html_to_markdown --source-encoding latin-1 input.html > output.md
449
+
450
+ # Force UTF-16 encoding when auto-detection fails
451
+ html_to_markdown --source-encoding utf-16 input.html > output.md
356
452
  ```
357
453
 
358
454
  **All Available Options:**
@@ -393,6 +489,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
393
489
  - `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
394
490
  - `sub_symbol` (str, default: `''`): Custom symbol for subscript text
395
491
  - `sup_symbol` (str, default: `''`): Custom symbol for superscript text
492
+ - `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
396
493
 
397
494
  ### Parser Options
398
495
 
@@ -0,0 +1,17 @@
1
+ html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
2
+ html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
3
+ html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
4
+ html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
5
+ html_to_markdown/converters.py,sha256=4dikabmNVu8g7jnSpk_i_6CAKy7OehjcL0c8lmIJRSk,36414
6
+ html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
7
+ html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
8
+ html_to_markdown/processing.py,sha256=RQbqkI3w_rm64uOvmO6-CrqCJXKNHtfKu2G6f59JSF0,34596
9
+ html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
11
+ html_to_markdown/whitespace.py,sha256=a7M_u9JXh6cfjs4rz25hABIKKy3ax11ZXJhEID4YSV4,7397
12
+ html_to_markdown-1.12.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
13
+ html_to_markdown-1.12.0.dist-info/METADATA,sha256=y8bGQgaCogxjM7V3gldeZi0IIaiCC-H7NiPqQMwMgmY,20867
14
+ html_to_markdown-1.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ html_to_markdown-1.12.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
16
+ html_to_markdown-1.12.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
17
+ html_to_markdown-1.12.0.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
2
- html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
3
- html_to_markdown/cli.py,sha256=ilnrJN2XMhPDQ4UkkG4cjLXTvglu_ZJj-bBsohVF3fw,8541
4
- html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
5
- html_to_markdown/converters.py,sha256=CbChkRIlOPe0d1MK5-txDE56IG4Ea_dcCV6KRCTjeKY,32497
6
- html_to_markdown/exceptions.py,sha256=YjfwVCWE_oZakr9iy0E-_aPSYHNaocJZgWeQ9Enty7Q,1212
7
- html_to_markdown/preprocessor.py,sha256=acmuJJvx1RaXE3c0F_aWsartQE0cEpa3AOnJYGnPzqw,9708
8
- html_to_markdown/processing.py,sha256=sOIIFNyRkRYAH8Q4ehrh66RY71bkvttSuqzXYsMC5JM,34334
9
- html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- html_to_markdown/utils.py,sha256=4Vzk2cCjxN0LAZ1DXQCufYtxE7a6739TYgPbje-VM_E,1086
11
- html_to_markdown/whitespace.py,sha256=EJ0gEsfLB_wZAk5d5qP4UPhPg0pJJ8LZLRRr_QoL01o,8186
12
- html_to_markdown-1.11.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
13
- html_to_markdown-1.11.0.dist-info/METADATA,sha256=Cej6bnqT9JVFzACZvND6Z5-kD0QoabiLi46opAaC11U,17814
14
- html_to_markdown-1.11.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- html_to_markdown-1.11.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
16
- html_to_markdown-1.11.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
17
- html_to_markdown-1.11.0.dist-info/RECORD,,