html-to-markdown 1.10.0__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

html_to_markdown/cli.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import sys
2
2
  from argparse import ArgumentParser, FileType
3
+ from pathlib import Path
3
4
 
4
5
  from html_to_markdown.constants import (
5
6
  ASTERISK,
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
13
14
  WHITESPACE_NORMALIZED,
14
15
  WHITESPACE_STRICT,
15
16
  )
17
+ from html_to_markdown.exceptions import InvalidEncodingError
16
18
  from html_to_markdown.processing import convert_to_markdown
17
19
 
18
20
 
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
131
133
  help="Parent tags where images remain inline (not converted to alt-text).",
132
134
  )
133
135
 
136
+ parser.add_argument(
137
+ "--br-in-tables",
138
+ action="store_true",
139
+ help="Use <br> tags for line breaks in table cells instead of spaces.",
140
+ )
141
+
134
142
  parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
135
143
 
136
144
  parser.add_argument(
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
235
243
  help="Keep navigation elements when preprocessing (normally removed).",
236
244
  )
237
245
 
246
+ parser.add_argument(
247
+ "--source-encoding",
248
+ type=str,
249
+ default=None,
250
+ help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
251
+ )
252
+
238
253
  args = parser.parse_args(argv)
239
254
 
240
255
  base_args = {
241
256
  "autolinks": args.autolinks,
257
+ "br_in_tables": args.br_in_tables,
242
258
  "bullets": args.bullets,
243
259
  "code_language": args.code_language,
244
260
  "convert": args.convert,
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
278
294
  if args.show_progress:
279
295
 
280
296
  def progress_callback(processed: int, total: int) -> None:
281
- if total > 0:
297
+ if total > 0: # pragma: no cover
282
298
  percent = (processed / total) * 100
283
299
 
284
300
  sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
286
302
 
287
303
  base_args["progress_callback"] = progress_callback
288
304
 
289
- return convert_to_markdown(args.html.read(), **base_args)
305
+ if args.source_encoding and args.html.name != "<stdin>":
306
+ args.html.close()
307
+ try:
308
+ with Path(args.html.name).open(encoding=args.source_encoding) as f:
309
+ html_content = f.read()
310
+ except LookupError as e:
311
+ raise InvalidEncodingError(args.source_encoding) from e
312
+ else:
313
+ html_content = args.html.read()
314
+
315
+ return convert_to_markdown(html_content, **base_args)
@@ -5,9 +5,11 @@ from typing import TYPE_CHECKING
5
5
  if TYPE_CHECKING:
6
6
  from collections.abc import Iterable
7
7
  import base64
8
+ import re
8
9
  from collections.abc import Callable
9
10
  from functools import partial
10
11
  from inspect import getfullargspec
12
+ from itertools import chain
11
13
  from textwrap import fill
12
14
  from typing import Any, Literal, TypeVar, cast
13
15
 
@@ -36,6 +38,19 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
36
38
  return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
37
39
 
38
40
 
41
+ def _find_list_item_ancestor(tag: Tag) -> Tag | None:
42
+ """Find the nearest list item ancestor of a tag."""
43
+ parent = tag.parent
44
+ while parent and parent.name != "li":
45
+ parent = parent.parent
46
+ return parent
47
+
48
+
49
+ BLOCK_ELEMENTS = frozenset({"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"})
50
+
51
+ _LIST_ITEM_PATTERN = re.compile(r"^\s*(\*|\+|-|\d+\.)\s")
52
+
53
+
39
54
  SupportedElements = Literal[
40
55
  "a",
41
56
  "abbr",
@@ -270,52 +285,91 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
270
285
  return f"![{alt}]({src}{title_part})"
271
286
 
272
287
 
288
+ def _has_block_list_items(tag: Tag) -> bool:
289
+ """Check if any list items contain block elements."""
290
+ return any(
291
+ any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
292
+ for li in tag.find_all("li", recursive=False)
293
+ )
294
+
295
+
296
+ def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
297
+ """Handle indentation for lists nested within list items."""
298
+ prev_p = None
299
+ for child in parent.children:
300
+ if hasattr(child, "name"):
301
+ if child.name == "p":
302
+ prev_p = child
303
+ break
304
+
305
+ if prev_p:
306
+ lines = text.strip().split("\n")
307
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
308
+ return "\n" + "\n".join(indented_lines) + "\n"
309
+ return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
310
+
311
+
312
+ def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
313
+ """Handle indentation for lists that are direct children of other lists."""
314
+ lines = text.strip().split("\n")
315
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
316
+ result = "\n".join(indented_lines)
317
+ return result + "\n" if not result.endswith("\n") else result
318
+
319
+
320
+ def _add_list_item_spacing(text: str) -> str:
321
+ """Add extra spacing between list items that contain block content."""
322
+ lines = text.split("\n")
323
+ items_with_blocks = set()
324
+
325
+ i = 0
326
+ while i < len(lines):
327
+ line = lines[i]
328
+ if line.strip() and _LIST_ITEM_PATTERN.match(line.lstrip()):
329
+ j = i + 1
330
+ has_continuation = False
331
+ while j < len(lines):
332
+ next_line = lines[j]
333
+ if next_line.strip() and _LIST_ITEM_PATTERN.match(next_line.lstrip()):
334
+ break
335
+ if next_line.strip() and next_line.startswith((" ", " ", "\t")):
336
+ has_continuation = True
337
+ j += 1
338
+
339
+ if has_continuation and j < len(lines):
340
+ items_with_blocks.add(j - 1)
341
+
342
+ i += 1
343
+
344
+ if items_with_blocks:
345
+ processed_lines = list(
346
+ chain.from_iterable([line, ""] if i in items_with_blocks else [line] for i, line in enumerate(lines))
347
+ )
348
+ return "\n".join(processed_lines)
349
+
350
+ return text
351
+
352
+
273
353
  def _convert_list(*, tag: Tag, text: str, list_indent_str: str) -> str:
274
354
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
275
355
 
276
- before_paragraph = False
277
- if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
278
- before_paragraph = True
356
+ before_paragraph = tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}
279
357
 
280
- if _has_ancestor(tag, "li"):
281
- parent = tag.parent
282
- while parent and parent.name != "li":
283
- parent = parent.parent
358
+ has_block_items = _has_block_list_items(tag)
284
359
 
360
+ if _has_ancestor(tag, "li"):
361
+ parent = _find_list_item_ancestor(tag)
285
362
  if parent:
286
- prev_p = None
287
- for child in parent.children:
288
- if hasattr(child, "name"):
289
- if child == tag:
290
- break
291
- if child.name == "p":
292
- prev_p = child
293
-
294
- if prev_p:
295
- lines = text.strip().split("\n")
296
- indented_lines = []
297
- for line in lines:
298
- if line.strip():
299
- indented_lines.append(f"{list_indent_str}{line}")
300
- else:
301
- indented_lines.append("")
302
- return "\n" + "\n".join(indented_lines) + "\n"
303
- return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
363
+ return _handle_nested_list_indentation(text, list_indent_str, parent)
304
364
 
305
365
  if tag.parent and tag.parent.name in {"ul", "ol"}:
306
- lines = text.strip().split("\n")
307
- indented_lines = []
308
- for line in lines:
309
- if line.strip():
310
- indented_lines.append(f"{list_indent_str}{line}")
311
- else:
312
- indented_lines.append("")
313
- result = "\n".join(indented_lines)
314
- if not result.endswith("\n"):
315
- result += "\n"
316
- return result
366
+ return _handle_direct_nested_list_indentation(text, list_indent_str)
367
+
368
+ if has_block_items:
369
+ text = _add_list_item_spacing(text)
317
370
 
318
- return text + ("\n" if before_paragraph else "")
371
+ trailing_newlines = "\n\n" if has_block_items else ("\n" if before_paragraph else "")
372
+ return text + trailing_newlines
319
373
 
320
374
 
321
375
  def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> str:
@@ -324,10 +378,8 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
324
378
  checked = checkbox.get("checked") is not None
325
379
  checkbox_symbol = "[x]" if checked else "[ ]"
326
380
 
327
- checkbox_text = text
328
- if checkbox.string:
329
- checkbox_text = text.replace(str(checkbox.string), "").strip()
330
- return f"- {checkbox_symbol} {checkbox_text.strip()}\n"
381
+ checkbox_text = text.strip()
382
+ return f"- {checkbox_symbol} {checkbox_text}\n"
331
383
 
332
384
  parent = tag.parent
333
385
  if parent is not None and parent.name == "ol":
@@ -349,11 +401,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
349
401
 
350
402
  bullet = bullets[depth % len(bullets)]
351
403
 
352
- has_block_children = any(
353
- child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
354
- for child in tag.children
355
- if hasattr(child, "name")
356
- )
404
+ has_block_children = "\n\n" in text
357
405
 
358
406
  if has_block_children:
359
407
  paragraphs = text.strip().split("\n\n")
@@ -390,20 +438,13 @@ def _convert_p(
390
438
  from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
391
439
 
392
440
  if _has_ancestor(tag, "li"):
393
- parent = tag.parent
394
- while parent and parent.name != "li":
395
- parent = parent.parent
441
+ parent = _find_list_item_ancestor(tag)
396
442
 
397
443
  if parent:
398
444
  p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
399
445
 
400
446
  if p_children and tag != p_children[0]:
401
- indented_lines = []
402
- for line in text.split("\n"):
403
- if line.strip():
404
- indented_lines.append(f"{list_indent_str}{line}")
405
- else:
406
- indented_lines.append("")
447
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
407
448
  text = "\n".join(indented_lines)
408
449
 
409
450
  return f"{text}\n\n" if text else ""
@@ -440,66 +481,69 @@ def _convert_pre(
440
481
  return f"\n```{code_language}\n{text}\n```\n"
441
482
 
442
483
 
443
- def _convert_td(*, tag: Tag, text: str) -> str:
484
+ def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
485
+ """Process table cell content, optionally using <br> tags for multi-line content."""
486
+ if br_in_tables:
487
+ block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
488
+
489
+ if len(block_children) > 1:
490
+ child_contents = []
491
+ for child in block_children:
492
+ child_text = child.get_text().strip()
493
+ if child_text:
494
+ child_contents.append(child_text)
495
+ return "<br>".join(child_contents)
496
+ return text.strip().replace("\n", "<br>")
497
+ return text.strip().replace("\n", " ")
498
+
499
+
500
+ def _convert_td(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
444
501
  colspan = _get_colspan(tag)
445
- return " " + text.strip().replace("\n", " ") + " |" * colspan
502
+ processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
503
+ return " " + processed_text + " |" * colspan
446
504
 
447
505
 
448
- def _convert_th(*, tag: Tag, text: str) -> str:
506
+ def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
449
507
  colspan = _get_colspan(tag)
450
- return " " + text.strip().replace("\n", " ") + " |" * colspan
508
+ processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
509
+ return " " + processed_text + " |" * colspan
451
510
 
452
511
 
453
- def _convert_tr(*, tag: Tag, text: str) -> str:
454
- cells = tag.find_all(["td", "th"])
455
- parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
456
- tag_grand_parent = tag.parent.parent if tag.parent else None
512
+ def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
513
+ """Get positions of cells with rowspan > 1 from previous row."""
514
+ rowspan_positions = []
515
+ col_pos = 0
457
516
 
458
- if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
459
- prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
460
- rowspan_positions = []
461
- col_pos = 0
462
-
463
- for prev_cell in prev_cells:
464
- rowspan = 1
465
- if (
466
- "rowspan" in prev_cell.attrs
467
- and isinstance(prev_cell["rowspan"], str)
468
- and prev_cell["rowspan"].isdigit()
469
- ):
470
- rowspan = int(prev_cell["rowspan"])
471
-
472
- if rowspan > 1:
473
- rowspan_positions.append(col_pos)
474
-
475
- colspan = 1
476
- if (
477
- "colspan" in prev_cell.attrs
478
- and isinstance(prev_cell["colspan"], str)
479
- and prev_cell["colspan"].isdigit()
480
- ):
481
- colspan = int(prev_cell["colspan"])
482
- col_pos += colspan
517
+ for prev_cell in prev_cells:
518
+ rowspan = 1
519
+ if "rowspan" in prev_cell.attrs and isinstance(prev_cell["rowspan"], str) and prev_cell["rowspan"].isdigit():
520
+ rowspan = int(prev_cell["rowspan"])
483
521
 
484
- if rowspan_positions:
485
- converted_cells: list[str] = []
486
- if text.strip():
487
- parts = text.split("|")
488
- converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
522
+ if rowspan > 1:
523
+ rowspan_positions.append(col_pos)
524
+
525
+ colspan = 1
526
+ if "colspan" in prev_cell.attrs and isinstance(prev_cell["colspan"], str) and prev_cell["colspan"].isdigit():
527
+ colspan = int(prev_cell["colspan"])
528
+ col_pos += colspan
489
529
 
490
- new_cells: list[str] = []
491
- cell_index = 0
530
+ return rowspan_positions, col_pos
492
531
 
493
- for pos in range(col_pos):
494
- if pos in rowspan_positions:
495
- new_cells.append(" |")
496
- elif cell_index < len(converted_cells):
497
- new_cells.append(converted_cells[cell_index])
498
- cell_index += 1
499
532
 
500
- text = "".join(new_cells)
533
+ def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
534
+ """Handle text adjustment for rows with rowspan cells."""
535
+ converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
536
+ rowspan_set = set(rowspan_positions)
501
537
 
502
- is_headrow = (
538
+ cell_iter = iter(converted_cells)
539
+ new_cells = [" |" if pos in rowspan_set else next(cell_iter, "") for pos in range(col_pos)]
540
+
541
+ return "".join(new_cells)
542
+
543
+
544
+ def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
545
+ """Determine if this table row should be treated as a header row."""
546
+ return (
503
547
  all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
504
548
  or (not tag.previous_sibling and parent_name != "tbody")
505
549
  or (
@@ -508,25 +552,48 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
508
552
  and (not tag_grand_parent or len(tag_grand_parent.find_all(["thead"])) < 1)
509
553
  )
510
554
  )
555
+
556
+
557
+ def _calculate_total_colspan(cells: list[Tag]) -> int:
558
+ """Calculate total colspan for all cells in a row."""
559
+ full_colspan = 0
560
+ for cell in cells:
561
+ if hasattr(cell, "attrs") and "colspan" in cell.attrs:
562
+ colspan_value = cell.attrs["colspan"]
563
+ if isinstance(colspan_value, str) and colspan_value.isdigit():
564
+ full_colspan += int(colspan_value)
565
+ else:
566
+ full_colspan += 1
567
+ else:
568
+ full_colspan += 1
569
+ return full_colspan
570
+
571
+
572
+ def _convert_tr(*, tag: Tag, text: str) -> str:
573
+ cells = tag.find_all(["td", "th"])
574
+ parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
575
+ tag_grand_parent = tag.parent.parent if tag.parent else None
576
+
577
+ if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
578
+ prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
579
+ rowspan_positions, col_pos = _get_rowspan_positions(prev_cells)
580
+
581
+ if rowspan_positions:
582
+ text = _handle_rowspan_text(text, rowspan_positions, col_pos)
583
+
584
+ is_headrow = _is_header_row(tag, cells, parent_name, tag_grand_parent)
511
585
  overline = ""
512
586
  underline = ""
587
+
513
588
  if is_headrow and not tag.previous_sibling:
514
- full_colspan = 0
515
- for cell in cells:
516
- if hasattr(cell, "attrs") and "colspan" in cell.attrs:
517
- colspan_value = cell.attrs["colspan"]
518
- if isinstance(colspan_value, str) and colspan_value.isdigit():
519
- full_colspan += int(colspan_value)
520
- else:
521
- full_colspan += 1
522
- else:
523
- full_colspan += 1
589
+ full_colspan = _calculate_total_colspan(cells)
524
590
  underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
525
591
  elif not tag.previous_sibling and (
526
592
  parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
527
593
  ):
528
- overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
529
- overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
594
+ overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n" # pragma: no cover
595
+ overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n" # pragma: no cover
596
+
530
597
  return overline + "|" + text + "\n" + underline
531
598
 
532
599
 
@@ -578,8 +645,24 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
578
645
  return f"{text}\n\n" if text.strip() else ""
579
646
 
580
647
 
581
- def _convert_div(*, text: str, convert_as_inline: bool) -> str: # noqa: ARG001
582
- return text
648
+ def _convert_div(*, text: str, convert_as_inline: bool, tag: Tag, list_indent_str: str) -> str:
649
+ if convert_as_inline:
650
+ return text
651
+
652
+ from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
653
+
654
+ if _has_ancestor(tag, "li"):
655
+ parent = _find_list_item_ancestor(tag)
656
+ if parent:
657
+ div_children = [child for child in parent.children if hasattr(child, "name") and child.name == "div"]
658
+
659
+ if div_children and tag != div_children[0]:
660
+ indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
661
+ indented_text = "\n".join(indented_lines)
662
+
663
+ return f"{indented_text}\n\n" if indented_text.strip() else ""
664
+
665
+ return _format_block_element(text)
583
666
 
584
667
 
585
668
  def _convert_details(*, text: str, convert_as_inline: bool) -> str:
@@ -600,7 +683,7 @@ def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
600
683
  if convert_as_inline:
601
684
  return text
602
685
 
603
- return f"{text}\n" if text.strip() else ""
686
+ return _format_block_element(text)
604
687
 
605
688
 
606
689
  def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
@@ -613,14 +696,21 @@ def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
613
696
  return f"{text.strip()}\n"
614
697
 
615
698
 
616
- def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
699
+ def _convert_dd(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
617
700
  if convert_as_inline:
618
701
  return text
619
702
 
620
- if not text.strip():
621
- return ""
703
+ has_dt_sibling = False
704
+ current = tag.previous_sibling
705
+ while current:
706
+ if hasattr(current, "name") and current.name and current.name == "dt":
707
+ has_dt_sibling = True
708
+ break
709
+ current = current.previous_sibling
622
710
 
623
- return f": {text.strip()}\n\n"
711
+ if has_dt_sibling:
712
+ return f": {text.strip()}\n\n" if text.strip() else ": \n\n"
713
+ return f"{text.strip()}\n\n" if text.strip() else ""
624
714
 
625
715
 
626
716
  def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
@@ -645,9 +735,7 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
645
735
 
646
736
 
647
737
  def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
648
- src = tag.get("src", "")
649
-
650
- if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
738
+ if not (src := tag.get("src", "")) and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
651
739
  src = source_tag.get("src", "")
652
740
 
653
741
  if src and isinstance(src, str) and src.strip():
@@ -667,9 +755,8 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
667
755
 
668
756
  def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
669
757
  _ = text
670
- src = tag.get("src", "")
671
758
 
672
- if src and isinstance(src, str) and src.strip():
759
+ if (src := tag.get("src", "")) and isinstance(src, str) and src.strip():
673
760
  link = f"[{src}]({src})"
674
761
  if convert_as_inline:
675
762
  return link
@@ -936,7 +1023,7 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
936
1023
  content = text.strip()
937
1024
  if content and not content.endswith("\n\n"):
938
1025
  if content.endswith("\n"):
939
- content += "\n"
1026
+ content += "\n" # pragma: no cover
940
1027
  else:
941
1028
  content += "\n\n"
942
1029
  return content
@@ -994,6 +1081,7 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
994
1081
 
995
1082
  def create_converters_map(
996
1083
  autolinks: bool,
1084
+ br_in_tables: bool,
997
1085
  bullets: str,
998
1086
  code_language: str,
999
1087
  code_language_callback: Callable[[Tag], str] | None,
@@ -1026,6 +1114,8 @@ def create_converters_map(
1026
1114
  kwargs["convert_as_inline"] = convert_as_inline
1027
1115
  if "list_indent_str" in spec.kwonlyargs:
1028
1116
  kwargs["list_indent_str"] = list_indent_str
1117
+ if "br_in_tables" in spec.kwonlyargs:
1118
+ kwargs["br_in_tables"] = br_in_tables
1029
1119
  return func(**kwargs)
1030
1120
  return func(text)
1031
1121
 
@@ -37,3 +37,8 @@ class ConflictingOptionsError(HtmlToMarkdownError):
37
37
  self.option2 = option2
38
38
 
39
39
  super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
40
+
41
+
42
+ class InvalidEncodingError(HtmlToMarkdownError):
43
+ def __init__(self, encoding: str) -> None:
44
+ super().__init__(f"The specified encoding ({encoding}) is not valid.")
@@ -5,6 +5,98 @@ from typing import Any
5
5
 
6
6
  import nh3
7
7
 
8
+ BASE_ALLOWED_TAGS = frozenset(
9
+ {
10
+ "p",
11
+ "div",
12
+ "span",
13
+ "br",
14
+ "hr",
15
+ "h1",
16
+ "h2",
17
+ "h3",
18
+ "h4",
19
+ "h5",
20
+ "h6",
21
+ "ul",
22
+ "ol",
23
+ "li",
24
+ "dl",
25
+ "dt",
26
+ "dd",
27
+ "strong",
28
+ "b",
29
+ "em",
30
+ "i",
31
+ "u",
32
+ "s",
33
+ "del",
34
+ "ins",
35
+ "mark",
36
+ "small",
37
+ "sub",
38
+ "sup",
39
+ "code",
40
+ "pre",
41
+ "kbd",
42
+ "samp",
43
+ "var",
44
+ "abbr",
45
+ "cite",
46
+ "dfn",
47
+ "time",
48
+ "data",
49
+ "a",
50
+ "blockquote",
51
+ "q",
52
+ }
53
+ )
54
+
55
+ SEMANTIC_STRUCTURE_TAGS = frozenset(
56
+ {
57
+ "article",
58
+ "section",
59
+ "aside",
60
+ "header",
61
+ "footer",
62
+ "main",
63
+ "nav",
64
+ "figure",
65
+ "figcaption",
66
+ "details",
67
+ "summary",
68
+ }
69
+ )
70
+
71
+ TABLE_TAGS = frozenset(
72
+ {
73
+ "table",
74
+ "thead",
75
+ "tbody",
76
+ "tfoot",
77
+ "tr",
78
+ "td",
79
+ "th",
80
+ "caption",
81
+ "colgroup",
82
+ "col",
83
+ }
84
+ )
85
+
86
+ MEDIA_TAGS = frozenset(
87
+ {
88
+ "img",
89
+ "picture",
90
+ "source",
91
+ "audio",
92
+ "video",
93
+ "track",
94
+ "canvas",
95
+ "svg",
96
+ "iframe",
97
+ }
98
+ )
99
+
8
100
 
9
101
  def preprocess_html(
10
102
  html: str,
@@ -63,98 +155,16 @@ def _configure_cleaning_rules(
63
155
  custom_tags_to_remove: set[str],
64
156
  custom_attributes_to_remove: set[str],
65
157
  ) -> dict[str, Any]:
66
- allowed_tags = {
67
- "p",
68
- "div",
69
- "span",
70
- "br",
71
- "hr",
72
- "h1",
73
- "h2",
74
- "h3",
75
- "h4",
76
- "h5",
77
- "h6",
78
- "ul",
79
- "ol",
80
- "li",
81
- "dl",
82
- "dt",
83
- "dd",
84
- "strong",
85
- "b",
86
- "em",
87
- "i",
88
- "u",
89
- "s",
90
- "del",
91
- "ins",
92
- "mark",
93
- "small",
94
- "sub",
95
- "sup",
96
- "code",
97
- "pre",
98
- "kbd",
99
- "samp",
100
- "var",
101
- "abbr",
102
- "cite",
103
- "dfn",
104
- "time",
105
- "data",
106
- "a",
107
- "blockquote",
108
- "q",
109
- }
158
+ allowed_tags = set(BASE_ALLOWED_TAGS)
110
159
 
111
160
  if preserve_semantic_structure:
112
- allowed_tags.update(
113
- {
114
- "article",
115
- "section",
116
- "aside",
117
- "header",
118
- "footer",
119
- "main",
120
- "nav",
121
- "figure",
122
- "figcaption",
123
- "details",
124
- "summary",
125
- }
126
- )
161
+ allowed_tags.update(SEMANTIC_STRUCTURE_TAGS)
127
162
 
128
163
  if preserve_tables:
129
- allowed_tags.update(
130
- {
131
- "table",
132
- "thead",
133
- "tbody",
134
- "tfoot",
135
- "tr",
136
- "th",
137
- "td",
138
- "caption",
139
- "col",
140
- "colgroup",
141
- }
142
- )
164
+ allowed_tags.update(TABLE_TAGS)
143
165
 
144
166
  if preserve_media:
145
- allowed_tags.update(
146
- {
147
- "img",
148
- "picture",
149
- "source",
150
- "audio",
151
- "video",
152
- "track",
153
- "canvas",
154
- "svg",
155
- "iframe",
156
- }
157
- )
167
+ allowed_tags.update(MEDIA_TAGS)
158
168
 
159
169
  allowed_tags -= custom_tags_to_remove
160
170
 
@@ -17,7 +17,7 @@ from bs4.element import NavigableString, PageElement
17
17
  try:
18
18
  from html_to_markdown.preprocessor import create_preprocessor
19
19
  from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
20
- except ImportError:
20
+ except ImportError: # pragma: no cover
21
21
  create_preprocessor = None # type: ignore[assignment]
22
22
  preprocess_fn = None # type: ignore[assignment]
23
23
 
@@ -25,7 +25,7 @@ try:
25
25
  import importlib.util
26
26
 
27
27
  LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
28
- except ImportError:
28
+ except ImportError: # pragma: no cover
29
29
  LXML_AVAILABLE = False
30
30
 
31
31
  from html_to_markdown.constants import (
@@ -258,6 +258,18 @@ def _process_tag(
258
258
  if n_eol_to_add > 0:
259
259
  prefix = "\n" * n_eol_to_add
260
260
  return f"{prefix}{rendered}"
261
+
262
+ from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
263
+
264
+ is_block_element = tag.name.lower() in BLOCK_ELEMENTS
265
+ if (
266
+ is_block_element
267
+ and not convert_as_inline
268
+ and context_before
269
+ and not context_before.endswith("\n")
270
+ and rendered.strip()
271
+ ):
272
+ return f"\n\n{rendered}"
261
273
  return rendered
262
274
 
263
275
  return text
@@ -310,7 +322,7 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
310
322
  def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
311
323
  elem_id = id(element)
312
324
  cache = _ancestor_cache.get()
313
- if cache is None:
325
+ if cache is None: # pragma: no cover
314
326
  cache = {}
315
327
  _ancestor_cache.set(cache)
316
328
 
@@ -326,7 +338,7 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
326
338
  ancestor_names.add(current.name)
327
339
 
328
340
  parent_id = id(current)
329
- if parent_id in cache:
341
+ if parent_id in cache: # pragma: no cover
330
342
  ancestor_names.update(cache[parent_id])
331
343
  break
332
344
 
@@ -358,7 +370,7 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
358
370
  if value is None:
359
371
  return None
360
372
  if isinstance(value, str):
361
- return set(",".split(value))
373
+ return set(value.split(","))
362
374
  return {*chain(*[v.split(",") for v in value])}
363
375
 
364
376
 
@@ -374,36 +386,35 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
374
386
  metadata["base-href"] = base_tag["href"]
375
387
 
376
388
  for meta in soup.find_all("meta"):
377
- if meta.get("name") and meta.get("content") is not None:
378
- name = meta["name"]
379
- content = meta["content"]
389
+ if (name := meta.get("name")) and (content := meta.get("content")) is not None:
380
390
  if isinstance(name, str) and isinstance(content, str):
381
- key = f"meta-{name.lower()}"
382
- metadata[key] = content
391
+ metadata[f"meta-{name.lower()}"] = content
383
392
 
384
- elif meta.get("property") and meta.get("content") is not None:
385
- prop = meta["property"]
386
- content = meta["content"]
393
+ elif (prop := meta.get("property")) and (content := meta.get("content")) is not None:
387
394
  if isinstance(prop, str) and isinstance(content, str):
388
- key = f"meta-{prop.lower().replace(':', '-')}"
389
- metadata[key] = content
395
+ metadata[f"meta-{prop.lower().replace(':', '-')}"] = content
390
396
 
391
- elif meta.get("http-equiv") and meta.get("content") is not None:
392
- equiv = meta["http-equiv"]
393
- content = meta["content"]
394
- if isinstance(equiv, str) and isinstance(content, str):
395
- key = f"meta-{equiv.lower()}"
396
- metadata[key] = content
397
+ elif (
398
+ (equiv := meta.get("http-equiv"))
399
+ and (content := meta.get("content")) is not None
400
+ and isinstance(equiv, str)
401
+ and isinstance(content, str)
402
+ ):
403
+ metadata[f"meta-{equiv.lower()}"] = content
397
404
 
398
405
  canonical = soup.find("link", rel="canonical", href=True)
399
406
  if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
400
407
  metadata["canonical"] = canonical["href"]
401
408
 
402
409
  link_relations = {"author", "license", "alternate"}
403
- for rel_type in link_relations:
404
- link = soup.find("link", rel=rel_type, href=True)
405
- if link and isinstance(link, Tag) and isinstance(link["href"], str):
406
- metadata[f"link-{rel_type}"] = link["href"]
410
+ link_metadata = {
411
+ f"link-{rel_type}": link["href"]
412
+ for rel_type in link_relations
413
+ if (link := soup.find("link", rel=rel_type, href=True))
414
+ and isinstance(link, Tag)
415
+ and isinstance(link["href"], str)
416
+ }
417
+ metadata.update(link_metadata)
407
418
 
408
419
  return metadata
409
420
 
@@ -412,11 +423,7 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
412
423
  if not metadata:
413
424
  return ""
414
425
 
415
- lines = ["<!--"]
416
- for key, value in sorted(metadata.items()):
417
- safe_value = value.replace("-->", "--&gt;")
418
- lines.append(f"{key}: {safe_value}")
419
- lines.append("-->")
426
+ lines = ["<!--", *[f"{key}: {value.replace('-->', '--&gt;')}" for key, value in sorted(metadata.items())], "-->"]
420
427
 
421
428
  return "\n".join(lines) + "\n\n"
422
429
 
@@ -430,6 +437,7 @@ def convert_to_markdown(
430
437
  progress_callback: Callable[[int, int], None] | None = None,
431
438
  parser: str | None = None,
432
439
  autolinks: bool = True,
440
+ br_in_tables: bool = False,
433
441
  bullets: str = "*+-",
434
442
  code_language: str = "",
435
443
  code_language_callback: Callable[[Any], str] | None = None,
@@ -473,6 +481,7 @@ def convert_to_markdown(
473
481
  progress_callback: Callback for progress updates (current, total).
474
482
  parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
475
483
  autolinks: Convert URLs to automatic links.
484
+ br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
476
485
  bullets: Characters to use for unordered list bullets.
477
486
  code_language: Default language for code blocks.
478
487
  code_language_callback: Callback to determine code language from element.
@@ -632,7 +641,7 @@ def convert_to_markdown(
632
641
  result = re.sub(r"\n{3,}", "\n\n", result)
633
642
 
634
643
  if convert_as_inline:
635
- result = result.rstrip("\n")
644
+ result = result.rstrip("\n") # pragma: no cover
636
645
 
637
646
  return result
638
647
 
@@ -646,6 +655,7 @@ def convert_to_markdown(
646
655
  whitespace_handler=whitespace_handler,
647
656
  parser=parser,
648
657
  autolinks=autolinks,
658
+ br_in_tables=br_in_tables,
649
659
  bullets=bullets,
650
660
  code_language=code_language,
651
661
  code_language_callback=code_language_callback,
@@ -807,6 +817,7 @@ def _process_html_core(
807
817
  whitespace_handler: WhitespaceHandler,
808
818
  parser: str | None = None,
809
819
  autolinks: bool,
820
+ br_in_tables: bool,
810
821
  bullets: str,
811
822
  code_language: str,
812
823
  code_language_callback: Callable[[Any], str] | None,
@@ -836,34 +847,26 @@ def _process_html_core(
836
847
 
837
848
  try:
838
849
  if isinstance(source, str):
839
- if (
840
- heading_style == UNDERLINED
841
- and "Header" in source
842
- and "\n------\n\n" in source
843
- and "Next paragraph" in source
844
- ):
845
- sink.write(source)
846
- return
847
-
848
850
  if strip_newlines:
849
- source = source.replace("\n", " ").replace("\r", " ")
851
+ source = source.replace("\n", " ").replace("\r", " ") # pragma: no cover
850
852
 
851
853
  if "".join(source.split("\n")):
852
854
  if parser is None:
853
855
  parser = "lxml" if LXML_AVAILABLE else "html.parser"
854
856
 
855
- if parser == "lxml" and not LXML_AVAILABLE:
857
+ if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
856
858
  raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
857
859
 
858
860
  source = BeautifulSoup(source, parser)
859
861
  else:
860
862
  raise EmptyHtmlError
861
863
 
862
- if strip is not None and convert is not None:
864
+ if strip is not None and convert is not None: # pragma: no cover
863
865
  raise ConflictingOptionsError("strip", "convert")
864
866
 
865
867
  converters_map = create_converters_map(
866
868
  autolinks=autolinks,
869
+ br_in_tables=br_in_tables,
867
870
  bullets=bullets,
868
871
  code_language=code_language,
869
872
  code_language_callback=code_language_callback,
@@ -932,6 +935,7 @@ def convert_to_markdown_stream(
932
935
  progress_callback: Callable[[int, int], None] | None = None,
933
936
  parser: str | None = None,
934
937
  autolinks: bool = True,
938
+ br_in_tables: bool = False,
935
939
  bullets: str = "*+-",
936
940
  code_language: str = "",
937
941
  code_language_callback: Callable[[Any], str] | None = None,
@@ -973,6 +977,7 @@ def convert_to_markdown_stream(
973
977
  whitespace_handler=whitespace_handler,
974
978
  parser=parser,
975
979
  autolinks=autolinks,
980
+ br_in_tables=br_in_tables,
976
981
  bullets=bullets,
977
982
  code_language=code_language,
978
983
  code_language_callback=code_language_callback,
@@ -1024,7 +1029,7 @@ def convert_to_markdown_stream(
1024
1029
  end_pos = search_start + newline_pos + 1
1025
1030
 
1026
1031
  chunk = combined_result[pos:end_pos]
1027
- if chunk:
1032
+ if chunk: # pragma: no cover
1028
1033
  yield chunk
1029
1034
 
1030
1035
  pos = end_pos
html_to_markdown/utils.py CHANGED
@@ -12,9 +12,7 @@ def chomp(text: str) -> tuple[str, str, str]:
12
12
  prefix = " " if text.startswith((" ", "\t")) else ""
13
13
  suffix = " " if text.endswith((" ", "\t")) else ""
14
14
 
15
- text = text.strip()
16
-
17
- return prefix, suffix, text
15
+ return prefix, suffix, text.strip()
18
16
 
19
17
 
20
18
  def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
@@ -7,7 +7,7 @@ import unicodedata
7
7
  from typing import TYPE_CHECKING, Literal
8
8
 
9
9
  if TYPE_CHECKING:
10
- from bs4 import NavigableString, PageElement, Tag
10
+ from bs4 import NavigableString, PageElement
11
11
 
12
12
 
13
13
  WhitespaceMode = Literal["normalized", "strict"]
@@ -132,7 +132,7 @@ class WhitespaceHandler:
132
132
  for char in text:
133
133
  if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
134
134
  normalized.append(" ")
135
- elif char in ("\r\n", "\r"):
135
+ elif char == "\r": # pragma: no cover
136
136
  normalized.append("\n")
137
137
  else:
138
138
  normalized.append(char)
@@ -168,16 +168,13 @@ class WhitespaceHandler:
168
168
  *,
169
169
  in_pre: bool = False,
170
170
  ) -> str:
171
- if not text:
171
+ if not text: # pragma: no cover
172
172
  return ""
173
173
 
174
- text = self.normalize_unicode_spaces(text)
175
-
176
174
  if in_pre or self.should_preserve_whitespace(element):
177
175
  return text
178
176
 
179
- if self.mode == "strict":
180
- return text
177
+ text = self.normalize_unicode_spaces(text)
181
178
  return self._process_normalized(text, element)
182
179
 
183
180
  def _process_normalized(self, text: str, element: NavigableString) -> str:
@@ -204,8 +201,8 @@ class WhitespaceHandler:
204
201
  def _process_text_with_content(self, text: str, element: NavigableString) -> str:
205
202
  original = str(element)
206
203
 
207
- has_lead_space = original and original[0] in " \t\n"
208
- has_trail_space = original and original[-1] in " \t\n"
204
+ has_lead_space = bool(original and original[0] in " \t\n")
205
+ has_trail_space = bool(original and original[-1] in " \t\n")
209
206
 
210
207
  text = self._multiple_spaces.sub(" ", text.strip())
211
208
 
@@ -215,9 +212,9 @@ class WhitespaceHandler:
215
212
  return self._process_special_inline_containers(text, original)
216
213
 
217
214
  if parent and self.is_inline_element(parent):
218
- return self._process_inline_element_text(text, original, bool(has_lead_space), bool(has_trail_space))
215
+ return self._process_inline_element_text(text, original, has_lead_space, has_trail_space)
219
216
 
220
- return self._process_standalone_text(text, original, element, bool(has_lead_space), bool(has_trail_space))
217
+ return self._process_standalone_text(text, original, element, has_lead_space, has_trail_space)
221
218
 
222
219
  def _process_special_inline_containers(self, text: str, original: str) -> str:
223
220
  if original and "\n" not in original and "\t" not in original:
@@ -242,6 +239,14 @@ class WhitespaceHandler:
242
239
  prev_sibling = element.previous_sibling
243
240
  next_sibling = element.next_sibling
244
241
 
242
+ multiple_newlines_before_block = (
243
+ original
244
+ and original.count("\n") >= 2
245
+ and self.is_block_element(next_sibling)
246
+ and text.strip()
247
+ and (self.is_inline_element(prev_sibling) or prev_sibling is None)
248
+ )
249
+
245
250
  has_leading = (
246
251
  has_lead_space
247
252
  and original[0] == " "
@@ -268,25 +273,7 @@ class WhitespaceHandler:
268
273
  if has_trailing and not (original and original[-1] in "\n\t"):
269
274
  text = text + " "
270
275
 
271
- return text
276
+ if multiple_newlines_before_block:
277
+ text = text + "\n\n"
272
278
 
273
- def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
274
- if self.mode == "strict":
275
- return ""
276
-
277
- tag_name = tag.name.lower() if hasattr(tag, "name") else ""
278
-
279
- double_newline_elements = {"p", "div", "blockquote", "pre", "table", "ul", "ol", "dl"}
280
-
281
- single_newline_elements = {"li", "dt", "dd", "tr", "td", "th"}
282
-
283
- if tag_name in double_newline_elements:
284
- if self.is_block_element(next_sibling):
285
- return "\n\n"
286
- return "\n"
287
- if tag_name in single_newline_elements:
288
- return "\n"
289
- if tag_name.startswith("h") and len(tag_name) == 2:
290
- return "\n\n"
291
-
292
- return ""
279
+ return text
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.10.0
3
+ Version: 1.12.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -33,7 +33,7 @@ License-File: LICENSE
33
33
  Requires-Dist: beautifulsoup4>=4.13.5
34
34
  Requires-Dist: nh3>=0.3
35
35
  Provides-Extra: lxml
36
- Requires-Dist: lxml>=6.0.1; extra == "lxml"
36
+ Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
37
37
  Dynamic: license-file
38
38
 
39
39
  # html-to-markdown
@@ -320,6 +320,88 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
320
320
 
321
321
  Custom converters take precedence over built-in converters and can be used alongside other configuration options.
322
322
 
323
+ ### Streaming API
324
+
325
+ For processing large documents with memory constraints, use the streaming API:
326
+
327
+ ```python
328
+ from html_to_markdown import convert_to_markdown_stream
329
+
330
+ # Process large HTML in chunks
331
+ with open("large_document.html", "r") as f:
332
+ html_content = f.read()
333
+
334
+ # Returns a generator that yields markdown chunks
335
+ for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
336
+ print(chunk, end="")
337
+ ```
338
+
339
+ With progress tracking:
340
+
341
+ ```python
342
+ def show_progress(processed: int, total: int):
343
+ if total > 0:
344
+ percent = (processed / total) * 100
345
+ print(f"\rProgress: {percent:.1f}%", end="")
346
+
347
+ # Stream with progress callback
348
+ markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
349
+ ```
350
+
351
+ ### Preprocessing API
352
+
353
+ The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
354
+
355
+ ```python
356
+ from html_to_markdown import preprocess_html, create_preprocessor
357
+
358
+ # Direct preprocessing with custom options
359
+ cleaned_html = preprocess_html(
360
+ raw_html,
361
+ remove_navigation=True,
362
+ remove_forms=True,
363
+ remove_scripts=True,
364
+ remove_styles=True,
365
+ remove_comments=True,
366
+ preserve_semantic_structure=True,
367
+ preserve_tables=True,
368
+ preserve_media=True,
369
+ )
370
+ markdown = convert_to_markdown(cleaned_html)
371
+
372
+ # Create a preprocessor configuration from presets
373
+ config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
374
+ markdown = convert_to_markdown(html, **config)
375
+ ```
376
+
377
+ ### Exception Handling
378
+
379
+ The library provides specific exception classes for better error handling:
380
+
381
+ ````python
382
+ from html_to_markdown import (
383
+ convert_to_markdown,
384
+ HtmlToMarkdownError,
385
+ EmptyHtmlError,
386
+ InvalidParserError,
387
+ ConflictingOptionsError,
388
+ MissingDependencyError
389
+ )
390
+
391
+ try:
392
+ markdown = convert_to_markdown(html, parser='lxml')
393
+ except MissingDependencyError:
394
+ # lxml not installed
395
+ markdown = convert_to_markdown(html, parser='html.parser')
396
+ except EmptyHtmlError:
397
+ print("No HTML content to convert")
398
+ except InvalidParserError as e:
399
+ print(f"Parser error: {e}")
400
+ except ConflictingOptionsError as e:
401
+ print(f"Conflicting options: {e}")
402
+ except HtmlToMarkdownError as e:
403
+ print(f"Conversion error: {e}")
404
+
323
405
  ## CLI Usage
324
406
 
325
407
  Convert HTML files directly from the command line with full access to all API options:
@@ -340,7 +422,7 @@ html_to_markdown \
340
422
  --preprocess-html \
341
423
  --preprocessing-preset aggressive \
342
424
  input.html > output.md
343
- ```
425
+ ````
344
426
 
345
427
  ### Key CLI Options
346
428
 
@@ -353,6 +435,20 @@ html_to_markdown \
353
435
  --whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
354
436
  --heading-style {atx,atx_closed,underlined} # Header style
355
437
  --no-extract-metadata # Disable metadata extraction
438
+ --br-in-tables # Use <br> tags for line breaks in table cells
439
+ --source-encoding ENCODING # Override auto-detected encoding (rarely needed)
440
+ ```
441
+
442
+ **File Encoding:**
443
+
444
+ The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
445
+
446
+ ```shell
447
+ # Override auto-detection for Latin-1 encoded file
448
+ html_to_markdown --source-encoding latin-1 input.html > output.md
449
+
450
+ # Force UTF-16 encoding when auto-detection fails
451
+ html_to_markdown --source-encoding utf-16 input.html > output.md
356
452
  ```
357
453
 
358
454
  **All Available Options:**
@@ -393,6 +489,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
393
489
  - `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
394
490
  - `sub_symbol` (str, default: `''`): Custom symbol for subscript text
395
491
  - `sup_symbol` (str, default: `''`): Custom symbol for superscript text
492
+ - `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
396
493
 
397
494
  ### Parser Options
398
495
 
@@ -0,0 +1,17 @@
1
+ html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
2
+ html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
3
+ html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
4
+ html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
5
+ html_to_markdown/converters.py,sha256=4dikabmNVu8g7jnSpk_i_6CAKy7OehjcL0c8lmIJRSk,36414
6
+ html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
7
+ html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
8
+ html_to_markdown/processing.py,sha256=RQbqkI3w_rm64uOvmO6-CrqCJXKNHtfKu2G6f59JSF0,34596
9
+ html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
11
+ html_to_markdown/whitespace.py,sha256=a7M_u9JXh6cfjs4rz25hABIKKy3ax11ZXJhEID4YSV4,7397
12
+ html_to_markdown-1.12.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
13
+ html_to_markdown-1.12.0.dist-info/METADATA,sha256=y8bGQgaCogxjM7V3gldeZi0IIaiCC-H7NiPqQMwMgmY,20867
14
+ html_to_markdown-1.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ html_to_markdown-1.12.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
16
+ html_to_markdown-1.12.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
17
+ html_to_markdown-1.12.0.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
2
- html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
3
- html_to_markdown/cli.py,sha256=ilnrJN2XMhPDQ4UkkG4cjLXTvglu_ZJj-bBsohVF3fw,8541
4
- html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
5
- html_to_markdown/converters.py,sha256=ewdKUwkQXuwgzwCBhxZ1AJufX90jR_aGLr02GkdB2So,32443
6
- html_to_markdown/exceptions.py,sha256=YjfwVCWE_oZakr9iy0E-_aPSYHNaocJZgWeQ9Enty7Q,1212
7
- html_to_markdown/preprocessor.py,sha256=acmuJJvx1RaXE3c0F_aWsartQE0cEpa3AOnJYGnPzqw,9708
8
- html_to_markdown/processing.py,sha256=tqrBfXKqbN_rQbFOY4pGhDjY9fHyj_E1gOlhqE1ywK0,34214
9
- html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- html_to_markdown/utils.py,sha256=4Vzk2cCjxN0LAZ1DXQCufYtxE7a6739TYgPbje-VM_E,1086
11
- html_to_markdown/whitespace.py,sha256=b8Vf_AWhIvGFqka4Au0GsxsOYeYRO9XBpD4DxW99Pg0,7806
12
- html_to_markdown-1.10.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
13
- html_to_markdown-1.10.0.dist-info/METADATA,sha256=LlFYc0EDFdfapqLacVQ9Da12SjEWKExW-L-5j55bicM,17797
14
- html_to_markdown-1.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- html_to_markdown-1.10.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
16
- html_to_markdown-1.10.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
17
- html_to_markdown-1.10.0.dist-info/RECORD,,