html-to-markdown 1.11.0__py3-none-any.whl → 1.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/cli.py +28 -2
- html_to_markdown/converters.py +214 -127
- html_to_markdown/exceptions.py +5 -0
- html_to_markdown/preprocessor.py +96 -86
- html_to_markdown/processing.py +36 -34
- html_to_markdown/utils.py +1 -3
- html_to_markdown/whitespace.py +7 -31
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.0.dist-info}/METADATA +99 -2
- html_to_markdown-1.12.0.dist-info/RECORD +17 -0
- html_to_markdown-1.11.0.dist-info/RECORD +0 -17
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.0.dist-info}/top_level.txt +0 -0
html_to_markdown/cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from argparse import ArgumentParser, FileType
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
from html_to_markdown.constants import (
|
|
5
6
|
ASTERISK,
|
|
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
|
|
|
13
14
|
WHITESPACE_NORMALIZED,
|
|
14
15
|
WHITESPACE_STRICT,
|
|
15
16
|
)
|
|
17
|
+
from html_to_markdown.exceptions import InvalidEncodingError
|
|
16
18
|
from html_to_markdown.processing import convert_to_markdown
|
|
17
19
|
|
|
18
20
|
|
|
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
|
|
|
131
133
|
help="Parent tags where images remain inline (not converted to alt-text).",
|
|
132
134
|
)
|
|
133
135
|
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--br-in-tables",
|
|
138
|
+
action="store_true",
|
|
139
|
+
help="Use <br> tags for line breaks in table cells instead of spaces.",
|
|
140
|
+
)
|
|
141
|
+
|
|
134
142
|
parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
|
|
135
143
|
|
|
136
144
|
parser.add_argument(
|
|
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
|
|
|
235
243
|
help="Keep navigation elements when preprocessing (normally removed).",
|
|
236
244
|
)
|
|
237
245
|
|
|
246
|
+
parser.add_argument(
|
|
247
|
+
"--source-encoding",
|
|
248
|
+
type=str,
|
|
249
|
+
default=None,
|
|
250
|
+
help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
|
|
251
|
+
)
|
|
252
|
+
|
|
238
253
|
args = parser.parse_args(argv)
|
|
239
254
|
|
|
240
255
|
base_args = {
|
|
241
256
|
"autolinks": args.autolinks,
|
|
257
|
+
"br_in_tables": args.br_in_tables,
|
|
242
258
|
"bullets": args.bullets,
|
|
243
259
|
"code_language": args.code_language,
|
|
244
260
|
"convert": args.convert,
|
|
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
|
|
|
278
294
|
if args.show_progress:
|
|
279
295
|
|
|
280
296
|
def progress_callback(processed: int, total: int) -> None:
|
|
281
|
-
if total > 0:
|
|
297
|
+
if total > 0: # pragma: no cover
|
|
282
298
|
percent = (processed / total) * 100
|
|
283
299
|
|
|
284
300
|
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
|
|
|
286
302
|
|
|
287
303
|
base_args["progress_callback"] = progress_callback
|
|
288
304
|
|
|
289
|
-
|
|
305
|
+
if args.source_encoding and args.html.name != "<stdin>":
|
|
306
|
+
args.html.close()
|
|
307
|
+
try:
|
|
308
|
+
with Path(args.html.name).open(encoding=args.source_encoding) as f:
|
|
309
|
+
html_content = f.read()
|
|
310
|
+
except LookupError as e:
|
|
311
|
+
raise InvalidEncodingError(args.source_encoding) from e
|
|
312
|
+
else:
|
|
313
|
+
html_content = args.html.read()
|
|
314
|
+
|
|
315
|
+
return convert_to_markdown(html_content, **base_args)
|
html_to_markdown/converters.py
CHANGED
|
@@ -5,9 +5,11 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from collections.abc import Iterable
|
|
7
7
|
import base64
|
|
8
|
+
import re
|
|
8
9
|
from collections.abc import Callable
|
|
9
10
|
from functools import partial
|
|
10
11
|
from inspect import getfullargspec
|
|
12
|
+
from itertools import chain
|
|
11
13
|
from textwrap import fill
|
|
12
14
|
from typing import Any, Literal, TypeVar, cast
|
|
13
15
|
|
|
@@ -36,6 +38,19 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
|
|
|
36
38
|
return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
|
|
37
39
|
|
|
38
40
|
|
|
41
|
+
def _find_list_item_ancestor(tag: Tag) -> Tag | None:
|
|
42
|
+
"""Find the nearest list item ancestor of a tag."""
|
|
43
|
+
parent = tag.parent
|
|
44
|
+
while parent and parent.name != "li":
|
|
45
|
+
parent = parent.parent
|
|
46
|
+
return parent
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
BLOCK_ELEMENTS = frozenset({"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"})
|
|
50
|
+
|
|
51
|
+
_LIST_ITEM_PATTERN = re.compile(r"^\s*(\*|\+|-|\d+\.)\s")
|
|
52
|
+
|
|
53
|
+
|
|
39
54
|
SupportedElements = Literal[
|
|
40
55
|
"a",
|
|
41
56
|
"abbr",
|
|
@@ -270,52 +285,91 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
270
285
|
return f""
|
|
271
286
|
|
|
272
287
|
|
|
288
|
+
def _has_block_list_items(tag: Tag) -> bool:
|
|
289
|
+
"""Check if any list items contain block elements."""
|
|
290
|
+
return any(
|
|
291
|
+
any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
|
|
292
|
+
for li in tag.find_all("li", recursive=False)
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
|
|
297
|
+
"""Handle indentation for lists nested within list items."""
|
|
298
|
+
prev_p = None
|
|
299
|
+
for child in parent.children:
|
|
300
|
+
if hasattr(child, "name"):
|
|
301
|
+
if child.name == "p":
|
|
302
|
+
prev_p = child
|
|
303
|
+
break
|
|
304
|
+
|
|
305
|
+
if prev_p:
|
|
306
|
+
lines = text.strip().split("\n")
|
|
307
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
|
|
308
|
+
return "\n" + "\n".join(indented_lines) + "\n"
|
|
309
|
+
return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
|
|
313
|
+
"""Handle indentation for lists that are direct children of other lists."""
|
|
314
|
+
lines = text.strip().split("\n")
|
|
315
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
|
|
316
|
+
result = "\n".join(indented_lines)
|
|
317
|
+
return result + "\n" if not result.endswith("\n") else result
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _add_list_item_spacing(text: str) -> str:
|
|
321
|
+
"""Add extra spacing between list items that contain block content."""
|
|
322
|
+
lines = text.split("\n")
|
|
323
|
+
items_with_blocks = set()
|
|
324
|
+
|
|
325
|
+
i = 0
|
|
326
|
+
while i < len(lines):
|
|
327
|
+
line = lines[i]
|
|
328
|
+
if line.strip() and _LIST_ITEM_PATTERN.match(line.lstrip()):
|
|
329
|
+
j = i + 1
|
|
330
|
+
has_continuation = False
|
|
331
|
+
while j < len(lines):
|
|
332
|
+
next_line = lines[j]
|
|
333
|
+
if next_line.strip() and _LIST_ITEM_PATTERN.match(next_line.lstrip()):
|
|
334
|
+
break
|
|
335
|
+
if next_line.strip() and next_line.startswith((" ", " ", "\t")):
|
|
336
|
+
has_continuation = True
|
|
337
|
+
j += 1
|
|
338
|
+
|
|
339
|
+
if has_continuation and j < len(lines):
|
|
340
|
+
items_with_blocks.add(j - 1)
|
|
341
|
+
|
|
342
|
+
i += 1
|
|
343
|
+
|
|
344
|
+
if items_with_blocks:
|
|
345
|
+
processed_lines = list(
|
|
346
|
+
chain.from_iterable([line, ""] if i in items_with_blocks else [line] for i, line in enumerate(lines))
|
|
347
|
+
)
|
|
348
|
+
return "\n".join(processed_lines)
|
|
349
|
+
|
|
350
|
+
return text
|
|
351
|
+
|
|
352
|
+
|
|
273
353
|
def _convert_list(*, tag: Tag, text: str, list_indent_str: str) -> str:
|
|
274
354
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
275
355
|
|
|
276
|
-
before_paragraph =
|
|
277
|
-
if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
|
|
278
|
-
before_paragraph = True
|
|
356
|
+
before_paragraph = tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}
|
|
279
357
|
|
|
280
|
-
|
|
281
|
-
parent = tag.parent
|
|
282
|
-
while parent and parent.name != "li":
|
|
283
|
-
parent = parent.parent
|
|
358
|
+
has_block_items = _has_block_list_items(tag)
|
|
284
359
|
|
|
360
|
+
if _has_ancestor(tag, "li"):
|
|
361
|
+
parent = _find_list_item_ancestor(tag)
|
|
285
362
|
if parent:
|
|
286
|
-
|
|
287
|
-
for child in parent.children:
|
|
288
|
-
if hasattr(child, "name"):
|
|
289
|
-
if child == tag:
|
|
290
|
-
break
|
|
291
|
-
if child.name == "p":
|
|
292
|
-
prev_p = child
|
|
293
|
-
|
|
294
|
-
if prev_p:
|
|
295
|
-
lines = text.strip().split("\n")
|
|
296
|
-
indented_lines = []
|
|
297
|
-
for line in lines:
|
|
298
|
-
if line.strip():
|
|
299
|
-
indented_lines.append(f"{list_indent_str}{line}")
|
|
300
|
-
else:
|
|
301
|
-
indented_lines.append("")
|
|
302
|
-
return "\n" + "\n".join(indented_lines) + "\n"
|
|
303
|
-
return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
|
|
363
|
+
return _handle_nested_list_indentation(text, list_indent_str, parent)
|
|
304
364
|
|
|
305
365
|
if tag.parent and tag.parent.name in {"ul", "ol"}:
|
|
306
|
-
|
|
307
|
-
indented_lines = []
|
|
308
|
-
for line in lines:
|
|
309
|
-
if line.strip():
|
|
310
|
-
indented_lines.append(f"{list_indent_str}{line}")
|
|
311
|
-
else:
|
|
312
|
-
indented_lines.append("")
|
|
313
|
-
result = "\n".join(indented_lines)
|
|
314
|
-
if not result.endswith("\n"):
|
|
315
|
-
result += "\n"
|
|
316
|
-
return result
|
|
366
|
+
return _handle_direct_nested_list_indentation(text, list_indent_str)
|
|
317
367
|
|
|
318
|
-
|
|
368
|
+
if has_block_items:
|
|
369
|
+
text = _add_list_item_spacing(text)
|
|
370
|
+
|
|
371
|
+
trailing_newlines = "\n\n" if has_block_items else ("\n" if before_paragraph else "")
|
|
372
|
+
return text + trailing_newlines
|
|
319
373
|
|
|
320
374
|
|
|
321
375
|
def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> str:
|
|
@@ -324,10 +378,8 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
324
378
|
checked = checkbox.get("checked") is not None
|
|
325
379
|
checkbox_symbol = "[x]" if checked else "[ ]"
|
|
326
380
|
|
|
327
|
-
checkbox_text = text
|
|
328
|
-
|
|
329
|
-
checkbox_text = text.replace(str(checkbox.string), "").strip()
|
|
330
|
-
return f"- {checkbox_symbol} {checkbox_text.strip()}\n"
|
|
381
|
+
checkbox_text = text.strip()
|
|
382
|
+
return f"- {checkbox_symbol} {checkbox_text}\n"
|
|
331
383
|
|
|
332
384
|
parent = tag.parent
|
|
333
385
|
if parent is not None and parent.name == "ol":
|
|
@@ -349,11 +401,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
349
401
|
|
|
350
402
|
bullet = bullets[depth % len(bullets)]
|
|
351
403
|
|
|
352
|
-
has_block_children =
|
|
353
|
-
child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
|
|
354
|
-
for child in tag.children
|
|
355
|
-
if hasattr(child, "name")
|
|
356
|
-
)
|
|
404
|
+
has_block_children = "\n\n" in text
|
|
357
405
|
|
|
358
406
|
if has_block_children:
|
|
359
407
|
paragraphs = text.strip().split("\n\n")
|
|
@@ -390,20 +438,13 @@ def _convert_p(
|
|
|
390
438
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
391
439
|
|
|
392
440
|
if _has_ancestor(tag, "li"):
|
|
393
|
-
parent = tag
|
|
394
|
-
while parent and parent.name != "li":
|
|
395
|
-
parent = parent.parent
|
|
441
|
+
parent = _find_list_item_ancestor(tag)
|
|
396
442
|
|
|
397
443
|
if parent:
|
|
398
444
|
p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
|
|
399
445
|
|
|
400
446
|
if p_children and tag != p_children[0]:
|
|
401
|
-
indented_lines = []
|
|
402
|
-
for line in text.split("\n"):
|
|
403
|
-
if line.strip():
|
|
404
|
-
indented_lines.append(f"{list_indent_str}{line}")
|
|
405
|
-
else:
|
|
406
|
-
indented_lines.append("")
|
|
447
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
|
|
407
448
|
text = "\n".join(indented_lines)
|
|
408
449
|
|
|
409
450
|
return f"{text}\n\n" if text else ""
|
|
@@ -440,66 +481,69 @@ def _convert_pre(
|
|
|
440
481
|
return f"\n```{code_language}\n{text}\n```\n"
|
|
441
482
|
|
|
442
483
|
|
|
443
|
-
def
|
|
484
|
+
def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
|
|
485
|
+
"""Process table cell content, optionally using <br> tags for multi-line content."""
|
|
486
|
+
if br_in_tables:
|
|
487
|
+
block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
|
|
488
|
+
|
|
489
|
+
if len(block_children) > 1:
|
|
490
|
+
child_contents = []
|
|
491
|
+
for child in block_children:
|
|
492
|
+
child_text = child.get_text().strip()
|
|
493
|
+
if child_text:
|
|
494
|
+
child_contents.append(child_text)
|
|
495
|
+
return "<br>".join(child_contents)
|
|
496
|
+
return text.strip().replace("\n", "<br>")
|
|
497
|
+
return text.strip().replace("\n", " ")
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _convert_td(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
|
|
444
501
|
colspan = _get_colspan(tag)
|
|
445
|
-
|
|
502
|
+
processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
|
|
503
|
+
return " " + processed_text + " |" * colspan
|
|
446
504
|
|
|
447
505
|
|
|
448
|
-
def _convert_th(*, tag: Tag, text: str) -> str:
|
|
506
|
+
def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
|
|
449
507
|
colspan = _get_colspan(tag)
|
|
450
|
-
|
|
508
|
+
processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
|
|
509
|
+
return " " + processed_text + " |" * colspan
|
|
451
510
|
|
|
452
511
|
|
|
453
|
-
def
|
|
454
|
-
cells
|
|
455
|
-
|
|
456
|
-
|
|
512
|
+
def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
|
|
513
|
+
"""Get positions of cells with rowspan > 1 from previous row."""
|
|
514
|
+
rowspan_positions = []
|
|
515
|
+
col_pos = 0
|
|
457
516
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
if rowspan > 1:
|
|
473
|
-
rowspan_positions.append(col_pos)
|
|
474
|
-
|
|
475
|
-
colspan = 1
|
|
476
|
-
if (
|
|
477
|
-
"colspan" in prev_cell.attrs
|
|
478
|
-
and isinstance(prev_cell["colspan"], str)
|
|
479
|
-
and prev_cell["colspan"].isdigit()
|
|
480
|
-
):
|
|
481
|
-
colspan = int(prev_cell["colspan"])
|
|
482
|
-
col_pos += colspan
|
|
517
|
+
for prev_cell in prev_cells:
|
|
518
|
+
rowspan = 1
|
|
519
|
+
if "rowspan" in prev_cell.attrs and isinstance(prev_cell["rowspan"], str) and prev_cell["rowspan"].isdigit():
|
|
520
|
+
rowspan = int(prev_cell["rowspan"])
|
|
521
|
+
|
|
522
|
+
if rowspan > 1:
|
|
523
|
+
rowspan_positions.append(col_pos)
|
|
524
|
+
|
|
525
|
+
colspan = 1
|
|
526
|
+
if "colspan" in prev_cell.attrs and isinstance(prev_cell["colspan"], str) and prev_cell["colspan"].isdigit():
|
|
527
|
+
colspan = int(prev_cell["colspan"])
|
|
528
|
+
col_pos += colspan
|
|
529
|
+
|
|
530
|
+
return rowspan_positions, col_pos
|
|
483
531
|
|
|
484
|
-
if rowspan_positions:
|
|
485
|
-
converted_cells: list[str] = []
|
|
486
|
-
if text.strip():
|
|
487
|
-
parts = text.split("|")
|
|
488
|
-
converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
|
|
489
532
|
|
|
490
|
-
|
|
491
|
-
|
|
533
|
+
def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
|
|
534
|
+
"""Handle text adjustment for rows with rowspan cells."""
|
|
535
|
+
converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
|
|
536
|
+
rowspan_set = set(rowspan_positions)
|
|
492
537
|
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
new_cells.append(" |")
|
|
496
|
-
elif cell_index < len(converted_cells):
|
|
497
|
-
new_cells.append(converted_cells[cell_index])
|
|
498
|
-
cell_index += 1
|
|
538
|
+
cell_iter = iter(converted_cells)
|
|
539
|
+
new_cells = [" |" if pos in rowspan_set else next(cell_iter, "") for pos in range(col_pos)]
|
|
499
540
|
|
|
500
|
-
|
|
541
|
+
return "".join(new_cells)
|
|
501
542
|
|
|
502
|
-
|
|
543
|
+
|
|
544
|
+
def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
|
|
545
|
+
"""Determine if this table row should be treated as a header row."""
|
|
546
|
+
return (
|
|
503
547
|
all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
|
|
504
548
|
or (not tag.previous_sibling and parent_name != "tbody")
|
|
505
549
|
or (
|
|
@@ -508,25 +552,48 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
508
552
|
and (not tag_grand_parent or len(tag_grand_parent.find_all(["thead"])) < 1)
|
|
509
553
|
)
|
|
510
554
|
)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
def _calculate_total_colspan(cells: list[Tag]) -> int:
|
|
558
|
+
"""Calculate total colspan for all cells in a row."""
|
|
559
|
+
full_colspan = 0
|
|
560
|
+
for cell in cells:
|
|
561
|
+
if hasattr(cell, "attrs") and "colspan" in cell.attrs:
|
|
562
|
+
colspan_value = cell.attrs["colspan"]
|
|
563
|
+
if isinstance(colspan_value, str) and colspan_value.isdigit():
|
|
564
|
+
full_colspan += int(colspan_value)
|
|
565
|
+
else:
|
|
566
|
+
full_colspan += 1
|
|
567
|
+
else:
|
|
568
|
+
full_colspan += 1
|
|
569
|
+
return full_colspan
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
573
|
+
cells = tag.find_all(["td", "th"])
|
|
574
|
+
parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
|
|
575
|
+
tag_grand_parent = tag.parent.parent if tag.parent else None
|
|
576
|
+
|
|
577
|
+
if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
|
|
578
|
+
prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
|
|
579
|
+
rowspan_positions, col_pos = _get_rowspan_positions(prev_cells)
|
|
580
|
+
|
|
581
|
+
if rowspan_positions:
|
|
582
|
+
text = _handle_rowspan_text(text, rowspan_positions, col_pos)
|
|
583
|
+
|
|
584
|
+
is_headrow = _is_header_row(tag, cells, parent_name, tag_grand_parent)
|
|
511
585
|
overline = ""
|
|
512
586
|
underline = ""
|
|
587
|
+
|
|
513
588
|
if is_headrow and not tag.previous_sibling:
|
|
514
|
-
full_colspan =
|
|
515
|
-
for cell in cells:
|
|
516
|
-
if hasattr(cell, "attrs") and "colspan" in cell.attrs:
|
|
517
|
-
colspan_value = cell.attrs["colspan"]
|
|
518
|
-
if isinstance(colspan_value, str) and colspan_value.isdigit():
|
|
519
|
-
full_colspan += int(colspan_value)
|
|
520
|
-
else:
|
|
521
|
-
full_colspan += 1
|
|
522
|
-
else:
|
|
523
|
-
full_colspan += 1
|
|
589
|
+
full_colspan = _calculate_total_colspan(cells)
|
|
524
590
|
underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
|
|
525
591
|
elif not tag.previous_sibling and (
|
|
526
592
|
parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
|
|
527
593
|
):
|
|
528
|
-
overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
|
|
529
|
-
overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
|
|
594
|
+
overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n" # pragma: no cover
|
|
595
|
+
overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n" # pragma: no cover
|
|
596
|
+
|
|
530
597
|
return overline + "|" + text + "\n" + underline
|
|
531
598
|
|
|
532
599
|
|
|
@@ -578,10 +645,23 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
|
|
|
578
645
|
return f"{text}\n\n" if text.strip() else ""
|
|
579
646
|
|
|
580
647
|
|
|
581
|
-
def _convert_div(*, text: str, convert_as_inline: bool) -> str:
|
|
648
|
+
def _convert_div(*, text: str, convert_as_inline: bool, tag: Tag, list_indent_str: str) -> str:
|
|
582
649
|
if convert_as_inline:
|
|
583
650
|
return text
|
|
584
651
|
|
|
652
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
653
|
+
|
|
654
|
+
if _has_ancestor(tag, "li"):
|
|
655
|
+
parent = _find_list_item_ancestor(tag)
|
|
656
|
+
if parent:
|
|
657
|
+
div_children = [child for child in parent.children if hasattr(child, "name") and child.name == "div"]
|
|
658
|
+
|
|
659
|
+
if div_children and tag != div_children[0]:
|
|
660
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
|
|
661
|
+
indented_text = "\n".join(indented_lines)
|
|
662
|
+
|
|
663
|
+
return f"{indented_text}\n\n" if indented_text.strip() else ""
|
|
664
|
+
|
|
585
665
|
return _format_block_element(text)
|
|
586
666
|
|
|
587
667
|
|
|
@@ -603,7 +683,7 @@ def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
|
|
|
603
683
|
if convert_as_inline:
|
|
604
684
|
return text
|
|
605
685
|
|
|
606
|
-
return
|
|
686
|
+
return _format_block_element(text)
|
|
607
687
|
|
|
608
688
|
|
|
609
689
|
def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -616,14 +696,21 @@ def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
|
|
|
616
696
|
return f"{text.strip()}\n"
|
|
617
697
|
|
|
618
698
|
|
|
619
|
-
def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
|
|
699
|
+
def _convert_dd(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
620
700
|
if convert_as_inline:
|
|
621
701
|
return text
|
|
622
702
|
|
|
623
|
-
|
|
624
|
-
|
|
703
|
+
has_dt_sibling = False
|
|
704
|
+
current = tag.previous_sibling
|
|
705
|
+
while current:
|
|
706
|
+
if hasattr(current, "name") and current.name and current.name == "dt":
|
|
707
|
+
has_dt_sibling = True
|
|
708
|
+
break
|
|
709
|
+
current = current.previous_sibling
|
|
625
710
|
|
|
626
|
-
|
|
711
|
+
if has_dt_sibling:
|
|
712
|
+
return f": {text.strip()}\n\n" if text.strip() else ": \n\n"
|
|
713
|
+
return f"{text.strip()}\n\n" if text.strip() else ""
|
|
627
714
|
|
|
628
715
|
|
|
629
716
|
def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -648,9 +735,7 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
|
|
|
648
735
|
|
|
649
736
|
|
|
650
737
|
def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
651
|
-
src
|
|
652
|
-
|
|
653
|
-
if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
|
|
738
|
+
if not (src := tag.get("src", "")) and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
|
|
654
739
|
src = source_tag.get("src", "")
|
|
655
740
|
|
|
656
741
|
if src and isinstance(src, str) and src.strip():
|
|
@@ -670,9 +755,8 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
|
|
|
670
755
|
|
|
671
756
|
def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
672
757
|
_ = text
|
|
673
|
-
src = tag.get("src", "")
|
|
674
758
|
|
|
675
|
-
if src and isinstance(src, str) and src.strip():
|
|
759
|
+
if (src := tag.get("src", "")) and isinstance(src, str) and src.strip():
|
|
676
760
|
link = f"[{src}]({src})"
|
|
677
761
|
if convert_as_inline:
|
|
678
762
|
return link
|
|
@@ -939,7 +1023,7 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
939
1023
|
content = text.strip()
|
|
940
1024
|
if content and not content.endswith("\n\n"):
|
|
941
1025
|
if content.endswith("\n"):
|
|
942
|
-
content += "\n"
|
|
1026
|
+
content += "\n" # pragma: no cover
|
|
943
1027
|
else:
|
|
944
1028
|
content += "\n\n"
|
|
945
1029
|
return content
|
|
@@ -997,6 +1081,7 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
997
1081
|
|
|
998
1082
|
def create_converters_map(
|
|
999
1083
|
autolinks: bool,
|
|
1084
|
+
br_in_tables: bool,
|
|
1000
1085
|
bullets: str,
|
|
1001
1086
|
code_language: str,
|
|
1002
1087
|
code_language_callback: Callable[[Tag], str] | None,
|
|
@@ -1029,6 +1114,8 @@ def create_converters_map(
|
|
|
1029
1114
|
kwargs["convert_as_inline"] = convert_as_inline
|
|
1030
1115
|
if "list_indent_str" in spec.kwonlyargs:
|
|
1031
1116
|
kwargs["list_indent_str"] = list_indent_str
|
|
1117
|
+
if "br_in_tables" in spec.kwonlyargs:
|
|
1118
|
+
kwargs["br_in_tables"] = br_in_tables
|
|
1032
1119
|
return func(**kwargs)
|
|
1033
1120
|
return func(text)
|
|
1034
1121
|
|
html_to_markdown/exceptions.py
CHANGED
|
@@ -37,3 +37,8 @@ class ConflictingOptionsError(HtmlToMarkdownError):
|
|
|
37
37
|
self.option2 = option2
|
|
38
38
|
|
|
39
39
|
super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class InvalidEncodingError(HtmlToMarkdownError):
|
|
43
|
+
def __init__(self, encoding: str) -> None:
|
|
44
|
+
super().__init__(f"The specified encoding ({encoding}) is not valid.")
|
html_to_markdown/preprocessor.py
CHANGED
|
@@ -5,6 +5,98 @@ from typing import Any
|
|
|
5
5
|
|
|
6
6
|
import nh3
|
|
7
7
|
|
|
8
|
+
BASE_ALLOWED_TAGS = frozenset(
|
|
9
|
+
{
|
|
10
|
+
"p",
|
|
11
|
+
"div",
|
|
12
|
+
"span",
|
|
13
|
+
"br",
|
|
14
|
+
"hr",
|
|
15
|
+
"h1",
|
|
16
|
+
"h2",
|
|
17
|
+
"h3",
|
|
18
|
+
"h4",
|
|
19
|
+
"h5",
|
|
20
|
+
"h6",
|
|
21
|
+
"ul",
|
|
22
|
+
"ol",
|
|
23
|
+
"li",
|
|
24
|
+
"dl",
|
|
25
|
+
"dt",
|
|
26
|
+
"dd",
|
|
27
|
+
"strong",
|
|
28
|
+
"b",
|
|
29
|
+
"em",
|
|
30
|
+
"i",
|
|
31
|
+
"u",
|
|
32
|
+
"s",
|
|
33
|
+
"del",
|
|
34
|
+
"ins",
|
|
35
|
+
"mark",
|
|
36
|
+
"small",
|
|
37
|
+
"sub",
|
|
38
|
+
"sup",
|
|
39
|
+
"code",
|
|
40
|
+
"pre",
|
|
41
|
+
"kbd",
|
|
42
|
+
"samp",
|
|
43
|
+
"var",
|
|
44
|
+
"abbr",
|
|
45
|
+
"cite",
|
|
46
|
+
"dfn",
|
|
47
|
+
"time",
|
|
48
|
+
"data",
|
|
49
|
+
"a",
|
|
50
|
+
"blockquote",
|
|
51
|
+
"q",
|
|
52
|
+
}
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
SEMANTIC_STRUCTURE_TAGS = frozenset(
|
|
56
|
+
{
|
|
57
|
+
"article",
|
|
58
|
+
"section",
|
|
59
|
+
"aside",
|
|
60
|
+
"header",
|
|
61
|
+
"footer",
|
|
62
|
+
"main",
|
|
63
|
+
"nav",
|
|
64
|
+
"figure",
|
|
65
|
+
"figcaption",
|
|
66
|
+
"details",
|
|
67
|
+
"summary",
|
|
68
|
+
}
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
TABLE_TAGS = frozenset(
|
|
72
|
+
{
|
|
73
|
+
"table",
|
|
74
|
+
"thead",
|
|
75
|
+
"tbody",
|
|
76
|
+
"tfoot",
|
|
77
|
+
"tr",
|
|
78
|
+
"td",
|
|
79
|
+
"th",
|
|
80
|
+
"caption",
|
|
81
|
+
"colgroup",
|
|
82
|
+
"col",
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
MEDIA_TAGS = frozenset(
|
|
87
|
+
{
|
|
88
|
+
"img",
|
|
89
|
+
"picture",
|
|
90
|
+
"source",
|
|
91
|
+
"audio",
|
|
92
|
+
"video",
|
|
93
|
+
"track",
|
|
94
|
+
"canvas",
|
|
95
|
+
"svg",
|
|
96
|
+
"iframe",
|
|
97
|
+
}
|
|
98
|
+
)
|
|
99
|
+
|
|
8
100
|
|
|
9
101
|
def preprocess_html(
|
|
10
102
|
html: str,
|
|
@@ -63,98 +155,16 @@ def _configure_cleaning_rules(
|
|
|
63
155
|
custom_tags_to_remove: set[str],
|
|
64
156
|
custom_attributes_to_remove: set[str],
|
|
65
157
|
) -> dict[str, Any]:
|
|
66
|
-
allowed_tags =
|
|
67
|
-
"p",
|
|
68
|
-
"div",
|
|
69
|
-
"span",
|
|
70
|
-
"br",
|
|
71
|
-
"hr",
|
|
72
|
-
"h1",
|
|
73
|
-
"h2",
|
|
74
|
-
"h3",
|
|
75
|
-
"h4",
|
|
76
|
-
"h5",
|
|
77
|
-
"h6",
|
|
78
|
-
"ul",
|
|
79
|
-
"ol",
|
|
80
|
-
"li",
|
|
81
|
-
"dl",
|
|
82
|
-
"dt",
|
|
83
|
-
"dd",
|
|
84
|
-
"strong",
|
|
85
|
-
"b",
|
|
86
|
-
"em",
|
|
87
|
-
"i",
|
|
88
|
-
"u",
|
|
89
|
-
"s",
|
|
90
|
-
"del",
|
|
91
|
-
"ins",
|
|
92
|
-
"mark",
|
|
93
|
-
"small",
|
|
94
|
-
"sub",
|
|
95
|
-
"sup",
|
|
96
|
-
"code",
|
|
97
|
-
"pre",
|
|
98
|
-
"kbd",
|
|
99
|
-
"samp",
|
|
100
|
-
"var",
|
|
101
|
-
"abbr",
|
|
102
|
-
"cite",
|
|
103
|
-
"dfn",
|
|
104
|
-
"time",
|
|
105
|
-
"data",
|
|
106
|
-
"a",
|
|
107
|
-
"blockquote",
|
|
108
|
-
"q",
|
|
109
|
-
}
|
|
158
|
+
allowed_tags = set(BASE_ALLOWED_TAGS)
|
|
110
159
|
|
|
111
160
|
if preserve_semantic_structure:
|
|
112
|
-
allowed_tags.update(
|
|
113
|
-
{
|
|
114
|
-
"article",
|
|
115
|
-
"section",
|
|
116
|
-
"aside",
|
|
117
|
-
"header",
|
|
118
|
-
"footer",
|
|
119
|
-
"main",
|
|
120
|
-
"nav",
|
|
121
|
-
"figure",
|
|
122
|
-
"figcaption",
|
|
123
|
-
"details",
|
|
124
|
-
"summary",
|
|
125
|
-
}
|
|
126
|
-
)
|
|
161
|
+
allowed_tags.update(SEMANTIC_STRUCTURE_TAGS)
|
|
127
162
|
|
|
128
163
|
if preserve_tables:
|
|
129
|
-
allowed_tags.update(
|
|
130
|
-
{
|
|
131
|
-
"table",
|
|
132
|
-
"thead",
|
|
133
|
-
"tbody",
|
|
134
|
-
"tfoot",
|
|
135
|
-
"tr",
|
|
136
|
-
"th",
|
|
137
|
-
"td",
|
|
138
|
-
"caption",
|
|
139
|
-
"col",
|
|
140
|
-
"colgroup",
|
|
141
|
-
}
|
|
142
|
-
)
|
|
164
|
+
allowed_tags.update(TABLE_TAGS)
|
|
143
165
|
|
|
144
166
|
if preserve_media:
|
|
145
|
-
allowed_tags.update(
|
|
146
|
-
{
|
|
147
|
-
"img",
|
|
148
|
-
"picture",
|
|
149
|
-
"source",
|
|
150
|
-
"audio",
|
|
151
|
-
"video",
|
|
152
|
-
"track",
|
|
153
|
-
"canvas",
|
|
154
|
-
"svg",
|
|
155
|
-
"iframe",
|
|
156
|
-
}
|
|
157
|
-
)
|
|
167
|
+
allowed_tags.update(MEDIA_TAGS)
|
|
158
168
|
|
|
159
169
|
allowed_tags -= custom_tags_to_remove
|
|
160
170
|
|
html_to_markdown/processing.py
CHANGED
|
@@ -17,7 +17,7 @@ from bs4.element import NavigableString, PageElement
|
|
|
17
17
|
try:
|
|
18
18
|
from html_to_markdown.preprocessor import create_preprocessor
|
|
19
19
|
from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
|
|
20
|
-
except ImportError:
|
|
20
|
+
except ImportError: # pragma: no cover
|
|
21
21
|
create_preprocessor = None # type: ignore[assignment]
|
|
22
22
|
preprocess_fn = None # type: ignore[assignment]
|
|
23
23
|
|
|
@@ -25,7 +25,7 @@ try:
|
|
|
25
25
|
import importlib.util
|
|
26
26
|
|
|
27
27
|
LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
|
|
28
|
-
except ImportError:
|
|
28
|
+
except ImportError: # pragma: no cover
|
|
29
29
|
LXML_AVAILABLE = False
|
|
30
30
|
|
|
31
31
|
from html_to_markdown.constants import (
|
|
@@ -322,7 +322,7 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
|
|
|
322
322
|
def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
323
323
|
elem_id = id(element)
|
|
324
324
|
cache = _ancestor_cache.get()
|
|
325
|
-
if cache is None:
|
|
325
|
+
if cache is None: # pragma: no cover
|
|
326
326
|
cache = {}
|
|
327
327
|
_ancestor_cache.set(cache)
|
|
328
328
|
|
|
@@ -338,7 +338,7 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
|
338
338
|
ancestor_names.add(current.name)
|
|
339
339
|
|
|
340
340
|
parent_id = id(current)
|
|
341
|
-
if parent_id in cache:
|
|
341
|
+
if parent_id in cache: # pragma: no cover
|
|
342
342
|
ancestor_names.update(cache[parent_id])
|
|
343
343
|
break
|
|
344
344
|
|
|
@@ -386,36 +386,35 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
386
386
|
metadata["base-href"] = base_tag["href"]
|
|
387
387
|
|
|
388
388
|
for meta in soup.find_all("meta"):
|
|
389
|
-
if meta.get("name") and meta.get("content") is not None:
|
|
390
|
-
name = meta["name"]
|
|
391
|
-
content = meta["content"]
|
|
389
|
+
if (name := meta.get("name")) and (content := meta.get("content")) is not None:
|
|
392
390
|
if isinstance(name, str) and isinstance(content, str):
|
|
393
|
-
|
|
394
|
-
metadata[key] = content
|
|
391
|
+
metadata[f"meta-{name.lower()}"] = content
|
|
395
392
|
|
|
396
|
-
elif meta.get("property") and meta.get("content") is not None:
|
|
397
|
-
prop = meta["property"]
|
|
398
|
-
content = meta["content"]
|
|
393
|
+
elif (prop := meta.get("property")) and (content := meta.get("content")) is not None:
|
|
399
394
|
if isinstance(prop, str) and isinstance(content, str):
|
|
400
|
-
|
|
401
|
-
metadata[key] = content
|
|
395
|
+
metadata[f"meta-{prop.lower().replace(':', '-')}"] = content
|
|
402
396
|
|
|
403
|
-
elif
|
|
404
|
-
equiv
|
|
405
|
-
content
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
397
|
+
elif (
|
|
398
|
+
(equiv := meta.get("http-equiv"))
|
|
399
|
+
and (content := meta.get("content")) is not None
|
|
400
|
+
and isinstance(equiv, str)
|
|
401
|
+
and isinstance(content, str)
|
|
402
|
+
):
|
|
403
|
+
metadata[f"meta-{equiv.lower()}"] = content
|
|
409
404
|
|
|
410
405
|
canonical = soup.find("link", rel="canonical", href=True)
|
|
411
406
|
if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
|
|
412
407
|
metadata["canonical"] = canonical["href"]
|
|
413
408
|
|
|
414
409
|
link_relations = {"author", "license", "alternate"}
|
|
415
|
-
|
|
416
|
-
link
|
|
417
|
-
|
|
418
|
-
|
|
410
|
+
link_metadata = {
|
|
411
|
+
f"link-{rel_type}": link["href"]
|
|
412
|
+
for rel_type in link_relations
|
|
413
|
+
if (link := soup.find("link", rel=rel_type, href=True))
|
|
414
|
+
and isinstance(link, Tag)
|
|
415
|
+
and isinstance(link["href"], str)
|
|
416
|
+
}
|
|
417
|
+
metadata.update(link_metadata)
|
|
419
418
|
|
|
420
419
|
return metadata
|
|
421
420
|
|
|
@@ -424,11 +423,7 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
|
424
423
|
if not metadata:
|
|
425
424
|
return ""
|
|
426
425
|
|
|
427
|
-
lines = ["<!--"]
|
|
428
|
-
for key, value in sorted(metadata.items()):
|
|
429
|
-
safe_value = value.replace("-->", "-->")
|
|
430
|
-
lines.append(f"{key}: {safe_value}")
|
|
431
|
-
lines.append("-->")
|
|
426
|
+
lines = ["<!--", *[f"{key}: {value.replace('-->', '-->')}" for key, value in sorted(metadata.items())], "-->"]
|
|
432
427
|
|
|
433
428
|
return "\n".join(lines) + "\n\n"
|
|
434
429
|
|
|
@@ -442,6 +437,7 @@ def convert_to_markdown(
|
|
|
442
437
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
443
438
|
parser: str | None = None,
|
|
444
439
|
autolinks: bool = True,
|
|
440
|
+
br_in_tables: bool = False,
|
|
445
441
|
bullets: str = "*+-",
|
|
446
442
|
code_language: str = "",
|
|
447
443
|
code_language_callback: Callable[[Any], str] | None = None,
|
|
@@ -485,6 +481,7 @@ def convert_to_markdown(
|
|
|
485
481
|
progress_callback: Callback for progress updates (current, total).
|
|
486
482
|
parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
|
|
487
483
|
autolinks: Convert URLs to automatic links.
|
|
484
|
+
br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
|
|
488
485
|
bullets: Characters to use for unordered list bullets.
|
|
489
486
|
code_language: Default language for code blocks.
|
|
490
487
|
code_language_callback: Callback to determine code language from element.
|
|
@@ -644,7 +641,7 @@ def convert_to_markdown(
|
|
|
644
641
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
645
642
|
|
|
646
643
|
if convert_as_inline:
|
|
647
|
-
result = result.rstrip("\n")
|
|
644
|
+
result = result.rstrip("\n") # pragma: no cover
|
|
648
645
|
|
|
649
646
|
return result
|
|
650
647
|
|
|
@@ -658,6 +655,7 @@ def convert_to_markdown(
|
|
|
658
655
|
whitespace_handler=whitespace_handler,
|
|
659
656
|
parser=parser,
|
|
660
657
|
autolinks=autolinks,
|
|
658
|
+
br_in_tables=br_in_tables,
|
|
661
659
|
bullets=bullets,
|
|
662
660
|
code_language=code_language,
|
|
663
661
|
code_language_callback=code_language_callback,
|
|
@@ -819,6 +817,7 @@ def _process_html_core(
|
|
|
819
817
|
whitespace_handler: WhitespaceHandler,
|
|
820
818
|
parser: str | None = None,
|
|
821
819
|
autolinks: bool,
|
|
820
|
+
br_in_tables: bool,
|
|
822
821
|
bullets: str,
|
|
823
822
|
code_language: str,
|
|
824
823
|
code_language_callback: Callable[[Any], str] | None,
|
|
@@ -849,24 +848,25 @@ def _process_html_core(
|
|
|
849
848
|
try:
|
|
850
849
|
if isinstance(source, str):
|
|
851
850
|
if strip_newlines:
|
|
852
|
-
source = source.replace("\n", " ").replace("\r", " ")
|
|
851
|
+
source = source.replace("\n", " ").replace("\r", " ") # pragma: no cover
|
|
853
852
|
|
|
854
853
|
if "".join(source.split("\n")):
|
|
855
854
|
if parser is None:
|
|
856
855
|
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
857
856
|
|
|
858
|
-
if parser == "lxml" and not LXML_AVAILABLE:
|
|
857
|
+
if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
|
|
859
858
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
860
859
|
|
|
861
860
|
source = BeautifulSoup(source, parser)
|
|
862
861
|
else:
|
|
863
862
|
raise EmptyHtmlError
|
|
864
863
|
|
|
865
|
-
if strip is not None and convert is not None:
|
|
864
|
+
if strip is not None and convert is not None: # pragma: no cover
|
|
866
865
|
raise ConflictingOptionsError("strip", "convert")
|
|
867
866
|
|
|
868
867
|
converters_map = create_converters_map(
|
|
869
868
|
autolinks=autolinks,
|
|
869
|
+
br_in_tables=br_in_tables,
|
|
870
870
|
bullets=bullets,
|
|
871
871
|
code_language=code_language,
|
|
872
872
|
code_language_callback=code_language_callback,
|
|
@@ -935,6 +935,7 @@ def convert_to_markdown_stream(
|
|
|
935
935
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
936
936
|
parser: str | None = None,
|
|
937
937
|
autolinks: bool = True,
|
|
938
|
+
br_in_tables: bool = False,
|
|
938
939
|
bullets: str = "*+-",
|
|
939
940
|
code_language: str = "",
|
|
940
941
|
code_language_callback: Callable[[Any], str] | None = None,
|
|
@@ -976,6 +977,7 @@ def convert_to_markdown_stream(
|
|
|
976
977
|
whitespace_handler=whitespace_handler,
|
|
977
978
|
parser=parser,
|
|
978
979
|
autolinks=autolinks,
|
|
980
|
+
br_in_tables=br_in_tables,
|
|
979
981
|
bullets=bullets,
|
|
980
982
|
code_language=code_language,
|
|
981
983
|
code_language_callback=code_language_callback,
|
|
@@ -1027,7 +1029,7 @@ def convert_to_markdown_stream(
|
|
|
1027
1029
|
end_pos = search_start + newline_pos + 1
|
|
1028
1030
|
|
|
1029
1031
|
chunk = combined_result[pos:end_pos]
|
|
1030
|
-
if chunk:
|
|
1032
|
+
if chunk: # pragma: no cover
|
|
1031
1033
|
yield chunk
|
|
1032
1034
|
|
|
1033
1035
|
pos = end_pos
|
html_to_markdown/utils.py
CHANGED
|
@@ -12,9 +12,7 @@ def chomp(text: str) -> tuple[str, str, str]:
|
|
|
12
12
|
prefix = " " if text.startswith((" ", "\t")) else ""
|
|
13
13
|
suffix = " " if text.endswith((" ", "\t")) else ""
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
return prefix, suffix, text
|
|
15
|
+
return prefix, suffix, text.strip()
|
|
18
16
|
|
|
19
17
|
|
|
20
18
|
def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
|
html_to_markdown/whitespace.py
CHANGED
|
@@ -7,7 +7,7 @@ import unicodedata
|
|
|
7
7
|
from typing import TYPE_CHECKING, Literal
|
|
8
8
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
|
-
from bs4 import NavigableString, PageElement
|
|
10
|
+
from bs4 import NavigableString, PageElement
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
WhitespaceMode = Literal["normalized", "strict"]
|
|
@@ -132,7 +132,7 @@ class WhitespaceHandler:
|
|
|
132
132
|
for char in text:
|
|
133
133
|
if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
|
|
134
134
|
normalized.append(" ")
|
|
135
|
-
elif char
|
|
135
|
+
elif char == "\r": # pragma: no cover
|
|
136
136
|
normalized.append("\n")
|
|
137
137
|
else:
|
|
138
138
|
normalized.append(char)
|
|
@@ -168,15 +168,12 @@ class WhitespaceHandler:
|
|
|
168
168
|
*,
|
|
169
169
|
in_pre: bool = False,
|
|
170
170
|
) -> str:
|
|
171
|
-
if not text:
|
|
171
|
+
if not text: # pragma: no cover
|
|
172
172
|
return ""
|
|
173
173
|
|
|
174
174
|
if in_pre or self.should_preserve_whitespace(element):
|
|
175
175
|
return text
|
|
176
176
|
|
|
177
|
-
if self.mode == "strict":
|
|
178
|
-
return text
|
|
179
|
-
|
|
180
177
|
text = self.normalize_unicode_spaces(text)
|
|
181
178
|
return self._process_normalized(text, element)
|
|
182
179
|
|
|
@@ -204,8 +201,8 @@ class WhitespaceHandler:
|
|
|
204
201
|
def _process_text_with_content(self, text: str, element: NavigableString) -> str:
|
|
205
202
|
original = str(element)
|
|
206
203
|
|
|
207
|
-
has_lead_space = original and original[0] in " \t\n"
|
|
208
|
-
has_trail_space = original and original[-1] in " \t\n"
|
|
204
|
+
has_lead_space = bool(original and original[0] in " \t\n")
|
|
205
|
+
has_trail_space = bool(original and original[-1] in " \t\n")
|
|
209
206
|
|
|
210
207
|
text = self._multiple_spaces.sub(" ", text.strip())
|
|
211
208
|
|
|
@@ -215,9 +212,9 @@ class WhitespaceHandler:
|
|
|
215
212
|
return self._process_special_inline_containers(text, original)
|
|
216
213
|
|
|
217
214
|
if parent and self.is_inline_element(parent):
|
|
218
|
-
return self._process_inline_element_text(text, original,
|
|
215
|
+
return self._process_inline_element_text(text, original, has_lead_space, has_trail_space)
|
|
219
216
|
|
|
220
|
-
return self._process_standalone_text(text, original, element,
|
|
217
|
+
return self._process_standalone_text(text, original, element, has_lead_space, has_trail_space)
|
|
221
218
|
|
|
222
219
|
def _process_special_inline_containers(self, text: str, original: str) -> str:
|
|
223
220
|
if original and "\n" not in original and "\t" not in original:
|
|
@@ -280,24 +277,3 @@ class WhitespaceHandler:
|
|
|
280
277
|
text = text + "\n\n"
|
|
281
278
|
|
|
282
279
|
return text
|
|
283
|
-
|
|
284
|
-
def get_block_spacing(self, tag: Tag, next_sibling: PageElement | None = None) -> str:
|
|
285
|
-
if self.mode == "strict":
|
|
286
|
-
return ""
|
|
287
|
-
|
|
288
|
-
tag_name = tag.name.lower() if hasattr(tag, "name") else ""
|
|
289
|
-
|
|
290
|
-
double_newline_elements = {"p", "div", "blockquote", "pre", "table", "ul", "ol", "dl"}
|
|
291
|
-
|
|
292
|
-
single_newline_elements = {"li", "dt", "dd", "tr", "td", "th"}
|
|
293
|
-
|
|
294
|
-
if tag_name in double_newline_elements:
|
|
295
|
-
if self.is_block_element(next_sibling):
|
|
296
|
-
return "\n\n"
|
|
297
|
-
return "\n"
|
|
298
|
-
if tag_name in single_newline_elements:
|
|
299
|
-
return "\n"
|
|
300
|
-
if tag_name.startswith("h") and len(tag_name) == 2 and tag_name[1].isdigit():
|
|
301
|
-
return "\n\n"
|
|
302
|
-
|
|
303
|
-
return ""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.12.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -320,6 +320,88 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
|
|
|
320
320
|
|
|
321
321
|
Custom converters take precedence over built-in converters and can be used alongside other configuration options.
|
|
322
322
|
|
|
323
|
+
### Streaming API
|
|
324
|
+
|
|
325
|
+
For processing large documents with memory constraints, use the streaming API:
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
from html_to_markdown import convert_to_markdown_stream
|
|
329
|
+
|
|
330
|
+
# Process large HTML in chunks
|
|
331
|
+
with open("large_document.html", "r") as f:
|
|
332
|
+
html_content = f.read()
|
|
333
|
+
|
|
334
|
+
# Returns a generator that yields markdown chunks
|
|
335
|
+
for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
|
|
336
|
+
print(chunk, end="")
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
With progress tracking:
|
|
340
|
+
|
|
341
|
+
```python
|
|
342
|
+
def show_progress(processed: int, total: int):
|
|
343
|
+
if total > 0:
|
|
344
|
+
percent = (processed / total) * 100
|
|
345
|
+
print(f"\rProgress: {percent:.1f}%", end="")
|
|
346
|
+
|
|
347
|
+
# Stream with progress callback
|
|
348
|
+
markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
### Preprocessing API
|
|
352
|
+
|
|
353
|
+
The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
from html_to_markdown import preprocess_html, create_preprocessor
|
|
357
|
+
|
|
358
|
+
# Direct preprocessing with custom options
|
|
359
|
+
cleaned_html = preprocess_html(
|
|
360
|
+
raw_html,
|
|
361
|
+
remove_navigation=True,
|
|
362
|
+
remove_forms=True,
|
|
363
|
+
remove_scripts=True,
|
|
364
|
+
remove_styles=True,
|
|
365
|
+
remove_comments=True,
|
|
366
|
+
preserve_semantic_structure=True,
|
|
367
|
+
preserve_tables=True,
|
|
368
|
+
preserve_media=True,
|
|
369
|
+
)
|
|
370
|
+
markdown = convert_to_markdown(cleaned_html)
|
|
371
|
+
|
|
372
|
+
# Create a preprocessor configuration from presets
|
|
373
|
+
config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
|
|
374
|
+
markdown = convert_to_markdown(html, **config)
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
### Exception Handling
|
|
378
|
+
|
|
379
|
+
The library provides specific exception classes for better error handling:
|
|
380
|
+
|
|
381
|
+
````python
|
|
382
|
+
from html_to_markdown import (
|
|
383
|
+
convert_to_markdown,
|
|
384
|
+
HtmlToMarkdownError,
|
|
385
|
+
EmptyHtmlError,
|
|
386
|
+
InvalidParserError,
|
|
387
|
+
ConflictingOptionsError,
|
|
388
|
+
MissingDependencyError
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
try:
|
|
392
|
+
markdown = convert_to_markdown(html, parser='lxml')
|
|
393
|
+
except MissingDependencyError:
|
|
394
|
+
# lxml not installed
|
|
395
|
+
markdown = convert_to_markdown(html, parser='html.parser')
|
|
396
|
+
except EmptyHtmlError:
|
|
397
|
+
print("No HTML content to convert")
|
|
398
|
+
except InvalidParserError as e:
|
|
399
|
+
print(f"Parser error: {e}")
|
|
400
|
+
except ConflictingOptionsError as e:
|
|
401
|
+
print(f"Conflicting options: {e}")
|
|
402
|
+
except HtmlToMarkdownError as e:
|
|
403
|
+
print(f"Conversion error: {e}")
|
|
404
|
+
|
|
323
405
|
## CLI Usage
|
|
324
406
|
|
|
325
407
|
Convert HTML files directly from the command line with full access to all API options:
|
|
@@ -340,7 +422,7 @@ html_to_markdown \
|
|
|
340
422
|
--preprocess-html \
|
|
341
423
|
--preprocessing-preset aggressive \
|
|
342
424
|
input.html > output.md
|
|
343
|
-
|
|
425
|
+
````
|
|
344
426
|
|
|
345
427
|
### Key CLI Options
|
|
346
428
|
|
|
@@ -353,6 +435,20 @@ html_to_markdown \
|
|
|
353
435
|
--whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
|
|
354
436
|
--heading-style {atx,atx_closed,underlined} # Header style
|
|
355
437
|
--no-extract-metadata # Disable metadata extraction
|
|
438
|
+
--br-in-tables # Use <br> tags for line breaks in table cells
|
|
439
|
+
--source-encoding ENCODING # Override auto-detected encoding (rarely needed)
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
**File Encoding:**
|
|
443
|
+
|
|
444
|
+
The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
|
|
445
|
+
|
|
446
|
+
```shell
|
|
447
|
+
# Override auto-detection for Latin-1 encoded file
|
|
448
|
+
html_to_markdown --source-encoding latin-1 input.html > output.md
|
|
449
|
+
|
|
450
|
+
# Force UTF-16 encoding when auto-detection fails
|
|
451
|
+
html_to_markdown --source-encoding utf-16 input.html > output.md
|
|
356
452
|
```
|
|
357
453
|
|
|
358
454
|
**All Available Options:**
|
|
@@ -393,6 +489,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
393
489
|
- `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
|
|
394
490
|
- `sub_symbol` (str, default: `''`): Custom symbol for subscript text
|
|
395
491
|
- `sup_symbol` (str, default: `''`): Custom symbol for superscript text
|
|
492
|
+
- `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
|
|
396
493
|
|
|
397
494
|
### Parser Options
|
|
398
495
|
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
|
|
2
|
+
html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
|
|
3
|
+
html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
|
|
4
|
+
html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
|
|
5
|
+
html_to_markdown/converters.py,sha256=4dikabmNVu8g7jnSpk_i_6CAKy7OehjcL0c8lmIJRSk,36414
|
|
6
|
+
html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
|
|
7
|
+
html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
|
|
8
|
+
html_to_markdown/processing.py,sha256=RQbqkI3w_rm64uOvmO6-CrqCJXKNHtfKu2G6f59JSF0,34596
|
|
9
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
|
|
11
|
+
html_to_markdown/whitespace.py,sha256=a7M_u9JXh6cfjs4rz25hABIKKy3ax11ZXJhEID4YSV4,7397
|
|
12
|
+
html_to_markdown-1.12.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
13
|
+
html_to_markdown-1.12.0.dist-info/METADATA,sha256=y8bGQgaCogxjM7V3gldeZi0IIaiCC-H7NiPqQMwMgmY,20867
|
|
14
|
+
html_to_markdown-1.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
html_to_markdown-1.12.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
16
|
+
html_to_markdown-1.12.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
17
|
+
html_to_markdown-1.12.0.dist-info/RECORD,,
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
|
|
2
|
-
html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
|
|
3
|
-
html_to_markdown/cli.py,sha256=ilnrJN2XMhPDQ4UkkG4cjLXTvglu_ZJj-bBsohVF3fw,8541
|
|
4
|
-
html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
|
|
5
|
-
html_to_markdown/converters.py,sha256=CbChkRIlOPe0d1MK5-txDE56IG4Ea_dcCV6KRCTjeKY,32497
|
|
6
|
-
html_to_markdown/exceptions.py,sha256=YjfwVCWE_oZakr9iy0E-_aPSYHNaocJZgWeQ9Enty7Q,1212
|
|
7
|
-
html_to_markdown/preprocessor.py,sha256=acmuJJvx1RaXE3c0F_aWsartQE0cEpa3AOnJYGnPzqw,9708
|
|
8
|
-
html_to_markdown/processing.py,sha256=sOIIFNyRkRYAH8Q4ehrh66RY71bkvttSuqzXYsMC5JM,34334
|
|
9
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
html_to_markdown/utils.py,sha256=4Vzk2cCjxN0LAZ1DXQCufYtxE7a6739TYgPbje-VM_E,1086
|
|
11
|
-
html_to_markdown/whitespace.py,sha256=EJ0gEsfLB_wZAk5d5qP4UPhPg0pJJ8LZLRRr_QoL01o,8186
|
|
12
|
-
html_to_markdown-1.11.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
13
|
-
html_to_markdown-1.11.0.dist-info/METADATA,sha256=Cej6bnqT9JVFzACZvND6Z5-kD0QoabiLi46opAaC11U,17814
|
|
14
|
-
html_to_markdown-1.11.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
-
html_to_markdown-1.11.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
16
|
-
html_to_markdown-1.11.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
17
|
-
html_to_markdown-1.11.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|