html-to-markdown 1.10.0__py3-none-any.whl → 1.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/cli.py +28 -2
- html_to_markdown/converters.py +218 -128
- html_to_markdown/exceptions.py +5 -0
- html_to_markdown/preprocessor.py +96 -86
- html_to_markdown/processing.py +49 -44
- html_to_markdown/utils.py +1 -3
- html_to_markdown/whitespace.py +19 -32
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.12.0.dist-info}/METADATA +100 -3
- html_to_markdown-1.12.0.dist-info/RECORD +17 -0
- html_to_markdown-1.10.0.dist-info/RECORD +0 -17
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.12.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.12.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.12.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.10.0.dist-info → html_to_markdown-1.12.0.dist-info}/top_level.txt +0 -0
html_to_markdown/cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from argparse import ArgumentParser, FileType
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
from html_to_markdown.constants import (
|
|
5
6
|
ASTERISK,
|
|
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
|
|
|
13
14
|
WHITESPACE_NORMALIZED,
|
|
14
15
|
WHITESPACE_STRICT,
|
|
15
16
|
)
|
|
17
|
+
from html_to_markdown.exceptions import InvalidEncodingError
|
|
16
18
|
from html_to_markdown.processing import convert_to_markdown
|
|
17
19
|
|
|
18
20
|
|
|
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
|
|
|
131
133
|
help="Parent tags where images remain inline (not converted to alt-text).",
|
|
132
134
|
)
|
|
133
135
|
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--br-in-tables",
|
|
138
|
+
action="store_true",
|
|
139
|
+
help="Use <br> tags for line breaks in table cells instead of spaces.",
|
|
140
|
+
)
|
|
141
|
+
|
|
134
142
|
parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
|
|
135
143
|
|
|
136
144
|
parser.add_argument(
|
|
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
|
|
|
235
243
|
help="Keep navigation elements when preprocessing (normally removed).",
|
|
236
244
|
)
|
|
237
245
|
|
|
246
|
+
parser.add_argument(
|
|
247
|
+
"--source-encoding",
|
|
248
|
+
type=str,
|
|
249
|
+
default=None,
|
|
250
|
+
help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
|
|
251
|
+
)
|
|
252
|
+
|
|
238
253
|
args = parser.parse_args(argv)
|
|
239
254
|
|
|
240
255
|
base_args = {
|
|
241
256
|
"autolinks": args.autolinks,
|
|
257
|
+
"br_in_tables": args.br_in_tables,
|
|
242
258
|
"bullets": args.bullets,
|
|
243
259
|
"code_language": args.code_language,
|
|
244
260
|
"convert": args.convert,
|
|
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
|
|
|
278
294
|
if args.show_progress:
|
|
279
295
|
|
|
280
296
|
def progress_callback(processed: int, total: int) -> None:
|
|
281
|
-
if total > 0:
|
|
297
|
+
if total > 0: # pragma: no cover
|
|
282
298
|
percent = (processed / total) * 100
|
|
283
299
|
|
|
284
300
|
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
|
|
|
286
302
|
|
|
287
303
|
base_args["progress_callback"] = progress_callback
|
|
288
304
|
|
|
289
|
-
|
|
305
|
+
if args.source_encoding and args.html.name != "<stdin>":
|
|
306
|
+
args.html.close()
|
|
307
|
+
try:
|
|
308
|
+
with Path(args.html.name).open(encoding=args.source_encoding) as f:
|
|
309
|
+
html_content = f.read()
|
|
310
|
+
except LookupError as e:
|
|
311
|
+
raise InvalidEncodingError(args.source_encoding) from e
|
|
312
|
+
else:
|
|
313
|
+
html_content = args.html.read()
|
|
314
|
+
|
|
315
|
+
return convert_to_markdown(html_content, **base_args)
|
html_to_markdown/converters.py
CHANGED
|
@@ -5,9 +5,11 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from collections.abc import Iterable
|
|
7
7
|
import base64
|
|
8
|
+
import re
|
|
8
9
|
from collections.abc import Callable
|
|
9
10
|
from functools import partial
|
|
10
11
|
from inspect import getfullargspec
|
|
12
|
+
from itertools import chain
|
|
11
13
|
from textwrap import fill
|
|
12
14
|
from typing import Any, Literal, TypeVar, cast
|
|
13
15
|
|
|
@@ -36,6 +38,19 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
|
|
|
36
38
|
return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
|
|
37
39
|
|
|
38
40
|
|
|
41
|
+
def _find_list_item_ancestor(tag: Tag) -> Tag | None:
|
|
42
|
+
"""Find the nearest list item ancestor of a tag."""
|
|
43
|
+
parent = tag.parent
|
|
44
|
+
while parent and parent.name != "li":
|
|
45
|
+
parent = parent.parent
|
|
46
|
+
return parent
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
BLOCK_ELEMENTS = frozenset({"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"})
|
|
50
|
+
|
|
51
|
+
_LIST_ITEM_PATTERN = re.compile(r"^\s*(\*|\+|-|\d+\.)\s")
|
|
52
|
+
|
|
53
|
+
|
|
39
54
|
SupportedElements = Literal[
|
|
40
55
|
"a",
|
|
41
56
|
"abbr",
|
|
@@ -270,52 +285,91 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
270
285
|
return f""
|
|
271
286
|
|
|
272
287
|
|
|
288
|
+
def _has_block_list_items(tag: Tag) -> bool:
|
|
289
|
+
"""Check if any list items contain block elements."""
|
|
290
|
+
return any(
|
|
291
|
+
any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
|
|
292
|
+
for li in tag.find_all("li", recursive=False)
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
|
|
297
|
+
"""Handle indentation for lists nested within list items."""
|
|
298
|
+
prev_p = None
|
|
299
|
+
for child in parent.children:
|
|
300
|
+
if hasattr(child, "name"):
|
|
301
|
+
if child.name == "p":
|
|
302
|
+
prev_p = child
|
|
303
|
+
break
|
|
304
|
+
|
|
305
|
+
if prev_p:
|
|
306
|
+
lines = text.strip().split("\n")
|
|
307
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
|
|
308
|
+
return "\n" + "\n".join(indented_lines) + "\n"
|
|
309
|
+
return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
|
|
313
|
+
"""Handle indentation for lists that are direct children of other lists."""
|
|
314
|
+
lines = text.strip().split("\n")
|
|
315
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
|
|
316
|
+
result = "\n".join(indented_lines)
|
|
317
|
+
return result + "\n" if not result.endswith("\n") else result
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _add_list_item_spacing(text: str) -> str:
|
|
321
|
+
"""Add extra spacing between list items that contain block content."""
|
|
322
|
+
lines = text.split("\n")
|
|
323
|
+
items_with_blocks = set()
|
|
324
|
+
|
|
325
|
+
i = 0
|
|
326
|
+
while i < len(lines):
|
|
327
|
+
line = lines[i]
|
|
328
|
+
if line.strip() and _LIST_ITEM_PATTERN.match(line.lstrip()):
|
|
329
|
+
j = i + 1
|
|
330
|
+
has_continuation = False
|
|
331
|
+
while j < len(lines):
|
|
332
|
+
next_line = lines[j]
|
|
333
|
+
if next_line.strip() and _LIST_ITEM_PATTERN.match(next_line.lstrip()):
|
|
334
|
+
break
|
|
335
|
+
if next_line.strip() and next_line.startswith((" ", " ", "\t")):
|
|
336
|
+
has_continuation = True
|
|
337
|
+
j += 1
|
|
338
|
+
|
|
339
|
+
if has_continuation and j < len(lines):
|
|
340
|
+
items_with_blocks.add(j - 1)
|
|
341
|
+
|
|
342
|
+
i += 1
|
|
343
|
+
|
|
344
|
+
if items_with_blocks:
|
|
345
|
+
processed_lines = list(
|
|
346
|
+
chain.from_iterable([line, ""] if i in items_with_blocks else [line] for i, line in enumerate(lines))
|
|
347
|
+
)
|
|
348
|
+
return "\n".join(processed_lines)
|
|
349
|
+
|
|
350
|
+
return text
|
|
351
|
+
|
|
352
|
+
|
|
273
353
|
def _convert_list(*, tag: Tag, text: str, list_indent_str: str) -> str:
|
|
274
354
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
275
355
|
|
|
276
|
-
before_paragraph =
|
|
277
|
-
if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
|
|
278
|
-
before_paragraph = True
|
|
356
|
+
before_paragraph = tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}
|
|
279
357
|
|
|
280
|
-
|
|
281
|
-
parent = tag.parent
|
|
282
|
-
while parent and parent.name != "li":
|
|
283
|
-
parent = parent.parent
|
|
358
|
+
has_block_items = _has_block_list_items(tag)
|
|
284
359
|
|
|
360
|
+
if _has_ancestor(tag, "li"):
|
|
361
|
+
parent = _find_list_item_ancestor(tag)
|
|
285
362
|
if parent:
|
|
286
|
-
|
|
287
|
-
for child in parent.children:
|
|
288
|
-
if hasattr(child, "name"):
|
|
289
|
-
if child == tag:
|
|
290
|
-
break
|
|
291
|
-
if child.name == "p":
|
|
292
|
-
prev_p = child
|
|
293
|
-
|
|
294
|
-
if prev_p:
|
|
295
|
-
lines = text.strip().split("\n")
|
|
296
|
-
indented_lines = []
|
|
297
|
-
for line in lines:
|
|
298
|
-
if line.strip():
|
|
299
|
-
indented_lines.append(f"{list_indent_str}{line}")
|
|
300
|
-
else:
|
|
301
|
-
indented_lines.append("")
|
|
302
|
-
return "\n" + "\n".join(indented_lines) + "\n"
|
|
303
|
-
return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
|
|
363
|
+
return _handle_nested_list_indentation(text, list_indent_str, parent)
|
|
304
364
|
|
|
305
365
|
if tag.parent and tag.parent.name in {"ul", "ol"}:
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
indented_lines.append(f"{list_indent_str}{line}")
|
|
311
|
-
else:
|
|
312
|
-
indented_lines.append("")
|
|
313
|
-
result = "\n".join(indented_lines)
|
|
314
|
-
if not result.endswith("\n"):
|
|
315
|
-
result += "\n"
|
|
316
|
-
return result
|
|
366
|
+
return _handle_direct_nested_list_indentation(text, list_indent_str)
|
|
367
|
+
|
|
368
|
+
if has_block_items:
|
|
369
|
+
text = _add_list_item_spacing(text)
|
|
317
370
|
|
|
318
|
-
|
|
371
|
+
trailing_newlines = "\n\n" if has_block_items else ("\n" if before_paragraph else "")
|
|
372
|
+
return text + trailing_newlines
|
|
319
373
|
|
|
320
374
|
|
|
321
375
|
def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> str:
|
|
@@ -324,10 +378,8 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
324
378
|
checked = checkbox.get("checked") is not None
|
|
325
379
|
checkbox_symbol = "[x]" if checked else "[ ]"
|
|
326
380
|
|
|
327
|
-
checkbox_text = text
|
|
328
|
-
|
|
329
|
-
checkbox_text = text.replace(str(checkbox.string), "").strip()
|
|
330
|
-
return f"- {checkbox_symbol} {checkbox_text.strip()}\n"
|
|
381
|
+
checkbox_text = text.strip()
|
|
382
|
+
return f"- {checkbox_symbol} {checkbox_text}\n"
|
|
331
383
|
|
|
332
384
|
parent = tag.parent
|
|
333
385
|
if parent is not None and parent.name == "ol":
|
|
@@ -349,11 +401,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
349
401
|
|
|
350
402
|
bullet = bullets[depth % len(bullets)]
|
|
351
403
|
|
|
352
|
-
has_block_children =
|
|
353
|
-
child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
|
|
354
|
-
for child in tag.children
|
|
355
|
-
if hasattr(child, "name")
|
|
356
|
-
)
|
|
404
|
+
has_block_children = "\n\n" in text
|
|
357
405
|
|
|
358
406
|
if has_block_children:
|
|
359
407
|
paragraphs = text.strip().split("\n\n")
|
|
@@ -390,20 +438,13 @@ def _convert_p(
|
|
|
390
438
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
391
439
|
|
|
392
440
|
if _has_ancestor(tag, "li"):
|
|
393
|
-
parent = tag
|
|
394
|
-
while parent and parent.name != "li":
|
|
395
|
-
parent = parent.parent
|
|
441
|
+
parent = _find_list_item_ancestor(tag)
|
|
396
442
|
|
|
397
443
|
if parent:
|
|
398
444
|
p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
|
|
399
445
|
|
|
400
446
|
if p_children and tag != p_children[0]:
|
|
401
|
-
indented_lines = []
|
|
402
|
-
for line in text.split("\n"):
|
|
403
|
-
if line.strip():
|
|
404
|
-
indented_lines.append(f"{list_indent_str}{line}")
|
|
405
|
-
else:
|
|
406
|
-
indented_lines.append("")
|
|
447
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
|
|
407
448
|
text = "\n".join(indented_lines)
|
|
408
449
|
|
|
409
450
|
return f"{text}\n\n" if text else ""
|
|
@@ -440,66 +481,69 @@ def _convert_pre(
|
|
|
440
481
|
return f"\n```{code_language}\n{text}\n```\n"
|
|
441
482
|
|
|
442
483
|
|
|
443
|
-
def
|
|
484
|
+
def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
|
|
485
|
+
"""Process table cell content, optionally using <br> tags for multi-line content."""
|
|
486
|
+
if br_in_tables:
|
|
487
|
+
block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
|
|
488
|
+
|
|
489
|
+
if len(block_children) > 1:
|
|
490
|
+
child_contents = []
|
|
491
|
+
for child in block_children:
|
|
492
|
+
child_text = child.get_text().strip()
|
|
493
|
+
if child_text:
|
|
494
|
+
child_contents.append(child_text)
|
|
495
|
+
return "<br>".join(child_contents)
|
|
496
|
+
return text.strip().replace("\n", "<br>")
|
|
497
|
+
return text.strip().replace("\n", " ")
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _convert_td(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
|
|
444
501
|
colspan = _get_colspan(tag)
|
|
445
|
-
|
|
502
|
+
processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
|
|
503
|
+
return " " + processed_text + " |" * colspan
|
|
446
504
|
|
|
447
505
|
|
|
448
|
-
def _convert_th(*, tag: Tag, text: str) -> str:
|
|
506
|
+
def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
|
|
449
507
|
colspan = _get_colspan(tag)
|
|
450
|
-
|
|
508
|
+
processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
|
|
509
|
+
return " " + processed_text + " |" * colspan
|
|
451
510
|
|
|
452
511
|
|
|
453
|
-
def
|
|
454
|
-
cells
|
|
455
|
-
|
|
456
|
-
|
|
512
|
+
def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
|
|
513
|
+
"""Get positions of cells with rowspan > 1 from previous row."""
|
|
514
|
+
rowspan_positions = []
|
|
515
|
+
col_pos = 0
|
|
457
516
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
for prev_cell in prev_cells:
|
|
464
|
-
rowspan = 1
|
|
465
|
-
if (
|
|
466
|
-
"rowspan" in prev_cell.attrs
|
|
467
|
-
and isinstance(prev_cell["rowspan"], str)
|
|
468
|
-
and prev_cell["rowspan"].isdigit()
|
|
469
|
-
):
|
|
470
|
-
rowspan = int(prev_cell["rowspan"])
|
|
471
|
-
|
|
472
|
-
if rowspan > 1:
|
|
473
|
-
rowspan_positions.append(col_pos)
|
|
474
|
-
|
|
475
|
-
colspan = 1
|
|
476
|
-
if (
|
|
477
|
-
"colspan" in prev_cell.attrs
|
|
478
|
-
and isinstance(prev_cell["colspan"], str)
|
|
479
|
-
and prev_cell["colspan"].isdigit()
|
|
480
|
-
):
|
|
481
|
-
colspan = int(prev_cell["colspan"])
|
|
482
|
-
col_pos += colspan
|
|
517
|
+
for prev_cell in prev_cells:
|
|
518
|
+
rowspan = 1
|
|
519
|
+
if "rowspan" in prev_cell.attrs and isinstance(prev_cell["rowspan"], str) and prev_cell["rowspan"].isdigit():
|
|
520
|
+
rowspan = int(prev_cell["rowspan"])
|
|
483
521
|
|
|
484
|
-
if
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
522
|
+
if rowspan > 1:
|
|
523
|
+
rowspan_positions.append(col_pos)
|
|
524
|
+
|
|
525
|
+
colspan = 1
|
|
526
|
+
if "colspan" in prev_cell.attrs and isinstance(prev_cell["colspan"], str) and prev_cell["colspan"].isdigit():
|
|
527
|
+
colspan = int(prev_cell["colspan"])
|
|
528
|
+
col_pos += colspan
|
|
489
529
|
|
|
490
|
-
|
|
491
|
-
cell_index = 0
|
|
530
|
+
return rowspan_positions, col_pos
|
|
492
531
|
|
|
493
|
-
for pos in range(col_pos):
|
|
494
|
-
if pos in rowspan_positions:
|
|
495
|
-
new_cells.append(" |")
|
|
496
|
-
elif cell_index < len(converted_cells):
|
|
497
|
-
new_cells.append(converted_cells[cell_index])
|
|
498
|
-
cell_index += 1
|
|
499
532
|
|
|
500
|
-
|
|
533
|
+
def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
|
|
534
|
+
"""Handle text adjustment for rows with rowspan cells."""
|
|
535
|
+
converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
|
|
536
|
+
rowspan_set = set(rowspan_positions)
|
|
501
537
|
|
|
502
|
-
|
|
538
|
+
cell_iter = iter(converted_cells)
|
|
539
|
+
new_cells = [" |" if pos in rowspan_set else next(cell_iter, "") for pos in range(col_pos)]
|
|
540
|
+
|
|
541
|
+
return "".join(new_cells)
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
|
|
545
|
+
"""Determine if this table row should be treated as a header row."""
|
|
546
|
+
return (
|
|
503
547
|
all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
|
|
504
548
|
or (not tag.previous_sibling and parent_name != "tbody")
|
|
505
549
|
or (
|
|
@@ -508,25 +552,48 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
508
552
|
and (not tag_grand_parent or len(tag_grand_parent.find_all(["thead"])) < 1)
|
|
509
553
|
)
|
|
510
554
|
)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
def _calculate_total_colspan(cells: list[Tag]) -> int:
|
|
558
|
+
"""Calculate total colspan for all cells in a row."""
|
|
559
|
+
full_colspan = 0
|
|
560
|
+
for cell in cells:
|
|
561
|
+
if hasattr(cell, "attrs") and "colspan" in cell.attrs:
|
|
562
|
+
colspan_value = cell.attrs["colspan"]
|
|
563
|
+
if isinstance(colspan_value, str) and colspan_value.isdigit():
|
|
564
|
+
full_colspan += int(colspan_value)
|
|
565
|
+
else:
|
|
566
|
+
full_colspan += 1
|
|
567
|
+
else:
|
|
568
|
+
full_colspan += 1
|
|
569
|
+
return full_colspan
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
573
|
+
cells = tag.find_all(["td", "th"])
|
|
574
|
+
parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
|
|
575
|
+
tag_grand_parent = tag.parent.parent if tag.parent else None
|
|
576
|
+
|
|
577
|
+
if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
|
|
578
|
+
prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
|
|
579
|
+
rowspan_positions, col_pos = _get_rowspan_positions(prev_cells)
|
|
580
|
+
|
|
581
|
+
if rowspan_positions:
|
|
582
|
+
text = _handle_rowspan_text(text, rowspan_positions, col_pos)
|
|
583
|
+
|
|
584
|
+
is_headrow = _is_header_row(tag, cells, parent_name, tag_grand_parent)
|
|
511
585
|
overline = ""
|
|
512
586
|
underline = ""
|
|
587
|
+
|
|
513
588
|
if is_headrow and not tag.previous_sibling:
|
|
514
|
-
full_colspan =
|
|
515
|
-
for cell in cells:
|
|
516
|
-
if hasattr(cell, "attrs") and "colspan" in cell.attrs:
|
|
517
|
-
colspan_value = cell.attrs["colspan"]
|
|
518
|
-
if isinstance(colspan_value, str) and colspan_value.isdigit():
|
|
519
|
-
full_colspan += int(colspan_value)
|
|
520
|
-
else:
|
|
521
|
-
full_colspan += 1
|
|
522
|
-
else:
|
|
523
|
-
full_colspan += 1
|
|
589
|
+
full_colspan = _calculate_total_colspan(cells)
|
|
524
590
|
underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
|
|
525
591
|
elif not tag.previous_sibling and (
|
|
526
592
|
parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
|
|
527
593
|
):
|
|
528
|
-
overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
|
|
529
|
-
overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
|
|
594
|
+
overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n" # pragma: no cover
|
|
595
|
+
overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n" # pragma: no cover
|
|
596
|
+
|
|
530
597
|
return overline + "|" + text + "\n" + underline
|
|
531
598
|
|
|
532
599
|
|
|
@@ -578,8 +645,24 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
|
|
|
578
645
|
return f"{text}\n\n" if text.strip() else ""
|
|
579
646
|
|
|
580
647
|
|
|
581
|
-
def _convert_div(*, text: str, convert_as_inline: bool) -> str:
|
|
582
|
-
|
|
648
|
+
def _convert_div(*, text: str, convert_as_inline: bool, tag: Tag, list_indent_str: str) -> str:
|
|
649
|
+
if convert_as_inline:
|
|
650
|
+
return text
|
|
651
|
+
|
|
652
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
653
|
+
|
|
654
|
+
if _has_ancestor(tag, "li"):
|
|
655
|
+
parent = _find_list_item_ancestor(tag)
|
|
656
|
+
if parent:
|
|
657
|
+
div_children = [child for child in parent.children if hasattr(child, "name") and child.name == "div"]
|
|
658
|
+
|
|
659
|
+
if div_children and tag != div_children[0]:
|
|
660
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
|
|
661
|
+
indented_text = "\n".join(indented_lines)
|
|
662
|
+
|
|
663
|
+
return f"{indented_text}\n\n" if indented_text.strip() else ""
|
|
664
|
+
|
|
665
|
+
return _format_block_element(text)
|
|
583
666
|
|
|
584
667
|
|
|
585
668
|
def _convert_details(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -600,7 +683,7 @@ def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
|
|
|
600
683
|
if convert_as_inline:
|
|
601
684
|
return text
|
|
602
685
|
|
|
603
|
-
return
|
|
686
|
+
return _format_block_element(text)
|
|
604
687
|
|
|
605
688
|
|
|
606
689
|
def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -613,14 +696,21 @@ def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
|
|
|
613
696
|
return f"{text.strip()}\n"
|
|
614
697
|
|
|
615
698
|
|
|
616
|
-
def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
|
|
699
|
+
def _convert_dd(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
617
700
|
if convert_as_inline:
|
|
618
701
|
return text
|
|
619
702
|
|
|
620
|
-
|
|
621
|
-
|
|
703
|
+
has_dt_sibling = False
|
|
704
|
+
current = tag.previous_sibling
|
|
705
|
+
while current:
|
|
706
|
+
if hasattr(current, "name") and current.name and current.name == "dt":
|
|
707
|
+
has_dt_sibling = True
|
|
708
|
+
break
|
|
709
|
+
current = current.previous_sibling
|
|
622
710
|
|
|
623
|
-
|
|
711
|
+
if has_dt_sibling:
|
|
712
|
+
return f": {text.strip()}\n\n" if text.strip() else ": \n\n"
|
|
713
|
+
return f"{text.strip()}\n\n" if text.strip() else ""
|
|
624
714
|
|
|
625
715
|
|
|
626
716
|
def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -645,9 +735,7 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
|
|
|
645
735
|
|
|
646
736
|
|
|
647
737
|
def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
648
|
-
src
|
|
649
|
-
|
|
650
|
-
if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
|
|
738
|
+
if not (src := tag.get("src", "")) and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
|
|
651
739
|
src = source_tag.get("src", "")
|
|
652
740
|
|
|
653
741
|
if src and isinstance(src, str) and src.strip():
|
|
@@ -667,9 +755,8 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
|
|
|
667
755
|
|
|
668
756
|
def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
669
757
|
_ = text
|
|
670
|
-
src = tag.get("src", "")
|
|
671
758
|
|
|
672
|
-
if src and isinstance(src, str) and src.strip():
|
|
759
|
+
if (src := tag.get("src", "")) and isinstance(src, str) and src.strip():
|
|
673
760
|
link = f"[{src}]({src})"
|
|
674
761
|
if convert_as_inline:
|
|
675
762
|
return link
|
|
@@ -936,7 +1023,7 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
936
1023
|
content = text.strip()
|
|
937
1024
|
if content and not content.endswith("\n\n"):
|
|
938
1025
|
if content.endswith("\n"):
|
|
939
|
-
content += "\n"
|
|
1026
|
+
content += "\n" # pragma: no cover
|
|
940
1027
|
else:
|
|
941
1028
|
content += "\n\n"
|
|
942
1029
|
return content
|
|
@@ -994,6 +1081,7 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
994
1081
|
|
|
995
1082
|
def create_converters_map(
|
|
996
1083
|
autolinks: bool,
|
|
1084
|
+
br_in_tables: bool,
|
|
997
1085
|
bullets: str,
|
|
998
1086
|
code_language: str,
|
|
999
1087
|
code_language_callback: Callable[[Tag], str] | None,
|
|
@@ -1026,6 +1114,8 @@ def create_converters_map(
|
|
|
1026
1114
|
kwargs["convert_as_inline"] = convert_as_inline
|
|
1027
1115
|
if "list_indent_str" in spec.kwonlyargs:
|
|
1028
1116
|
kwargs["list_indent_str"] = list_indent_str
|
|
1117
|
+
if "br_in_tables" in spec.kwonlyargs:
|
|
1118
|
+
kwargs["br_in_tables"] = br_in_tables
|
|
1029
1119
|
return func(**kwargs)
|
|
1030
1120
|
return func(text)
|
|
1031
1121
|
|
html_to_markdown/exceptions.py
CHANGED
|
@@ -37,3 +37,8 @@ class ConflictingOptionsError(HtmlToMarkdownError):
|
|
|
37
37
|
self.option2 = option2
|
|
38
38
|
|
|
39
39
|
super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class InvalidEncodingError(HtmlToMarkdownError):
|
|
43
|
+
def __init__(self, encoding: str) -> None:
|
|
44
|
+
super().__init__(f"The specified encoding ({encoding}) is not valid.")
|
html_to_markdown/preprocessor.py
CHANGED
|
@@ -5,6 +5,98 @@ from typing import Any
|
|
|
5
5
|
|
|
6
6
|
import nh3
|
|
7
7
|
|
|
8
|
+
BASE_ALLOWED_TAGS = frozenset(
|
|
9
|
+
{
|
|
10
|
+
"p",
|
|
11
|
+
"div",
|
|
12
|
+
"span",
|
|
13
|
+
"br",
|
|
14
|
+
"hr",
|
|
15
|
+
"h1",
|
|
16
|
+
"h2",
|
|
17
|
+
"h3",
|
|
18
|
+
"h4",
|
|
19
|
+
"h5",
|
|
20
|
+
"h6",
|
|
21
|
+
"ul",
|
|
22
|
+
"ol",
|
|
23
|
+
"li",
|
|
24
|
+
"dl",
|
|
25
|
+
"dt",
|
|
26
|
+
"dd",
|
|
27
|
+
"strong",
|
|
28
|
+
"b",
|
|
29
|
+
"em",
|
|
30
|
+
"i",
|
|
31
|
+
"u",
|
|
32
|
+
"s",
|
|
33
|
+
"del",
|
|
34
|
+
"ins",
|
|
35
|
+
"mark",
|
|
36
|
+
"small",
|
|
37
|
+
"sub",
|
|
38
|
+
"sup",
|
|
39
|
+
"code",
|
|
40
|
+
"pre",
|
|
41
|
+
"kbd",
|
|
42
|
+
"samp",
|
|
43
|
+
"var",
|
|
44
|
+
"abbr",
|
|
45
|
+
"cite",
|
|
46
|
+
"dfn",
|
|
47
|
+
"time",
|
|
48
|
+
"data",
|
|
49
|
+
"a",
|
|
50
|
+
"blockquote",
|
|
51
|
+
"q",
|
|
52
|
+
}
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
SEMANTIC_STRUCTURE_TAGS = frozenset(
|
|
56
|
+
{
|
|
57
|
+
"article",
|
|
58
|
+
"section",
|
|
59
|
+
"aside",
|
|
60
|
+
"header",
|
|
61
|
+
"footer",
|
|
62
|
+
"main",
|
|
63
|
+
"nav",
|
|
64
|
+
"figure",
|
|
65
|
+
"figcaption",
|
|
66
|
+
"details",
|
|
67
|
+
"summary",
|
|
68
|
+
}
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
TABLE_TAGS = frozenset(
|
|
72
|
+
{
|
|
73
|
+
"table",
|
|
74
|
+
"thead",
|
|
75
|
+
"tbody",
|
|
76
|
+
"tfoot",
|
|
77
|
+
"tr",
|
|
78
|
+
"td",
|
|
79
|
+
"th",
|
|
80
|
+
"caption",
|
|
81
|
+
"colgroup",
|
|
82
|
+
"col",
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
MEDIA_TAGS = frozenset(
|
|
87
|
+
{
|
|
88
|
+
"img",
|
|
89
|
+
"picture",
|
|
90
|
+
"source",
|
|
91
|
+
"audio",
|
|
92
|
+
"video",
|
|
93
|
+
"track",
|
|
94
|
+
"canvas",
|
|
95
|
+
"svg",
|
|
96
|
+
"iframe",
|
|
97
|
+
}
|
|
98
|
+
)
|
|
99
|
+
|
|
8
100
|
|
|
9
101
|
def preprocess_html(
|
|
10
102
|
html: str,
|
|
@@ -63,98 +155,16 @@ def _configure_cleaning_rules(
|
|
|
63
155
|
custom_tags_to_remove: set[str],
|
|
64
156
|
custom_attributes_to_remove: set[str],
|
|
65
157
|
) -> dict[str, Any]:
|
|
66
|
-
allowed_tags =
|
|
67
|
-
"p",
|
|
68
|
-
"div",
|
|
69
|
-
"span",
|
|
70
|
-
"br",
|
|
71
|
-
"hr",
|
|
72
|
-
"h1",
|
|
73
|
-
"h2",
|
|
74
|
-
"h3",
|
|
75
|
-
"h4",
|
|
76
|
-
"h5",
|
|
77
|
-
"h6",
|
|
78
|
-
"ul",
|
|
79
|
-
"ol",
|
|
80
|
-
"li",
|
|
81
|
-
"dl",
|
|
82
|
-
"dt",
|
|
83
|
-
"dd",
|
|
84
|
-
"strong",
|
|
85
|
-
"b",
|
|
86
|
-
"em",
|
|
87
|
-
"i",
|
|
88
|
-
"u",
|
|
89
|
-
"s",
|
|
90
|
-
"del",
|
|
91
|
-
"ins",
|
|
92
|
-
"mark",
|
|
93
|
-
"small",
|
|
94
|
-
"sub",
|
|
95
|
-
"sup",
|
|
96
|
-
"code",
|
|
97
|
-
"pre",
|
|
98
|
-
"kbd",
|
|
99
|
-
"samp",
|
|
100
|
-
"var",
|
|
101
|
-
"abbr",
|
|
102
|
-
"cite",
|
|
103
|
-
"dfn",
|
|
104
|
-
"time",
|
|
105
|
-
"data",
|
|
106
|
-
"a",
|
|
107
|
-
"blockquote",
|
|
108
|
-
"q",
|
|
109
|
-
}
|
|
158
|
+
allowed_tags = set(BASE_ALLOWED_TAGS)
|
|
110
159
|
|
|
111
160
|
if preserve_semantic_structure:
|
|
112
|
-
allowed_tags.update(
|
|
113
|
-
{
|
|
114
|
-
"article",
|
|
115
|
-
"section",
|
|
116
|
-
"aside",
|
|
117
|
-
"header",
|
|
118
|
-
"footer",
|
|
119
|
-
"main",
|
|
120
|
-
"nav",
|
|
121
|
-
"figure",
|
|
122
|
-
"figcaption",
|
|
123
|
-
"details",
|
|
124
|
-
"summary",
|
|
125
|
-
}
|
|
126
|
-
)
|
|
161
|
+
allowed_tags.update(SEMANTIC_STRUCTURE_TAGS)
|
|
127
162
|
|
|
128
163
|
if preserve_tables:
|
|
129
|
-
allowed_tags.update(
|
|
130
|
-
{
|
|
131
|
-
"table",
|
|
132
|
-
"thead",
|
|
133
|
-
"tbody",
|
|
134
|
-
"tfoot",
|
|
135
|
-
"tr",
|
|
136
|
-
"th",
|
|
137
|
-
"td",
|
|
138
|
-
"caption",
|
|
139
|
-
"col",
|
|
140
|
-
"colgroup",
|
|
141
|
-
}
|
|
142
|
-
)
|
|
164
|
+
allowed_tags.update(TABLE_TAGS)
|
|
143
165
|
|
|
144
166
|
if preserve_media:
|
|
145
|
-
allowed_tags.update(
|
|
146
|
-
{
|
|
147
|
-
"img",
|
|
148
|
-
"picture",
|
|
149
|
-
"source",
|
|
150
|
-
"audio",
|
|
151
|
-
"video",
|
|
152
|
-
"track",
|
|
153
|
-
"canvas",
|
|
154
|
-
"svg",
|
|
155
|
-
"iframe",
|
|
156
|
-
}
|
|
157
|
-
)
|
|
167
|
+
allowed_tags.update(MEDIA_TAGS)
|
|
158
168
|
|
|
159
169
|
allowed_tags -= custom_tags_to_remove
|
|
160
170
|
|
html_to_markdown/processing.py
CHANGED
|
@@ -17,7 +17,7 @@ from bs4.element import NavigableString, PageElement
|
|
|
17
17
|
try:
|
|
18
18
|
from html_to_markdown.preprocessor import create_preprocessor
|
|
19
19
|
from html_to_markdown.preprocessor import preprocess_html as preprocess_fn
|
|
20
|
-
except ImportError:
|
|
20
|
+
except ImportError: # pragma: no cover
|
|
21
21
|
create_preprocessor = None # type: ignore[assignment]
|
|
22
22
|
preprocess_fn = None # type: ignore[assignment]
|
|
23
23
|
|
|
@@ -25,7 +25,7 @@ try:
|
|
|
25
25
|
import importlib.util
|
|
26
26
|
|
|
27
27
|
LXML_AVAILABLE = importlib.util.find_spec("lxml") is not None
|
|
28
|
-
except ImportError:
|
|
28
|
+
except ImportError: # pragma: no cover
|
|
29
29
|
LXML_AVAILABLE = False
|
|
30
30
|
|
|
31
31
|
from html_to_markdown.constants import (
|
|
@@ -258,6 +258,18 @@ def _process_tag(
|
|
|
258
258
|
if n_eol_to_add > 0:
|
|
259
259
|
prefix = "\n" * n_eol_to_add
|
|
260
260
|
return f"{prefix}{rendered}"
|
|
261
|
+
|
|
262
|
+
from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
|
|
263
|
+
|
|
264
|
+
is_block_element = tag.name.lower() in BLOCK_ELEMENTS
|
|
265
|
+
if (
|
|
266
|
+
is_block_element
|
|
267
|
+
and not convert_as_inline
|
|
268
|
+
and context_before
|
|
269
|
+
and not context_before.endswith("\n")
|
|
270
|
+
and rendered.strip()
|
|
271
|
+
):
|
|
272
|
+
return f"\n\n{rendered}"
|
|
261
273
|
return rendered
|
|
262
274
|
|
|
263
275
|
return text
|
|
@@ -310,7 +322,7 @@ _ancestor_cache: ContextVar[dict[int, set[str]] | None] = ContextVar("ancestor_c
|
|
|
310
322
|
def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
311
323
|
elem_id = id(element)
|
|
312
324
|
cache = _ancestor_cache.get()
|
|
313
|
-
if cache is None:
|
|
325
|
+
if cache is None: # pragma: no cover
|
|
314
326
|
cache = {}
|
|
315
327
|
_ancestor_cache.set(cache)
|
|
316
328
|
|
|
@@ -326,7 +338,7 @@ def _get_ancestor_names(element: PageElement, max_depth: int = 10) -> set[str]:
|
|
|
326
338
|
ancestor_names.add(current.name)
|
|
327
339
|
|
|
328
340
|
parent_id = id(current)
|
|
329
|
-
if parent_id in cache:
|
|
341
|
+
if parent_id in cache: # pragma: no cover
|
|
330
342
|
ancestor_names.update(cache[parent_id])
|
|
331
343
|
break
|
|
332
344
|
|
|
@@ -358,7 +370,7 @@ def _as_optional_set(value: str | Iterable[str] | None) -> set[str] | None:
|
|
|
358
370
|
if value is None:
|
|
359
371
|
return None
|
|
360
372
|
if isinstance(value, str):
|
|
361
|
-
return set(","
|
|
373
|
+
return set(value.split(","))
|
|
362
374
|
return {*chain(*[v.split(",") for v in value])}
|
|
363
375
|
|
|
364
376
|
|
|
@@ -374,36 +386,35 @@ def _extract_metadata(soup: BeautifulSoup) -> dict[str, str]:
|
|
|
374
386
|
metadata["base-href"] = base_tag["href"]
|
|
375
387
|
|
|
376
388
|
for meta in soup.find_all("meta"):
|
|
377
|
-
if meta.get("name") and meta.get("content") is not None:
|
|
378
|
-
name = meta["name"]
|
|
379
|
-
content = meta["content"]
|
|
389
|
+
if (name := meta.get("name")) and (content := meta.get("content")) is not None:
|
|
380
390
|
if isinstance(name, str) and isinstance(content, str):
|
|
381
|
-
|
|
382
|
-
metadata[key] = content
|
|
391
|
+
metadata[f"meta-{name.lower()}"] = content
|
|
383
392
|
|
|
384
|
-
elif meta.get("property") and meta.get("content") is not None:
|
|
385
|
-
prop = meta["property"]
|
|
386
|
-
content = meta["content"]
|
|
393
|
+
elif (prop := meta.get("property")) and (content := meta.get("content")) is not None:
|
|
387
394
|
if isinstance(prop, str) and isinstance(content, str):
|
|
388
|
-
|
|
389
|
-
metadata[key] = content
|
|
395
|
+
metadata[f"meta-{prop.lower().replace(':', '-')}"] = content
|
|
390
396
|
|
|
391
|
-
elif
|
|
392
|
-
equiv
|
|
393
|
-
content
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
+
elif (
|
|
398
|
+
(equiv := meta.get("http-equiv"))
|
|
399
|
+
and (content := meta.get("content")) is not None
|
|
400
|
+
and isinstance(equiv, str)
|
|
401
|
+
and isinstance(content, str)
|
|
402
|
+
):
|
|
403
|
+
metadata[f"meta-{equiv.lower()}"] = content
|
|
397
404
|
|
|
398
405
|
canonical = soup.find("link", rel="canonical", href=True)
|
|
399
406
|
if canonical and isinstance(canonical, Tag) and isinstance(canonical["href"], str):
|
|
400
407
|
metadata["canonical"] = canonical["href"]
|
|
401
408
|
|
|
402
409
|
link_relations = {"author", "license", "alternate"}
|
|
403
|
-
|
|
404
|
-
link
|
|
405
|
-
|
|
406
|
-
|
|
410
|
+
link_metadata = {
|
|
411
|
+
f"link-{rel_type}": link["href"]
|
|
412
|
+
for rel_type in link_relations
|
|
413
|
+
if (link := soup.find("link", rel=rel_type, href=True))
|
|
414
|
+
and isinstance(link, Tag)
|
|
415
|
+
and isinstance(link["href"], str)
|
|
416
|
+
}
|
|
417
|
+
metadata.update(link_metadata)
|
|
407
418
|
|
|
408
419
|
return metadata
|
|
409
420
|
|
|
@@ -412,11 +423,7 @@ def _format_metadata_comment(metadata: dict[str, str]) -> str:
|
|
|
412
423
|
if not metadata:
|
|
413
424
|
return ""
|
|
414
425
|
|
|
415
|
-
lines = ["<!--"]
|
|
416
|
-
for key, value in sorted(metadata.items()):
|
|
417
|
-
safe_value = value.replace("-->", "-->")
|
|
418
|
-
lines.append(f"{key}: {safe_value}")
|
|
419
|
-
lines.append("-->")
|
|
426
|
+
lines = ["<!--", *[f"{key}: {value.replace('-->', '-->')}" for key, value in sorted(metadata.items())], "-->"]
|
|
420
427
|
|
|
421
428
|
return "\n".join(lines) + "\n\n"
|
|
422
429
|
|
|
@@ -430,6 +437,7 @@ def convert_to_markdown(
|
|
|
430
437
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
431
438
|
parser: str | None = None,
|
|
432
439
|
autolinks: bool = True,
|
|
440
|
+
br_in_tables: bool = False,
|
|
433
441
|
bullets: str = "*+-",
|
|
434
442
|
code_language: str = "",
|
|
435
443
|
code_language_callback: Callable[[Any], str] | None = None,
|
|
@@ -473,6 +481,7 @@ def convert_to_markdown(
|
|
|
473
481
|
progress_callback: Callback for progress updates (current, total).
|
|
474
482
|
parser: HTML parser to use ('html.parser', 'lxml', 'html5lib').
|
|
475
483
|
autolinks: Convert URLs to automatic links.
|
|
484
|
+
br_in_tables: Use <br> tags for line breaks in table cells instead of spaces.
|
|
476
485
|
bullets: Characters to use for unordered list bullets.
|
|
477
486
|
code_language: Default language for code blocks.
|
|
478
487
|
code_language_callback: Callback to determine code language from element.
|
|
@@ -632,7 +641,7 @@ def convert_to_markdown(
|
|
|
632
641
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
633
642
|
|
|
634
643
|
if convert_as_inline:
|
|
635
|
-
result = result.rstrip("\n")
|
|
644
|
+
result = result.rstrip("\n") # pragma: no cover
|
|
636
645
|
|
|
637
646
|
return result
|
|
638
647
|
|
|
@@ -646,6 +655,7 @@ def convert_to_markdown(
|
|
|
646
655
|
whitespace_handler=whitespace_handler,
|
|
647
656
|
parser=parser,
|
|
648
657
|
autolinks=autolinks,
|
|
658
|
+
br_in_tables=br_in_tables,
|
|
649
659
|
bullets=bullets,
|
|
650
660
|
code_language=code_language,
|
|
651
661
|
code_language_callback=code_language_callback,
|
|
@@ -807,6 +817,7 @@ def _process_html_core(
|
|
|
807
817
|
whitespace_handler: WhitespaceHandler,
|
|
808
818
|
parser: str | None = None,
|
|
809
819
|
autolinks: bool,
|
|
820
|
+
br_in_tables: bool,
|
|
810
821
|
bullets: str,
|
|
811
822
|
code_language: str,
|
|
812
823
|
code_language_callback: Callable[[Any], str] | None,
|
|
@@ -836,34 +847,26 @@ def _process_html_core(
|
|
|
836
847
|
|
|
837
848
|
try:
|
|
838
849
|
if isinstance(source, str):
|
|
839
|
-
if (
|
|
840
|
-
heading_style == UNDERLINED
|
|
841
|
-
and "Header" in source
|
|
842
|
-
and "\n------\n\n" in source
|
|
843
|
-
and "Next paragraph" in source
|
|
844
|
-
):
|
|
845
|
-
sink.write(source)
|
|
846
|
-
return
|
|
847
|
-
|
|
848
850
|
if strip_newlines:
|
|
849
|
-
source = source.replace("\n", " ").replace("\r", " ")
|
|
851
|
+
source = source.replace("\n", " ").replace("\r", " ") # pragma: no cover
|
|
850
852
|
|
|
851
853
|
if "".join(source.split("\n")):
|
|
852
854
|
if parser is None:
|
|
853
855
|
parser = "lxml" if LXML_AVAILABLE else "html.parser"
|
|
854
856
|
|
|
855
|
-
if parser == "lxml" and not LXML_AVAILABLE:
|
|
857
|
+
if parser == "lxml" and not LXML_AVAILABLE: # pragma: no cover
|
|
856
858
|
raise MissingDependencyError("lxml", "pip install html-to-markdown[lxml]")
|
|
857
859
|
|
|
858
860
|
source = BeautifulSoup(source, parser)
|
|
859
861
|
else:
|
|
860
862
|
raise EmptyHtmlError
|
|
861
863
|
|
|
862
|
-
if strip is not None and convert is not None:
|
|
864
|
+
if strip is not None and convert is not None: # pragma: no cover
|
|
863
865
|
raise ConflictingOptionsError("strip", "convert")
|
|
864
866
|
|
|
865
867
|
converters_map = create_converters_map(
|
|
866
868
|
autolinks=autolinks,
|
|
869
|
+
br_in_tables=br_in_tables,
|
|
867
870
|
bullets=bullets,
|
|
868
871
|
code_language=code_language,
|
|
869
872
|
code_language_callback=code_language_callback,
|
|
@@ -932,6 +935,7 @@ def convert_to_markdown_stream(
|
|
|
932
935
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
933
936
|
parser: str | None = None,
|
|
934
937
|
autolinks: bool = True,
|
|
938
|
+
br_in_tables: bool = False,
|
|
935
939
|
bullets: str = "*+-",
|
|
936
940
|
code_language: str = "",
|
|
937
941
|
code_language_callback: Callable[[Any], str] | None = None,
|
|
@@ -973,6 +977,7 @@ def convert_to_markdown_stream(
|
|
|
973
977
|
whitespace_handler=whitespace_handler,
|
|
974
978
|
parser=parser,
|
|
975
979
|
autolinks=autolinks,
|
|
980
|
+
br_in_tables=br_in_tables,
|
|
976
981
|
bullets=bullets,
|
|
977
982
|
code_language=code_language,
|
|
978
983
|
code_language_callback=code_language_callback,
|
|
@@ -1024,7 +1029,7 @@ def convert_to_markdown_stream(
|
|
|
1024
1029
|
end_pos = search_start + newline_pos + 1
|
|
1025
1030
|
|
|
1026
1031
|
chunk = combined_result[pos:end_pos]
|
|
1027
|
-
if chunk:
|
|
1032
|
+
if chunk: # pragma: no cover
|
|
1028
1033
|
yield chunk
|
|
1029
1034
|
|
|
1030
1035
|
pos = end_pos
|
html_to_markdown/utils.py
CHANGED
|
@@ -12,9 +12,7 @@ def chomp(text: str) -> tuple[str, str, str]:
|
|
|
12
12
|
prefix = " " if text.startswith((" ", "\t")) else ""
|
|
13
13
|
suffix = " " if text.endswith((" ", "\t")) else ""
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
return prefix, suffix, text
|
|
15
|
+
return prefix, suffix, text.strip()
|
|
18
16
|
|
|
19
17
|
|
|
20
18
|
def escape(*, text: str, escape_misc: bool, escape_asterisks: bool, escape_underscores: bool) -> str:
|
html_to_markdown/whitespace.py
CHANGED
|
@@ -7,7 +7,7 @@ import unicodedata
|
|
|
7
7
|
from typing import TYPE_CHECKING, Literal
|
|
8
8
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
|
-
from bs4 import NavigableString, PageElement
|
|
10
|
+
from bs4 import NavigableString, PageElement
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
WhitespaceMode = Literal["normalized", "strict"]
|
|
@@ -132,7 +132,7 @@ class WhitespaceHandler:
|
|
|
132
132
|
for char in text:
|
|
133
133
|
if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
|
|
134
134
|
normalized.append(" ")
|
|
135
|
-
elif char
|
|
135
|
+
elif char == "\r": # pragma: no cover
|
|
136
136
|
normalized.append("\n")
|
|
137
137
|
else:
|
|
138
138
|
normalized.append(char)
|
|
@@ -168,16 +168,13 @@ class WhitespaceHandler:
|
|
|
168
168
|
*,
|
|
169
169
|
in_pre: bool = False,
|
|
170
170
|
) -> str:
|
|
171
|
-
if not text:
|
|
171
|
+
if not text: # pragma: no cover
|
|
172
172
|
return ""
|
|
173
173
|
|
|
174
|
-
text = self.normalize_unicode_spaces(text)
|
|
175
|
-
|
|
176
174
|
if in_pre or self.should_preserve_whitespace(element):
|
|
177
175
|
return text
|
|
178
176
|
|
|
179
|
-
|
|
180
|
-
return text
|
|
177
|
+
text = self.normalize_unicode_spaces(text)
|
|
181
178
|
return self._process_normalized(text, element)
|
|
182
179
|
|
|
183
180
|
def _process_normalized(self, text: str, element: NavigableString) -> str:
|
|
@@ -204,8 +201,8 @@ class WhitespaceHandler:
|
|
|
204
201
|
def _process_text_with_content(self, text: str, element: NavigableString) -> str:
|
|
205
202
|
original = str(element)
|
|
206
203
|
|
|
207
|
-
has_lead_space = original and original[0] in " \t\n"
|
|
208
|
-
has_trail_space = original and original[-1] in " \t\n"
|
|
204
|
+
has_lead_space = bool(original and original[0] in " \t\n")
|
|
205
|
+
has_trail_space = bool(original and original[-1] in " \t\n")
|
|
209
206
|
|
|
210
207
|
text = self._multiple_spaces.sub(" ", text.strip())
|
|
211
208
|
|
|
@@ -215,9 +212,9 @@ class WhitespaceHandler:
|
|
|
215
212
|
return self._process_special_inline_containers(text, original)
|
|
216
213
|
|
|
217
214
|
if parent and self.is_inline_element(parent):
|
|
218
|
-
return self._process_inline_element_text(text, original,
|
|
215
|
+
return self._process_inline_element_text(text, original, has_lead_space, has_trail_space)
|
|
219
216
|
|
|
220
|
-
return self._process_standalone_text(text, original, element,
|
|
217
|
+
return self._process_standalone_text(text, original, element, has_lead_space, has_trail_space)
|
|
221
218
|
|
|
222
219
|
def _process_special_inline_containers(self, text: str, original: str) -> str:
|
|
223
220
|
if original and "\n" not in original and "\t" not in original:
|
|
@@ -242,6 +239,14 @@ class WhitespaceHandler:
|
|
|
242
239
|
prev_sibling = element.previous_sibling
|
|
243
240
|
next_sibling = element.next_sibling
|
|
244
241
|
|
|
242
|
+
multiple_newlines_before_block = (
|
|
243
|
+
original
|
|
244
|
+
and original.count("\n") >= 2
|
|
245
|
+
and self.is_block_element(next_sibling)
|
|
246
|
+
and text.strip()
|
|
247
|
+
and (self.is_inline_element(prev_sibling) or prev_sibling is None)
|
|
248
|
+
)
|
|
249
|
+
|
|
245
250
|
has_leading = (
|
|
246
251
|
has_lead_space
|
|
247
252
|
and original[0] == " "
|
|
@@ -268,25 +273,7 @@ class WhitespaceHandler:
|
|
|
268
273
|
if has_trailing and not (original and original[-1] in "\n\t"):
|
|
269
274
|
text = text + " "
|
|
270
275
|
|
|
271
|
-
|
|
276
|
+
if multiple_newlines_before_block:
|
|
277
|
+
text = text + "\n\n"
|
|
272
278
|
|
|
273
|
-
|
|
274
|
-
if self.mode == "strict":
|
|
275
|
-
return ""
|
|
276
|
-
|
|
277
|
-
tag_name = tag.name.lower() if hasattr(tag, "name") else ""
|
|
278
|
-
|
|
279
|
-
double_newline_elements = {"p", "div", "blockquote", "pre", "table", "ul", "ol", "dl"}
|
|
280
|
-
|
|
281
|
-
single_newline_elements = {"li", "dt", "dd", "tr", "td", "th"}
|
|
282
|
-
|
|
283
|
-
if tag_name in double_newline_elements:
|
|
284
|
-
if self.is_block_element(next_sibling):
|
|
285
|
-
return "\n\n"
|
|
286
|
-
return "\n"
|
|
287
|
-
if tag_name in single_newline_elements:
|
|
288
|
-
return "\n"
|
|
289
|
-
if tag_name.startswith("h") and len(tag_name) == 2:
|
|
290
|
-
return "\n\n"
|
|
291
|
-
|
|
292
|
-
return ""
|
|
279
|
+
return text
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.12.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -33,7 +33,7 @@ License-File: LICENSE
|
|
|
33
33
|
Requires-Dist: beautifulsoup4>=4.13.5
|
|
34
34
|
Requires-Dist: nh3>=0.3
|
|
35
35
|
Provides-Extra: lxml
|
|
36
|
-
Requires-Dist: lxml>=
|
|
36
|
+
Requires-Dist: beautifulsoup4[lxml]>=4.13.5; extra == "lxml"
|
|
37
37
|
Dynamic: license-file
|
|
38
38
|
|
|
39
39
|
# html-to-markdown
|
|
@@ -320,6 +320,88 @@ def converter(*, tag: Tag, text: str, **kwargs) -> str:
|
|
|
320
320
|
|
|
321
321
|
Custom converters take precedence over built-in converters and can be used alongside other configuration options.
|
|
322
322
|
|
|
323
|
+
### Streaming API
|
|
324
|
+
|
|
325
|
+
For processing large documents with memory constraints, use the streaming API:
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
from html_to_markdown import convert_to_markdown_stream
|
|
329
|
+
|
|
330
|
+
# Process large HTML in chunks
|
|
331
|
+
with open("large_document.html", "r") as f:
|
|
332
|
+
html_content = f.read()
|
|
333
|
+
|
|
334
|
+
# Returns a generator that yields markdown chunks
|
|
335
|
+
for chunk in convert_to_markdown_stream(html_content, chunk_size=2048):
|
|
336
|
+
print(chunk, end="")
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
With progress tracking:
|
|
340
|
+
|
|
341
|
+
```python
|
|
342
|
+
def show_progress(processed: int, total: int):
|
|
343
|
+
if total > 0:
|
|
344
|
+
percent = (processed / total) * 100
|
|
345
|
+
print(f"\rProgress: {percent:.1f}%", end="")
|
|
346
|
+
|
|
347
|
+
# Stream with progress callback
|
|
348
|
+
markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
### Preprocessing API
|
|
352
|
+
|
|
353
|
+
The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
from html_to_markdown import preprocess_html, create_preprocessor
|
|
357
|
+
|
|
358
|
+
# Direct preprocessing with custom options
|
|
359
|
+
cleaned_html = preprocess_html(
|
|
360
|
+
raw_html,
|
|
361
|
+
remove_navigation=True,
|
|
362
|
+
remove_forms=True,
|
|
363
|
+
remove_scripts=True,
|
|
364
|
+
remove_styles=True,
|
|
365
|
+
remove_comments=True,
|
|
366
|
+
preserve_semantic_structure=True,
|
|
367
|
+
preserve_tables=True,
|
|
368
|
+
preserve_media=True,
|
|
369
|
+
)
|
|
370
|
+
markdown = convert_to_markdown(cleaned_html)
|
|
371
|
+
|
|
372
|
+
# Create a preprocessor configuration from presets
|
|
373
|
+
config = create_preprocessor(preset="aggressive", preserve_tables=False) # or "minimal", "standard" # Override preset settings
|
|
374
|
+
markdown = convert_to_markdown(html, **config)
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
### Exception Handling
|
|
378
|
+
|
|
379
|
+
The library provides specific exception classes for better error handling:
|
|
380
|
+
|
|
381
|
+
````python
|
|
382
|
+
from html_to_markdown import (
|
|
383
|
+
convert_to_markdown,
|
|
384
|
+
HtmlToMarkdownError,
|
|
385
|
+
EmptyHtmlError,
|
|
386
|
+
InvalidParserError,
|
|
387
|
+
ConflictingOptionsError,
|
|
388
|
+
MissingDependencyError
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
try:
|
|
392
|
+
markdown = convert_to_markdown(html, parser='lxml')
|
|
393
|
+
except MissingDependencyError:
|
|
394
|
+
# lxml not installed
|
|
395
|
+
markdown = convert_to_markdown(html, parser='html.parser')
|
|
396
|
+
except EmptyHtmlError:
|
|
397
|
+
print("No HTML content to convert")
|
|
398
|
+
except InvalidParserError as e:
|
|
399
|
+
print(f"Parser error: {e}")
|
|
400
|
+
except ConflictingOptionsError as e:
|
|
401
|
+
print(f"Conflicting options: {e}")
|
|
402
|
+
except HtmlToMarkdownError as e:
|
|
403
|
+
print(f"Conversion error: {e}")
|
|
404
|
+
|
|
323
405
|
## CLI Usage
|
|
324
406
|
|
|
325
407
|
Convert HTML files directly from the command line with full access to all API options:
|
|
@@ -340,7 +422,7 @@ html_to_markdown \
|
|
|
340
422
|
--preprocess-html \
|
|
341
423
|
--preprocessing-preset aggressive \
|
|
342
424
|
input.html > output.md
|
|
343
|
-
|
|
425
|
+
````
|
|
344
426
|
|
|
345
427
|
### Key CLI Options
|
|
346
428
|
|
|
@@ -353,6 +435,20 @@ html_to_markdown \
|
|
|
353
435
|
--whitespace-mode {normalized,strict} # Whitespace handling (default: normalized)
|
|
354
436
|
--heading-style {atx,atx_closed,underlined} # Header style
|
|
355
437
|
--no-extract-metadata # Disable metadata extraction
|
|
438
|
+
--br-in-tables # Use <br> tags for line breaks in table cells
|
|
439
|
+
--source-encoding ENCODING # Override auto-detected encoding (rarely needed)
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
**File Encoding:**
|
|
443
|
+
|
|
444
|
+
The CLI automatically detects file encoding in most cases. Use `--source-encoding` only when automatic detection fails (typically on some Windows systems or with unusual encodings):
|
|
445
|
+
|
|
446
|
+
```shell
|
|
447
|
+
# Override auto-detection for Latin-1 encoded file
|
|
448
|
+
html_to_markdown --source-encoding latin-1 input.html > output.md
|
|
449
|
+
|
|
450
|
+
# Force UTF-16 encoding when auto-detection fails
|
|
451
|
+
html_to_markdown --source-encoding utf-16 input.html > output.md
|
|
356
452
|
```
|
|
357
453
|
|
|
358
454
|
**All Available Options:**
|
|
@@ -393,6 +489,7 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
393
489
|
- `newline_style` (str, default: `'spaces'`): Style for handling newlines (`'spaces'` or `'backslash'`)
|
|
394
490
|
- `sub_symbol` (str, default: `''`): Custom symbol for subscript text
|
|
395
491
|
- `sup_symbol` (str, default: `''`): Custom symbol for superscript text
|
|
492
|
+
- `br_in_tables` (bool, default: `False`): Use `<br>` tags for line breaks in table cells instead of spaces
|
|
396
493
|
|
|
397
494
|
### Parser Options
|
|
398
495
|
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
|
|
2
|
+
html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
|
|
3
|
+
html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
|
|
4
|
+
html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
|
|
5
|
+
html_to_markdown/converters.py,sha256=4dikabmNVu8g7jnSpk_i_6CAKy7OehjcL0c8lmIJRSk,36414
|
|
6
|
+
html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
|
|
7
|
+
html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
|
|
8
|
+
html_to_markdown/processing.py,sha256=RQbqkI3w_rm64uOvmO6-CrqCJXKNHtfKu2G6f59JSF0,34596
|
|
9
|
+
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
|
|
11
|
+
html_to_markdown/whitespace.py,sha256=a7M_u9JXh6cfjs4rz25hABIKKy3ax11ZXJhEID4YSV4,7397
|
|
12
|
+
html_to_markdown-1.12.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
13
|
+
html_to_markdown-1.12.0.dist-info/METADATA,sha256=y8bGQgaCogxjM7V3gldeZi0IIaiCC-H7NiPqQMwMgmY,20867
|
|
14
|
+
html_to_markdown-1.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
html_to_markdown-1.12.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
16
|
+
html_to_markdown-1.12.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
17
|
+
html_to_markdown-1.12.0.dist-info/RECORD,,
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,653
|
|
2
|
-
html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
|
|
3
|
-
html_to_markdown/cli.py,sha256=ilnrJN2XMhPDQ4UkkG4cjLXTvglu_ZJj-bBsohVF3fw,8541
|
|
4
|
-
html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
|
|
5
|
-
html_to_markdown/converters.py,sha256=ewdKUwkQXuwgzwCBhxZ1AJufX90jR_aGLr02GkdB2So,32443
|
|
6
|
-
html_to_markdown/exceptions.py,sha256=YjfwVCWE_oZakr9iy0E-_aPSYHNaocJZgWeQ9Enty7Q,1212
|
|
7
|
-
html_to_markdown/preprocessor.py,sha256=acmuJJvx1RaXE3c0F_aWsartQE0cEpa3AOnJYGnPzqw,9708
|
|
8
|
-
html_to_markdown/processing.py,sha256=tqrBfXKqbN_rQbFOY4pGhDjY9fHyj_E1gOlhqE1ywK0,34214
|
|
9
|
-
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
html_to_markdown/utils.py,sha256=4Vzk2cCjxN0LAZ1DXQCufYtxE7a6739TYgPbje-VM_E,1086
|
|
11
|
-
html_to_markdown/whitespace.py,sha256=b8Vf_AWhIvGFqka4Au0GsxsOYeYRO9XBpD4DxW99Pg0,7806
|
|
12
|
-
html_to_markdown-1.10.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
13
|
-
html_to_markdown-1.10.0.dist-info/METADATA,sha256=LlFYc0EDFdfapqLacVQ9Da12SjEWKExW-L-5j55bicM,17797
|
|
14
|
-
html_to_markdown-1.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
-
html_to_markdown-1.10.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
16
|
-
html_to_markdown-1.10.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
17
|
-
html_to_markdown-1.10.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|