html-to-markdown 1.11.0__py3-none-any.whl → 1.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/cli.py +28 -2
- html_to_markdown/converters.py +208 -130
- html_to_markdown/exceptions.py +5 -0
- html_to_markdown/preprocessor.py +96 -86
- html_to_markdown/processing.py +63 -48
- html_to_markdown/utils.py +1 -3
- html_to_markdown/whitespace.py +23 -33
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/METADATA +143 -2
- html_to_markdown-1.12.1.dist-info/RECORD +17 -0
- html_to_markdown-1.11.0.dist-info/RECORD +0 -17
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.11.0.dist-info → html_to_markdown-1.12.1.dist-info}/top_level.txt +0 -0
html_to_markdown/cli.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from argparse import ArgumentParser, FileType
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
from html_to_markdown.constants import (
|
|
5
6
|
ASTERISK,
|
|
@@ -13,6 +14,7 @@ from html_to_markdown.constants import (
|
|
|
13
14
|
WHITESPACE_NORMALIZED,
|
|
14
15
|
WHITESPACE_STRICT,
|
|
15
16
|
)
|
|
17
|
+
from html_to_markdown.exceptions import InvalidEncodingError
|
|
16
18
|
from html_to_markdown.processing import convert_to_markdown
|
|
17
19
|
|
|
18
20
|
|
|
@@ -131,6 +133,12 @@ def main(argv: list[str]) -> str:
|
|
|
131
133
|
help="Parent tags where images remain inline (not converted to alt-text).",
|
|
132
134
|
)
|
|
133
135
|
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--br-in-tables",
|
|
138
|
+
action="store_true",
|
|
139
|
+
help="Use <br> tags for line breaks in table cells instead of spaces.",
|
|
140
|
+
)
|
|
141
|
+
|
|
134
142
|
parser.add_argument("-w", "--wrap", action="store_true", help="Enable text wrapping at --wrap-width characters.")
|
|
135
143
|
|
|
136
144
|
parser.add_argument(
|
|
@@ -235,10 +243,18 @@ def main(argv: list[str]) -> str:
|
|
|
235
243
|
help="Keep navigation elements when preprocessing (normally removed).",
|
|
236
244
|
)
|
|
237
245
|
|
|
246
|
+
parser.add_argument(
|
|
247
|
+
"--source-encoding",
|
|
248
|
+
type=str,
|
|
249
|
+
default=None,
|
|
250
|
+
help="Source file encoding (e.g. 'utf-8', 'latin-1'). Defaults to system default.",
|
|
251
|
+
)
|
|
252
|
+
|
|
238
253
|
args = parser.parse_args(argv)
|
|
239
254
|
|
|
240
255
|
base_args = {
|
|
241
256
|
"autolinks": args.autolinks,
|
|
257
|
+
"br_in_tables": args.br_in_tables,
|
|
242
258
|
"bullets": args.bullets,
|
|
243
259
|
"code_language": args.code_language,
|
|
244
260
|
"convert": args.convert,
|
|
@@ -278,7 +294,7 @@ def main(argv: list[str]) -> str:
|
|
|
278
294
|
if args.show_progress:
|
|
279
295
|
|
|
280
296
|
def progress_callback(processed: int, total: int) -> None:
|
|
281
|
-
if total > 0:
|
|
297
|
+
if total > 0: # pragma: no cover
|
|
282
298
|
percent = (processed / total) * 100
|
|
283
299
|
|
|
284
300
|
sys.stderr.write(f"\rProgress: {percent:.1f}% ({processed}/{total} bytes)")
|
|
@@ -286,4 +302,14 @@ def main(argv: list[str]) -> str:
|
|
|
286
302
|
|
|
287
303
|
base_args["progress_callback"] = progress_callback
|
|
288
304
|
|
|
289
|
-
|
|
305
|
+
if args.source_encoding and args.html.name != "<stdin>":
|
|
306
|
+
args.html.close()
|
|
307
|
+
try:
|
|
308
|
+
with Path(args.html.name).open(encoding=args.source_encoding) as f:
|
|
309
|
+
html_content = f.read()
|
|
310
|
+
except LookupError as e:
|
|
311
|
+
raise InvalidEncodingError(args.source_encoding) from e
|
|
312
|
+
else:
|
|
313
|
+
html_content = args.html.read()
|
|
314
|
+
|
|
315
|
+
return convert_to_markdown(html_content, **base_args)
|
html_to_markdown/converters.py
CHANGED
|
@@ -5,9 +5,11 @@ from typing import TYPE_CHECKING
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from collections.abc import Iterable
|
|
7
7
|
import base64
|
|
8
|
+
import re
|
|
8
9
|
from collections.abc import Callable
|
|
9
10
|
from functools import partial
|
|
10
11
|
from inspect import getfullargspec
|
|
12
|
+
from itertools import chain
|
|
11
13
|
from textwrap import fill
|
|
12
14
|
from typing import Any, Literal, TypeVar, cast
|
|
13
15
|
|
|
@@ -36,6 +38,18 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
|
|
|
36
38
|
return f"{start_marker}{text.strip()}{end_marker}\n\n" if text.strip() else ""
|
|
37
39
|
|
|
38
40
|
|
|
41
|
+
def _find_list_item_ancestor(tag: Tag) -> Tag | None:
|
|
42
|
+
parent = tag.parent
|
|
43
|
+
while parent and parent.name != "li":
|
|
44
|
+
parent = parent.parent
|
|
45
|
+
return parent
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
BLOCK_ELEMENTS = frozenset({"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"})
|
|
49
|
+
|
|
50
|
+
_LIST_ITEM_PATTERN = re.compile(r"^\s*(\*|\+|-|\d+\.)\s")
|
|
51
|
+
|
|
52
|
+
|
|
39
53
|
SupportedElements = Literal[
|
|
40
54
|
"a",
|
|
41
55
|
"abbr",
|
|
@@ -216,14 +230,15 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool, list_in
|
|
|
216
230
|
return quote_text
|
|
217
231
|
|
|
218
232
|
|
|
219
|
-
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
|
|
233
|
+
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag, text: str) -> str:
|
|
220
234
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
221
235
|
|
|
222
236
|
if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
|
|
223
|
-
return " "
|
|
237
|
+
return " " + text.strip()
|
|
224
238
|
|
|
225
239
|
_ = convert_as_inline
|
|
226
|
-
|
|
240
|
+
newline = "\\\n" if newline_style.lower() == BACKSLASH else " \n"
|
|
241
|
+
return newline + text.strip() if text.strip() else newline
|
|
227
242
|
|
|
228
243
|
|
|
229
244
|
def _convert_hn(
|
|
@@ -270,52 +285,87 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
270
285
|
return f""
|
|
271
286
|
|
|
272
287
|
|
|
288
|
+
def _has_block_list_items(tag: Tag) -> bool:
|
|
289
|
+
return any(
|
|
290
|
+
any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
|
|
291
|
+
for li in tag.find_all("li", recursive=False)
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
|
|
296
|
+
prev_p = None
|
|
297
|
+
for child in parent.children:
|
|
298
|
+
if hasattr(child, "name"):
|
|
299
|
+
if child.name == "p":
|
|
300
|
+
prev_p = child
|
|
301
|
+
break
|
|
302
|
+
|
|
303
|
+
if prev_p:
|
|
304
|
+
lines = text.strip().split("\n")
|
|
305
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
|
|
306
|
+
return "\n" + "\n".join(indented_lines) + "\n"
|
|
307
|
+
return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
|
|
311
|
+
lines = text.strip().split("\n")
|
|
312
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
|
|
313
|
+
result = "\n".join(indented_lines)
|
|
314
|
+
return result + "\n" if not result.endswith("\n") else result
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _add_list_item_spacing(text: str) -> str:
|
|
318
|
+
lines = text.split("\n")
|
|
319
|
+
items_with_blocks = set()
|
|
320
|
+
|
|
321
|
+
i = 0
|
|
322
|
+
while i < len(lines):
|
|
323
|
+
line = lines[i]
|
|
324
|
+
if line.strip() and _LIST_ITEM_PATTERN.match(line.lstrip()):
|
|
325
|
+
j = i + 1
|
|
326
|
+
has_continuation = False
|
|
327
|
+
while j < len(lines):
|
|
328
|
+
next_line = lines[j]
|
|
329
|
+
if next_line.strip() and _LIST_ITEM_PATTERN.match(next_line.lstrip()):
|
|
330
|
+
break
|
|
331
|
+
if next_line.strip() and next_line.startswith((" ", " ", "\t")):
|
|
332
|
+
has_continuation = True
|
|
333
|
+
j += 1
|
|
334
|
+
|
|
335
|
+
if has_continuation and j < len(lines):
|
|
336
|
+
items_with_blocks.add(j - 1)
|
|
337
|
+
|
|
338
|
+
i += 1
|
|
339
|
+
|
|
340
|
+
if items_with_blocks:
|
|
341
|
+
processed_lines = list(
|
|
342
|
+
chain.from_iterable([line, ""] if i in items_with_blocks else [line] for i, line in enumerate(lines))
|
|
343
|
+
)
|
|
344
|
+
return "\n".join(processed_lines)
|
|
345
|
+
|
|
346
|
+
return text
|
|
347
|
+
|
|
348
|
+
|
|
273
349
|
def _convert_list(*, tag: Tag, text: str, list_indent_str: str) -> str:
|
|
274
350
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
275
351
|
|
|
276
|
-
before_paragraph =
|
|
277
|
-
if tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}:
|
|
278
|
-
before_paragraph = True
|
|
352
|
+
before_paragraph = tag.next_sibling and getattr(tag.next_sibling, "name", None) not in {"ul", "ol"}
|
|
279
353
|
|
|
280
|
-
|
|
281
|
-
parent = tag.parent
|
|
282
|
-
while parent and parent.name != "li":
|
|
283
|
-
parent = parent.parent
|
|
354
|
+
has_block_items = _has_block_list_items(tag)
|
|
284
355
|
|
|
356
|
+
if _has_ancestor(tag, "li"):
|
|
357
|
+
parent = _find_list_item_ancestor(tag)
|
|
285
358
|
if parent:
|
|
286
|
-
|
|
287
|
-
for child in parent.children:
|
|
288
|
-
if hasattr(child, "name"):
|
|
289
|
-
if child == tag:
|
|
290
|
-
break
|
|
291
|
-
if child.name == "p":
|
|
292
|
-
prev_p = child
|
|
293
|
-
|
|
294
|
-
if prev_p:
|
|
295
|
-
lines = text.strip().split("\n")
|
|
296
|
-
indented_lines = []
|
|
297
|
-
for line in lines:
|
|
298
|
-
if line.strip():
|
|
299
|
-
indented_lines.append(f"{list_indent_str}{line}")
|
|
300
|
-
else:
|
|
301
|
-
indented_lines.append("")
|
|
302
|
-
return "\n" + "\n".join(indented_lines) + "\n"
|
|
303
|
-
return "\n" + indent(text=text, level=1, indent_str=list_indent_str).rstrip()
|
|
359
|
+
return _handle_nested_list_indentation(text, list_indent_str, parent)
|
|
304
360
|
|
|
305
361
|
if tag.parent and tag.parent.name in {"ul", "ol"}:
|
|
306
|
-
|
|
307
|
-
indented_lines = []
|
|
308
|
-
for line in lines:
|
|
309
|
-
if line.strip():
|
|
310
|
-
indented_lines.append(f"{list_indent_str}{line}")
|
|
311
|
-
else:
|
|
312
|
-
indented_lines.append("")
|
|
313
|
-
result = "\n".join(indented_lines)
|
|
314
|
-
if not result.endswith("\n"):
|
|
315
|
-
result += "\n"
|
|
316
|
-
return result
|
|
362
|
+
return _handle_direct_nested_list_indentation(text, list_indent_str)
|
|
317
363
|
|
|
318
|
-
|
|
364
|
+
if has_block_items:
|
|
365
|
+
text = _add_list_item_spacing(text)
|
|
366
|
+
|
|
367
|
+
trailing_newlines = "\n\n" if has_block_items else ("\n" if before_paragraph else "")
|
|
368
|
+
return text + trailing_newlines
|
|
319
369
|
|
|
320
370
|
|
|
321
371
|
def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> str:
|
|
@@ -324,10 +374,8 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
324
374
|
checked = checkbox.get("checked") is not None
|
|
325
375
|
checkbox_symbol = "[x]" if checked else "[ ]"
|
|
326
376
|
|
|
327
|
-
checkbox_text = text
|
|
328
|
-
|
|
329
|
-
checkbox_text = text.replace(str(checkbox.string), "").strip()
|
|
330
|
-
return f"- {checkbox_symbol} {checkbox_text.strip()}\n"
|
|
377
|
+
checkbox_text = text.strip()
|
|
378
|
+
return f"- {checkbox_symbol} {checkbox_text}\n"
|
|
331
379
|
|
|
332
380
|
parent = tag.parent
|
|
333
381
|
if parent is not None and parent.name == "ol":
|
|
@@ -349,11 +397,7 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
349
397
|
|
|
350
398
|
bullet = bullets[depth % len(bullets)]
|
|
351
399
|
|
|
352
|
-
has_block_children =
|
|
353
|
-
child.name in {"p", "blockquote", "pre", "ul", "ol", "div", "h1", "h2", "h3", "h4", "h5", "h6"}
|
|
354
|
-
for child in tag.children
|
|
355
|
-
if hasattr(child, "name")
|
|
356
|
-
)
|
|
400
|
+
has_block_children = "\n\n" in text
|
|
357
401
|
|
|
358
402
|
if has_block_children:
|
|
359
403
|
paragraphs = text.strip().split("\n\n")
|
|
@@ -390,20 +434,13 @@ def _convert_p(
|
|
|
390
434
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
391
435
|
|
|
392
436
|
if _has_ancestor(tag, "li"):
|
|
393
|
-
parent = tag
|
|
394
|
-
while parent and parent.name != "li":
|
|
395
|
-
parent = parent.parent
|
|
437
|
+
parent = _find_list_item_ancestor(tag)
|
|
396
438
|
|
|
397
439
|
if parent:
|
|
398
440
|
p_children = [child for child in parent.children if hasattr(child, "name") and child.name == "p"]
|
|
399
441
|
|
|
400
442
|
if p_children and tag != p_children[0]:
|
|
401
|
-
indented_lines = []
|
|
402
|
-
for line in text.split("\n"):
|
|
403
|
-
if line.strip():
|
|
404
|
-
indented_lines.append(f"{list_indent_str}{line}")
|
|
405
|
-
else:
|
|
406
|
-
indented_lines.append("")
|
|
443
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
|
|
407
444
|
text = "\n".join(indented_lines)
|
|
408
445
|
|
|
409
446
|
return f"{text}\n\n" if text else ""
|
|
@@ -440,66 +477,65 @@ def _convert_pre(
|
|
|
440
477
|
return f"\n```{code_language}\n{text}\n```\n"
|
|
441
478
|
|
|
442
479
|
|
|
443
|
-
def
|
|
480
|
+
def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
|
|
481
|
+
if br_in_tables:
|
|
482
|
+
block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
|
|
483
|
+
|
|
484
|
+
if len(block_children) > 1:
|
|
485
|
+
child_contents = []
|
|
486
|
+
for child in block_children:
|
|
487
|
+
child_text = child.get_text().strip()
|
|
488
|
+
if child_text:
|
|
489
|
+
child_contents.append(child_text)
|
|
490
|
+
return "<br>".join(child_contents)
|
|
491
|
+
return text.strip().replace("\n", "<br>")
|
|
492
|
+
return text.strip().replace("\n", " ")
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def _convert_td(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
|
|
444
496
|
colspan = _get_colspan(tag)
|
|
445
|
-
|
|
497
|
+
processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
|
|
498
|
+
return " " + processed_text + " |" * colspan
|
|
446
499
|
|
|
447
500
|
|
|
448
|
-
def _convert_th(*, tag: Tag, text: str) -> str:
|
|
501
|
+
def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
|
|
449
502
|
colspan = _get_colspan(tag)
|
|
450
|
-
|
|
503
|
+
processed_text = _process_table_cell_content(tag=tag, text=text, br_in_tables=br_in_tables)
|
|
504
|
+
return " " + processed_text + " |" * colspan
|
|
451
505
|
|
|
452
506
|
|
|
453
|
-
def
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
tag_grand_parent = tag.parent.parent if tag.parent else None
|
|
507
|
+
def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
|
|
508
|
+
rowspan_positions = []
|
|
509
|
+
col_pos = 0
|
|
457
510
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
if rowspan > 1:
|
|
473
|
-
rowspan_positions.append(col_pos)
|
|
474
|
-
|
|
475
|
-
colspan = 1
|
|
476
|
-
if (
|
|
477
|
-
"colspan" in prev_cell.attrs
|
|
478
|
-
and isinstance(prev_cell["colspan"], str)
|
|
479
|
-
and prev_cell["colspan"].isdigit()
|
|
480
|
-
):
|
|
481
|
-
colspan = int(prev_cell["colspan"])
|
|
482
|
-
col_pos += colspan
|
|
511
|
+
for prev_cell in prev_cells:
|
|
512
|
+
rowspan = 1
|
|
513
|
+
if "rowspan" in prev_cell.attrs and isinstance(prev_cell["rowspan"], str) and prev_cell["rowspan"].isdigit():
|
|
514
|
+
rowspan = int(prev_cell["rowspan"])
|
|
515
|
+
|
|
516
|
+
if rowspan > 1:
|
|
517
|
+
rowspan_positions.append(col_pos)
|
|
518
|
+
|
|
519
|
+
colspan = 1
|
|
520
|
+
if "colspan" in prev_cell.attrs and isinstance(prev_cell["colspan"], str) and prev_cell["colspan"].isdigit():
|
|
521
|
+
colspan = int(prev_cell["colspan"])
|
|
522
|
+
col_pos += colspan
|
|
523
|
+
|
|
524
|
+
return rowspan_positions, col_pos
|
|
483
525
|
|
|
484
|
-
if rowspan_positions:
|
|
485
|
-
converted_cells: list[str] = []
|
|
486
|
-
if text.strip():
|
|
487
|
-
parts = text.split("|")
|
|
488
|
-
converted_cells.extend(part.rstrip() + " |" for part in parts[:-1] if part)
|
|
489
526
|
|
|
490
|
-
|
|
491
|
-
|
|
527
|
+
def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
|
|
528
|
+
converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
|
|
529
|
+
rowspan_set = set(rowspan_positions)
|
|
492
530
|
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
new_cells.append(" |")
|
|
496
|
-
elif cell_index < len(converted_cells):
|
|
497
|
-
new_cells.append(converted_cells[cell_index])
|
|
498
|
-
cell_index += 1
|
|
531
|
+
cell_iter = iter(converted_cells)
|
|
532
|
+
new_cells = [" |" if pos in rowspan_set else next(cell_iter, "") for pos in range(col_pos)]
|
|
499
533
|
|
|
500
|
-
|
|
534
|
+
return "".join(new_cells)
|
|
501
535
|
|
|
502
|
-
|
|
536
|
+
|
|
537
|
+
def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
|
|
538
|
+
return (
|
|
503
539
|
all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
|
|
504
540
|
or (not tag.previous_sibling and parent_name != "tbody")
|
|
505
541
|
or (
|
|
@@ -508,25 +544,47 @@ def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
|
508
544
|
and (not tag_grand_parent or len(tag_grand_parent.find_all(["thead"])) < 1)
|
|
509
545
|
)
|
|
510
546
|
)
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def _calculate_total_colspan(cells: list[Tag]) -> int:
|
|
550
|
+
full_colspan = 0
|
|
551
|
+
for cell in cells:
|
|
552
|
+
if hasattr(cell, "attrs") and "colspan" in cell.attrs:
|
|
553
|
+
colspan_value = cell.attrs["colspan"]
|
|
554
|
+
if isinstance(colspan_value, str) and colspan_value.isdigit():
|
|
555
|
+
full_colspan += int(colspan_value)
|
|
556
|
+
else:
|
|
557
|
+
full_colspan += 1
|
|
558
|
+
else:
|
|
559
|
+
full_colspan += 1
|
|
560
|
+
return full_colspan
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _convert_tr(*, tag: Tag, text: str) -> str:
|
|
564
|
+
cells = tag.find_all(["td", "th"])
|
|
565
|
+
parent_name = tag.parent.name if tag.parent and hasattr(tag.parent, "name") else ""
|
|
566
|
+
tag_grand_parent = tag.parent.parent if tag.parent else None
|
|
567
|
+
|
|
568
|
+
if tag.previous_sibling and hasattr(tag.previous_sibling, "name") and tag.previous_sibling.name == "tr":
|
|
569
|
+
prev_cells = cast("Tag", tag.previous_sibling).find_all(["td", "th"])
|
|
570
|
+
rowspan_positions, col_pos = _get_rowspan_positions(prev_cells)
|
|
571
|
+
|
|
572
|
+
if rowspan_positions:
|
|
573
|
+
text = _handle_rowspan_text(text, rowspan_positions, col_pos)
|
|
574
|
+
|
|
575
|
+
is_headrow = _is_header_row(tag, cells, parent_name, tag_grand_parent)
|
|
511
576
|
overline = ""
|
|
512
577
|
underline = ""
|
|
578
|
+
|
|
513
579
|
if is_headrow and not tag.previous_sibling:
|
|
514
|
-
full_colspan =
|
|
515
|
-
for cell in cells:
|
|
516
|
-
if hasattr(cell, "attrs") and "colspan" in cell.attrs:
|
|
517
|
-
colspan_value = cell.attrs["colspan"]
|
|
518
|
-
if isinstance(colspan_value, str) and colspan_value.isdigit():
|
|
519
|
-
full_colspan += int(colspan_value)
|
|
520
|
-
else:
|
|
521
|
-
full_colspan += 1
|
|
522
|
-
else:
|
|
523
|
-
full_colspan += 1
|
|
580
|
+
full_colspan = _calculate_total_colspan(cells)
|
|
524
581
|
underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n"
|
|
525
582
|
elif not tag.previous_sibling and (
|
|
526
583
|
parent_name == "table" or (parent_name == "tbody" and not cast("Tag", tag.parent).previous_sibling)
|
|
527
584
|
):
|
|
528
|
-
overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n"
|
|
529
|
-
overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n"
|
|
585
|
+
overline += "| " + " | ".join([""] * len(cells)) + " |" + "\n" # pragma: no cover
|
|
586
|
+
overline += "| " + " | ".join(["---"] * len(cells)) + " |" + "\n" # pragma: no cover
|
|
587
|
+
|
|
530
588
|
return overline + "|" + text + "\n" + underline
|
|
531
589
|
|
|
532
590
|
|
|
@@ -578,10 +636,23 @@ def _convert_semantic_block(*, text: str, convert_as_inline: bool) -> str:
|
|
|
578
636
|
return f"{text}\n\n" if text.strip() else ""
|
|
579
637
|
|
|
580
638
|
|
|
581
|
-
def _convert_div(*, text: str, convert_as_inline: bool) -> str:
|
|
639
|
+
def _convert_div(*, text: str, convert_as_inline: bool, tag: Tag, list_indent_str: str) -> str:
|
|
582
640
|
if convert_as_inline:
|
|
583
641
|
return text
|
|
584
642
|
|
|
643
|
+
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
644
|
+
|
|
645
|
+
if _has_ancestor(tag, "li"):
|
|
646
|
+
parent = _find_list_item_ancestor(tag)
|
|
647
|
+
if parent:
|
|
648
|
+
div_children = [child for child in parent.children if hasattr(child, "name") and child.name == "div"]
|
|
649
|
+
|
|
650
|
+
if div_children and tag != div_children[0]:
|
|
651
|
+
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in text.split("\n")]
|
|
652
|
+
indented_text = "\n".join(indented_lines)
|
|
653
|
+
|
|
654
|
+
return f"{indented_text}\n\n" if indented_text.strip() else ""
|
|
655
|
+
|
|
585
656
|
return _format_block_element(text)
|
|
586
657
|
|
|
587
658
|
|
|
@@ -603,7 +674,7 @@ def _convert_dl(*, text: str, convert_as_inline: bool) -> str:
|
|
|
603
674
|
if convert_as_inline:
|
|
604
675
|
return text
|
|
605
676
|
|
|
606
|
-
return
|
|
677
|
+
return _format_block_element(text)
|
|
607
678
|
|
|
608
679
|
|
|
609
680
|
def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -616,14 +687,21 @@ def _convert_dt(*, text: str, convert_as_inline: bool) -> str:
|
|
|
616
687
|
return f"{text.strip()}\n"
|
|
617
688
|
|
|
618
689
|
|
|
619
|
-
def _convert_dd(*, text: str, convert_as_inline: bool) -> str:
|
|
690
|
+
def _convert_dd(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
620
691
|
if convert_as_inline:
|
|
621
692
|
return text
|
|
622
693
|
|
|
623
|
-
|
|
624
|
-
|
|
694
|
+
has_dt_sibling = False
|
|
695
|
+
current = tag.previous_sibling
|
|
696
|
+
while current:
|
|
697
|
+
if hasattr(current, "name") and current.name and current.name == "dt":
|
|
698
|
+
has_dt_sibling = True
|
|
699
|
+
break
|
|
700
|
+
current = current.previous_sibling
|
|
625
701
|
|
|
626
|
-
|
|
702
|
+
if has_dt_sibling:
|
|
703
|
+
return f": {text.strip()}\n\n" if text.strip() else ": \n\n"
|
|
704
|
+
return f"{text.strip()}\n\n" if text.strip() else ""
|
|
627
705
|
|
|
628
706
|
|
|
629
707
|
def _convert_cite(*, text: str, convert_as_inline: bool) -> str:
|
|
@@ -648,9 +726,7 @@ def _convert_q(*, text: str, convert_as_inline: bool) -> str:
|
|
|
648
726
|
|
|
649
727
|
|
|
650
728
|
def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
651
|
-
src
|
|
652
|
-
|
|
653
|
-
if not src and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
|
|
729
|
+
if not (src := tag.get("src", "")) and (source_tag := tag.find("source")) and isinstance(source_tag, Tag):
|
|
654
730
|
src = source_tag.get("src", "")
|
|
655
731
|
|
|
656
732
|
if src and isinstance(src, str) and src.strip():
|
|
@@ -670,9 +746,8 @@ def _convert_media_element(*, tag: Tag, text: str, convert_as_inline: bool) -> s
|
|
|
670
746
|
|
|
671
747
|
def _convert_iframe(*, tag: Tag, text: str, convert_as_inline: bool) -> str:
|
|
672
748
|
_ = text
|
|
673
|
-
src = tag.get("src", "")
|
|
674
749
|
|
|
675
|
-
if src and isinstance(src, str) and src.strip():
|
|
750
|
+
if (src := tag.get("src", "")) and isinstance(src, str) and src.strip():
|
|
676
751
|
link = f"[{src}]({src})"
|
|
677
752
|
if convert_as_inline:
|
|
678
753
|
return link
|
|
@@ -939,7 +1014,7 @@ def _convert_figure(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
939
1014
|
content = text.strip()
|
|
940
1015
|
if content and not content.endswith("\n\n"):
|
|
941
1016
|
if content.endswith("\n"):
|
|
942
|
-
content += "\n"
|
|
1017
|
+
content += "\n" # pragma: no cover
|
|
943
1018
|
else:
|
|
944
1019
|
content += "\n\n"
|
|
945
1020
|
return content
|
|
@@ -997,6 +1072,7 @@ def _convert_math(*, text: str, convert_as_inline: bool, tag: Tag) -> str:
|
|
|
997
1072
|
|
|
998
1073
|
def create_converters_map(
|
|
999
1074
|
autolinks: bool,
|
|
1075
|
+
br_in_tables: bool,
|
|
1000
1076
|
bullets: str,
|
|
1001
1077
|
code_language: str,
|
|
1002
1078
|
code_language_callback: Callable[[Tag], str] | None,
|
|
@@ -1029,6 +1105,8 @@ def create_converters_map(
|
|
|
1029
1105
|
kwargs["convert_as_inline"] = convert_as_inline
|
|
1030
1106
|
if "list_indent_str" in spec.kwonlyargs:
|
|
1031
1107
|
kwargs["list_indent_str"] = list_indent_str
|
|
1108
|
+
if "br_in_tables" in spec.kwonlyargs:
|
|
1109
|
+
kwargs["br_in_tables"] = br_in_tables
|
|
1032
1110
|
return func(**kwargs)
|
|
1033
1111
|
return func(text)
|
|
1034
1112
|
|
html_to_markdown/exceptions.py
CHANGED
|
@@ -37,3 +37,8 @@ class ConflictingOptionsError(HtmlToMarkdownError):
|
|
|
37
37
|
self.option2 = option2
|
|
38
38
|
|
|
39
39
|
super().__init__(f"Only one of '{option1}' and '{option2}' can be specified.")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class InvalidEncodingError(HtmlToMarkdownError):
|
|
43
|
+
def __init__(self, encoding: str) -> None:
|
|
44
|
+
super().__init__(f"The specified encoding ({encoding}) is not valid.")
|