html-to-markdown 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/converters.py +8 -14
- html_to_markdown/processing.py +98 -29
- html_to_markdown/whitespace.py +17 -3
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/METADATA +45 -1
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/RECORD +9 -9
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.13.0.dist-info}/top_level.txt +0 -0
html_to_markdown/converters.py
CHANGED
|
@@ -39,7 +39,6 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def _find_list_item_ancestor(tag: Tag) -> Tag | None:
|
|
42
|
-
"""Find the nearest list item ancestor of a tag."""
|
|
43
42
|
parent = tag.parent
|
|
44
43
|
while parent and parent.name != "li":
|
|
45
44
|
parent = parent.parent
|
|
@@ -231,14 +230,15 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool, list_in
|
|
|
231
230
|
return quote_text
|
|
232
231
|
|
|
233
232
|
|
|
234
|
-
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
|
|
233
|
+
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag, text: str) -> str:
|
|
235
234
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
236
235
|
|
|
237
236
|
if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
|
|
238
|
-
return " "
|
|
237
|
+
return " " + text.strip()
|
|
239
238
|
|
|
240
239
|
_ = convert_as_inline
|
|
241
|
-
|
|
240
|
+
newline = "\\\n" if newline_style.lower() == BACKSLASH else " \n"
|
|
241
|
+
return newline + text.strip() if text.strip() else newline
|
|
242
242
|
|
|
243
243
|
|
|
244
244
|
def _convert_hn(
|
|
@@ -286,7 +286,6 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
286
286
|
|
|
287
287
|
|
|
288
288
|
def _has_block_list_items(tag: Tag) -> bool:
|
|
289
|
-
"""Check if any list items contain block elements."""
|
|
290
289
|
return any(
|
|
291
290
|
any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
|
|
292
291
|
for li in tag.find_all("li", recursive=False)
|
|
@@ -294,7 +293,6 @@ def _has_block_list_items(tag: Tag) -> bool:
|
|
|
294
293
|
|
|
295
294
|
|
|
296
295
|
def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
|
|
297
|
-
"""Handle indentation for lists nested within list items."""
|
|
298
296
|
prev_p = None
|
|
299
297
|
for child in parent.children:
|
|
300
298
|
if hasattr(child, "name"):
|
|
@@ -310,7 +308,6 @@ def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag
|
|
|
310
308
|
|
|
311
309
|
|
|
312
310
|
def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
|
|
313
|
-
"""Handle indentation for lists that are direct children of other lists."""
|
|
314
311
|
lines = text.strip().split("\n")
|
|
315
312
|
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
|
|
316
313
|
result = "\n".join(indented_lines)
|
|
@@ -318,7 +315,6 @@ def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> s
|
|
|
318
315
|
|
|
319
316
|
|
|
320
317
|
def _add_list_item_spacing(text: str) -> str:
|
|
321
|
-
"""Add extra spacing between list items that contain block content."""
|
|
322
318
|
lines = text.split("\n")
|
|
323
319
|
items_with_blocks = set()
|
|
324
320
|
|
|
@@ -418,7 +414,10 @@ def _convert_li(*, tag: Tag, text: str, bullets: str, list_indent_str: str) -> s
|
|
|
418
414
|
|
|
419
415
|
return "".join(result_parts)
|
|
420
416
|
|
|
421
|
-
|
|
417
|
+
# Ensure consistent whitespace handling for list items, especially with strip_newlines=True
|
|
418
|
+
# Strip any leading whitespace that may have been inherited from parent containers
|
|
419
|
+
clean_text = (text or "").strip()
|
|
420
|
+
return f"{bullet} {clean_text}\n"
|
|
422
421
|
|
|
423
422
|
|
|
424
423
|
def _convert_p(
|
|
@@ -482,7 +481,6 @@ def _convert_pre(
|
|
|
482
481
|
|
|
483
482
|
|
|
484
483
|
def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
|
|
485
|
-
"""Process table cell content, optionally using <br> tags for multi-line content."""
|
|
486
484
|
if br_in_tables:
|
|
487
485
|
block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
|
|
488
486
|
|
|
@@ -510,7 +508,6 @@ def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
|
|
|
510
508
|
|
|
511
509
|
|
|
512
510
|
def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
|
|
513
|
-
"""Get positions of cells with rowspan > 1 from previous row."""
|
|
514
511
|
rowspan_positions = []
|
|
515
512
|
col_pos = 0
|
|
516
513
|
|
|
@@ -531,7 +528,6 @@ def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
|
|
|
531
528
|
|
|
532
529
|
|
|
533
530
|
def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
|
|
534
|
-
"""Handle text adjustment for rows with rowspan cells."""
|
|
535
531
|
converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
|
|
536
532
|
rowspan_set = set(rowspan_positions)
|
|
537
533
|
|
|
@@ -542,7 +538,6 @@ def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int)
|
|
|
542
538
|
|
|
543
539
|
|
|
544
540
|
def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
|
|
545
|
-
"""Determine if this table row should be treated as a header row."""
|
|
546
541
|
return (
|
|
547
542
|
all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
|
|
548
543
|
or (not tag.previous_sibling and parent_name != "tbody")
|
|
@@ -555,7 +550,6 @@ def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_paren
|
|
|
555
550
|
|
|
556
551
|
|
|
557
552
|
def _calculate_total_colspan(cells: list[Tag]) -> int:
|
|
558
|
-
"""Calculate total colspan for all cells in a row."""
|
|
559
553
|
full_colspan = 0
|
|
560
554
|
for cell in cells:
|
|
561
555
|
if hasattr(cell, "attrs") and "colspan" in cell.attrs:
|
html_to_markdown/processing.py
CHANGED
|
@@ -11,7 +11,7 @@ from io import StringIO
|
|
|
11
11
|
from itertools import chain
|
|
12
12
|
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
13
13
|
|
|
14
|
-
from bs4 import BeautifulSoup, Comment, Doctype, Tag
|
|
14
|
+
from bs4 import BeautifulSoup, CData, Comment, Doctype, Tag
|
|
15
15
|
from bs4.element import NavigableString, PageElement
|
|
16
16
|
|
|
17
17
|
try:
|
|
@@ -179,6 +179,7 @@ def _process_tag(
|
|
|
179
179
|
strip: set[str] | None,
|
|
180
180
|
whitespace_handler: WhitespaceHandler,
|
|
181
181
|
context_before: str = "",
|
|
182
|
+
ancestor_names: set[str] | None = None,
|
|
182
183
|
) -> str:
|
|
183
184
|
should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
|
|
184
185
|
tag_name: SupportedTag | None = (
|
|
@@ -186,6 +187,17 @@ def _process_tag(
|
|
|
186
187
|
)
|
|
187
188
|
text_parts: list[str] = []
|
|
188
189
|
|
|
190
|
+
if ancestor_names is None:
|
|
191
|
+
ancestor_names = set()
|
|
192
|
+
current: Tag | None = tag
|
|
193
|
+
while current and hasattr(current, "name"):
|
|
194
|
+
if current.name:
|
|
195
|
+
ancestor_names.add(current.name)
|
|
196
|
+
current = getattr(current, "parent", None)
|
|
197
|
+
|
|
198
|
+
if len(ancestor_names) > 10:
|
|
199
|
+
break
|
|
200
|
+
|
|
189
201
|
is_heading = html_heading_re.match(tag.name) is not None
|
|
190
202
|
is_cell = tag_name in {"td", "th"}
|
|
191
203
|
convert_children_as_inline = convert_as_inline or is_heading or is_cell
|
|
@@ -201,7 +213,7 @@ def _process_tag(
|
|
|
201
213
|
if can_extract and isinstance(el, NavigableString) and not el.strip():
|
|
202
214
|
el.extract()
|
|
203
215
|
|
|
204
|
-
children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
|
|
216
|
+
children = list(filter(lambda value: not isinstance(value, (Comment, Doctype, CData)), tag.children))
|
|
205
217
|
|
|
206
218
|
empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
|
|
207
219
|
|
|
@@ -227,6 +239,7 @@ def _process_tag(
|
|
|
227
239
|
escape_asterisks=escape_asterisks,
|
|
228
240
|
escape_underscores=escape_underscores,
|
|
229
241
|
whitespace_handler=whitespace_handler,
|
|
242
|
+
ancestor_names=ancestor_names,
|
|
230
243
|
)
|
|
231
244
|
)
|
|
232
245
|
elif isinstance(el, Tag):
|
|
@@ -243,6 +256,7 @@ def _process_tag(
|
|
|
243
256
|
strip=strip,
|
|
244
257
|
whitespace_handler=whitespace_handler,
|
|
245
258
|
context_before=(context_before + current_text)[-2:],
|
|
259
|
+
ancestor_names=ancestor_names,
|
|
246
260
|
)
|
|
247
261
|
)
|
|
248
262
|
|
|
@@ -282,21 +296,23 @@ def _process_text(
|
|
|
282
296
|
escape_asterisks: bool,
|
|
283
297
|
escape_underscores: bool,
|
|
284
298
|
whitespace_handler: WhitespaceHandler,
|
|
299
|
+
ancestor_names: set[str] | None = None,
|
|
285
300
|
) -> str:
|
|
286
301
|
text = str(el) or ""
|
|
287
302
|
|
|
288
303
|
parent = el.parent
|
|
289
304
|
parent_name = parent.name if parent else None
|
|
290
305
|
|
|
291
|
-
ancestor_names
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
306
|
+
if ancestor_names is None:
|
|
307
|
+
ancestor_names = set()
|
|
308
|
+
current = parent
|
|
309
|
+
while current and hasattr(current, "name"):
|
|
310
|
+
if current.name:
|
|
311
|
+
ancestor_names.add(current.name)
|
|
312
|
+
current = getattr(current, "parent", None)
|
|
297
313
|
|
|
298
|
-
|
|
299
|
-
|
|
314
|
+
if len(ancestor_names) > 10:
|
|
315
|
+
break
|
|
300
316
|
|
|
301
317
|
in_pre = bool(ancestor_names.intersection({"pre"}))
|
|
302
318
|
|
|
@@ -469,7 +485,6 @@ def convert_to_markdown(
|
|
|
469
485
|
wrap_width: int = 80,
|
|
470
486
|
) -> str:
|
|
471
487
|
"""Convert HTML content to Markdown format.
|
|
472
|
-
|
|
473
488
|
This is the main entry point for converting HTML to Markdown. It supports
|
|
474
489
|
various customization options for controlling the conversion behavior.
|
|
475
490
|
|
|
@@ -525,17 +540,21 @@ def convert_to_markdown(
|
|
|
525
540
|
>>> html = "<h1>Title</h1><p>Content</p>"
|
|
526
541
|
>>> convert_to_markdown(html)
|
|
527
542
|
'Title\\n=====\\n\\nContent\\n\\n'
|
|
528
|
-
|
|
529
543
|
With custom options:
|
|
530
544
|
>>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
|
|
531
545
|
'# Title\\n\\nContent\\n\\n'
|
|
532
|
-
|
|
533
546
|
Discord-compatible lists (2-space indent):
|
|
534
547
|
>>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
|
|
535
548
|
>>> convert_to_markdown(html, list_indent_width=2)
|
|
536
549
|
'* Item 1\\n* Item 2\\n\\n'
|
|
537
550
|
"""
|
|
551
|
+
# Initialize original input string for Windows lxml fix
|
|
552
|
+
original_input_str = None
|
|
553
|
+
|
|
538
554
|
if isinstance(source, str):
|
|
555
|
+
# Store original string for plain text detection (Windows lxml fix)
|
|
556
|
+
original_input_str = source
|
|
557
|
+
|
|
539
558
|
if (
|
|
540
559
|
heading_style == UNDERLINED
|
|
541
560
|
and "Header" in source
|
|
@@ -684,23 +703,33 @@ def convert_to_markdown(
|
|
|
684
703
|
|
|
685
704
|
result = sink.get_result()
|
|
686
705
|
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
and not result.startswith((" ", "\t", "\n", "\r"))
|
|
691
|
-
):
|
|
706
|
+
# Parser-agnostic behavior: handle leading whitespace differences between parsers
|
|
707
|
+
# lxml may either add unwanted whitespace or strip meaningful whitespace compared to html.parser
|
|
708
|
+
if "needs_leading_whitespace_fix" in locals() and needs_leading_whitespace_fix:
|
|
692
709
|
original_input = sink.original_source if hasattr(sink, "original_source") else original_source
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
if any(tag in original_input for tag in list_heading_tags):
|
|
699
|
-
leading_newlines = re.match(r"^[\n\r]*", leading_whitespace)
|
|
700
|
-
leading_whitespace = leading_newlines.group(0) if leading_newlines else ""
|
|
710
|
+
if isinstance(original_input, str):
|
|
711
|
+
original_leading_whitespace_match = re.match(r"^[\s]*", original_input)
|
|
712
|
+
original_leading_whitespace = (
|
|
713
|
+
original_leading_whitespace_match.group(0) if original_leading_whitespace_match else ""
|
|
714
|
+
)
|
|
701
715
|
|
|
702
|
-
|
|
703
|
-
|
|
716
|
+
# Case 1: lxml added leading newlines (like "\n<figure>") - strip them
|
|
717
|
+
if result.startswith("\n") and not original_input.lstrip().startswith(result.strip()):
|
|
718
|
+
result = result.lstrip("\n\r")
|
|
719
|
+
|
|
720
|
+
# Case 2: lxml stripped meaningful leading whitespace (like " <b>") - restore it
|
|
721
|
+
# However, don't restore whitespace if strip_newlines=True was used, as the user
|
|
722
|
+
# explicitly requested to remove formatting whitespace
|
|
723
|
+
elif (
|
|
724
|
+
not strip_newlines
|
|
725
|
+
and not result.startswith((" ", "\t"))
|
|
726
|
+
and original_leading_whitespace.startswith((" ", "\t"))
|
|
727
|
+
):
|
|
728
|
+
# Only restore spaces/tabs, not newlines (which are usually formatting)
|
|
729
|
+
leading_spaces_tabs_match = re.match(r"^[ \t]*", original_leading_whitespace)
|
|
730
|
+
leading_spaces_tabs = leading_spaces_tabs_match.group(0) if leading_spaces_tabs_match else ""
|
|
731
|
+
if leading_spaces_tabs:
|
|
732
|
+
result = leading_spaces_tabs + result
|
|
704
733
|
|
|
705
734
|
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
706
735
|
|
|
@@ -729,6 +758,46 @@ def convert_to_markdown(
|
|
|
729
758
|
if convert_as_inline:
|
|
730
759
|
result = result.rstrip("\n")
|
|
731
760
|
|
|
761
|
+
# Windows-specific fix: For plain text input (no HTML tags), lxml may add extra trailing newlines
|
|
762
|
+
# This ensures consistent behavior across platforms when processing plain text
|
|
763
|
+
# Only apply to cases where lxml adds extra newlines (\n\n) at the end
|
|
764
|
+
if (
|
|
765
|
+
"original_input_str" in locals()
|
|
766
|
+
and original_input_str
|
|
767
|
+
and not original_input_str.strip().startswith("<")
|
|
768
|
+
and not original_input_str.strip().endswith(">")
|
|
769
|
+
and result.endswith("\n\n")
|
|
770
|
+
):
|
|
771
|
+
# Input appears to be plain text, not HTML - normalize trailing newlines only
|
|
772
|
+
result = result.rstrip("\n")
|
|
773
|
+
|
|
774
|
+
# If the original input contained no block-level elements, normalize any
|
|
775
|
+
# accidental trailing newlines for cross-platform consistency.
|
|
776
|
+
# This guards cases like inline-only inputs (e.g., "text <strong>bold</strong>")
|
|
777
|
+
# and head-only documents (e.g., "<head>head</head>") where output should
|
|
778
|
+
# not end with extra blank lines.
|
|
779
|
+
if "original_input_str" in locals() and original_input_str:
|
|
780
|
+
from html_to_markdown.whitespace import BLOCK_ELEMENTS # noqa: PLC0415
|
|
781
|
+
|
|
782
|
+
# Treat additional tags as block-producing for trailing newline purposes.
|
|
783
|
+
# These may be inline in HTML spec but produce block output in our Markdown conversion.
|
|
784
|
+
blockish = set(BLOCK_ELEMENTS) | {
|
|
785
|
+
"textarea",
|
|
786
|
+
"dialog",
|
|
787
|
+
"label",
|
|
788
|
+
"button",
|
|
789
|
+
"progress",
|
|
790
|
+
"meter",
|
|
791
|
+
"output",
|
|
792
|
+
"math",
|
|
793
|
+
"audio",
|
|
794
|
+
"video",
|
|
795
|
+
"iframe",
|
|
796
|
+
}
|
|
797
|
+
block_pattern = r"<(?:" + "|".join(sorted(blockish)) + r")\b"
|
|
798
|
+
if not re.search(block_pattern, original_input_str, flags=re.IGNORECASE):
|
|
799
|
+
result = result.rstrip("\n")
|
|
800
|
+
|
|
732
801
|
return result
|
|
733
802
|
|
|
734
803
|
|
|
@@ -896,7 +965,7 @@ def _process_html_core(
|
|
|
896
965
|
elements_to_process = body.children if body and isinstance(body, Tag) else source.children
|
|
897
966
|
|
|
898
967
|
context = ""
|
|
899
|
-
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
|
|
968
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype, CData)), elements_to_process):
|
|
900
969
|
if isinstance(el, NavigableString):
|
|
901
970
|
text = _process_text(
|
|
902
971
|
el=el,
|
html_to_markdown/whitespace.py
CHANGED
|
@@ -6,8 +6,10 @@ import re
|
|
|
6
6
|
import unicodedata
|
|
7
7
|
from typing import TYPE_CHECKING, Literal
|
|
8
8
|
|
|
9
|
+
from bs4.element import NavigableString
|
|
10
|
+
|
|
9
11
|
if TYPE_CHECKING:
|
|
10
|
-
from bs4 import
|
|
12
|
+
from bs4 import PageElement
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
WhitespaceMode = Literal["normalized", "strict"]
|
|
@@ -128,6 +130,8 @@ class WhitespaceHandler:
|
|
|
128
130
|
def normalize_unicode_spaces(self, text: str) -> str:
|
|
129
131
|
text = self._unicode_spaces.sub(" ", text)
|
|
130
132
|
|
|
133
|
+
text = text.replace("\r\n", "\n")
|
|
134
|
+
|
|
131
135
|
normalized = []
|
|
132
136
|
for char in text:
|
|
133
137
|
if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
|
|
@@ -250,12 +254,22 @@ class WhitespaceHandler:
|
|
|
250
254
|
has_leading = (
|
|
251
255
|
has_lead_space
|
|
252
256
|
and original[0] == " "
|
|
253
|
-
and (
|
|
257
|
+
and (
|
|
258
|
+
self.is_inline_element(prev_sibling)
|
|
259
|
+
or self.is_block_element(prev_sibling)
|
|
260
|
+
or prev_sibling is None
|
|
261
|
+
or isinstance(prev_sibling, NavigableString)
|
|
262
|
+
)
|
|
254
263
|
)
|
|
255
264
|
has_trailing = (
|
|
256
265
|
has_trail_space
|
|
257
266
|
and original[-1] == " "
|
|
258
|
-
and (
|
|
267
|
+
and (
|
|
268
|
+
self.is_inline_element(next_sibling)
|
|
269
|
+
or self.is_block_element(next_sibling)
|
|
270
|
+
or next_sibling is None
|
|
271
|
+
or isinstance(next_sibling, NavigableString)
|
|
272
|
+
)
|
|
259
273
|
)
|
|
260
274
|
|
|
261
275
|
if original and original[0] in "\n\t" and self.is_inline_element(prev_sibling):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.13.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -348,6 +348,50 @@ def show_progress(processed: int, total: int):
|
|
|
348
348
|
markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
|
|
349
349
|
```
|
|
350
350
|
|
|
351
|
+
#### When to Use Streaming vs Regular Processing
|
|
352
|
+
|
|
353
|
+
Based on comprehensive performance analysis, here are our recommendations:
|
|
354
|
+
|
|
355
|
+
**📄 Use Regular Processing When:**
|
|
356
|
+
|
|
357
|
+
- Files < 100KB (simplicity preferred)
|
|
358
|
+
- Simple scripts and one-off conversions
|
|
359
|
+
- Memory is not a concern
|
|
360
|
+
- You want the simplest API
|
|
361
|
+
|
|
362
|
+
**🌊 Use Streaming Processing When:**
|
|
363
|
+
|
|
364
|
+
- Files > 100KB (memory efficiency)
|
|
365
|
+
- Processing many files in batch
|
|
366
|
+
- Memory is constrained
|
|
367
|
+
- You need progress reporting
|
|
368
|
+
- You want to process results incrementally
|
|
369
|
+
- Running in production environments
|
|
370
|
+
|
|
371
|
+
**📋 Specific Recommendations by File Size:**
|
|
372
|
+
|
|
373
|
+
| File Size | Recommendation | Reason |
|
|
374
|
+
| ---------- | ----------------------------------------------- | -------------------------------------- |
|
|
375
|
+
| < 50KB | Regular (simplicity) or Streaming (3-5% faster) | Either works well |
|
|
376
|
+
| 50KB-100KB | Either (streaming slightly preferred) | Minimal difference |
|
|
377
|
+
| 100KB-1MB | Streaming preferred | Better performance + memory efficiency |
|
|
378
|
+
| > 1MB | Streaming strongly recommended | Significant memory advantages |
|
|
379
|
+
|
|
380
|
+
**🔧 Configuration Recommendations:**
|
|
381
|
+
|
|
382
|
+
- **Default chunk_size: 2048 bytes** (optimal performance balance)
|
|
383
|
+
- **For very large files (>10MB)**: Consider `chunk_size=4096`
|
|
384
|
+
- **For memory-constrained environments**: Use smaller chunks `chunk_size=1024`
|
|
385
|
+
|
|
386
|
+
**📈 Performance Benefits:**
|
|
387
|
+
|
|
388
|
+
Streaming provides consistent **3-5% performance improvement** across all file sizes:
|
|
389
|
+
|
|
390
|
+
- **Streaming throughput**: ~0.47-0.48 MB/s
|
|
391
|
+
- **Regular throughput**: ~0.44-0.47 MB/s
|
|
392
|
+
- **Memory usage**: Streaming uses less peak memory for large files
|
|
393
|
+
- **Latency**: Streaming allows processing results before completion
|
|
394
|
+
|
|
351
395
|
### Preprocessing API
|
|
352
396
|
|
|
353
397
|
The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
|
|
@@ -2,16 +2,16 @@ html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,
|
|
|
2
2
|
html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
|
|
3
3
|
html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
|
|
4
4
|
html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
|
|
5
|
-
html_to_markdown/converters.py,sha256=
|
|
5
|
+
html_to_markdown/converters.py,sha256=l4ZtIhfOdemvaApRjH7qmzHrWNF3PDlBzsT1LRw3n0Y,36022
|
|
6
6
|
html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
|
|
7
7
|
html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
|
|
8
|
-
html_to_markdown/processing.py,sha256=
|
|
8
|
+
html_to_markdown/processing.py,sha256=SjVStbriaOb24ZwCcRp8eqOJ1p5bIVxpCXSMW3vQojs,38059
|
|
9
9
|
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
|
|
11
|
-
html_to_markdown/whitespace.py,sha256=
|
|
12
|
-
html_to_markdown-1.
|
|
13
|
-
html_to_markdown-1.
|
|
14
|
-
html_to_markdown-1.
|
|
15
|
-
html_to_markdown-1.
|
|
16
|
-
html_to_markdown-1.
|
|
17
|
-
html_to_markdown-1.
|
|
11
|
+
html_to_markdown/whitespace.py,sha256=rl3eEwqfMpNWx4FBmbkZ1RxO_Od45p3EZ_7UgKcDAtg,7710
|
|
12
|
+
html_to_markdown-1.13.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
13
|
+
html_to_markdown-1.13.0.dist-info/METADATA,sha256=CIfFx5C69D3lFg3wgajZnMRmQV-7C78ga2zbXKcxcsc,22694
|
|
14
|
+
html_to_markdown-1.13.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
html_to_markdown-1.13.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
16
|
+
html_to_markdown-1.13.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
17
|
+
html_to_markdown-1.13.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|