html-to-markdown 1.12.0__py3-none-any.whl → 1.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/converters.py +4 -13
- html_to_markdown/processing.py +27 -14
- html_to_markdown/whitespace.py +17 -3
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.12.1.dist-info}/METADATA +45 -1
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.12.1.dist-info}/RECORD +9 -9
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.12.1.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.12.1.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.12.1.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.12.0.dist-info → html_to_markdown-1.12.1.dist-info}/top_level.txt +0 -0
html_to_markdown/converters.py
CHANGED
|
@@ -39,7 +39,6 @@ def _format_wrapped_block(text: str, start_marker: str, end_marker: str = "") ->
|
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def _find_list_item_ancestor(tag: Tag) -> Tag | None:
|
|
42
|
-
"""Find the nearest list item ancestor of a tag."""
|
|
43
42
|
parent = tag.parent
|
|
44
43
|
while parent and parent.name != "li":
|
|
45
44
|
parent = parent.parent
|
|
@@ -231,14 +230,15 @@ def _convert_blockquote(*, text: str, tag: Tag, convert_as_inline: bool, list_in
|
|
|
231
230
|
return quote_text
|
|
232
231
|
|
|
233
232
|
|
|
234
|
-
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag) -> str:
|
|
233
|
+
def _convert_br(*, convert_as_inline: bool, newline_style: str, tag: Tag, text: str) -> str:
|
|
235
234
|
from html_to_markdown.processing import _has_ancestor # noqa: PLC0415
|
|
236
235
|
|
|
237
236
|
if _has_ancestor(tag, ["h1", "h2", "h3", "h4", "h5", "h6"]):
|
|
238
|
-
return " "
|
|
237
|
+
return " " + text.strip()
|
|
239
238
|
|
|
240
239
|
_ = convert_as_inline
|
|
241
|
-
|
|
240
|
+
newline = "\\\n" if newline_style.lower() == BACKSLASH else " \n"
|
|
241
|
+
return newline + text.strip() if text.strip() else newline
|
|
242
242
|
|
|
243
243
|
|
|
244
244
|
def _convert_hn(
|
|
@@ -286,7 +286,6 @@ def _convert_img(*, tag: Tag, convert_as_inline: bool, keep_inline_images_in: It
|
|
|
286
286
|
|
|
287
287
|
|
|
288
288
|
def _has_block_list_items(tag: Tag) -> bool:
|
|
289
|
-
"""Check if any list items contain block elements."""
|
|
290
289
|
return any(
|
|
291
290
|
any(child.name in BLOCK_ELEMENTS for child in li.children if hasattr(child, "name"))
|
|
292
291
|
for li in tag.find_all("li", recursive=False)
|
|
@@ -294,7 +293,6 @@ def _has_block_list_items(tag: Tag) -> bool:
|
|
|
294
293
|
|
|
295
294
|
|
|
296
295
|
def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag) -> str:
|
|
297
|
-
"""Handle indentation for lists nested within list items."""
|
|
298
296
|
prev_p = None
|
|
299
297
|
for child in parent.children:
|
|
300
298
|
if hasattr(child, "name"):
|
|
@@ -310,7 +308,6 @@ def _handle_nested_list_indentation(text: str, list_indent_str: str, parent: Tag
|
|
|
310
308
|
|
|
311
309
|
|
|
312
310
|
def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> str:
|
|
313
|
-
"""Handle indentation for lists that are direct children of other lists."""
|
|
314
311
|
lines = text.strip().split("\n")
|
|
315
312
|
indented_lines = [f"{list_indent_str}{line}" if line.strip() else "" for line in lines]
|
|
316
313
|
result = "\n".join(indented_lines)
|
|
@@ -318,7 +315,6 @@ def _handle_direct_nested_list_indentation(text: str, list_indent_str: str) -> s
|
|
|
318
315
|
|
|
319
316
|
|
|
320
317
|
def _add_list_item_spacing(text: str) -> str:
|
|
321
|
-
"""Add extra spacing between list items that contain block content."""
|
|
322
318
|
lines = text.split("\n")
|
|
323
319
|
items_with_blocks = set()
|
|
324
320
|
|
|
@@ -482,7 +478,6 @@ def _convert_pre(
|
|
|
482
478
|
|
|
483
479
|
|
|
484
480
|
def _process_table_cell_content(*, tag: Tag, text: str, br_in_tables: bool) -> str:
|
|
485
|
-
"""Process table cell content, optionally using <br> tags for multi-line content."""
|
|
486
481
|
if br_in_tables:
|
|
487
482
|
block_children = [child for child in tag.children if hasattr(child, "name") and child.name in BLOCK_ELEMENTS]
|
|
488
483
|
|
|
@@ -510,7 +505,6 @@ def _convert_th(*, tag: Tag, text: str, br_in_tables: bool = False) -> str:
|
|
|
510
505
|
|
|
511
506
|
|
|
512
507
|
def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
|
|
513
|
-
"""Get positions of cells with rowspan > 1 from previous row."""
|
|
514
508
|
rowspan_positions = []
|
|
515
509
|
col_pos = 0
|
|
516
510
|
|
|
@@ -531,7 +525,6 @@ def _get_rowspan_positions(prev_cells: list[Tag]) -> tuple[list[int], int]:
|
|
|
531
525
|
|
|
532
526
|
|
|
533
527
|
def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int) -> str:
|
|
534
|
-
"""Handle text adjustment for rows with rowspan cells."""
|
|
535
528
|
converted_cells = [part.rstrip() + " |" for part in text.split("|")[:-1] if part] if text.strip() else []
|
|
536
529
|
rowspan_set = set(rowspan_positions)
|
|
537
530
|
|
|
@@ -542,7 +535,6 @@ def _handle_rowspan_text(text: str, rowspan_positions: list[int], col_pos: int)
|
|
|
542
535
|
|
|
543
536
|
|
|
544
537
|
def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_parent: Tag | None) -> bool:
|
|
545
|
-
"""Determine if this table row should be treated as a header row."""
|
|
546
538
|
return (
|
|
547
539
|
all(hasattr(cell, "name") and cell.name == "th" for cell in cells)
|
|
548
540
|
or (not tag.previous_sibling and parent_name != "tbody")
|
|
@@ -555,7 +547,6 @@ def _is_header_row(tag: Tag, cells: list[Tag], parent_name: str, tag_grand_paren
|
|
|
555
547
|
|
|
556
548
|
|
|
557
549
|
def _calculate_total_colspan(cells: list[Tag]) -> int:
|
|
558
|
-
"""Calculate total colspan for all cells in a row."""
|
|
559
550
|
full_colspan = 0
|
|
560
551
|
for cell in cells:
|
|
561
552
|
if hasattr(cell, "attrs") and "colspan" in cell.attrs:
|
html_to_markdown/processing.py
CHANGED
|
@@ -11,7 +11,7 @@ from io import StringIO
|
|
|
11
11
|
from itertools import chain
|
|
12
12
|
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
13
13
|
|
|
14
|
-
from bs4 import BeautifulSoup, Comment, Doctype, Tag
|
|
14
|
+
from bs4 import BeautifulSoup, CData, Comment, Doctype, Tag
|
|
15
15
|
from bs4.element import NavigableString, PageElement
|
|
16
16
|
|
|
17
17
|
try:
|
|
@@ -179,6 +179,7 @@ def _process_tag(
|
|
|
179
179
|
strip: set[str] | None,
|
|
180
180
|
whitespace_handler: WhitespaceHandler,
|
|
181
181
|
context_before: str = "",
|
|
182
|
+
ancestor_names: set[str] | None = None,
|
|
182
183
|
) -> str:
|
|
183
184
|
should_convert_tag = _should_convert_tag(tag_name=tag.name, strip=strip, convert=convert)
|
|
184
185
|
tag_name: SupportedTag | None = (
|
|
@@ -186,6 +187,17 @@ def _process_tag(
|
|
|
186
187
|
)
|
|
187
188
|
text_parts: list[str] = []
|
|
188
189
|
|
|
190
|
+
if ancestor_names is None:
|
|
191
|
+
ancestor_names = set()
|
|
192
|
+
current: Tag | None = tag
|
|
193
|
+
while current and hasattr(current, "name"):
|
|
194
|
+
if current.name:
|
|
195
|
+
ancestor_names.add(current.name)
|
|
196
|
+
current = getattr(current, "parent", None)
|
|
197
|
+
|
|
198
|
+
if len(ancestor_names) > 10:
|
|
199
|
+
break
|
|
200
|
+
|
|
189
201
|
is_heading = html_heading_re.match(tag.name) is not None
|
|
190
202
|
is_cell = tag_name in {"td", "th"}
|
|
191
203
|
convert_children_as_inline = convert_as_inline or is_heading or is_cell
|
|
@@ -201,7 +213,7 @@ def _process_tag(
|
|
|
201
213
|
if can_extract and isinstance(el, NavigableString) and not el.strip():
|
|
202
214
|
el.extract()
|
|
203
215
|
|
|
204
|
-
children = list(filter(lambda value: not isinstance(value, (Comment, Doctype)), tag.children))
|
|
216
|
+
children = list(filter(lambda value: not isinstance(value, (Comment, Doctype, CData)), tag.children))
|
|
205
217
|
|
|
206
218
|
empty_when_no_content_tags = {"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u"}
|
|
207
219
|
|
|
@@ -227,6 +239,7 @@ def _process_tag(
|
|
|
227
239
|
escape_asterisks=escape_asterisks,
|
|
228
240
|
escape_underscores=escape_underscores,
|
|
229
241
|
whitespace_handler=whitespace_handler,
|
|
242
|
+
ancestor_names=ancestor_names,
|
|
230
243
|
)
|
|
231
244
|
)
|
|
232
245
|
elif isinstance(el, Tag):
|
|
@@ -243,6 +256,7 @@ def _process_tag(
|
|
|
243
256
|
strip=strip,
|
|
244
257
|
whitespace_handler=whitespace_handler,
|
|
245
258
|
context_before=(context_before + current_text)[-2:],
|
|
259
|
+
ancestor_names=ancestor_names,
|
|
246
260
|
)
|
|
247
261
|
)
|
|
248
262
|
|
|
@@ -282,21 +296,23 @@ def _process_text(
|
|
|
282
296
|
escape_asterisks: bool,
|
|
283
297
|
escape_underscores: bool,
|
|
284
298
|
whitespace_handler: WhitespaceHandler,
|
|
299
|
+
ancestor_names: set[str] | None = None,
|
|
285
300
|
) -> str:
|
|
286
301
|
text = str(el) or ""
|
|
287
302
|
|
|
288
303
|
parent = el.parent
|
|
289
304
|
parent_name = parent.name if parent else None
|
|
290
305
|
|
|
291
|
-
ancestor_names
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
306
|
+
if ancestor_names is None:
|
|
307
|
+
ancestor_names = set()
|
|
308
|
+
current = parent
|
|
309
|
+
while current and hasattr(current, "name"):
|
|
310
|
+
if current.name:
|
|
311
|
+
ancestor_names.add(current.name)
|
|
312
|
+
current = getattr(current, "parent", None)
|
|
297
313
|
|
|
298
|
-
|
|
299
|
-
|
|
314
|
+
if len(ancestor_names) > 10:
|
|
315
|
+
break
|
|
300
316
|
|
|
301
317
|
in_pre = bool(ancestor_names.intersection({"pre"}))
|
|
302
318
|
|
|
@@ -469,7 +485,6 @@ def convert_to_markdown(
|
|
|
469
485
|
wrap_width: int = 80,
|
|
470
486
|
) -> str:
|
|
471
487
|
"""Convert HTML content to Markdown format.
|
|
472
|
-
|
|
473
488
|
This is the main entry point for converting HTML to Markdown. It supports
|
|
474
489
|
various customization options for controlling the conversion behavior.
|
|
475
490
|
|
|
@@ -525,11 +540,9 @@ def convert_to_markdown(
|
|
|
525
540
|
>>> html = "<h1>Title</h1><p>Content</p>"
|
|
526
541
|
>>> convert_to_markdown(html)
|
|
527
542
|
'Title\\n=====\\n\\nContent\\n\\n'
|
|
528
|
-
|
|
529
543
|
With custom options:
|
|
530
544
|
>>> convert_to_markdown(html, heading_style="atx", list_indent_width=2)
|
|
531
545
|
'# Title\\n\\nContent\\n\\n'
|
|
532
|
-
|
|
533
546
|
Discord-compatible lists (2-space indent):
|
|
534
547
|
>>> html = "<ul><li>Item 1</li><li>Item 2</li></ul>"
|
|
535
548
|
>>> convert_to_markdown(html, list_indent_width=2)
|
|
@@ -896,7 +909,7 @@ def _process_html_core(
|
|
|
896
909
|
elements_to_process = body.children if body and isinstance(body, Tag) else source.children
|
|
897
910
|
|
|
898
911
|
context = ""
|
|
899
|
-
for el in filter(lambda value: not isinstance(value, (Comment, Doctype)), elements_to_process):
|
|
912
|
+
for el in filter(lambda value: not isinstance(value, (Comment, Doctype, CData)), elements_to_process):
|
|
900
913
|
if isinstance(el, NavigableString):
|
|
901
914
|
text = _process_text(
|
|
902
915
|
el=el,
|
html_to_markdown/whitespace.py
CHANGED
|
@@ -6,8 +6,10 @@ import re
|
|
|
6
6
|
import unicodedata
|
|
7
7
|
from typing import TYPE_CHECKING, Literal
|
|
8
8
|
|
|
9
|
+
from bs4.element import NavigableString
|
|
10
|
+
|
|
9
11
|
if TYPE_CHECKING:
|
|
10
|
-
from bs4 import
|
|
12
|
+
from bs4 import PageElement
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
WhitespaceMode = Literal["normalized", "strict"]
|
|
@@ -128,6 +130,8 @@ class WhitespaceHandler:
|
|
|
128
130
|
def normalize_unicode_spaces(self, text: str) -> str:
|
|
129
131
|
text = self._unicode_spaces.sub(" ", text)
|
|
130
132
|
|
|
133
|
+
text = text.replace("\r\n", "\n")
|
|
134
|
+
|
|
131
135
|
normalized = []
|
|
132
136
|
for char in text:
|
|
133
137
|
if unicodedata.category(char) in ("Zs", "Zl", "Zp"):
|
|
@@ -250,12 +254,22 @@ class WhitespaceHandler:
|
|
|
250
254
|
has_leading = (
|
|
251
255
|
has_lead_space
|
|
252
256
|
and original[0] == " "
|
|
253
|
-
and (
|
|
257
|
+
and (
|
|
258
|
+
self.is_inline_element(prev_sibling)
|
|
259
|
+
or self.is_block_element(prev_sibling)
|
|
260
|
+
or prev_sibling is None
|
|
261
|
+
or isinstance(prev_sibling, NavigableString)
|
|
262
|
+
)
|
|
254
263
|
)
|
|
255
264
|
has_trailing = (
|
|
256
265
|
has_trail_space
|
|
257
266
|
and original[-1] == " "
|
|
258
|
-
and (
|
|
267
|
+
and (
|
|
268
|
+
self.is_inline_element(next_sibling)
|
|
269
|
+
or self.is_block_element(next_sibling)
|
|
270
|
+
or next_sibling is None
|
|
271
|
+
or isinstance(next_sibling, NavigableString)
|
|
272
|
+
)
|
|
259
273
|
)
|
|
260
274
|
|
|
261
275
|
if original and original[0] in "\n\t" and self.is_inline_element(prev_sibling):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.12.
|
|
3
|
+
Version: 1.12.1
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -348,6 +348,50 @@ def show_progress(processed: int, total: int):
|
|
|
348
348
|
markdown = convert_to_markdown(html_content, stream_processing=True, chunk_size=4096, progress_callback=show_progress)
|
|
349
349
|
```
|
|
350
350
|
|
|
351
|
+
#### When to Use Streaming vs Regular Processing
|
|
352
|
+
|
|
353
|
+
Based on comprehensive performance analysis, here are our recommendations:
|
|
354
|
+
|
|
355
|
+
**📄 Use Regular Processing When:**
|
|
356
|
+
|
|
357
|
+
- Files < 100KB (simplicity preferred)
|
|
358
|
+
- Simple scripts and one-off conversions
|
|
359
|
+
- Memory is not a concern
|
|
360
|
+
- You want the simplest API
|
|
361
|
+
|
|
362
|
+
**🌊 Use Streaming Processing When:**
|
|
363
|
+
|
|
364
|
+
- Files > 100KB (memory efficiency)
|
|
365
|
+
- Processing many files in batch
|
|
366
|
+
- Memory is constrained
|
|
367
|
+
- You need progress reporting
|
|
368
|
+
- You want to process results incrementally
|
|
369
|
+
- Running in production environments
|
|
370
|
+
|
|
371
|
+
**📋 Specific Recommendations by File Size:**
|
|
372
|
+
|
|
373
|
+
| File Size | Recommendation | Reason |
|
|
374
|
+
| ---------- | ----------------------------------------------- | -------------------------------------- |
|
|
375
|
+
| < 50KB | Regular (simplicity) or Streaming (3-5% faster) | Either works well |
|
|
376
|
+
| 50KB-100KB | Either (streaming slightly preferred) | Minimal difference |
|
|
377
|
+
| 100KB-1MB | Streaming preferred | Better performance + memory efficiency |
|
|
378
|
+
| > 1MB | Streaming strongly recommended | Significant memory advantages |
|
|
379
|
+
|
|
380
|
+
**🔧 Configuration Recommendations:**
|
|
381
|
+
|
|
382
|
+
- **Default chunk_size: 2048 bytes** (optimal performance balance)
|
|
383
|
+
- **For very large files (>10MB)**: Consider `chunk_size=4096`
|
|
384
|
+
- **For memory-constrained environments**: Use smaller chunks `chunk_size=1024`
|
|
385
|
+
|
|
386
|
+
**📈 Performance Benefits:**
|
|
387
|
+
|
|
388
|
+
Streaming provides consistent **3-5% performance improvement** across all file sizes:
|
|
389
|
+
|
|
390
|
+
- **Streaming throughput**: ~0.47-0.48 MB/s
|
|
391
|
+
- **Regular throughput**: ~0.44-0.47 MB/s
|
|
392
|
+
- **Memory usage**: Streaming uses less peak memory for large files
|
|
393
|
+
- **Latency**: Streaming allows processing results before completion
|
|
394
|
+
|
|
351
395
|
### Preprocessing API
|
|
352
396
|
|
|
353
397
|
The library provides functions for preprocessing HTML before conversion, useful for cleaning messy or complex HTML:
|
|
@@ -2,16 +2,16 @@ html_to_markdown/__init__.py,sha256=TzZzhZDJHeXW_3B9zceYehz2zlttqdLsDr5un8stZLM,
|
|
|
2
2
|
html_to_markdown/__main__.py,sha256=E9d62nVceR_5TUWgVu5L5CnSZxKcnT_7a6ScWZUGE-s,292
|
|
3
3
|
html_to_markdown/cli.py,sha256=qB8-1jqJPW-YrOmlyOdJnLM6DpKSUIA3iyn1SJaJgKg,9418
|
|
4
4
|
html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
|
|
5
|
-
html_to_markdown/converters.py,sha256=
|
|
5
|
+
html_to_markdown/converters.py,sha256=fdFT9WwDd3hGpYn0jVbPDcB8OmLPvQUmanbM7aQmzms,35821
|
|
6
6
|
html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
|
|
7
7
|
html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
|
|
8
|
-
html_to_markdown/processing.py,sha256=
|
|
8
|
+
html_to_markdown/processing.py,sha256=xchoTwKZHQW8ejjwLAiMb_AY6XcgPQ6zhLShlduYVuY,35213
|
|
9
9
|
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
|
|
11
|
-
html_to_markdown/whitespace.py,sha256=
|
|
12
|
-
html_to_markdown-1.12.
|
|
13
|
-
html_to_markdown-1.12.
|
|
14
|
-
html_to_markdown-1.12.
|
|
15
|
-
html_to_markdown-1.12.
|
|
16
|
-
html_to_markdown-1.12.
|
|
17
|
-
html_to_markdown-1.12.
|
|
11
|
+
html_to_markdown/whitespace.py,sha256=rl3eEwqfMpNWx4FBmbkZ1RxO_Od45p3EZ_7UgKcDAtg,7710
|
|
12
|
+
html_to_markdown-1.12.1.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
13
|
+
html_to_markdown-1.12.1.dist-info/METADATA,sha256=5PoGUeYuGtGmh5q_XwxKlSzq7572CUw3yAVBNmVxDTc,22694
|
|
14
|
+
html_to_markdown-1.12.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
html_to_markdown-1.12.1.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
16
|
+
html_to_markdown-1.12.1.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
17
|
+
html_to_markdown-1.12.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|