html-to-markdown 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/__main__.py +0 -1
- html_to_markdown/cli.py +101 -45
- html_to_markdown/constants.py +3 -0
- html_to_markdown/converters.py +52 -573
- html_to_markdown/exceptions.py +1 -11
- html_to_markdown/preprocessor.py +0 -37
- html_to_markdown/processing.py +104 -202
- html_to_markdown/utils.py +2 -42
- html_to_markdown/whitespace.py +292 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/METADATA +204 -204
- html_to_markdown-1.10.0.dist-info/RECORD +17 -0
- html_to_markdown-1.9.0.dist-info/RECORD +0 -16
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.9.0.dist-info → html_to_markdown-1.10.0.dist-info}/top_level.txt +0 -0
html_to_markdown/exceptions.py
CHANGED
|
@@ -1,15 +1,11 @@
|
|
|
1
|
-
"""Custom exceptions for the html-to-markdown library."""
|
|
2
|
-
|
|
3
1
|
from __future__ import annotations
|
|
4
2
|
|
|
5
3
|
|
|
6
4
|
class HtmlToMarkdownError(Exception):
|
|
7
|
-
|
|
5
|
+
pass
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
class MissingDependencyError(HtmlToMarkdownError):
|
|
11
|
-
"""Raised when an optional dependency is required but not installed."""
|
|
12
|
-
|
|
13
9
|
def __init__(self, dependency: str, install_command: str | None = None) -> None:
|
|
14
10
|
self.dependency = dependency
|
|
15
11
|
self.install_command = install_command
|
|
@@ -22,8 +18,6 @@ class MissingDependencyError(HtmlToMarkdownError):
|
|
|
22
18
|
|
|
23
19
|
|
|
24
20
|
class InvalidParserError(HtmlToMarkdownError):
|
|
25
|
-
"""Raised when an invalid parser is specified."""
|
|
26
|
-
|
|
27
21
|
def __init__(self, parser: str, available_parsers: list[str]) -> None:
|
|
28
22
|
self.parser = parser
|
|
29
23
|
self.available_parsers = available_parsers
|
|
@@ -33,15 +27,11 @@ class InvalidParserError(HtmlToMarkdownError):
|
|
|
33
27
|
|
|
34
28
|
|
|
35
29
|
class EmptyHtmlError(HtmlToMarkdownError):
|
|
36
|
-
"""Raised when the input HTML is empty."""
|
|
37
|
-
|
|
38
30
|
def __init__(self) -> None:
|
|
39
31
|
super().__init__("The input HTML is empty.")
|
|
40
32
|
|
|
41
33
|
|
|
42
34
|
class ConflictingOptionsError(HtmlToMarkdownError):
|
|
43
|
-
"""Raised when conflicting options are specified."""
|
|
44
|
-
|
|
45
35
|
def __init__(self, option1: str, option2: str) -> None:
|
|
46
36
|
self.option1 = option1
|
|
47
37
|
self.option2 = option2
|
html_to_markdown/preprocessor.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
"""HTML preprocessing using nh3 (ammonia bindings) for improved quality and performance."""
|
|
2
|
-
|
|
3
1
|
from __future__ import annotations
|
|
4
2
|
|
|
5
3
|
import re
|
|
@@ -22,24 +20,6 @@ def preprocess_html(
|
|
|
22
20
|
custom_tags_to_remove: set[str] | None = None,
|
|
23
21
|
custom_attributes_to_remove: set[str] | None = None,
|
|
24
22
|
) -> str:
|
|
25
|
-
"""Preprocess HTML to remove unwanted elements and improve quality.
|
|
26
|
-
|
|
27
|
-
Args:
|
|
28
|
-
html: Raw HTML content to preprocess.
|
|
29
|
-
remove_navigation: Remove navigation elements and menus.
|
|
30
|
-
remove_forms: Remove form elements (input, button, select, etc.).
|
|
31
|
-
remove_scripts: Remove script tags and content.
|
|
32
|
-
remove_styles: Remove style tags and content.
|
|
33
|
-
remove_comments: Remove HTML comments.
|
|
34
|
-
preserve_semantic_structure: Preserve semantic HTML5 elements.
|
|
35
|
-
preserve_tables: Preserve table structure.
|
|
36
|
-
preserve_media: Preserve media elements (img, video, audio).
|
|
37
|
-
custom_tags_to_remove: Additional tags to remove.
|
|
38
|
-
custom_attributes_to_remove: Additional attributes to remove.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
Cleaned HTML ready for conversion to markdown.
|
|
42
|
-
"""
|
|
43
23
|
if not html or not html.strip(): # pragma: no cover
|
|
44
24
|
return html
|
|
45
25
|
|
|
@@ -83,7 +63,6 @@ def _configure_cleaning_rules(
|
|
|
83
63
|
custom_tags_to_remove: set[str],
|
|
84
64
|
custom_attributes_to_remove: set[str],
|
|
85
65
|
) -> dict[str, Any]:
|
|
86
|
-
"""Configure the cleaning rules for nh3."""
|
|
87
66
|
allowed_tags = {
|
|
88
67
|
"p",
|
|
89
68
|
"div",
|
|
@@ -254,7 +233,6 @@ def _configure_cleaning_rules(
|
|
|
254
233
|
|
|
255
234
|
|
|
256
235
|
def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
|
|
257
|
-
"""Remove elements with navigation-related classes."""
|
|
258
236
|
if not remove_navigation:
|
|
259
237
|
return html
|
|
260
238
|
|
|
@@ -288,7 +266,6 @@ def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
|
|
|
288
266
|
|
|
289
267
|
|
|
290
268
|
def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
|
|
291
|
-
"""Remove common navigation patterns that nh3 might miss."""
|
|
292
269
|
if not remove_navigation:
|
|
293
270
|
return html
|
|
294
271
|
|
|
@@ -329,7 +306,6 @@ def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
|
|
|
329
306
|
|
|
330
307
|
|
|
331
308
|
def _remove_wikipedia_navigation_lists(html: str) -> str:
|
|
332
|
-
"""Remove Wikipedia-style navigation lists that appear at the start."""
|
|
333
309
|
patterns = [
|
|
334
310
|
r"Main menu\s*\n\n(-\s*\[.*?\]\(.*?\).*?\n){3,}",
|
|
335
311
|
r"(-\s*\[[^\]]*\]\(/wiki/[^)]*\).*?\n){5,}",
|
|
@@ -342,7 +318,6 @@ def _remove_wikipedia_navigation_lists(html: str) -> str:
|
|
|
342
318
|
|
|
343
319
|
|
|
344
320
|
def _fix_whitespace_issues(html: str) -> str:
|
|
345
|
-
"""Fix common whitespace issues in HTML."""
|
|
346
321
|
html = re.sub(r"[ \t]{2,}", " ", html)
|
|
347
322
|
html = re.sub(r"\n\s*\n", "\n\n", html)
|
|
348
323
|
|
|
@@ -385,18 +360,6 @@ PRESETS: dict[str, dict[str, Any]] = {
|
|
|
385
360
|
|
|
386
361
|
|
|
387
362
|
def create_preprocessor(preset: str = "standard", **overrides: Any) -> dict[str, Any]:
|
|
388
|
-
"""Create preprocessor configuration with a preset.
|
|
389
|
-
|
|
390
|
-
Args:
|
|
391
|
-
preset: The preset configuration to use (minimal, standard, aggressive).
|
|
392
|
-
**overrides: Any configuration options to override.
|
|
393
|
-
|
|
394
|
-
Returns:
|
|
395
|
-
Configuration dict for preprocessor.
|
|
396
|
-
|
|
397
|
-
Raises:
|
|
398
|
-
ValueError: If preset is unknown.
|
|
399
|
-
"""
|
|
400
363
|
if preset not in PRESETS:
|
|
401
364
|
msg = f"Unknown preset '{preset}'. Available presets: {list(PRESETS.keys())}"
|
|
402
365
|
raise ValueError(msg)
|