html-to-markdown 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -1,15 +1,11 @@
1
- """Custom exceptions for the html-to-markdown library."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
 
6
4
  class HtmlToMarkdownError(Exception):
7
- """Base exception for all html-to-markdown errors."""
5
+ pass
8
6
 
9
7
 
10
8
  class MissingDependencyError(HtmlToMarkdownError):
11
- """Raised when an optional dependency is required but not installed."""
12
-
13
9
  def __init__(self, dependency: str, install_command: str | None = None) -> None:
14
10
  self.dependency = dependency
15
11
  self.install_command = install_command
@@ -22,8 +18,6 @@ class MissingDependencyError(HtmlToMarkdownError):
22
18
 
23
19
 
24
20
  class InvalidParserError(HtmlToMarkdownError):
25
- """Raised when an invalid parser is specified."""
26
-
27
21
  def __init__(self, parser: str, available_parsers: list[str]) -> None:
28
22
  self.parser = parser
29
23
  self.available_parsers = available_parsers
@@ -33,15 +27,11 @@ class InvalidParserError(HtmlToMarkdownError):
33
27
 
34
28
 
35
29
  class EmptyHtmlError(HtmlToMarkdownError):
36
- """Raised when the input HTML is empty."""
37
-
38
30
  def __init__(self) -> None:
39
31
  super().__init__("The input HTML is empty.")
40
32
 
41
33
 
42
34
  class ConflictingOptionsError(HtmlToMarkdownError):
43
- """Raised when conflicting options are specified."""
44
-
45
35
  def __init__(self, option1: str, option2: str) -> None:
46
36
  self.option1 = option1
47
37
  self.option2 = option2
@@ -1,5 +1,3 @@
1
- """HTML preprocessing using nh3 (ammonia bindings) for improved quality and performance."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import re
@@ -22,24 +20,6 @@ def preprocess_html(
22
20
  custom_tags_to_remove: set[str] | None = None,
23
21
  custom_attributes_to_remove: set[str] | None = None,
24
22
  ) -> str:
25
- """Preprocess HTML to remove unwanted elements and improve quality.
26
-
27
- Args:
28
- html: Raw HTML content to preprocess.
29
- remove_navigation: Remove navigation elements and menus.
30
- remove_forms: Remove form elements (input, button, select, etc.).
31
- remove_scripts: Remove script tags and content.
32
- remove_styles: Remove style tags and content.
33
- remove_comments: Remove HTML comments.
34
- preserve_semantic_structure: Preserve semantic HTML5 elements.
35
- preserve_tables: Preserve table structure.
36
- preserve_media: Preserve media elements (img, video, audio).
37
- custom_tags_to_remove: Additional tags to remove.
38
- custom_attributes_to_remove: Additional attributes to remove.
39
-
40
- Returns:
41
- Cleaned HTML ready for conversion to markdown.
42
- """
43
23
  if not html or not html.strip(): # pragma: no cover
44
24
  return html
45
25
 
@@ -83,7 +63,6 @@ def _configure_cleaning_rules(
83
63
  custom_tags_to_remove: set[str],
84
64
  custom_attributes_to_remove: set[str],
85
65
  ) -> dict[str, Any]:
86
- """Configure the cleaning rules for nh3."""
87
66
  allowed_tags = {
88
67
  "p",
89
68
  "div",
@@ -254,7 +233,6 @@ def _configure_cleaning_rules(
254
233
 
255
234
 
256
235
  def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
257
- """Remove elements with navigation-related classes."""
258
236
  if not remove_navigation:
259
237
  return html
260
238
 
@@ -288,7 +266,6 @@ def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
288
266
 
289
267
 
290
268
  def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
291
- """Remove common navigation patterns that nh3 might miss."""
292
269
  if not remove_navigation:
293
270
  return html
294
271
 
@@ -329,7 +306,6 @@ def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
329
306
 
330
307
 
331
308
  def _remove_wikipedia_navigation_lists(html: str) -> str:
332
- """Remove Wikipedia-style navigation lists that appear at the start."""
333
309
  patterns = [
334
310
  r"Main menu\s*\n\n(-\s*\[.*?\]\(.*?\).*?\n){3,}",
335
311
  r"(-\s*\[[^\]]*\]\(/wiki/[^)]*\).*?\n){5,}",
@@ -342,7 +318,6 @@ def _remove_wikipedia_navigation_lists(html: str) -> str:
342
318
 
343
319
 
344
320
  def _fix_whitespace_issues(html: str) -> str:
345
- """Fix common whitespace issues in HTML."""
346
321
  html = re.sub(r"[ \t]{2,}", " ", html)
347
322
  html = re.sub(r"\n\s*\n", "\n\n", html)
348
323
 
@@ -385,18 +360,6 @@ PRESETS: dict[str, dict[str, Any]] = {
385
360
 
386
361
 
387
362
  def create_preprocessor(preset: str = "standard", **overrides: Any) -> dict[str, Any]:
388
- """Create preprocessor configuration with a preset.
389
-
390
- Args:
391
- preset: The preset configuration to use (minimal, standard, aggressive).
392
- **overrides: Any configuration options to override.
393
-
394
- Returns:
395
- Configuration dict for preprocessor.
396
-
397
- Raises:
398
- ValueError: If preset is unknown.
399
- """
400
363
  if preset not in PRESETS:
401
364
  msg = f"Unknown preset '{preset}'. Available presets: {list(PRESETS.keys())}"
402
365
  raise ValueError(msg)