PyPI - html-to-markdown - Versions diffs - 1.14.0__tar.gz → 1.15.0__tar.gz - Mend

html-to-markdown 1.14.0tar.gz → 1.15.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.14.0
+Version: 1.15.0
 Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
@@ -627,6 +627,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
 - `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
 - `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
 - `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
+- `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
+- `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
 ## Contribution

{html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/README.md RENAMED Viewed

@@ -587,6 +587,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
 - `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
 - `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
 - `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
+- `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
+- `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
 ## Contribution

{html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/preprocessor.py RENAMED Viewed

@@ -97,6 +97,27 @@ MEDIA_TAGS = frozenset(
     }
 )
+DEFAULT_NAVIGATION_CLASSES: frozenset[str] = frozenset(
+    {
+        "vector-header",
+        "vector-main-menu",
+        "vector-page-tools",
+        "vector-toc",
+        "mw-jump-link",
+        "mw-navigation",
+        "navbox",
+        "navigation-box",
+        "sidebar",
+        "nav",
+        "header",
+        "footer",
+        "menu",
+        "breadcrumb",
+        "topbar",
+        "toolbar",
+    }
+)
 def preprocess_html(
     html: str,
@@ -111,11 +132,18 @@ def preprocess_html(
     preserve_media: bool = True,
     custom_tags_to_remove: set[str] | None = None,
     custom_attributes_to_remove: set[str] | None = None,
+    excluded_navigation_classes: set[str] | None = None,
+    extra_navigation_classes: set[str] | None = None,
 ) -> str:
     if not html or not html.strip():  # pragma: no cover
         return html
-    html = _remove_class_based_navigation(html, remove_navigation)
+    html = _remove_class_based_navigation(
+        html,
+        remove_navigation,
+        excluded_navigation_classes,
+        extra_navigation_classes,
+    )
     nh3_config = _configure_cleaning_rules(
         remove_navigation=remove_navigation,
@@ -242,35 +270,31 @@ def _configure_cleaning_rules(
     }
-def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
+def _remove_class_based_navigation(
+    html: str,
+    remove_navigation: bool,
+    excluded_navigation_classes: set[str] | None,
+    extra_navigation_classes: set[str] | None,
+) -> str:
     if not remove_navigation:
         return html
-    navigation_classes = [
-        r'vector-header[^"]*',
-        r'vector-main-menu[^"]*',
-        r'vector-page-tools[^"]*',
-        r'vector-toc[^"]*',
-        r'mw-jump-link[^"]*',
-        r'mw-navigation[^"]*',
-        r'navbox[^"]*',
-        r'navigation-box[^"]*',
-        r'sidebar[^"]*',
-        r'nav[^"]*',
-        r'header[^"]*',
-        r'footer[^"]*',
-        r'menu[^"]*',
-        r'breadcrumb[^"]*',
-        r'topbar[^"]*',
-        r'toolbar[^"]*',
-    ]
+    class_names = set(DEFAULT_NAVIGATION_CLASSES)
+    if excluded_navigation_classes:
+        class_names.difference_update(excluded_navigation_classes)
+    if extra_navigation_classes:
+        class_names.update(extra_navigation_classes)
+    for class_name in class_names:
+        class_pattern = rf'{re.escape(class_name)}[^"]*'
-    for class_pattern in navigation_classes:
-        pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</[^>]*>'
-        html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
+        block_pattern = rf'<(?P<tag>[^>\s]+)[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</(?P=tag)>'
+        html = re.sub(block_pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
-        pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
-        html = re.sub(pattern, "", html, flags=re.IGNORECASE)
+        self_closing_pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
+        html = re.sub(self_closing_pattern, "", html, flags=re.IGNORECASE)
     return html

{html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/processing.py RENAMED Viewed

@@ -314,11 +314,12 @@ def _process_text(
             if len(ancestor_names) > 10:
                 break
-    in_pre = bool(ancestor_names.intersection({"pre"}))
+    in_pre = bool(ancestor_names.intersection({"pre"})) or parent_name == "pre"
     text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
-    if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
+    code_like_tags = {"pre", "code", "kbd", "samp"}
+    if not (ancestor_names.intersection(code_like_tags) or parent_name in code_like_tags):
         text = escape(
             text=text,
             escape_misc=escape_misc,
@@ -476,6 +477,8 @@ def convert_to_markdown(
     preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
     remove_forms: bool = True,
     remove_navigation: bool = True,
+    excluded_navigation_classes: set[str] | None = None,
+    extra_navigation_classes: set[str] | None = None,
     strip: str | Iterable[str] | None = None,
     strip_newlines: bool = False,
     strong_em_symbol: Literal["*", "_"] = ASTERISK,
@@ -520,6 +523,8 @@ def convert_to_markdown(
         preprocessing_preset: Preprocessing aggressiveness level.
         remove_forms: Remove form elements during preprocessing.
         remove_navigation: Remove navigation elements during preprocessing.
+        excluded_navigation_classes: Navigation class fragments to keep even when removing navigation.
+        extra_navigation_classes: Additional navigation class fragments to strip beyond the defaults.
         strip: HTML tags to strip from output.
         strip_newlines: Remove newlines from HTML before processing.
         strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
@@ -575,6 +580,8 @@ def convert_to_markdown(
             config = create_preprocessor(
                 preset=preprocessing_preset,
                 remove_navigation=remove_navigation,
+                excluded_navigation_classes=excluded_navigation_classes,
+                extra_navigation_classes=extra_navigation_classes,
                 remove_forms=remove_forms,
             )
             source = preprocess_fn(source, **config)
@@ -617,7 +624,6 @@ def convert_to_markdown(
                         first_child.replace_with(new_text)
                         needs_leading_space_fix = False
-            # Fix html5lib whitespace handling to match other parsers
             if parser == "html5lib":
                 body = source.find("body")
                 if body and isinstance(body, Tag):
@@ -632,7 +638,6 @@ def convert_to_markdown(
                         first_child = children[0]
                         original_text = str(first_child)
-                        # Preserve leading whitespace from original if html5lib stripped it
                         leading_ws = ""
                         for char in original_source:
                             if char in " \t\n\r":
@@ -640,7 +645,6 @@ def convert_to_markdown(
                             else:
                                 break
-                        # Create normalized text: restore leading whitespace only
                         normalized_text = original_text
                         if leading_ws and not normalized_text.startswith(leading_ws):
                             normalized_text = leading_ws + normalized_text
@@ -1080,6 +1084,8 @@ def convert_to_markdown_stream(
     preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
     remove_forms: bool = True,
     remove_navigation: bool = True,
+    excluded_navigation_classes: set[str] | None = None,
+    extra_navigation_classes: set[str] | None = None,
     strip: str | Iterable[str] | None = None,
     strip_newlines: bool = False,
     strong_em_symbol: Literal["*", "_"] = ASTERISK,
@@ -1098,6 +1104,8 @@ def convert_to_markdown_stream(
         config = create_preprocessor(
             preset=preprocessing_preset,
             remove_navigation=remove_navigation,
+            excluded_navigation_classes=excluded_navigation_classes,
+            extra_navigation_classes=extra_navigation_classes,
             remove_forms=remove_forms,
         )
         source = preprocess_fn(source, **config)

{html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.14.0
+Version: 1.15.0
 Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
@@ -627,6 +627,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
 - `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
 - `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
 - `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
+- `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
+- `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
 ## Contribution

{html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
 [project]
 name = "html-to-markdown"
-version = "1.14.0"
+version = "1.15.0"
 description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
 readme = "README.md"
 keywords = [
@@ -61,8 +61,10 @@ dev = [
   "beautifulsoup4[html5lib]>=4.13.5",
   "beautifulsoup4[lxml]>=4.13.5",
   "covdefaults>=2.3",
+  "memray>=1.18; sys_platform!='win32'",
   "mypy>=1.18.2",
   "pre-commit>=4.3",
+  "psutil>=7.1; sys_platform!='win32'",
   "pytest>=8.4.2",
   "pytest-benchmark>=5.1",
   "pytest-cov>=7",