PyPI - html-to-markdown - Versions diffs - 1.14.0__py3-none-any.whl → 1.15.0__py3-none-any.whl - Mend

html-to-markdown 1.14.0py3-none-any.whl → 1.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (8) hide show

html_to_markdown/preprocessor.py CHANGED Viewed

@@ -97,6 +97,27 @@ MEDIA_TAGS = frozenset(
     }
 )
+DEFAULT_NAVIGATION_CLASSES: frozenset[str] = frozenset(
+    {
+        "vector-header",
+        "vector-main-menu",
+        "vector-page-tools",
+        "vector-toc",
+        "mw-jump-link",
+        "mw-navigation",
+        "navbox",
+        "navigation-box",
+        "sidebar",
+        "nav",
+        "header",
+        "footer",
+        "menu",
+        "breadcrumb",
+        "topbar",
+        "toolbar",
+    }
+)
 def preprocess_html(
     html: str,
@@ -111,11 +132,18 @@ def preprocess_html(
     preserve_media: bool = True,
     custom_tags_to_remove: set[str] | None = None,
     custom_attributes_to_remove: set[str] | None = None,
+    excluded_navigation_classes: set[str] | None = None,
+    extra_navigation_classes: set[str] | None = None,
 ) -> str:
     if not html or not html.strip():  # pragma: no cover
         return html
-    html = _remove_class_based_navigation(html, remove_navigation)
+    html = _remove_class_based_navigation(
+        html,
+        remove_navigation,
+        excluded_navigation_classes,
+        extra_navigation_classes,
+    )
     nh3_config = _configure_cleaning_rules(
         remove_navigation=remove_navigation,
@@ -242,35 +270,31 @@ def _configure_cleaning_rules(
     }
-def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
+def _remove_class_based_navigation(
+    html: str,
+    remove_navigation: bool,
+    excluded_navigation_classes: set[str] | None,
+    extra_navigation_classes: set[str] | None,
+) -> str:
     if not remove_navigation:
         return html
-    navigation_classes = [
-        r'vector-header[^"]*',
-        r'vector-main-menu[^"]*',
-        r'vector-page-tools[^"]*',
-        r'vector-toc[^"]*',
-        r'mw-jump-link[^"]*',
-        r'mw-navigation[^"]*',
-        r'navbox[^"]*',
-        r'navigation-box[^"]*',
-        r'sidebar[^"]*',
-        r'nav[^"]*',
-        r'header[^"]*',
-        r'footer[^"]*',
-        r'menu[^"]*',
-        r'breadcrumb[^"]*',
-        r'topbar[^"]*',
-        r'toolbar[^"]*',
-    ]
+    class_names = set(DEFAULT_NAVIGATION_CLASSES)
+    if excluded_navigation_classes:
+        class_names.difference_update(excluded_navigation_classes)
+    if extra_navigation_classes:
+        class_names.update(extra_navigation_classes)
+    for class_name in class_names:
+        class_pattern = rf'{re.escape(class_name)}[^"]*'
-    for class_pattern in navigation_classes:
-        pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</[^>]*>'
-        html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
+        block_pattern = rf'<(?P<tag>[^>\s]+)[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</(?P=tag)>'
+        html = re.sub(block_pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
-        pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
-        html = re.sub(pattern, "", html, flags=re.IGNORECASE)
+        self_closing_pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
+        html = re.sub(self_closing_pattern, "", html, flags=re.IGNORECASE)
     return html

html_to_markdown/processing.py CHANGED Viewed

@@ -314,11 +314,12 @@ def _process_text(
             if len(ancestor_names) > 10:
                 break
-    in_pre = bool(ancestor_names.intersection({"pre"}))
+    in_pre = bool(ancestor_names.intersection({"pre"})) or parent_name == "pre"
     text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
-    if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
+    code_like_tags = {"pre", "code", "kbd", "samp"}
+    if not (ancestor_names.intersection(code_like_tags) or parent_name in code_like_tags):
         text = escape(
             text=text,
             escape_misc=escape_misc,
@@ -476,6 +477,8 @@ def convert_to_markdown(
     preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
     remove_forms: bool = True,
     remove_navigation: bool = True,
+    excluded_navigation_classes: set[str] | None = None,
+    extra_navigation_classes: set[str] | None = None,
     strip: str | Iterable[str] | None = None,
     strip_newlines: bool = False,
     strong_em_symbol: Literal["*", "_"] = ASTERISK,
@@ -520,6 +523,8 @@ def convert_to_markdown(
         preprocessing_preset: Preprocessing aggressiveness level.
         remove_forms: Remove form elements during preprocessing.
         remove_navigation: Remove navigation elements during preprocessing.
+        excluded_navigation_classes: Navigation class fragments to keep even when removing navigation.
+        extra_navigation_classes: Additional navigation class fragments to strip beyond the defaults.
         strip: HTML tags to strip from output.
         strip_newlines: Remove newlines from HTML before processing.
         strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
@@ -575,6 +580,8 @@ def convert_to_markdown(
             config = create_preprocessor(
                 preset=preprocessing_preset,
                 remove_navigation=remove_navigation,
+                excluded_navigation_classes=excluded_navigation_classes,
+                extra_navigation_classes=extra_navigation_classes,
                 remove_forms=remove_forms,
             )
             source = preprocess_fn(source, **config)
@@ -617,7 +624,6 @@ def convert_to_markdown(
                         first_child.replace_with(new_text)
                         needs_leading_space_fix = False
-            # Fix html5lib whitespace handling to match other parsers
             if parser == "html5lib":
                 body = source.find("body")
                 if body and isinstance(body, Tag):
@@ -632,7 +638,6 @@ def convert_to_markdown(
                         first_child = children[0]
                         original_text = str(first_child)
-                        # Preserve leading whitespace from original if html5lib stripped it
                         leading_ws = ""
                         for char in original_source:
                             if char in " \t\n\r":
@@ -640,7 +645,6 @@ def convert_to_markdown(
                             else:
                                 break
-                        # Create normalized text: restore leading whitespace only
                         normalized_text = original_text
                         if leading_ws and not normalized_text.startswith(leading_ws):
                             normalized_text = leading_ws + normalized_text
@@ -1080,6 +1084,8 @@ def convert_to_markdown_stream(
     preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
     remove_forms: bool = True,
     remove_navigation: bool = True,
+    excluded_navigation_classes: set[str] | None = None,
+    extra_navigation_classes: set[str] | None = None,
     strip: str | Iterable[str] | None = None,
     strip_newlines: bool = False,
     strong_em_symbol: Literal["*", "_"] = ASTERISK,
@@ -1098,6 +1104,8 @@ def convert_to_markdown_stream(
         config = create_preprocessor(
             preset=preprocessing_preset,
             remove_navigation=remove_navigation,
+            excluded_navigation_classes=excluded_navigation_classes,
+            extra_navigation_classes=extra_navigation_classes,
             remove_forms=remove_forms,
         )
         source = preprocess_fn(source, **config)

{html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 1.14.0
+Version: 1.15.0
 Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
 Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
 License: MIT
@@ -627,6 +627,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
 - `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
 - `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
 - `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
+- `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
+- `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
 ## Contribution

{html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/RECORD RENAMED Viewed

@@ -4,14 +4,14 @@ html_to_markdown/cli.py,sha256=-rq1L64Ze-zxSdn0cta8HvUCJDGmWHDcZe2RlVZJFjI,9665
 html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
 html_to_markdown/converters.py,sha256=REuvFnP-D97VlG2kuCVTbb3exoZ87NQn9hUuiP5ISOU,35839
 html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
-html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
-html_to_markdown/processing.py,sha256=Nw68rKNRMV7BbDxGW5sDhebeyhO7SC_Tv4lMJF4TJfc,40697
+html_to_markdown/preprocessor.py,sha256=QBdZbQQAlJG6n3iv9xzvufnq1F0l9c9LOOVECtPiuM0,10192
+html_to_markdown/processing.py,sha256=s9o_7GTpK3UAmMS0PHFrlNZk0xhO-ITvFQx6a-fN-tI,41247
 html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
 html_to_markdown/whitespace.py,sha256=rl3eEwqfMpNWx4FBmbkZ1RxO_Od45p3EZ_7UgKcDAtg,7710
-html_to_markdown-1.14.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
-html_to_markdown-1.14.0.dist-info/METADATA,sha256=vJeFvECsy8HFT8Ezd_ddc4__dHFxKgSH4wFHH8bDQtE,29421
-html_to_markdown-1.14.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-html_to_markdown-1.14.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
-html_to_markdown-1.14.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
-html_to_markdown-1.14.0.dist-info/RECORD,,
+html_to_markdown-1.15.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
+html_to_markdown-1.15.0.dist-info/METADATA,sha256=uEh0fiaSCEOvazsHCGRsjIyUO-9kMdOgWHM2og31xgs,29670
+html_to_markdown-1.15.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+html_to_markdown-1.15.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
+html_to_markdown-1.15.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
+html_to_markdown-1.15.0.dist-info/RECORD,,

{html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

html-to-markdown 1.14.0__py3-none-any.whl → 1.15.0__py3-none-any.whl

Potentially problematic release.

html-to-markdown 1.14.0py3-none-any.whl → 1.15.0py3-none-any.whl