html-to-markdown 1.14.1__py3-none-any.whl → 1.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -97,6 +97,27 @@ MEDIA_TAGS = frozenset(
97
97
  }
98
98
  )
99
99
 
100
+ DEFAULT_NAVIGATION_CLASSES: frozenset[str] = frozenset(
101
+ {
102
+ "vector-header",
103
+ "vector-main-menu",
104
+ "vector-page-tools",
105
+ "vector-toc",
106
+ "mw-jump-link",
107
+ "mw-navigation",
108
+ "navbox",
109
+ "navigation-box",
110
+ "sidebar",
111
+ "nav",
112
+ "header",
113
+ "footer",
114
+ "menu",
115
+ "breadcrumb",
116
+ "topbar",
117
+ "toolbar",
118
+ }
119
+ )
120
+
100
121
 
101
122
  def preprocess_html(
102
123
  html: str,
@@ -111,11 +132,18 @@ def preprocess_html(
111
132
  preserve_media: bool = True,
112
133
  custom_tags_to_remove: set[str] | None = None,
113
134
  custom_attributes_to_remove: set[str] | None = None,
135
+ excluded_navigation_classes: set[str] | None = None,
136
+ extra_navigation_classes: set[str] | None = None,
114
137
  ) -> str:
115
138
  if not html or not html.strip(): # pragma: no cover
116
139
  return html
117
140
 
118
- html = _remove_class_based_navigation(html, remove_navigation)
141
+ html = _remove_class_based_navigation(
142
+ html,
143
+ remove_navigation,
144
+ excluded_navigation_classes,
145
+ extra_navigation_classes,
146
+ )
119
147
 
120
148
  nh3_config = _configure_cleaning_rules(
121
149
  remove_navigation=remove_navigation,
@@ -242,35 +270,31 @@ def _configure_cleaning_rules(
242
270
  }
243
271
 
244
272
 
245
- def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
273
+ def _remove_class_based_navigation(
274
+ html: str,
275
+ remove_navigation: bool,
276
+ excluded_navigation_classes: set[str] | None,
277
+ extra_navigation_classes: set[str] | None,
278
+ ) -> str:
246
279
  if not remove_navigation:
247
280
  return html
248
281
 
249
- navigation_classes = [
250
- r'vector-header[^"]*',
251
- r'vector-main-menu[^"]*',
252
- r'vector-page-tools[^"]*',
253
- r'vector-toc[^"]*',
254
- r'mw-jump-link[^"]*',
255
- r'mw-navigation[^"]*',
256
- r'navbox[^"]*',
257
- r'navigation-box[^"]*',
258
- r'sidebar[^"]*',
259
- r'nav[^"]*',
260
- r'header[^"]*',
261
- r'footer[^"]*',
262
- r'menu[^"]*',
263
- r'breadcrumb[^"]*',
264
- r'topbar[^"]*',
265
- r'toolbar[^"]*',
266
- ]
282
+ class_names = set(DEFAULT_NAVIGATION_CLASSES)
283
+
284
+ if excluded_navigation_classes:
285
+ class_names.difference_update(excluded_navigation_classes)
286
+
287
+ if extra_navigation_classes:
288
+ class_names.update(extra_navigation_classes)
289
+
290
+ for class_name in class_names:
291
+ class_pattern = rf'{re.escape(class_name)}[^"]*'
267
292
 
268
- for class_pattern in navigation_classes:
269
- pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</[^>]*>'
270
- html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
293
+ block_pattern = rf'<(?P<tag>[^>\s]+)[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</(?P=tag)>'
294
+ html = re.sub(block_pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
271
295
 
272
- pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
273
- html = re.sub(pattern, "", html, flags=re.IGNORECASE)
296
+ self_closing_pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
297
+ html = re.sub(self_closing_pattern, "", html, flags=re.IGNORECASE)
274
298
 
275
299
  return html
276
300
 
@@ -477,6 +477,8 @@ def convert_to_markdown(
477
477
  preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
478
478
  remove_forms: bool = True,
479
479
  remove_navigation: bool = True,
480
+ excluded_navigation_classes: set[str] | None = None,
481
+ extra_navigation_classes: set[str] | None = None,
480
482
  strip: str | Iterable[str] | None = None,
481
483
  strip_newlines: bool = False,
482
484
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
@@ -521,6 +523,8 @@ def convert_to_markdown(
521
523
  preprocessing_preset: Preprocessing aggressiveness level.
522
524
  remove_forms: Remove form elements during preprocessing.
523
525
  remove_navigation: Remove navigation elements during preprocessing.
526
+ excluded_navigation_classes: Navigation class fragments to keep even when removing navigation.
527
+ extra_navigation_classes: Additional navigation class fragments to strip beyond the defaults.
524
528
  strip: HTML tags to strip from output.
525
529
  strip_newlines: Remove newlines from HTML before processing.
526
530
  strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
@@ -576,6 +580,8 @@ def convert_to_markdown(
576
580
  config = create_preprocessor(
577
581
  preset=preprocessing_preset,
578
582
  remove_navigation=remove_navigation,
583
+ excluded_navigation_classes=excluded_navigation_classes,
584
+ extra_navigation_classes=extra_navigation_classes,
579
585
  remove_forms=remove_forms,
580
586
  )
581
587
  source = preprocess_fn(source, **config)
@@ -1078,6 +1084,8 @@ def convert_to_markdown_stream(
1078
1084
  preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
1079
1085
  remove_forms: bool = True,
1080
1086
  remove_navigation: bool = True,
1087
+ excluded_navigation_classes: set[str] | None = None,
1088
+ extra_navigation_classes: set[str] | None = None,
1081
1089
  strip: str | Iterable[str] | None = None,
1082
1090
  strip_newlines: bool = False,
1083
1091
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
@@ -1096,6 +1104,8 @@ def convert_to_markdown_stream(
1096
1104
  config = create_preprocessor(
1097
1105
  preset=preprocessing_preset,
1098
1106
  remove_navigation=remove_navigation,
1107
+ excluded_navigation_classes=excluded_navigation_classes,
1108
+ extra_navigation_classes=extra_navigation_classes,
1099
1109
  remove_forms=remove_forms,
1100
1110
  )
1101
1111
  source = preprocess_fn(source, **config)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.14.1
3
+ Version: 1.15.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -627,6 +627,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
627
627
  - `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
628
628
  - `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
629
629
  - `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
630
+ - `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
631
+ - `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
630
632
 
631
633
  ## Contribution
632
634
 
@@ -4,14 +4,14 @@ html_to_markdown/cli.py,sha256=-rq1L64Ze-zxSdn0cta8HvUCJDGmWHDcZe2RlVZJFjI,9665
4
4
  html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
5
5
  html_to_markdown/converters.py,sha256=REuvFnP-D97VlG2kuCVTbb3exoZ87NQn9hUuiP5ISOU,35839
6
6
  html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
7
- html_to_markdown/preprocessor.py,sha256=otnTOhoivJkxaip1Lb9xNMl8q-x9aGFXSYkSrxsTW8g,9591
8
- html_to_markdown/processing.py,sha256=WFXwHNOK_wdNtiRjubt_MC19Q3FScR0j5eohWmBRSmU,40548
7
+ html_to_markdown/preprocessor.py,sha256=QBdZbQQAlJG6n3iv9xzvufnq1F0l9c9LOOVECtPiuM0,10192
8
+ html_to_markdown/processing.py,sha256=s9o_7GTpK3UAmMS0PHFrlNZk0xhO-ITvFQx6a-fN-tI,41247
9
9
  html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
11
11
  html_to_markdown/whitespace.py,sha256=rl3eEwqfMpNWx4FBmbkZ1RxO_Od45p3EZ_7UgKcDAtg,7710
12
- html_to_markdown-1.14.1.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
13
- html_to_markdown-1.14.1.dist-info/METADATA,sha256=m9fQ28oyBQXrKoXB6Sd-tH7-NhB0RsaikBH2wBvn1LA,29421
14
- html_to_markdown-1.14.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- html_to_markdown-1.14.1.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
16
- html_to_markdown-1.14.1.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
17
- html_to_markdown-1.14.1.dist-info/RECORD,,
12
+ html_to_markdown-1.15.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
13
+ html_to_markdown-1.15.0.dist-info/METADATA,sha256=uEh0fiaSCEOvazsHCGRsjIyUO-9kMdOgWHM2og31xgs,29670
14
+ html_to_markdown-1.15.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ html_to_markdown-1.15.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
16
+ html_to_markdown-1.15.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
17
+ html_to_markdown-1.15.0.dist-info/RECORD,,