html-to-markdown 1.14.0__tar.gz → 1.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/PKG-INFO +3 -1
  2. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/README.md +2 -0
  3. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/preprocessor.py +49 -25
  4. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/processing.py +13 -5
  5. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown.egg-info/PKG-INFO +3 -1
  6. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/pyproject.toml +3 -1
  7. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/LICENSE +0 -0
  8. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/__init__.py +0 -0
  9. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/__main__.py +0 -0
  10. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/cli.py +0 -0
  11. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/constants.py +0 -0
  12. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/converters.py +0 -0
  13. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/exceptions.py +0 -0
  14. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/py.typed +0 -0
  15. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/utils.py +0 -0
  16. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown/whitespace.py +0 -0
  17. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown.egg-info/SOURCES.txt +0 -0
  18. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown.egg-info/dependency_links.txt +0 -0
  19. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown.egg-info/entry_points.txt +0 -0
  20. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown.egg-info/requires.txt +0 -0
  21. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/html_to_markdown.egg-info/top_level.txt +0 -0
  22. {html_to_markdown-1.14.0 → html_to_markdown-1.15.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.14.0
3
+ Version: 1.15.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -627,6 +627,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
627
627
  - `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
628
628
  - `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
629
629
  - `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
630
+ - `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
631
+ - `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
630
632
 
631
633
  ## Contribution
632
634
 
@@ -587,6 +587,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
587
587
  - `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
588
588
  - `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
589
589
  - `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
590
+ - `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
591
+ - `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
590
592
 
591
593
  ## Contribution
592
594
 
@@ -97,6 +97,27 @@ MEDIA_TAGS = frozenset(
97
97
  }
98
98
  )
99
99
 
100
+ DEFAULT_NAVIGATION_CLASSES: frozenset[str] = frozenset(
101
+ {
102
+ "vector-header",
103
+ "vector-main-menu",
104
+ "vector-page-tools",
105
+ "vector-toc",
106
+ "mw-jump-link",
107
+ "mw-navigation",
108
+ "navbox",
109
+ "navigation-box",
110
+ "sidebar",
111
+ "nav",
112
+ "header",
113
+ "footer",
114
+ "menu",
115
+ "breadcrumb",
116
+ "topbar",
117
+ "toolbar",
118
+ }
119
+ )
120
+
100
121
 
101
122
  def preprocess_html(
102
123
  html: str,
@@ -111,11 +132,18 @@ def preprocess_html(
111
132
  preserve_media: bool = True,
112
133
  custom_tags_to_remove: set[str] | None = None,
113
134
  custom_attributes_to_remove: set[str] | None = None,
135
+ excluded_navigation_classes: set[str] | None = None,
136
+ extra_navigation_classes: set[str] | None = None,
114
137
  ) -> str:
115
138
  if not html or not html.strip(): # pragma: no cover
116
139
  return html
117
140
 
118
- html = _remove_class_based_navigation(html, remove_navigation)
141
+ html = _remove_class_based_navigation(
142
+ html,
143
+ remove_navigation,
144
+ excluded_navigation_classes,
145
+ extra_navigation_classes,
146
+ )
119
147
 
120
148
  nh3_config = _configure_cleaning_rules(
121
149
  remove_navigation=remove_navigation,
@@ -242,35 +270,31 @@ def _configure_cleaning_rules(
242
270
  }
243
271
 
244
272
 
245
- def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
273
+ def _remove_class_based_navigation(
274
+ html: str,
275
+ remove_navigation: bool,
276
+ excluded_navigation_classes: set[str] | None,
277
+ extra_navigation_classes: set[str] | None,
278
+ ) -> str:
246
279
  if not remove_navigation:
247
280
  return html
248
281
 
249
- navigation_classes = [
250
- r'vector-header[^"]*',
251
- r'vector-main-menu[^"]*',
252
- r'vector-page-tools[^"]*',
253
- r'vector-toc[^"]*',
254
- r'mw-jump-link[^"]*',
255
- r'mw-navigation[^"]*',
256
- r'navbox[^"]*',
257
- r'navigation-box[^"]*',
258
- r'sidebar[^"]*',
259
- r'nav[^"]*',
260
- r'header[^"]*',
261
- r'footer[^"]*',
262
- r'menu[^"]*',
263
- r'breadcrumb[^"]*',
264
- r'topbar[^"]*',
265
- r'toolbar[^"]*',
266
- ]
282
+ class_names = set(DEFAULT_NAVIGATION_CLASSES)
283
+
284
+ if excluded_navigation_classes:
285
+ class_names.difference_update(excluded_navigation_classes)
286
+
287
+ if extra_navigation_classes:
288
+ class_names.update(extra_navigation_classes)
289
+
290
+ for class_name in class_names:
291
+ class_pattern = rf'{re.escape(class_name)}[^"]*'
267
292
 
268
- for class_pattern in navigation_classes:
269
- pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</[^>]*>'
270
- html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
293
+ block_pattern = rf'<(?P<tag>[^>\s]+)[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</(?P=tag)>'
294
+ html = re.sub(block_pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
271
295
 
272
- pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
273
- html = re.sub(pattern, "", html, flags=re.IGNORECASE)
296
+ self_closing_pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
297
+ html = re.sub(self_closing_pattern, "", html, flags=re.IGNORECASE)
274
298
 
275
299
  return html
276
300
 
@@ -314,11 +314,12 @@ def _process_text(
314
314
  if len(ancestor_names) > 10:
315
315
  break
316
316
 
317
- in_pre = bool(ancestor_names.intersection({"pre"}))
317
+ in_pre = bool(ancestor_names.intersection({"pre"})) or parent_name == "pre"
318
318
 
319
319
  text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
320
320
 
321
- if not ancestor_names.intersection({"pre", "code", "kbd", "samp"}):
321
+ code_like_tags = {"pre", "code", "kbd", "samp"}
322
+ if not (ancestor_names.intersection(code_like_tags) or parent_name in code_like_tags):
322
323
  text = escape(
323
324
  text=text,
324
325
  escape_misc=escape_misc,
@@ -476,6 +477,8 @@ def convert_to_markdown(
476
477
  preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
477
478
  remove_forms: bool = True,
478
479
  remove_navigation: bool = True,
480
+ excluded_navigation_classes: set[str] | None = None,
481
+ extra_navigation_classes: set[str] | None = None,
479
482
  strip: str | Iterable[str] | None = None,
480
483
  strip_newlines: bool = False,
481
484
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
@@ -520,6 +523,8 @@ def convert_to_markdown(
520
523
  preprocessing_preset: Preprocessing aggressiveness level.
521
524
  remove_forms: Remove form elements during preprocessing.
522
525
  remove_navigation: Remove navigation elements during preprocessing.
526
+ excluded_navigation_classes: Navigation class fragments to keep even when removing navigation.
527
+ extra_navigation_classes: Additional navigation class fragments to strip beyond the defaults.
523
528
  strip: HTML tags to strip from output.
524
529
  strip_newlines: Remove newlines from HTML before processing.
525
530
  strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
@@ -575,6 +580,8 @@ def convert_to_markdown(
575
580
  config = create_preprocessor(
576
581
  preset=preprocessing_preset,
577
582
  remove_navigation=remove_navigation,
583
+ excluded_navigation_classes=excluded_navigation_classes,
584
+ extra_navigation_classes=extra_navigation_classes,
578
585
  remove_forms=remove_forms,
579
586
  )
580
587
  source = preprocess_fn(source, **config)
@@ -617,7 +624,6 @@ def convert_to_markdown(
617
624
  first_child.replace_with(new_text)
618
625
  needs_leading_space_fix = False
619
626
 
620
- # Fix html5lib whitespace handling to match other parsers
621
627
  if parser == "html5lib":
622
628
  body = source.find("body")
623
629
  if body and isinstance(body, Tag):
@@ -632,7 +638,6 @@ def convert_to_markdown(
632
638
  first_child = children[0]
633
639
  original_text = str(first_child)
634
640
 
635
- # Preserve leading whitespace from original if html5lib stripped it
636
641
  leading_ws = ""
637
642
  for char in original_source:
638
643
  if char in " \t\n\r":
@@ -640,7 +645,6 @@ def convert_to_markdown(
640
645
  else:
641
646
  break
642
647
 
643
- # Create normalized text: restore leading whitespace only
644
648
  normalized_text = original_text
645
649
  if leading_ws and not normalized_text.startswith(leading_ws):
646
650
  normalized_text = leading_ws + normalized_text
@@ -1080,6 +1084,8 @@ def convert_to_markdown_stream(
1080
1084
  preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
1081
1085
  remove_forms: bool = True,
1082
1086
  remove_navigation: bool = True,
1087
+ excluded_navigation_classes: set[str] | None = None,
1088
+ extra_navigation_classes: set[str] | None = None,
1083
1089
  strip: str | Iterable[str] | None = None,
1084
1090
  strip_newlines: bool = False,
1085
1091
  strong_em_symbol: Literal["*", "_"] = ASTERISK,
@@ -1098,6 +1104,8 @@ def convert_to_markdown_stream(
1098
1104
  config = create_preprocessor(
1099
1105
  preset=preprocessing_preset,
1100
1106
  remove_navigation=remove_navigation,
1107
+ excluded_navigation_classes=excluded_navigation_classes,
1108
+ extra_navigation_classes=extra_navigation_classes,
1101
1109
  remove_forms=remove_forms,
1102
1110
  )
1103
1111
  source = preprocess_fn(source, **config)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: html-to-markdown
3
- Version: 1.14.0
3
+ Version: 1.15.0
4
4
  Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
5
5
  Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
6
6
  License: MIT
@@ -627,6 +627,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
627
627
  - `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
628
628
  - `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
629
629
  - `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
630
+ - `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
631
+ - `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
630
632
 
631
633
  ## Contribution
632
634
 
@@ -5,7 +5,7 @@ requires = [ "setuptools>=78.1" ]
5
5
 
6
6
  [project]
7
7
  name = "html-to-markdown"
8
- version = "1.14.0"
8
+ version = "1.15.0"
9
9
  description = "A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options"
10
10
  readme = "README.md"
11
11
  keywords = [
@@ -61,8 +61,10 @@ dev = [
61
61
  "beautifulsoup4[html5lib]>=4.13.5",
62
62
  "beautifulsoup4[lxml]>=4.13.5",
63
63
  "covdefaults>=2.3",
64
+ "memray>=1.18; sys_platform!='win32'",
64
65
  "mypy>=1.18.2",
65
66
  "pre-commit>=4.3",
67
+ "psutil>=7.1; sys_platform!='win32'",
66
68
  "pytest>=8.4.2",
67
69
  "pytest-benchmark>=5.1",
68
70
  "pytest-cov>=7",