html-to-markdown 1.14.0__py3-none-any.whl → 1.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/preprocessor.py +49 -25
- html_to_markdown/processing.py +13 -5
- {html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/METADATA +3 -1
- {html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/RECORD +8 -8
- {html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.14.0.dist-info → html_to_markdown-1.15.0.dist-info}/top_level.txt +0 -0
html_to_markdown/preprocessor.py
CHANGED
|
@@ -97,6 +97,27 @@ MEDIA_TAGS = frozenset(
|
|
|
97
97
|
}
|
|
98
98
|
)
|
|
99
99
|
|
|
100
|
+
DEFAULT_NAVIGATION_CLASSES: frozenset[str] = frozenset(
|
|
101
|
+
{
|
|
102
|
+
"vector-header",
|
|
103
|
+
"vector-main-menu",
|
|
104
|
+
"vector-page-tools",
|
|
105
|
+
"vector-toc",
|
|
106
|
+
"mw-jump-link",
|
|
107
|
+
"mw-navigation",
|
|
108
|
+
"navbox",
|
|
109
|
+
"navigation-box",
|
|
110
|
+
"sidebar",
|
|
111
|
+
"nav",
|
|
112
|
+
"header",
|
|
113
|
+
"footer",
|
|
114
|
+
"menu",
|
|
115
|
+
"breadcrumb",
|
|
116
|
+
"topbar",
|
|
117
|
+
"toolbar",
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
|
|
100
121
|
|
|
101
122
|
def preprocess_html(
|
|
102
123
|
html: str,
|
|
@@ -111,11 +132,18 @@ def preprocess_html(
|
|
|
111
132
|
preserve_media: bool = True,
|
|
112
133
|
custom_tags_to_remove: set[str] | None = None,
|
|
113
134
|
custom_attributes_to_remove: set[str] | None = None,
|
|
135
|
+
excluded_navigation_classes: set[str] | None = None,
|
|
136
|
+
extra_navigation_classes: set[str] | None = None,
|
|
114
137
|
) -> str:
|
|
115
138
|
if not html or not html.strip(): # pragma: no cover
|
|
116
139
|
return html
|
|
117
140
|
|
|
118
|
-
html = _remove_class_based_navigation(
|
|
141
|
+
html = _remove_class_based_navigation(
|
|
142
|
+
html,
|
|
143
|
+
remove_navigation,
|
|
144
|
+
excluded_navigation_classes,
|
|
145
|
+
extra_navigation_classes,
|
|
146
|
+
)
|
|
119
147
|
|
|
120
148
|
nh3_config = _configure_cleaning_rules(
|
|
121
149
|
remove_navigation=remove_navigation,
|
|
@@ -242,35 +270,31 @@ def _configure_cleaning_rules(
|
|
|
242
270
|
}
|
|
243
271
|
|
|
244
272
|
|
|
245
|
-
def _remove_class_based_navigation(
|
|
273
|
+
def _remove_class_based_navigation(
|
|
274
|
+
html: str,
|
|
275
|
+
remove_navigation: bool,
|
|
276
|
+
excluded_navigation_classes: set[str] | None,
|
|
277
|
+
extra_navigation_classes: set[str] | None,
|
|
278
|
+
) -> str:
|
|
246
279
|
if not remove_navigation:
|
|
247
280
|
return html
|
|
248
281
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
r'nav[^"]*',
|
|
260
|
-
r'header[^"]*',
|
|
261
|
-
r'footer[^"]*',
|
|
262
|
-
r'menu[^"]*',
|
|
263
|
-
r'breadcrumb[^"]*',
|
|
264
|
-
r'topbar[^"]*',
|
|
265
|
-
r'toolbar[^"]*',
|
|
266
|
-
]
|
|
282
|
+
class_names = set(DEFAULT_NAVIGATION_CLASSES)
|
|
283
|
+
|
|
284
|
+
if excluded_navigation_classes:
|
|
285
|
+
class_names.difference_update(excluded_navigation_classes)
|
|
286
|
+
|
|
287
|
+
if extra_navigation_classes:
|
|
288
|
+
class_names.update(extra_navigation_classes)
|
|
289
|
+
|
|
290
|
+
for class_name in class_names:
|
|
291
|
+
class_pattern = rf'{re.escape(class_name)}[^"]*'
|
|
267
292
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
293
|
+
block_pattern = rf'<(?P<tag>[^>\s]+)[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</(?P=tag)>'
|
|
294
|
+
html = re.sub(block_pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
271
295
|
|
|
272
|
-
|
|
273
|
-
html = re.sub(
|
|
296
|
+
self_closing_pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
|
|
297
|
+
html = re.sub(self_closing_pattern, "", html, flags=re.IGNORECASE)
|
|
274
298
|
|
|
275
299
|
return html
|
|
276
300
|
|
html_to_markdown/processing.py
CHANGED
|
@@ -314,11 +314,12 @@ def _process_text(
|
|
|
314
314
|
if len(ancestor_names) > 10:
|
|
315
315
|
break
|
|
316
316
|
|
|
317
|
-
in_pre = bool(ancestor_names.intersection({"pre"}))
|
|
317
|
+
in_pre = bool(ancestor_names.intersection({"pre"})) or parent_name == "pre"
|
|
318
318
|
|
|
319
319
|
text = whitespace_handler.process_text_whitespace(text, el, in_pre=in_pre)
|
|
320
320
|
|
|
321
|
-
|
|
321
|
+
code_like_tags = {"pre", "code", "kbd", "samp"}
|
|
322
|
+
if not (ancestor_names.intersection(code_like_tags) or parent_name in code_like_tags):
|
|
322
323
|
text = escape(
|
|
323
324
|
text=text,
|
|
324
325
|
escape_misc=escape_misc,
|
|
@@ -476,6 +477,8 @@ def convert_to_markdown(
|
|
|
476
477
|
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
477
478
|
remove_forms: bool = True,
|
|
478
479
|
remove_navigation: bool = True,
|
|
480
|
+
excluded_navigation_classes: set[str] | None = None,
|
|
481
|
+
extra_navigation_classes: set[str] | None = None,
|
|
479
482
|
strip: str | Iterable[str] | None = None,
|
|
480
483
|
strip_newlines: bool = False,
|
|
481
484
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
@@ -520,6 +523,8 @@ def convert_to_markdown(
|
|
|
520
523
|
preprocessing_preset: Preprocessing aggressiveness level.
|
|
521
524
|
remove_forms: Remove form elements during preprocessing.
|
|
522
525
|
remove_navigation: Remove navigation elements during preprocessing.
|
|
526
|
+
excluded_navigation_classes: Navigation class fragments to keep even when removing navigation.
|
|
527
|
+
extra_navigation_classes: Additional navigation class fragments to strip beyond the defaults.
|
|
523
528
|
strip: HTML tags to strip from output.
|
|
524
529
|
strip_newlines: Remove newlines from HTML before processing.
|
|
525
530
|
strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
|
|
@@ -575,6 +580,8 @@ def convert_to_markdown(
|
|
|
575
580
|
config = create_preprocessor(
|
|
576
581
|
preset=preprocessing_preset,
|
|
577
582
|
remove_navigation=remove_navigation,
|
|
583
|
+
excluded_navigation_classes=excluded_navigation_classes,
|
|
584
|
+
extra_navigation_classes=extra_navigation_classes,
|
|
578
585
|
remove_forms=remove_forms,
|
|
579
586
|
)
|
|
580
587
|
source = preprocess_fn(source, **config)
|
|
@@ -617,7 +624,6 @@ def convert_to_markdown(
|
|
|
617
624
|
first_child.replace_with(new_text)
|
|
618
625
|
needs_leading_space_fix = False
|
|
619
626
|
|
|
620
|
-
# Fix html5lib whitespace handling to match other parsers
|
|
621
627
|
if parser == "html5lib":
|
|
622
628
|
body = source.find("body")
|
|
623
629
|
if body and isinstance(body, Tag):
|
|
@@ -632,7 +638,6 @@ def convert_to_markdown(
|
|
|
632
638
|
first_child = children[0]
|
|
633
639
|
original_text = str(first_child)
|
|
634
640
|
|
|
635
|
-
# Preserve leading whitespace from original if html5lib stripped it
|
|
636
641
|
leading_ws = ""
|
|
637
642
|
for char in original_source:
|
|
638
643
|
if char in " \t\n\r":
|
|
@@ -640,7 +645,6 @@ def convert_to_markdown(
|
|
|
640
645
|
else:
|
|
641
646
|
break
|
|
642
647
|
|
|
643
|
-
# Create normalized text: restore leading whitespace only
|
|
644
648
|
normalized_text = original_text
|
|
645
649
|
if leading_ws and not normalized_text.startswith(leading_ws):
|
|
646
650
|
normalized_text = leading_ws + normalized_text
|
|
@@ -1080,6 +1084,8 @@ def convert_to_markdown_stream(
|
|
|
1080
1084
|
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
1081
1085
|
remove_forms: bool = True,
|
|
1082
1086
|
remove_navigation: bool = True,
|
|
1087
|
+
excluded_navigation_classes: set[str] | None = None,
|
|
1088
|
+
extra_navigation_classes: set[str] | None = None,
|
|
1083
1089
|
strip: str | Iterable[str] | None = None,
|
|
1084
1090
|
strip_newlines: bool = False,
|
|
1085
1091
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
@@ -1098,6 +1104,8 @@ def convert_to_markdown_stream(
|
|
|
1098
1104
|
config = create_preprocessor(
|
|
1099
1105
|
preset=preprocessing_preset,
|
|
1100
1106
|
remove_navigation=remove_navigation,
|
|
1107
|
+
excluded_navigation_classes=excluded_navigation_classes,
|
|
1108
|
+
extra_navigation_classes=extra_navigation_classes,
|
|
1101
1109
|
remove_forms=remove_forms,
|
|
1102
1110
|
)
|
|
1103
1111
|
source = preprocess_fn(source, **config)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.15.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -627,6 +627,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
627
627
|
- `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
|
|
628
628
|
- `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
|
|
629
629
|
- `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
|
|
630
|
+
- `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
|
|
631
|
+
- `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
|
|
630
632
|
|
|
631
633
|
## Contribution
|
|
632
634
|
|
|
@@ -4,14 +4,14 @@ html_to_markdown/cli.py,sha256=-rq1L64Ze-zxSdn0cta8HvUCJDGmWHDcZe2RlVZJFjI,9665
|
|
|
4
4
|
html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
|
|
5
5
|
html_to_markdown/converters.py,sha256=REuvFnP-D97VlG2kuCVTbb3exoZ87NQn9hUuiP5ISOU,35839
|
|
6
6
|
html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
|
|
7
|
-
html_to_markdown/preprocessor.py,sha256=
|
|
8
|
-
html_to_markdown/processing.py,sha256=
|
|
7
|
+
html_to_markdown/preprocessor.py,sha256=QBdZbQQAlJG6n3iv9xzvufnq1F0l9c9LOOVECtPiuM0,10192
|
|
8
|
+
html_to_markdown/processing.py,sha256=s9o_7GTpK3UAmMS0PHFrlNZk0xhO-ITvFQx6a-fN-tI,41247
|
|
9
9
|
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
|
|
11
11
|
html_to_markdown/whitespace.py,sha256=rl3eEwqfMpNWx4FBmbkZ1RxO_Od45p3EZ_7UgKcDAtg,7710
|
|
12
|
-
html_to_markdown-1.
|
|
13
|
-
html_to_markdown-1.
|
|
14
|
-
html_to_markdown-1.
|
|
15
|
-
html_to_markdown-1.
|
|
16
|
-
html_to_markdown-1.
|
|
17
|
-
html_to_markdown-1.
|
|
12
|
+
html_to_markdown-1.15.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
13
|
+
html_to_markdown-1.15.0.dist-info/METADATA,sha256=uEh0fiaSCEOvazsHCGRsjIyUO-9kMdOgWHM2og31xgs,29670
|
|
14
|
+
html_to_markdown-1.15.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
html_to_markdown-1.15.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
16
|
+
html_to_markdown-1.15.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
17
|
+
html_to_markdown-1.15.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|