html-to-markdown 1.14.1__py3-none-any.whl → 1.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of html-to-markdown might be problematic. Click here for more details.
- html_to_markdown/preprocessor.py +49 -25
- html_to_markdown/processing.py +10 -0
- {html_to_markdown-1.14.1.dist-info → html_to_markdown-1.15.0.dist-info}/METADATA +3 -1
- {html_to_markdown-1.14.1.dist-info → html_to_markdown-1.15.0.dist-info}/RECORD +8 -8
- {html_to_markdown-1.14.1.dist-info → html_to_markdown-1.15.0.dist-info}/WHEEL +0 -0
- {html_to_markdown-1.14.1.dist-info → html_to_markdown-1.15.0.dist-info}/entry_points.txt +0 -0
- {html_to_markdown-1.14.1.dist-info → html_to_markdown-1.15.0.dist-info}/licenses/LICENSE +0 -0
- {html_to_markdown-1.14.1.dist-info → html_to_markdown-1.15.0.dist-info}/top_level.txt +0 -0
html_to_markdown/preprocessor.py
CHANGED
|
@@ -97,6 +97,27 @@ MEDIA_TAGS = frozenset(
|
|
|
97
97
|
}
|
|
98
98
|
)
|
|
99
99
|
|
|
100
|
+
DEFAULT_NAVIGATION_CLASSES: frozenset[str] = frozenset(
|
|
101
|
+
{
|
|
102
|
+
"vector-header",
|
|
103
|
+
"vector-main-menu",
|
|
104
|
+
"vector-page-tools",
|
|
105
|
+
"vector-toc",
|
|
106
|
+
"mw-jump-link",
|
|
107
|
+
"mw-navigation",
|
|
108
|
+
"navbox",
|
|
109
|
+
"navigation-box",
|
|
110
|
+
"sidebar",
|
|
111
|
+
"nav",
|
|
112
|
+
"header",
|
|
113
|
+
"footer",
|
|
114
|
+
"menu",
|
|
115
|
+
"breadcrumb",
|
|
116
|
+
"topbar",
|
|
117
|
+
"toolbar",
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
|
|
100
121
|
|
|
101
122
|
def preprocess_html(
|
|
102
123
|
html: str,
|
|
@@ -111,11 +132,18 @@ def preprocess_html(
|
|
|
111
132
|
preserve_media: bool = True,
|
|
112
133
|
custom_tags_to_remove: set[str] | None = None,
|
|
113
134
|
custom_attributes_to_remove: set[str] | None = None,
|
|
135
|
+
excluded_navigation_classes: set[str] | None = None,
|
|
136
|
+
extra_navigation_classes: set[str] | None = None,
|
|
114
137
|
) -> str:
|
|
115
138
|
if not html or not html.strip(): # pragma: no cover
|
|
116
139
|
return html
|
|
117
140
|
|
|
118
|
-
html = _remove_class_based_navigation(
|
|
141
|
+
html = _remove_class_based_navigation(
|
|
142
|
+
html,
|
|
143
|
+
remove_navigation,
|
|
144
|
+
excluded_navigation_classes,
|
|
145
|
+
extra_navigation_classes,
|
|
146
|
+
)
|
|
119
147
|
|
|
120
148
|
nh3_config = _configure_cleaning_rules(
|
|
121
149
|
remove_navigation=remove_navigation,
|
|
@@ -242,35 +270,31 @@ def _configure_cleaning_rules(
|
|
|
242
270
|
}
|
|
243
271
|
|
|
244
272
|
|
|
245
|
-
def _remove_class_based_navigation(
|
|
273
|
+
def _remove_class_based_navigation(
|
|
274
|
+
html: str,
|
|
275
|
+
remove_navigation: bool,
|
|
276
|
+
excluded_navigation_classes: set[str] | None,
|
|
277
|
+
extra_navigation_classes: set[str] | None,
|
|
278
|
+
) -> str:
|
|
246
279
|
if not remove_navigation:
|
|
247
280
|
return html
|
|
248
281
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
r'nav[^"]*',
|
|
260
|
-
r'header[^"]*',
|
|
261
|
-
r'footer[^"]*',
|
|
262
|
-
r'menu[^"]*',
|
|
263
|
-
r'breadcrumb[^"]*',
|
|
264
|
-
r'topbar[^"]*',
|
|
265
|
-
r'toolbar[^"]*',
|
|
266
|
-
]
|
|
282
|
+
class_names = set(DEFAULT_NAVIGATION_CLASSES)
|
|
283
|
+
|
|
284
|
+
if excluded_navigation_classes:
|
|
285
|
+
class_names.difference_update(excluded_navigation_classes)
|
|
286
|
+
|
|
287
|
+
if extra_navigation_classes:
|
|
288
|
+
class_names.update(extra_navigation_classes)
|
|
289
|
+
|
|
290
|
+
for class_name in class_names:
|
|
291
|
+
class_pattern = rf'{re.escape(class_name)}[^"]*'
|
|
267
292
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
293
|
+
block_pattern = rf'<(?P<tag>[^>\s]+)[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</(?P=tag)>'
|
|
294
|
+
html = re.sub(block_pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
271
295
|
|
|
272
|
-
|
|
273
|
-
html = re.sub(
|
|
296
|
+
self_closing_pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
|
|
297
|
+
html = re.sub(self_closing_pattern, "", html, flags=re.IGNORECASE)
|
|
274
298
|
|
|
275
299
|
return html
|
|
276
300
|
|
html_to_markdown/processing.py
CHANGED
|
@@ -477,6 +477,8 @@ def convert_to_markdown(
|
|
|
477
477
|
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
478
478
|
remove_forms: bool = True,
|
|
479
479
|
remove_navigation: bool = True,
|
|
480
|
+
excluded_navigation_classes: set[str] | None = None,
|
|
481
|
+
extra_navigation_classes: set[str] | None = None,
|
|
480
482
|
strip: str | Iterable[str] | None = None,
|
|
481
483
|
strip_newlines: bool = False,
|
|
482
484
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
@@ -521,6 +523,8 @@ def convert_to_markdown(
|
|
|
521
523
|
preprocessing_preset: Preprocessing aggressiveness level.
|
|
522
524
|
remove_forms: Remove form elements during preprocessing.
|
|
523
525
|
remove_navigation: Remove navigation elements during preprocessing.
|
|
526
|
+
excluded_navigation_classes: Navigation class fragments to keep even when removing navigation.
|
|
527
|
+
extra_navigation_classes: Additional navigation class fragments to strip beyond the defaults.
|
|
524
528
|
strip: HTML tags to strip from output.
|
|
525
529
|
strip_newlines: Remove newlines from HTML before processing.
|
|
526
530
|
strong_em_symbol: Symbol for strong/emphasis ('*' or '_').
|
|
@@ -576,6 +580,8 @@ def convert_to_markdown(
|
|
|
576
580
|
config = create_preprocessor(
|
|
577
581
|
preset=preprocessing_preset,
|
|
578
582
|
remove_navigation=remove_navigation,
|
|
583
|
+
excluded_navigation_classes=excluded_navigation_classes,
|
|
584
|
+
extra_navigation_classes=extra_navigation_classes,
|
|
579
585
|
remove_forms=remove_forms,
|
|
580
586
|
)
|
|
581
587
|
source = preprocess_fn(source, **config)
|
|
@@ -1078,6 +1084,8 @@ def convert_to_markdown_stream(
|
|
|
1078
1084
|
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard",
|
|
1079
1085
|
remove_forms: bool = True,
|
|
1080
1086
|
remove_navigation: bool = True,
|
|
1087
|
+
excluded_navigation_classes: set[str] | None = None,
|
|
1088
|
+
extra_navigation_classes: set[str] | None = None,
|
|
1081
1089
|
strip: str | Iterable[str] | None = None,
|
|
1082
1090
|
strip_newlines: bool = False,
|
|
1083
1091
|
strong_em_symbol: Literal["*", "_"] = ASTERISK,
|
|
@@ -1096,6 +1104,8 @@ def convert_to_markdown_stream(
|
|
|
1096
1104
|
config = create_preprocessor(
|
|
1097
1105
|
preset=preprocessing_preset,
|
|
1098
1106
|
remove_navigation=remove_navigation,
|
|
1107
|
+
excluded_navigation_classes=excluded_navigation_classes,
|
|
1108
|
+
extra_navigation_classes=extra_navigation_classes,
|
|
1099
1109
|
remove_forms=remove_forms,
|
|
1100
1110
|
)
|
|
1101
1111
|
source = preprocess_fn(source, **config)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: html-to-markdown
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.15.0
|
|
4
4
|
Summary: A modern, type-safe Python library for converting HTML to Markdown with comprehensive tag support and customizable options
|
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfeld@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -627,6 +627,8 @@ The `markdownify` function is an alias for `convert_to_markdown` and provides id
|
|
|
627
627
|
- `preprocessing_preset` (str, default: `'standard'`): Preprocessing aggressiveness (`'minimal'` for basic cleaning, `'standard'` for balanced, `'aggressive'` for heavy cleaning)
|
|
628
628
|
- `remove_forms` (bool, default: `True`): Remove form elements during preprocessing
|
|
629
629
|
- `remove_navigation` (bool, default: `True`): Remove navigation elements during preprocessing
|
|
630
|
+
- `excluded_navigation_classes` (set[str], default: `None`): CSS class fragments to keep when navigation removal is enabled
|
|
631
|
+
- `extra_navigation_classes` (set[str], default: `None`): Additional CSS class fragments to strip during navigation clean-up
|
|
630
632
|
|
|
631
633
|
## Contribution
|
|
632
634
|
|
|
@@ -4,14 +4,14 @@ html_to_markdown/cli.py,sha256=-rq1L64Ze-zxSdn0cta8HvUCJDGmWHDcZe2RlVZJFjI,9665
|
|
|
4
4
|
html_to_markdown/constants.py,sha256=CKFVHjUZKgi8-lgU6AHPic7X5ChlTkbZt4Jv6VaVjjs,665
|
|
5
5
|
html_to_markdown/converters.py,sha256=REuvFnP-D97VlG2kuCVTbb3exoZ87NQn9hUuiP5ISOU,35839
|
|
6
6
|
html_to_markdown/exceptions.py,sha256=ytUOIL0D8r0Jd59RzUPqzmk73i-Mg63zDQYo6S6DBg4,1389
|
|
7
|
-
html_to_markdown/preprocessor.py,sha256=
|
|
8
|
-
html_to_markdown/processing.py,sha256=
|
|
7
|
+
html_to_markdown/preprocessor.py,sha256=QBdZbQQAlJG6n3iv9xzvufnq1F0l9c9LOOVECtPiuM0,10192
|
|
8
|
+
html_to_markdown/processing.py,sha256=s9o_7GTpK3UAmMS0PHFrlNZk0xhO-ITvFQx6a-fN-tI,41247
|
|
9
9
|
html_to_markdown/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
html_to_markdown/utils.py,sha256=s3A4ET_XyKC-WxzJtH4W0S7cIBGF5fTYIf4JJrqTX8Q,1069
|
|
11
11
|
html_to_markdown/whitespace.py,sha256=rl3eEwqfMpNWx4FBmbkZ1RxO_Od45p3EZ_7UgKcDAtg,7710
|
|
12
|
-
html_to_markdown-1.
|
|
13
|
-
html_to_markdown-1.
|
|
14
|
-
html_to_markdown-1.
|
|
15
|
-
html_to_markdown-1.
|
|
16
|
-
html_to_markdown-1.
|
|
17
|
-
html_to_markdown-1.
|
|
12
|
+
html_to_markdown-1.15.0.dist-info/licenses/LICENSE,sha256=3J_HR5BWvUM1mlIrlkF32-uC1FM64gy8JfG17LBuheQ,1122
|
|
13
|
+
html_to_markdown-1.15.0.dist-info/METADATA,sha256=uEh0fiaSCEOvazsHCGRsjIyUO-9kMdOgWHM2og31xgs,29670
|
|
14
|
+
html_to_markdown-1.15.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
15
|
+
html_to_markdown-1.15.0.dist-info/entry_points.txt,sha256=xmFijrTfgYW7lOrZxZGRPciicQHa5KiXKkUhBCmICtQ,116
|
|
16
|
+
html_to_markdown-1.15.0.dist-info/top_level.txt,sha256=Ev6djb1c4dSKr_-n4K-FpEGDkzBigXY6LuZ5onqS7AE,17
|
|
17
|
+
html_to_markdown-1.15.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|