html-to-markdown 1.6.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of html-to-markdown might be problematic. Click here for more details.

@@ -0,0 +1,407 @@
1
+ """HTML preprocessing using nh3 (ammonia bindings) for improved quality and performance."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ import nh3
9
+
10
+
11
+ def preprocess_html(
12
+ html: str,
13
+ *,
14
+ remove_navigation: bool = True,
15
+ remove_forms: bool = True,
16
+ remove_scripts: bool = True,
17
+ remove_styles: bool = True,
18
+ remove_comments: bool = True,
19
+ preserve_semantic_structure: bool = True,
20
+ preserve_tables: bool = True,
21
+ preserve_media: bool = True,
22
+ custom_tags_to_remove: set[str] | None = None,
23
+ custom_attributes_to_remove: set[str] | None = None,
24
+ ) -> str:
25
+ """Preprocess HTML to remove unwanted elements and improve quality.
26
+
27
+ Args:
28
+ html: Raw HTML content to preprocess.
29
+ remove_navigation: Remove navigation elements and menus.
30
+ remove_forms: Remove form elements (input, button, select, etc.).
31
+ remove_scripts: Remove script tags and content.
32
+ remove_styles: Remove style tags and content.
33
+ remove_comments: Remove HTML comments.
34
+ preserve_semantic_structure: Preserve semantic HTML5 elements.
35
+ preserve_tables: Preserve table structure.
36
+ preserve_media: Preserve media elements (img, video, audio).
37
+ custom_tags_to_remove: Additional tags to remove.
38
+ custom_attributes_to_remove: Additional attributes to remove.
39
+
40
+ Returns:
41
+ Cleaned HTML ready for conversion to markdown.
42
+ """
43
+ if not html or not html.strip(): # pragma: no cover
44
+ return html
45
+
46
+ html = _remove_class_based_navigation(html, remove_navigation)
47
+
48
+ nh3_config = _configure_cleaning_rules(
49
+ remove_navigation=remove_navigation,
50
+ remove_forms=remove_forms,
51
+ remove_scripts=remove_scripts,
52
+ remove_styles=remove_styles,
53
+ remove_comments=remove_comments,
54
+ preserve_semantic_structure=preserve_semantic_structure,
55
+ preserve_tables=preserve_tables,
56
+ preserve_media=preserve_media,
57
+ custom_tags_to_remove=custom_tags_to_remove or set(),
58
+ custom_attributes_to_remove=custom_attributes_to_remove or set(),
59
+ )
60
+
61
+ cleaned_html = nh3.clean(
62
+ html,
63
+ tags=nh3_config["tags"],
64
+ attributes=nh3_config["attributes"],
65
+ clean_content_tags=nh3_config["clean_content_tags"],
66
+ strip_comments=nh3_config["strip_comments"],
67
+ )
68
+
69
+ cleaned_html = _remove_navigation_patterns(cleaned_html, remove_navigation)
70
+ return _fix_whitespace_issues(cleaned_html)
71
+
72
+
73
+ def _configure_cleaning_rules(
74
+ *,
75
+ remove_navigation: bool,
76
+ remove_forms: bool,
77
+ remove_scripts: bool,
78
+ remove_styles: bool,
79
+ remove_comments: bool,
80
+ preserve_semantic_structure: bool,
81
+ preserve_tables: bool,
82
+ preserve_media: bool,
83
+ custom_tags_to_remove: set[str],
84
+ custom_attributes_to_remove: set[str],
85
+ ) -> dict[str, Any]:
86
+ """Configure the cleaning rules for nh3."""
87
+ allowed_tags = {
88
+ "p",
89
+ "div",
90
+ "span",
91
+ "br",
92
+ "hr",
93
+ "h1",
94
+ "h2",
95
+ "h3",
96
+ "h4",
97
+ "h5",
98
+ "h6",
99
+ "ul",
100
+ "ol",
101
+ "li",
102
+ "dl",
103
+ "dt",
104
+ "dd",
105
+ "strong",
106
+ "b",
107
+ "em",
108
+ "i",
109
+ "u",
110
+ "s",
111
+ "del",
112
+ "ins",
113
+ "mark",
114
+ "small",
115
+ "sub",
116
+ "sup",
117
+ "code",
118
+ "pre",
119
+ "kbd",
120
+ "samp",
121
+ "var",
122
+ "abbr",
123
+ "cite",
124
+ "dfn",
125
+ "time",
126
+ "data",
127
+ "a",
128
+ "blockquote",
129
+ "q",
130
+ }
131
+
132
+ if preserve_semantic_structure:
133
+ allowed_tags.update(
134
+ {
135
+ "article",
136
+ "section",
137
+ "aside",
138
+ "header",
139
+ "footer",
140
+ "main",
141
+ "nav",
142
+ "figure",
143
+ "figcaption",
144
+ "details",
145
+ "summary",
146
+ }
147
+ )
148
+
149
+ if preserve_tables:
150
+ allowed_tags.update(
151
+ {
152
+ "table",
153
+ "thead",
154
+ "tbody",
155
+ "tfoot",
156
+ "tr",
157
+ "th",
158
+ "td",
159
+ "caption",
160
+ "col",
161
+ "colgroup",
162
+ }
163
+ )
164
+
165
+ if preserve_media:
166
+ allowed_tags.update(
167
+ {
168
+ "img",
169
+ "picture",
170
+ "source",
171
+ "audio",
172
+ "video",
173
+ "track",
174
+ "canvas",
175
+ "svg",
176
+ "iframe",
177
+ }
178
+ )
179
+
180
+ allowed_tags -= custom_tags_to_remove
181
+
182
+ clean_content_tags = set()
183
+
184
+ if remove_navigation:
185
+ clean_content_tags.update(
186
+ {
187
+ "nav",
188
+ "menu",
189
+ "menuitem",
190
+ "header",
191
+ "footer",
192
+ "mw-jump-link",
193
+ "vector-header",
194
+ "vector-header-container",
195
+ "vector-main-menu",
196
+ "vector-page-tools",
197
+ "vector-toc",
198
+ "mw-navigation",
199
+ "navbox",
200
+ "navigation-box",
201
+ "sidebar",
202
+ }
203
+ )
204
+
205
+ if remove_forms:
206
+ clean_content_tags.update(
207
+ {
208
+ "form",
209
+ "input",
210
+ "button",
211
+ "select",
212
+ "option",
213
+ "optgroup",
214
+ "textarea",
215
+ "fieldset",
216
+ "legend",
217
+ "label",
218
+ "output",
219
+ "progress",
220
+ "meter",
221
+ "datalist",
222
+ }
223
+ )
224
+
225
+ if remove_scripts:
226
+ clean_content_tags.update({"script", "noscript"})
227
+
228
+ if remove_styles:
229
+ clean_content_tags.update({"style"})
230
+
231
+ clean_content_tags.update(custom_tags_to_remove)
232
+
233
+ allowed_tags -= clean_content_tags
234
+
235
+ allowed_attributes = {
236
+ "*": {"id", "class", "lang", "dir", "title"},
237
+ "a": {"href"},
238
+ "img": {"src", "alt", "width", "height"},
239
+ "th": {"scope", "colspan", "rowspan"},
240
+ "td": {"colspan", "rowspan"},
241
+ }
242
+
243
+ if custom_attributes_to_remove:
244
+ for attrs in allowed_attributes.values():
245
+ if isinstance(attrs, set):
246
+ attrs.difference_update(custom_attributes_to_remove)
247
+
248
+ return {
249
+ "tags": allowed_tags,
250
+ "attributes": allowed_attributes,
251
+ "clean_content_tags": clean_content_tags,
252
+ "strip_comments": remove_comments,
253
+ }
254
+
255
+
256
+ def _remove_class_based_navigation(html: str, remove_navigation: bool) -> str:
257
+ """Remove elements with navigation-related classes."""
258
+ if not remove_navigation:
259
+ return html
260
+
261
+ navigation_classes = [
262
+ r'vector-header[^"]*',
263
+ r'vector-main-menu[^"]*',
264
+ r'vector-page-tools[^"]*',
265
+ r'vector-toc[^"]*',
266
+ r'mw-jump-link[^"]*',
267
+ r'mw-navigation[^"]*',
268
+ r'navbox[^"]*',
269
+ r'navigation-box[^"]*',
270
+ r'sidebar[^"]*',
271
+ r'nav[^"]*',
272
+ r'header[^"]*',
273
+ r'footer[^"]*',
274
+ r'menu[^"]*',
275
+ r'breadcrumb[^"]*',
276
+ r'topbar[^"]*',
277
+ r'toolbar[^"]*',
278
+ ]
279
+
280
+ for class_pattern in navigation_classes:
281
+ pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*>.*?</[^>]*>'
282
+ html = re.sub(pattern, "", html, flags=re.DOTALL | re.IGNORECASE)
283
+
284
+ pattern = rf'<[^>]*class="[^"]*{class_pattern}[^"]*"[^>]*/>'
285
+ html = re.sub(pattern, "", html, flags=re.IGNORECASE)
286
+
287
+ return html
288
+
289
+
290
+ def _remove_navigation_patterns(html: str, remove_navigation: bool) -> str:
291
+ """Remove common navigation patterns that nh3 might miss."""
292
+ if not remove_navigation:
293
+ return html
294
+
295
+ html = _remove_wikipedia_navigation_lists(html)
296
+
297
+ patterns_to_remove = [
298
+ r"\[Jump to content\]\(#[^)]*\)",
299
+ r"\[Jump to content\]",
300
+ r"Jump to content",
301
+ r"Main menu.*?hide.*?Navigation",
302
+ r"move to sidebar.*?hide",
303
+ r"Home\s*[>»]\s*[^<]*[>»]",
304
+ r"\[Skip to [^]]*\]",
305
+ r"\[Skip [^]]*\]",
306
+ r"<label[^>]*>.*?menu.*?</label>",
307
+ r"<button[^>]*>.*?(menu|toggle|expand|collapse|show|hide).*?</button>",
308
+ r"The Free Encyclopedia[^a-zA-Z]*",
309
+ r"<img[^>]*wikipedia[^>]*>",
310
+ r"\[Wikipedia\]\([^)]*\)",
311
+ r'\[Search\]\([^)]*"Search[^)]*"\)',
312
+ r"\[Add links\]\([^)]*\)",
313
+ r"This is a good article\. Click here for more information\.",
314
+ r"From Wikipedia, the free encyclopedia",
315
+ r'<img[^>]*alt=[\'"][\'"][^>]*>',
316
+ r'<img[^>]*src=[\'"][\'"][^>]*>',
317
+ r"div\\>",
318
+ r"</?\w+\\>",
319
+ r"^Main menu\s*$",
320
+ r"^Search\s*$",
321
+ r"^History\s*$",
322
+ r"^ProgrammingTranslatorReferencesExternal links\s*$",
323
+ ]
324
+
325
+ for pattern in patterns_to_remove:
326
+ html = re.sub(pattern, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
327
+
328
+ return html
329
+
330
+
331
+ def _remove_wikipedia_navigation_lists(html: str) -> str:
332
+ """Remove Wikipedia-style navigation lists that appear at the start."""
333
+ patterns = [
334
+ r"Main menu\s*\n\n(-\s*\[.*?\]\(.*?\).*?\n){3,}",
335
+ r"(-\s*\[[^\]]*\]\(/wiki/[^)]*\).*?\n){5,}",
336
+ ]
337
+
338
+ for pattern in patterns:
339
+ html = re.sub(pattern, "", html, flags=re.DOTALL | re.MULTILINE)
340
+
341
+ return html
342
+
343
+
344
+ def _fix_whitespace_issues(html: str) -> str:
345
+ """Fix common whitespace issues in HTML."""
346
+ html = re.sub(r"[ \t]{2,}", " ", html)
347
+ html = re.sub(r"\n\s*\n", "\n\n", html)
348
+
349
+ return re.sub(r">\s*<", "><", html)
350
+
351
+
352
+ PRESETS: dict[str, dict[str, Any]] = {
353
+ "minimal": {
354
+ "remove_navigation": True,
355
+ "remove_forms": True,
356
+ "remove_scripts": True,
357
+ "remove_styles": True,
358
+ "remove_comments": True,
359
+ "preserve_semantic_structure": False,
360
+ "preserve_tables": True,
361
+ "preserve_media": False,
362
+ },
363
+ "standard": {
364
+ "remove_navigation": True,
365
+ "remove_forms": True,
366
+ "remove_scripts": True,
367
+ "remove_styles": True,
368
+ "remove_comments": True,
369
+ "preserve_semantic_structure": True,
370
+ "preserve_tables": True,
371
+ "preserve_media": True,
372
+ },
373
+ "aggressive": {
374
+ "remove_navigation": True,
375
+ "remove_forms": True,
376
+ "remove_scripts": True,
377
+ "remove_styles": True,
378
+ "remove_comments": True,
379
+ "preserve_semantic_structure": False,
380
+ "preserve_tables": True,
381
+ "preserve_media": False,
382
+ "custom_tags_to_remove": {"aside", "footer", "header"},
383
+ },
384
+ }
385
+
386
+
387
+ def create_preprocessor(preset: str = "standard", **overrides: Any) -> dict[str, Any]:
388
+ """Create preprocessor configuration with a preset.
389
+
390
+ Args:
391
+ preset: The preset configuration to use (minimal, standard, aggressive).
392
+ **overrides: Any configuration options to override.
393
+
394
+ Returns:
395
+ Configuration dict for preprocessor.
396
+
397
+ Raises:
398
+ ValueError: If preset is unknown.
399
+ """
400
+ if preset not in PRESETS:
401
+ msg = f"Unknown preset '{preset}'. Available presets: {list(PRESETS.keys())}"
402
+ raise ValueError(msg)
403
+
404
+ config: dict[str, Any] = dict(PRESETS[preset])
405
+ config.update(overrides)
406
+
407
+ return config