smol-html 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
smol_html/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from smol_html.core import SmolHtmlCleaner
1
+ from smol_html.smol_html import SmolHtmlCleaner
2
2
 
3
3
  all = ["__version__", "SmolHtmlCleaner"]
4
- __version__ = "0.1.0"
4
+ __version__ = "0.1.0"
@@ -1,299 +1,377 @@
1
- from __future__ import annotations
2
-
3
- import minify_html
4
- from bs4 import BeautifulSoup, Tag
5
- from lxml import html as lxml_html
6
- from lxml.html.clean import Cleaner
7
-
8
-
9
- # -------------------------
10
- # Public API
11
- # -------------------------
12
- class SmolHtmlCleaner:
13
- """
14
- Small, dependable HTML cleaner/minifier with sensible defaults.
15
-
16
- Parameters
17
- ----------
18
- non_text_to_keep : set of str, optional
19
- Tags preserved even if textless. Default includes meta/media/table/line-break tags.
20
- attr_stop_words : set of str, optional
21
- Attribute tokens indicating non-content scaffolding/UX. Default contains common UI tokens.
22
- remove_header_lists : bool, optional
23
- Prune links/lists inside ``<header>``. Default True.
24
- remove_footer_lists : bool, optional
25
- Prune links/lists inside ``<footer>``. Default True.
26
- minify : bool, optional
27
- Minify HTML output via ``minify_html``. Default True.
28
- minify_kwargs : dict, optional
29
- Extra args for ``minify_html.minify``. Default empty.
30
- pre_parse_hooks : sequence of callables, optional
31
- Functions ``(str) -> str`` applied before parsing.
32
- post_clean_hooks : sequence of callables, optional
33
- Functions ``(BeautifulSoup) -> BeautifulSoup`` applied after cleaning.
34
- lxml_* : various, optional
35
- Direct mapping to ``lxml.html.clean.Cleaner`` kwargs (e.g., ``lxml_comments``, ``lxml_style``).
36
-
37
- Notes
38
- -----
39
- Defaults and cleaning behavior are preserved; only the configuration surface
40
- moved from a dataclass to keyword-only parameters on the constructor.
41
- """
42
-
43
- def __init__(
44
- self,
45
- *,
46
- # Core behavior
47
- non_text_to_keep: set[str] = None,
48
- attr_stop_words: set[str] = None,
49
- remove_header_lists: bool = True,
50
- remove_footer_lists: bool = True,
51
- # Minify
52
- minify: bool = True,
53
- minify_kwargs: dict | None = None,
54
- # lxml Cleaner exposed explicitly (prefixed)
55
- meta: bool = False,
56
- page_structure: bool = False,
57
- links: bool = True,
58
- scripts: bool = False,
59
- javascript: bool = True,
60
- comments: bool = True,
61
- style: bool = True,
62
- processing_instructions: bool = True,
63
- embedded: bool = True,
64
- frames: bool = True,
65
- forms: bool = True,
66
- annoying_tags: bool = True,
67
- kill_tags: set[str] | None = None,
68
- remove_unknown_tags: bool = True,
69
- safe_attrs_only: bool = True,
70
- safe_attrs: set[str] = None,
71
- ):
72
- # Inline defaults identical to the prior CleanerConfig
73
- if safe_attrs is None:
74
- safe_attrs = {"href", "hreflang", "src", "srclang", "target", "alt", "kind", "type", "role", "abbr",
75
- "accept", "accept-charset", "datetime", "lang", "name", "rel", "title", "value", "content", "label",
76
- "item_type", "property", "itemprop"}
77
-
78
- if attr_stop_words is None:
79
- attr_stop_words = {"alert", "button", "checkbox", "dialog", "navigation", "tab", "tabpanel", "textbox",
80
- "menu", "banner", "form", "search", "progressbar", "radio", "slider", "comment", "nav", "sidebar",
81
- "breadcrumb", "dropdown", "menu-item", "toggle", "hamburger", "aside", "tooltip", "modal", "overlay",
82
- "popup", "advert", "hero", "utility", "login", "signup", "password", "email", "username"}
83
-
84
- if non_text_to_keep is None:
85
- non_text_to_keep = {"meta", "img", "picture", "figure", "figcaption", "video", "source", "audio", "table",
86
- "tr", "th", "td", "thead", "tbody", "tfoot", "caption", "br"}
87
-
88
- self.non_text_to_keep = non_text_to_keep
89
- self.attr_stop_words = attr_stop_words
90
- self.remove_header_lists = remove_header_lists
91
- self.remove_footer_lists = remove_footer_lists
92
- self.minify = minify
93
- self.minify_kwargs = dict(minify_kwargs or {})
94
-
95
- # Initialize lxml Cleaner with explicit kwargs gathered from parameters
96
- self._cleaner = Cleaner(
97
- meta=meta,
98
- page_structure=page_structure,
99
- links=links,
100
- scripts=scripts,
101
- javascript=javascript,
102
- comments=comments,
103
- style=style,
104
- processing_instructions=processing_instructions,
105
- embedded=embedded,
106
- frames=frames,
107
- forms=forms,
108
- annoying_tags=annoying_tags,
109
- kill_tags=kill_tags,
110
- remove_unknown_tags=remove_unknown_tags,
111
- safe_attrs_only=safe_attrs_only,
112
- safe_attrs=safe_attrs,
113
- )
114
-
115
- # -------------------------
116
- # User-friendly entry points
117
- # -------------------------
118
-
119
-
120
- def clean(self, *, raw_html: str | BeautifulSoup) -> str:
121
- """Clean and optionally minify HTML input.
122
-
123
- The cleaning pipeline applies pre-parse hooks (on strings), prunes elements
124
- by attribute stop words, sanitizes via lxml Cleaner, performs structural
125
- pruning of header/footer/body, then applies post-clean hooks.
126
-
127
- Parameters
128
- ----------
129
- raw_html : str or BeautifulSoup
130
- Raw HTML string or BeautifulSoup to be cleaned.
131
-
132
- Returns
133
- -------
134
- str
135
- Cleaned HTML as a string.
136
- """
137
-
138
- # Stage 0: hooks that operate on the raw string
139
- if isinstance(raw_html, str):
140
- soup = BeautifulSoup(raw_html or "", features="lxml")
141
- elif isinstance(raw_html, BeautifulSoup):
142
- soup = raw_html
143
- else:
144
- raise TypeError("raw_html must be a str or BeautifulSoup instance")
145
-
146
- # Stage 1: attribute-based pruning on the original soup
147
- # Remove small, likely non-content elements based on attribute tokens.
148
- self._strip_by_attribute_stop_words(soup=soup)
149
-
150
- # Stage 2: lxml cleaner pass (robust HTML sanitation)
151
- # Use lxml Cleaner to sanitize HTML, optionally minify afterwards.
152
- cleaned_html = self._lxml_clean(str(soup))
153
- clean_soup = BeautifulSoup(markup=cleaned_html, features="lxml")
154
-
155
- # Stage 3: structural pruning on header/body/footer of the cleaned soup
156
- self._prune_header_footer(clean_soup)
157
- self._prune_body(clean_soup)
158
- self._drop_empty_leaf_nodes(clean_soup)
159
-
160
- return str(clean_soup)
161
-
162
- # -------------------------
163
- # Internal helpers
164
- # -------------------------
165
- def _lxml_clean(self, html_str: str) -> str:
166
- """Sanitize and optionally minify HTML using lxml + minify_html.
167
-
168
- Parameters
169
- ----------
170
- html_str : str
171
- HTML markup to be cleaned.
172
-
173
- Returns
174
- -------
175
- str
176
- Cleaned (and possibly minified) HTML markup.
177
- """
178
- try:
179
- cleaned = self._cleaner.clean_html(html_str)
180
- return minify_html.minify(cleaned, **self.minify_kwargs) if self.minify else cleaned
181
- except ValueError as ex:
182
- # Handle encoding declaration edge-cases by round-tripping via lxml
183
- msg = (
184
- "Unicode strings with encoding declaration are not supported. "
185
- "Please use bytes input or XML fragments without declaration."
186
- )
187
- if str(ex) == msg:
188
- raw_bytes = html_str.encode("utf-8", errors="ignore")
189
- doc = lxml_html.fromstring(raw_bytes)
190
- cleaned = self._cleaner.clean_html(doc)
191
- rendered = lxml_html.tostring(cleaned, encoding="utf-8").decode("utf-8")
192
- return minify_html.minify(rendered, **self.minify_kwargs) if self.minify else rendered
193
- raise
194
-
195
- def _strip_by_attribute_stop_words(self, *, soup: BeautifulSoup) -> None:
196
- """Remove small, likely non-content elements by attribute tokens.
197
-
198
- Scans leaf-like descendants under ``<body>`` and collects elements whose
199
- ``id``, ``class``, ``role``, or ``item_type`` values contain any of the
200
- configured ``attr_stop_words`` tokens (case-insensitive), then decomposes
201
- them. Mirrors the baseline leaf-ness and concatenation behavior.
202
-
203
- Parameters
204
- ----------
205
- soup : BeautifulSoup
206
- Parsed document to prune in place.
207
- """
208
- body = soup.find("body") or soup
209
- to_decompose: list[Tag] = []
210
- for el in body.descendants:
211
- if not isinstance(el, Tag):
212
- continue
213
- attrs = el.attrs if isinstance(el.attrs, dict) else {}
214
- if not attrs:
215
- continue
216
- # Only prune simple leaf-ish nodes to avoid huge deletes unintentionally
217
- if sum(1 for _ in el.descendants) > 1:
218
- continue
219
- for name in ("id", "class", "role", "item_type"):
220
- val = attrs.get(name)
221
- if val is None:
222
- continue
223
- if isinstance(val, (list, tuple)):
224
- # Match baseline behavior: concatenate tokens without separator
225
- val_str = "".join(map(str, val))
226
- else:
227
- val_str = str(val)
228
- if any(sw in val_str.lower() for sw in self.attr_stop_words):
229
- to_decompose.append(el)
230
- break
231
- for el in to_decompose:
232
- el.decompose()
233
-
234
- def _prune_header_footer(self, soup: BeautifulSoup) -> None:
235
- """Prune likely navigational clutter inside header and footer.
236
-
237
- Removes common list-like elements and links inside ``<header>``/``<footer>``
238
- when the corresponding toggles are enabled.
239
- """
240
- header = soup.find("header")
241
- footer = soup.find("footer")
242
- if header and self.remove_header_lists:
243
- self._decompose_tags(header, {"a", "img", "ol", "ul", "li"})
244
- if footer and self.remove_footer_lists:
245
- self._decompose_tags(footer, {"a", "img", "ol", "ul", "li"})
246
-
247
- def _prune_body(self, soup: BeautifulSoup) -> None:
248
- body = soup.find("body") or soup
249
- always_remove = {
250
- "input", "textarea", "button", "select", "option", "optgroup", "datalist",
251
- "label", "fieldset", "legend", "output", "meter", "dialog", "form",
252
- "search", "progress", "svg", "canvas", "use", "nav", "object", "noscript",
253
- }
254
- to_decompose: list[Tag] = []
255
- for el in body.descendants:
256
- if not isinstance(el, Tag):
257
- continue
258
- if not isinstance(el.name, str):
259
- continue
260
- if el.name in self.non_text_to_keep:
261
- continue
262
- if el.name in always_remove:
263
- to_decompose.append(el)
264
- for el in to_decompose:
265
- el.decompose()
266
-
267
- def _drop_empty_leaf_nodes(self, soup: BeautifulSoup) -> None:
268
- """Iteratively remove empty leaves using the baseline's strict leaf check.
269
-
270
- Walks leaf nodes (no descendants) and removes those with no text content,
271
- excluding tags explicitly whitelisted in ``non_text_to_keep``.
272
- """
273
- body = soup.find("body") or soup
274
- while True:
275
- to_decompose: list[Tag] = []
276
- for el in body.descendants:
277
- if not isinstance(el, Tag):
278
- continue
279
- if not isinstance(el.name, str):
280
- continue
281
- if el.name in self.non_text_to_keep:
282
- continue
283
- # Baseline leaf check: element must have zero descendants at all
284
- if len(list(el.descendants)) != 0:
285
- continue
286
- # Remove if no text once stripped
287
- if (el.get_text() or "").strip():
288
- continue
289
- to_decompose.append(el)
290
- if not to_decompose:
291
- break
292
- for el in to_decompose:
293
- el.decompose()
294
-
295
- @staticmethod
296
- def _decompose_tags(root: Tag, names: set[str]) -> None:
297
- for el in list(root.descendants):
298
- if isinstance(el, Tag) and isinstance(el.name, str) and el.name in names:
299
- el.decompose()
1
+ from __future__ import annotations
2
+
3
+ import minify_html
4
+ from bs4 import BeautifulSoup, Tag
5
+ from lxml import html as lxml_html
6
+ from lxml.html.clean import Cleaner
7
+
8
+
9
+
10
+
11
+ # -------------------------
12
+ # Public API
13
+ # -------------------------
14
+ class SmolHtmlCleaner:
15
+ """
16
+ Small, dependable HTML cleaner/minifier with sensible defaults.
17
+
18
+ Parameters
19
+ ----------
20
+ non_text_to_keep : set of str, optional
21
+ Tags preserved even if textless. Default includes meta/media/table/line-break tags.
22
+ attr_stop_words : set of str, optional
23
+ Attribute tokens indicating non-content scaffolding/UX. Default contains common UI tokens.
24
+ remove_header_lists : bool, optional
25
+ Prune links/lists inside ``<header>``. Default True.
26
+ remove_footer_lists : bool, optional
27
+ Prune links/lists inside ``<footer>``. Default True.
28
+ minify : bool, optional
29
+ Minify HTML output via ``minify_html``. Default True.
30
+ minify_kwargs : dict, optional
31
+ Extra args for ``minify_html.minify``. Default empty.
32
+
33
+ lxml Cleaner parameters
34
+ ----------------------
35
+ meta : bool, optional
36
+ Remove meta tags. Default False.
37
+ page_structure : bool, optional
38
+ Remove page structure tags (html, head, body). Default False.
39
+ links : bool, optional
40
+ Remove link tags. Default True.
41
+ scripts : bool, optional
42
+ Remove script tags. Default False.
43
+ javascript : bool, optional
44
+ Remove JavaScript content. Default True.
45
+ comments : bool, optional
46
+ Remove comments. Default True.
47
+ style : bool, optional
48
+ Remove style tags. Default True.
49
+ processing_instructions : bool, optional
50
+ Remove processing instructions. Default True.
51
+ embedded : bool, optional
52
+ Remove embedded content (object, embed, applet). Default True.
53
+ frames : bool, optional
54
+ Remove frame/iframe tags. Default True.
55
+ forms : bool, optional
56
+ Remove form tags. Default True.
57
+ annoying_tags : bool, optional
58
+ Remove tags considered annoying (blink, marquee, etc). Default True.
59
+ kill_tags : set of str, optional
60
+ Additional tags to remove. Default None.
61
+ remove_unknown_tags : bool, optional
62
+ Remove unknown tags. Default True.
63
+ safe_attrs_only : bool, optional
64
+ Only keep safe attributes. Default True.
65
+ safe_attrs : set of str, optional
66
+ Set of safe attributes to keep. Default is a sensible set.
67
+
68
+ Notes
69
+ -----
70
+ Defaults and cleaning behavior are preserved; only the configuration surface
71
+ moved from a dataclass to keyword-only parameters on the constructor.
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ *,
77
+ # Core behavior
78
+ non_text_to_keep: set[str] = None,
79
+ attr_stop_words: set[str] = None,
80
+ remove_header_lists: bool = True,
81
+ remove_footer_lists: bool = True,
82
+ # Minify
83
+ minify: bool = True,
84
+ minify_kwargs: dict | None = None,
85
+ # lxml Cleaner exposed explicitly (prefixed)
86
+ meta: bool = False,
87
+ page_structure: bool = False,
88
+ links: bool = True,
89
+ scripts: bool = False,
90
+ javascript: bool = True,
91
+ comments: bool = True,
92
+ style: bool = True,
93
+ processing_instructions: bool = True,
94
+ embedded: bool = True,
95
+ frames: bool = True,
96
+ forms: bool = True,
97
+ annoying_tags: bool = True,
98
+ kill_tags: set[str] | None = None,
99
+ remove_unknown_tags: bool = True,
100
+ safe_attrs_only: bool = True,
101
+ safe_attrs: set[str] = None,
102
+ ):
103
+ # Inline defaults identical to the prior CleanerConfig
104
+ if safe_attrs is None:
105
+ safe_attrs = {"href", "hreflang", "src", "srclang", "target", "alt", "kind", "type", "role", "abbr",
106
+ "accept", "accept-charset", "datetime", "lang", "name", "rel", "title", "value", "content", "label",
107
+ "item_type", "property", "itemprop"}
108
+
109
+ if attr_stop_words is None:
110
+ attr_stop_words = {"alert", "button", "checkbox", "dialog", "navigation", "tab", "tabpanel", "textbox",
111
+ "menu", "banner", "form", "search", "progressbar", "radio", "slider", "comment", "nav", "sidebar",
112
+ "breadcrumb", "dropdown", "menu-item", "toggle", "hamburger", "aside", "tooltip", "modal", "overlay",
113
+ "popup", "advert", "hero", "utility", "login", "signup", "password", "email", "username"}
114
+
115
+ if non_text_to_keep is None:
116
+ non_text_to_keep = {"meta", "img", "picture", "figure", "figcaption", "video", "source", "audio", "table",
117
+ "tr", "th", "td", "thead", "tbody", "tfoot", "caption", "br"}
118
+
119
+ self.non_text_to_keep = non_text_to_keep
120
+ self.attr_stop_words = attr_stop_words
121
+ self.remove_header_lists = remove_header_lists
122
+ self.remove_footer_lists = remove_footer_lists
123
+ self.minify = minify
124
+ self.minify_kwargs = dict(minify_kwargs or {})
125
+
126
+ # Initialize lxml Cleaner with explicit kwargs gathered from parameters
127
+ self._cleaner = Cleaner(
128
+ meta=meta,
129
+ page_structure=page_structure,
130
+ links=links,
131
+ scripts=scripts,
132
+ javascript=javascript,
133
+ comments=comments,
134
+ style=style,
135
+ processing_instructions=processing_instructions,
136
+ embedded=embedded,
137
+ frames=frames,
138
+ forms=forms,
139
+ annoying_tags=annoying_tags,
140
+ kill_tags=kill_tags,
141
+ remove_unknown_tags=remove_unknown_tags,
142
+ safe_attrs_only=safe_attrs_only,
143
+ safe_attrs=safe_attrs,
144
+ )
145
+
146
+ # -------------------------
147
+ # User-friendly entry points
148
+ # -------------------------
149
+
150
+
151
+ def make_smol(self, *, raw_html: str | BeautifulSoup) -> str:
152
+ """Clean and optionally minify HTML input.
153
+
154
+ The cleaning pipeline applies pre-parse hooks (on strings), prunes elements
155
+ by attribute stop words, sanitizes via lxml Cleaner, performs structural
156
+ pruning of header/footer/body, then applies post-clean hooks.
157
+
158
+ Parameters
159
+ ----------
160
+ raw_html : str or BeautifulSoup
161
+ Raw HTML string or BeautifulSoup to be cleaned.
162
+
163
+ Returns
164
+ -------
165
+ str
166
+ Cleaned HTML as a string.
167
+ """
168
+
169
+ # Stage 0: hooks that operate on the raw string
170
+ if isinstance(raw_html, str):
171
+ soup = BeautifulSoup(raw_html or "", features="lxml")
172
+ elif isinstance(raw_html, BeautifulSoup):
173
+ soup = raw_html
174
+ else:
175
+ raise TypeError("raw_html must be a str or BeautifulSoup instance")
176
+
177
+ # Stage 1: attribute-based pruning on the original soup
178
+ # Remove small, likely non-content elements based on attribute tokens.
179
+ self._strip_by_attribute_stop_words(soup=soup)
180
+
181
+ # Stage 2: lxml cleaner pass (robust HTML sanitation)
182
+ # Use lxml Cleaner to sanitize HTML, optionally minify afterwards.
183
+ cleaned_html = self._lxml_clean(str(soup))
184
+ clean_soup = BeautifulSoup(markup=cleaned_html, features="lxml")
185
+
186
+ # Stage 3: structural pruning on header/body/footer of the cleaned soup
187
+ self._prune_header_footer(clean_soup)
188
+ self._prune_body(clean_soup)
189
+ self._drop_empty_leaf_nodes(clean_soup)
190
+
191
+ return str(clean_soup)
192
+
193
+
194
+ def make_smol_bytes(self, *,
195
+ raw_html: str | BeautifulSoup,
196
+ compression_level: int = 5,
197
+ ) -> bytes:
198
+ """Return cleaned HTML as bytes, optionally Brotli-compressed.
199
+
200
+ If ``compression_level`` is 0, returns UTF-8 encoded bytes without compression.
201
+ For ``compression_level`` > 0, compresses the bytes using Brotli.
202
+
203
+ Parameters
204
+ ----------
205
+ raw_html : str or BeautifulSoup
206
+ Raw HTML to clean.
207
+ compression_level : int, optional
208
+ Brotli quality/level. 0 disables compression. Default 11.
209
+ **cleaner_kwargs : dict
210
+ Optional keyword args forwarded to ``SmolHtmlCleaner``.
211
+
212
+ Returns
213
+ -------
214
+ bytes
215
+ Cleaned (and possibly compressed) HTML as bytes.
216
+ """
217
+ html = self.make_smol(raw_html=raw_html)
218
+ data = html.encode("utf-8")
219
+
220
+ if compression_level <= 0:
221
+ return data
222
+
223
+ try:
224
+ import brotli as _brotli # type: ignore
225
+ except Exception as exc: # pragma: no cover - import-time dependency
226
+ raise RuntimeError(
227
+ "Brotli is required for compression. Install 'brotli' or 'brotlicffi', "
228
+ "or call with compression_level=0."
229
+ ) from exc
230
+
231
+ # Prefer TEXT mode if available for HTML content; fall back gracefully.
232
+ mode = getattr(_brotli, "MODE_TEXT", None)
233
+ if mode is None:
234
+ mode = getattr(_brotli, "BROTLI_MODE_TEXT", None)
235
+
236
+ if mode is not None:
237
+ return _brotli.compress(data, quality=int(compression_level), mode=mode)
238
+ return _brotli.compress(data, quality=int(compression_level))
239
+
240
+ # -------------------------
241
+ # Internal helpers
242
+ # -------------------------
243
+ def _lxml_clean(self, html_str: str) -> str:
244
+ """Sanitize and optionally minify HTML using lxml + minify_html.
245
+
246
+ Parameters
247
+ ----------
248
+ html_str : str
249
+ HTML markup to be cleaned.
250
+
251
+ Returns
252
+ -------
253
+ str
254
+ Cleaned (and possibly minified) HTML markup.
255
+ """
256
+ try:
257
+ cleaned = self._cleaner.clean_html(html_str)
258
+ return minify_html.minify(cleaned, **self.minify_kwargs) if self.minify else cleaned
259
+ except ValueError as ex:
260
+ # Handle encoding declaration edge-cases by round-tripping via lxml
261
+ msg = (
262
+ "Unicode strings with encoding declaration are not supported. "
263
+ "Please use bytes input or XML fragments without declaration."
264
+ )
265
+ if str(ex) == msg:
266
+ raw_bytes = html_str.encode("utf-8", errors="ignore")
267
+ doc = lxml_html.fromstring(raw_bytes)
268
+ cleaned = self._cleaner.clean_html(doc)
269
+ rendered = lxml_html.tostring(cleaned, encoding="utf-8").decode("utf-8")
270
+ return minify_html.minify(rendered, **self.minify_kwargs) if self.minify else rendered
271
+ raise
272
+
273
+ def _strip_by_attribute_stop_words(self, *, soup: BeautifulSoup) -> None:
274
+ """Remove small, likely non-content elements by attribute tokens.
275
+
276
+ Scans leaf-like descendants under ``<body>`` and collects elements whose
277
+ ``id``, ``class``, ``role``, or ``item_type`` values contain any of the
278
+ configured ``attr_stop_words`` tokens (case-insensitive), then decomposes
279
+ them. Mirrors the baseline leaf-ness and concatenation behavior.
280
+
281
+ Parameters
282
+ ----------
283
+ soup : BeautifulSoup
284
+ Parsed document to prune in place.
285
+ """
286
+ body = soup.find("body") or soup
287
+ to_decompose: list[Tag] = []
288
+ for el in body.descendants:
289
+ if not isinstance(el, Tag):
290
+ continue
291
+ attrs = el.attrs if isinstance(el.attrs, dict) else {}
292
+ if not attrs:
293
+ continue
294
+ # Only prune simple leaf-ish nodes to avoid huge deletes unintentionally
295
+ if sum(1 for _ in el.descendants) > 1:
296
+ continue
297
+ for name in ("id", "class", "role", "item_type"):
298
+ val = attrs.get(name)
299
+ if val is None:
300
+ continue
301
+ if isinstance(val, (list, tuple)):
302
+ # Match baseline behavior: concatenate tokens without separator
303
+ val_str = "".join(map(str, val))
304
+ else:
305
+ val_str = str(val)
306
+ if any(sw in val_str.lower() for sw in self.attr_stop_words):
307
+ to_decompose.append(el)
308
+ break
309
+ for el in to_decompose:
310
+ el.decompose()
311
+
312
+ def _prune_header_footer(self, soup: BeautifulSoup) -> None:
313
+ """Prune likely navigational clutter inside header and footer.
314
+
315
+ Removes common list-like elements and links inside ``<header>``/``<footer>``
316
+ when the corresponding toggles are enabled.
317
+ """
318
+ header = soup.find("header")
319
+ footer = soup.find("footer")
320
+ if header and self.remove_header_lists:
321
+ self._decompose_tags(header, {"a", "img", "ol", "ul", "li"})
322
+ if footer and self.remove_footer_lists:
323
+ self._decompose_tags(footer, {"a", "img", "ol", "ul", "li"})
324
+
325
+ def _prune_body(self, soup: BeautifulSoup) -> None:
326
+ body = soup.find("body") or soup
327
+ always_remove = {
328
+ "input", "textarea", "button", "select", "option", "optgroup", "datalist",
329
+ "label", "fieldset", "legend", "output", "meter", "dialog", "form",
330
+ "search", "progress", "svg", "canvas", "use", "nav", "object", "noscript",
331
+ }
332
+ to_decompose: list[Tag] = []
333
+ for el in body.descendants:
334
+ if not isinstance(el, Tag):
335
+ continue
336
+ if not isinstance(el.name, str):
337
+ continue
338
+ if el.name in self.non_text_to_keep:
339
+ continue
340
+ if el.name in always_remove:
341
+ to_decompose.append(el)
342
+ for el in to_decompose:
343
+ el.decompose()
344
+
345
+ def _drop_empty_leaf_nodes(self, soup: BeautifulSoup) -> None:
346
+ """Iteratively remove empty leaves using the baseline's strict leaf check.
347
+
348
+ Walks leaf nodes (no descendants) and removes those with no text content,
349
+ excluding tags explicitly whitelisted in ``non_text_to_keep``.
350
+ """
351
+ body = soup.find("body") or soup
352
+ while True:
353
+ to_decompose: list[Tag] = []
354
+ for el in body.descendants:
355
+ if not isinstance(el, Tag):
356
+ continue
357
+ if not isinstance(el.name, str):
358
+ continue
359
+ if el.name in self.non_text_to_keep:
360
+ continue
361
+ # Baseline leaf check: element must have zero descendants at all
362
+ if len(list(el.descendants)) != 0:
363
+ continue
364
+ # Remove if no text once stripped
365
+ if (el.get_text() or "").strip():
366
+ continue
367
+ to_decompose.append(el)
368
+ if not to_decompose:
369
+ break
370
+ for el in to_decompose:
371
+ el.decompose()
372
+
373
+ @staticmethod
374
+ def _decompose_tags(root: Tag, names: set[str]) -> None:
375
+ for el in list(root.descendants):
376
+ if isinstance(el, Tag) and isinstance(el.name, str) and el.name in names:
377
+ el.decompose()
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.4
2
+ Name: smol-html
3
+ Version: 0.1.2
4
+ Summary: Small, dependable HTML cleaner/minifier with sensible defaults
5
+ Project-URL: Homepage, https://github.com/NosibleAI/smol-html
6
+ Project-URL: Repository, https://github.com/NosibleAI/smol-html
7
+ Project-URL: Issues, https://github.com/NosibleAI/smol-html/issues
8
+ Author-email: Gareth Warburton <garethw738@gmail.com>, Stuart Reid <stuart@nosible.com>, Matthew Dicks <matthew@nosible.com>, Richard Taylor <richard@nosible.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Requires-Python: >=3.9
25
+ Requires-Dist: beautifulsoup4>=4.0.1
26
+ Requires-Dist: brotli>=0.5.2
27
+ Requires-Dist: lxml[html-clean]>=1.3.2
28
+ Requires-Dist: minify-html>=0.2.6
29
+ Description-Content-Type: text/markdown
30
+
31
+ ![smol](smol.png)
32
+
33
+
34
+ # smol-html
35
+
36
+ Small, dependable HTML cleaner/minifier with sensible defaults.
37
+
38
+ ### 📦 Installation
39
+
40
+ ```bash
41
+ pip install smol-html
42
+ ```
43
+
44
+ ### ⚡ Installing with uv
45
+
46
+ ```bash
47
+ uv pip install smol-html
48
+ ```
49
+
50
+ ## Quick Start
51
+
52
+ Clean an HTML string (or page contents):
53
+
54
+ ```python
55
+ from smol_html import SmolHtmlCleaner
56
+
57
+ html = """
58
+ <html>
59
+ <head><title> Example </title></head>
60
+ <body>
61
+ <div> Hello <span> world </span> </div>
62
+ </body>
63
+ </html>
64
+ """
65
+
66
+ # All constructor arguments are keyword-only and optional.
67
+ cleaner = SmolHtmlCleaner()
68
+ cleaned = cleaner.make_smol(raw_html=html)
69
+
70
+ print(cleaned)
71
+ ```
72
+
73
+ ## Customization
74
+
75
+ `SmolHtmlCleaner` exposes keyword-only parameters with practical defaults. You can:
76
+ - Pass overrides to the constructor, or
77
+ - Adjust attributes on the instance after creation.
78
+
79
+ ```python
80
+ from smol_html import SmolHtmlCleaner
81
+
82
+ cleaner = SmolHtmlCleaner()
83
+ cleaner.attr_stop_words.add("advert") # e.g., add a custom stop word
84
+ ```
85
+
86
+ ## Usage Examples
87
+
88
+ Minimal:
89
+
90
+ ```python
91
+ from smol_html import SmolHtmlCleaner
92
+
93
+ cleaner = SmolHtmlCleaner()
94
+ out = cleaner.make_smol(raw_html="<p>Hi <!-- note --> <a href='x'>link</a></p>")
95
+ ```
96
+
97
+ Customize a few options:
98
+
99
+ ```python
100
+ from smol_html import SmolHtmlCleaner
101
+
102
+ cleaner = SmolHtmlCleaner(
103
+ attr_stop_words={"nav", "advert"},
104
+ remove_header_lists=False,
105
+ minify=True,
106
+ )
107
+
108
+ out = cleaner.make_smol(raw_html="<p>Hi</p>")
109
+ ```
110
+
111
+ ## Compressed Bytes Output
112
+
113
+ Produce compressed bytes using Brotli with `make_smol_bytes`
114
+
115
+
116
+ ```python
117
+ from smol_html import SmolHtmlCleaner
118
+ import brotli # only needed if you want to decompress here in the example
119
+
120
+ html = """
121
+ <html>
122
+ <body>
123
+ <div> Hello <span> world </span> </div>
124
+ </body>
125
+ </html>
126
+ """
127
+
128
+ cleaner = SmolHtmlCleaner()
129
+
130
+ # Get compressed bytes (quality 11 is strong compression)
131
+ compressed = cleaner.make_smol_bytes(raw_html=html, compression_level=11)
132
+
133
+ # Example: decompress back to text to inspect (optional)
134
+ decompressed = brotli.decompress(compressed).decode("utf-8")
135
+ print(decompressed)
136
+
137
+ # Or write compressed output directly to a file
138
+ with open("page.html.br", "wb") as f:
139
+ f.write(compressed)
140
+ ```
141
+
142
+ ## Parameter Reference
143
+
144
+ To improve readability, the reference is split into two tables:
145
+ - What it does and when to change
146
+ - Types and default values
147
+
148
+ ### What It Does
149
+
150
+ | Parameter | What it does | When to change |
151
+ |---|---|---|
152
+ | `non_text_to_keep` | Whitelist of empty/non-text tags to preserve (e.g., images, figures, tables, line breaks). | If important non-text elements are being removed or you want to keep/drop more empty tags. |
153
+ | `attr_stop_words` | Tokens matched against `id`/`class`/`role`/`item_type` on small elements; matches are removed as likely non-content. | Add tokens like `advert`, `hero`, `menu` to aggressively drop UI chrome, or remove tokens if content is lost. |
154
+ | `remove_header_lists` | Removes links/lists/images within `<header>` to reduce nav clutter. | Set `False` if your header contains meaningful content you want to keep. |
155
+ | `remove_footer_lists` | Removes links/lists/images within `<footer>` to reduce boilerplate. | Set `False` for content-heavy footers you need. |
156
+ | `minify` | Minifies output HTML using `minify_html`. | Set `False` for readability or debugging; use `--pretty` in the CLI. |
157
+ | `minify_kwargs` | Extra options passed to `minify_html.minify`. | Tune minification behavior (e.g., whitespace, comments) without changing cleaning. |
158
+ | `meta` | lxml Cleaner option: remove `<meta>` content when `True`. | Usually leave `False`; enable only for strict sanitation. |
159
+ | `page_structure` | lxml Cleaner option: remove page-structure tags (e.g., `<head>`, `<body>`) when `True`. | Rarely needed; keep `False` to preserve structure. |
160
+ | `links` | lxml Cleaner option: sanitize/clean links. | Leave `True` unless you need raw anchors untouched. |
161
+ | `scripts` | lxml Cleaner option: remove `<script>` tags when `True`. | Keep `False` to preserve scripts; usually safe to remove via `javascript=True` anyway. |
162
+ | `javascript` | lxml Cleaner option: remove JS and event handlers. | Set `False` only if you truly need inline JS (not recommended). |
163
+ | `comments` | lxml Cleaner option: remove HTML comments. | Set `False` to retain comments for debugging. |
164
+ | `style` | lxml Cleaner option: remove CSS and style attributes. | Set `False` to keep inline styles/CSS. |
165
+ | `processing_instructions` | lxml Cleaner option: remove processing instructions. | Rarely change; keep for safety. |
166
+ | `embedded` | lxml Cleaner option: remove embedded content (e.g., `<embed>`, `<object>`). | Set `False` to keep embedded media. |
167
+ | `frames` | lxml Cleaner option: remove frames/iframes. | Set `False` if iframes contain needed content. |
168
+ | `forms` | lxml Cleaner option: remove form elements. | Set `False` if you need to keep forms/inputs. |
169
+ | `annoying_tags` | lxml Cleaner option: remove tags considered "annoying" by lxml (e.g., `<blink>`, `<marquee>`). | Rarely change. |
170
+ | `kill_tags` | Additional explicit tags to remove entirely. | Add site-specific or custom tags to drop. |
171
+ | `remove_unknown_tags` | lxml Cleaner option: drop unknown/invalid tags. | Set `False` if you rely on custom elements. |
172
+ | `safe_attrs_only` | Only allow attributes listed in `safe_attrs`. | Set `False` if you need to keep arbitrary attributes. |
173
+ | `safe_attrs` | Allowed HTML attributes when `safe_attrs_only=True`. | Extend to keep additional attributes you trust. |
174
+
175
+ ### Types and Defaults
176
+
177
+ | Parameter | Type | Default |
178
+ |---|---|---|
179
+ | `non_text_to_keep` | `set[str]` | media/meta/table/`br` tags |
180
+ | `attr_stop_words` | `set[str]` | common UI/navigation tokens |
181
+ | `remove_header_lists` | `bool` | `True` |
182
+ | `remove_footer_lists` | `bool` | `True` |
183
+ | `minify` | `bool` | `True` |
184
+ | `minify_kwargs` | `dict` | `{}` |
185
+ | `meta` | `bool` | `False` |
186
+ | `page_structure` | `bool` | `False` |
187
+ | `links` | `bool` | `True` |
188
+ | `scripts` | `bool` | `False` |
189
+ | `javascript` | `bool` | `True` |
190
+ | `comments` | `bool` | `True` |
191
+ | `style` | `bool` | `True` |
192
+ | `processing_instructions` | `bool` | `True` |
193
+ | `embedded` | `bool` | `True` |
194
+ | `frames` | `bool` | `True` |
195
+ | `forms` | `bool` | `True` |
196
+ | `annoying_tags` | `bool` | `True` |
197
+ | `kill_tags` | `set[str] &#124; None` | `None` |
198
+ | `remove_unknown_tags` | `bool` | `True` |
199
+ | `safe_attrs_only` | `bool` | `True` |
200
+ | `safe_attrs` | `set[str]` | curated set |
@@ -0,0 +1,6 @@
1
+ smol_html/__init__.py,sha256=L_Clggminog8SY3a9TiN57ZajkYAQmXqdY8TOGEut2A,112
2
+ smol_html/smol_html.py,sha256=f9VGdIS3PTKfce3WpRQlhwMf7OEYOUWOlip8DI3dGV4,15194
3
+ smol_html-0.1.2.dist-info/METADATA,sha256=RHzSayuHJ0JalYqWxrI3lWcLfI3gi6BGwi8hEEY0m3A,7747
4
+ smol_html-0.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
+ smol_html-0.1.2.dist-info/licenses/LICENSE,sha256=88yg3BujRGq8MYlWhbrzB2YMNWJaXnBck3c7l23labs,1089
6
+ smol_html-0.1.2.dist-info/RECORD,,
@@ -1,127 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: smol-html
3
- Version: 0.1.0
4
- Summary: Small, dependable HTML cleaner/minifier with sensible defaults
5
- Project-URL: Homepage, https://github.com/NosibleAI/smol-html
6
- Project-URL: Repository, https://github.com/NosibleAI/smol-html
7
- Project-URL: Issues, https://github.com/NosibleAI/smol-html/issues
8
- Author-email: Gareth Warburton <garethw738@gmail.com>, Stuart Reid <stuart@nosible.com>, Matthew Dicks <matthew@nosible.com>, Richard Taylor <richard@nosible.com>
9
- License: MIT
10
- License-File: LICENSE
11
- Classifier: Development Status :: 4 - Beta
12
- Classifier: Intended Audience :: Developers
13
- Classifier: License :: OSI Approved :: MIT License
14
- Classifier: Operating System :: OS Independent
15
- Classifier: Programming Language :: Python :: 3
16
- Classifier: Programming Language :: Python :: 3 :: Only
17
- Classifier: Programming Language :: Python :: 3.9
18
- Classifier: Programming Language :: Python :: 3.10
19
- Classifier: Programming Language :: Python :: 3.11
20
- Classifier: Programming Language :: Python :: 3.12
21
- Classifier: Programming Language :: Python :: 3.13
22
- Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
- Requires-Python: >=3.9
25
- Requires-Dist: beautifulsoup4>=4.13.5
26
- Requires-Dist: lxml[html-clean]>=6.0.1
27
- Requires-Dist: minify-html>=0.16.4
28
- Description-Content-Type: text/markdown
29
-
30
- # smol-html
31
-
32
- Small, dependable HTML cleaner/minifier with sensible defaults.
33
-
34
- ## Installation
35
-
36
- - pip: `pip install smol-html`
37
- - uv: `uv pip install smol-html`
38
-
39
- ## Quick Start
40
-
41
- Clean an HTML string (or page contents):
42
-
43
- ```python
44
- from smol_html import SmolHtmlCleaner
45
-
46
- html = """
47
- <html>
48
- <head><title> Example </title></head>
49
- <body>
50
- <div> Hello <span> world </span> </div>
51
- </body>
52
- </html>
53
- """
54
-
55
- # All constructor arguments are keyword-only and optional.
56
- cleaner = SmolHtmlCleaner()
57
- cleaned = cleaner.clean(raw_html=html)
58
-
59
- print(cleaned)
60
- ```
61
-
62
- ## Customization
63
-
64
- `SmolHtmlCleaner` exposes keyword-only parameters with practical defaults. You can:
65
- - Pass overrides to the constructor, or
66
- - Adjust attributes on the instance after creation.
67
-
68
- ```python
69
- from smol_html import SmolHtmlCleaner
70
-
71
- cleaner = SmolHtmlCleaner()
72
- cleaner.attr_stop_words.add("advert") # e.g., add a custom stop word
73
- ```
74
-
75
- ## Usage Examples
76
-
77
- Minimal:
78
-
79
- ```python
80
- from smol_html import SmolHtmlCleaner
81
-
82
- cleaner = SmolHtmlCleaner()
83
- out = cleaner.clean(raw_html="<p>Hi <!-- note --> <a href='x'>link</a></p>")
84
- ```
85
-
86
- Customize a few options:
87
-
88
- ```python
89
- from smol_html import SmolHtmlCleaner
90
-
91
- cleaner = SmolHtmlCleaner(
92
- attr_stop_words={"nav", "advert"},
93
- remove_header_lists=False,
94
- minify=True,
95
- )
96
-
97
- out = cleaner.clean(raw_html="<p>Hi</p>")
98
- ```
99
-
100
- ## Parameter Reference
101
-
102
- The most useful parameters, what they do, and when to change them:
103
-
104
- | Parameter | Type | Default | What it does | When to change |
105
- |---|---|---|---|---|
106
- | `non_text_to_keep` | `set[str]` | media/meta/table/`br` tags | Whitelist of empty/non-text tags to preserve (e.g., images, figures, tables, line breaks). | If important non-text elements are being removed or you want to keep/drop more empty tags. |
107
- | `attr_stop_words` | `set[str]` | common UI/navigation tokens | Tokens matched against `id`/`class`/`role`/`item_type` on small elements; matches are removed as likely non-content. | Add tokens like `advert`, `hero`, `menu` to aggressively drop UI chrome, or remove tokens if content is lost. |
108
- | `remove_header_lists` | `bool` | `True` | Removes links/lists/images within `<header>` to reduce nav clutter. | Set `False` if your header contains meaningful content you want to keep. |
109
- | `remove_footer_lists` | `bool` | `True` | Removes links/lists/images within `<footer>` to reduce boilerplate. | Set `False` for content-heavy footers you need. |
110
- | `minify` | `bool` | `True` | Minifies output HTML using `minify_html`. | Set `False` for readability or debugging; use `--pretty` in the CLI. |
111
- | `minify_kwargs` | `dict` | `{}` | Extra options passed to `minify_html.minify`. | Tune minification behavior (e.g., whitespace, comments) without changing cleaning. |
112
- | `meta` | `bool` | `False` | lxml Cleaner option: remove `<meta>` content when `True`. | Usually leave `False`; enable only for strict sanitation. |
113
- | `page_structure` | `bool` | `False` | lxml Cleaner option: remove page-structure tags (e.g., `<head>`, `<body>`) when `True`. | Rarely needed; keep `False` to preserve structure. |
114
- | `links` | `bool` | `True` | lxml Cleaner option: sanitize/clean links. | Leave `True` unless you need raw anchors untouched. |
115
- | `scripts` | `bool` | `False` | lxml Cleaner option: remove `<script>` tags when `True`. | Keep `False` to preserve scripts; usually safe to remove via `javascript=True` anyway. |
116
- | `javascript` | `bool` | `True` | lxml Cleaner option: remove JS and event handlers. | Set `False` only if you truly need inline JS (not recommended). |
117
- | `comments` | `bool` | `True` | lxml Cleaner option: remove HTML comments. | Set `False` to retain comments for debugging. |
118
- | `style` | `bool` | `True` | lxml Cleaner option: remove CSS and style attributes. | Set `False` to keep inline styles/CSS. |
119
- | `processing_instructions` | `bool` | `True` | lxml Cleaner option: remove processing instructions. | Rarely change; keep for safety. |
120
- | `embedded` | `bool` | `True` | lxml Cleaner option: remove embedded content (e.g., `<embed>`, `<object>`). | Set `False` to keep embedded media. |
121
- | `frames` | `bool` | `True` | lxml Cleaner option: remove frames/iframes. | Set `False` if iframes contain needed content. |
122
- | `forms` | `bool` | `True` | lxml Cleaner option: remove form elements. | Set `False` if you need to keep forms/inputs. |
123
- | `annoying_tags` | `bool` | `True` | lxml Cleaner option: remove tags considered "annoying" by lxml (e.g., `<blink>`, `<marquee>`). | Rarely change. |
124
- | `kill_tags` | `set[str] | None` | `None` | Additional explicit tags to remove entirely. | Add site-specific or custom tags to drop. |
125
- | `remove_unknown_tags` | `bool` | `True` | lxml Cleaner option: drop unknown/invalid tags. | Set `False` if you rely on custom elements. |
126
- | `safe_attrs_only` | `bool` | `True` | Only allow attributes listed in `safe_attrs`. | Set `False` if you need to keep arbitrary attributes. |
127
- | `safe_attrs` | `set[str]` | curated set | Allowed HTML attributes when `safe_attrs_only=True`. | Extend to keep additional attributes you trust. |
@@ -1,6 +0,0 @@
1
- smol_html/__init__.py,sha256=ea_KigbtSifgJYhoYvaco0CszvdBEX08BuT5qKILY7k,106
2
- smol_html/core.py,sha256=oJ7YAjocTVouCdmnxZYmIvx_30G5427uQn-nF8Q-rng,12095
3
- smol_html-0.1.0.dist-info/METADATA,sha256=CWySJr1U5V34e9i-x9BtQM64sKZ0ZbCOQI-5HYBZMr4,6390
4
- smol_html-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
- smol_html-0.1.0.dist-info/licenses/LICENSE,sha256=88yg3BujRGq8MYlWhbrzB2YMNWJaXnBck3c7l23labs,1089
6
- smol_html-0.1.0.dist-info/RECORD,,