smol-html 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {smol_html-0.1.2.dist-info → smol_html-0.1.3.dist-info}/METADATA +7 -1
- smol_html-0.1.3.dist-info/RECORD +4 -0
- smol_html/__init__.py +0 -4
- smol_html/smol_html.py +0 -377
- smol_html-0.1.2.dist-info/RECORD +0 -6
- {smol_html-0.1.2.dist-info → smol_html-0.1.3.dist-info}/WHEEL +0 -0
- {smol_html-0.1.2.dist-info → smol_html-0.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: smol-html
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.3
|
4
4
|
Summary: Small, dependable HTML cleaner/minifier with sensible defaults
|
5
5
|
Project-URL: Homepage, https://github.com/NosibleAI/smol-html
|
6
6
|
Project-URL: Repository, https://github.com/NosibleAI/smol-html
|
@@ -35,6 +35,12 @@ Description-Content-Type: text/markdown
|
|
35
35
|
|
36
36
|
Small, dependable HTML cleaner/minifier with sensible defaults.
|
37
37
|
|
38
|
+
## Motivation
|
39
|
+
|
40
|
+
Nosible is a search engine, which means we need to store and process a very large number of webpages. To make this tractable, we strip out visual chrome and other non-essential components that don’t matter for downstream tasks (indexing, ranking, retrieval, and LLM pipelines) while preserving the important content and structure. This package cleans and minifies HTML, greatly reducing size on disk; combined with Brotli compression (by Google), the savings are even larger.
|
41
|
+
|
42
|
+

|
43
|
+
|
38
44
|
### 📦 Installation
|
39
45
|
|
40
46
|
```bash
|
@@ -0,0 +1,4 @@
|
|
1
|
+
smol_html-0.1.3.dist-info/METADATA,sha256=MApb1E7-tzyEYuRymzRjUTg8TD14vFBsfceTsY07r3s,8279
|
2
|
+
smol_html-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
3
|
+
smol_html-0.1.3.dist-info/licenses/LICENSE,sha256=88yg3BujRGq8MYlWhbrzB2YMNWJaXnBck3c7l23labs,1089
|
4
|
+
smol_html-0.1.3.dist-info/RECORD,,
|
smol_html/__init__.py
DELETED
smol_html/smol_html.py
DELETED
@@ -1,377 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import minify_html
|
4
|
-
from bs4 import BeautifulSoup, Tag
|
5
|
-
from lxml import html as lxml_html
|
6
|
-
from lxml.html.clean import Cleaner
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
# -------------------------
|
12
|
-
# Public API
|
13
|
-
# -------------------------
|
14
|
-
class SmolHtmlCleaner:
|
15
|
-
"""
|
16
|
-
Small, dependable HTML cleaner/minifier with sensible defaults.
|
17
|
-
|
18
|
-
Parameters
|
19
|
-
----------
|
20
|
-
non_text_to_keep : set of str, optional
|
21
|
-
Tags preserved even if textless. Default includes meta/media/table/line-break tags.
|
22
|
-
attr_stop_words : set of str, optional
|
23
|
-
Attribute tokens indicating non-content scaffolding/UX. Default contains common UI tokens.
|
24
|
-
remove_header_lists : bool, optional
|
25
|
-
Prune links/lists inside ``<header>``. Default True.
|
26
|
-
remove_footer_lists : bool, optional
|
27
|
-
Prune links/lists inside ``<footer>``. Default True.
|
28
|
-
minify : bool, optional
|
29
|
-
Minify HTML output via ``minify_html``. Default True.
|
30
|
-
minify_kwargs : dict, optional
|
31
|
-
Extra args for ``minify_html.minify``. Default empty.
|
32
|
-
|
33
|
-
lxml Cleaner parameters
|
34
|
-
----------------------
|
35
|
-
meta : bool, optional
|
36
|
-
Remove meta tags. Default False.
|
37
|
-
page_structure : bool, optional
|
38
|
-
Remove page structure tags (html, head, body). Default False.
|
39
|
-
links : bool, optional
|
40
|
-
Remove link tags. Default True.
|
41
|
-
scripts : bool, optional
|
42
|
-
Remove script tags. Default False.
|
43
|
-
javascript : bool, optional
|
44
|
-
Remove JavaScript content. Default True.
|
45
|
-
comments : bool, optional
|
46
|
-
Remove comments. Default True.
|
47
|
-
style : bool, optional
|
48
|
-
Remove style tags. Default True.
|
49
|
-
processing_instructions : bool, optional
|
50
|
-
Remove processing instructions. Default True.
|
51
|
-
embedded : bool, optional
|
52
|
-
Remove embedded content (object, embed, applet). Default True.
|
53
|
-
frames : bool, optional
|
54
|
-
Remove frame/iframe tags. Default True.
|
55
|
-
forms : bool, optional
|
56
|
-
Remove form tags. Default True.
|
57
|
-
annoying_tags : bool, optional
|
58
|
-
Remove tags considered annoying (blink, marquee, etc). Default True.
|
59
|
-
kill_tags : set of str, optional
|
60
|
-
Additional tags to remove. Default None.
|
61
|
-
remove_unknown_tags : bool, optional
|
62
|
-
Remove unknown tags. Default True.
|
63
|
-
safe_attrs_only : bool, optional
|
64
|
-
Only keep safe attributes. Default True.
|
65
|
-
safe_attrs : set of str, optional
|
66
|
-
Set of safe attributes to keep. Default is a sensible set.
|
67
|
-
|
68
|
-
Notes
|
69
|
-
-----
|
70
|
-
Defaults and cleaning behavior are preserved; only the configuration surface
|
71
|
-
moved from a dataclass to keyword-only parameters on the constructor.
|
72
|
-
"""
|
73
|
-
|
74
|
-
def __init__(
|
75
|
-
self,
|
76
|
-
*,
|
77
|
-
# Core behavior
|
78
|
-
non_text_to_keep: set[str] = None,
|
79
|
-
attr_stop_words: set[str] = None,
|
80
|
-
remove_header_lists: bool = True,
|
81
|
-
remove_footer_lists: bool = True,
|
82
|
-
# Minify
|
83
|
-
minify: bool = True,
|
84
|
-
minify_kwargs: dict | None = None,
|
85
|
-
# lxml Cleaner exposed explicitly (prefixed)
|
86
|
-
meta: bool = False,
|
87
|
-
page_structure: bool = False,
|
88
|
-
links: bool = True,
|
89
|
-
scripts: bool = False,
|
90
|
-
javascript: bool = True,
|
91
|
-
comments: bool = True,
|
92
|
-
style: bool = True,
|
93
|
-
processing_instructions: bool = True,
|
94
|
-
embedded: bool = True,
|
95
|
-
frames: bool = True,
|
96
|
-
forms: bool = True,
|
97
|
-
annoying_tags: bool = True,
|
98
|
-
kill_tags: set[str] | None = None,
|
99
|
-
remove_unknown_tags: bool = True,
|
100
|
-
safe_attrs_only: bool = True,
|
101
|
-
safe_attrs: set[str] = None,
|
102
|
-
):
|
103
|
-
# Inline defaults identical to the prior CleanerConfig
|
104
|
-
if safe_attrs is None:
|
105
|
-
safe_attrs = {"href", "hreflang", "src", "srclang", "target", "alt", "kind", "type", "role", "abbr",
|
106
|
-
"accept", "accept-charset", "datetime", "lang", "name", "rel", "title", "value", "content", "label",
|
107
|
-
"item_type", "property", "itemprop"}
|
108
|
-
|
109
|
-
if attr_stop_words is None:
|
110
|
-
attr_stop_words = {"alert", "button", "checkbox", "dialog", "navigation", "tab", "tabpanel", "textbox",
|
111
|
-
"menu", "banner", "form", "search", "progressbar", "radio", "slider", "comment", "nav", "sidebar",
|
112
|
-
"breadcrumb", "dropdown", "menu-item", "toggle", "hamburger", "aside", "tooltip", "modal", "overlay",
|
113
|
-
"popup", "advert", "hero", "utility", "login", "signup", "password", "email", "username"}
|
114
|
-
|
115
|
-
if non_text_to_keep is None:
|
116
|
-
non_text_to_keep = {"meta", "img", "picture", "figure", "figcaption", "video", "source", "audio", "table",
|
117
|
-
"tr", "th", "td", "thead", "tbody", "tfoot", "caption", "br"}
|
118
|
-
|
119
|
-
self.non_text_to_keep = non_text_to_keep
|
120
|
-
self.attr_stop_words = attr_stop_words
|
121
|
-
self.remove_header_lists = remove_header_lists
|
122
|
-
self.remove_footer_lists = remove_footer_lists
|
123
|
-
self.minify = minify
|
124
|
-
self.minify_kwargs = dict(minify_kwargs or {})
|
125
|
-
|
126
|
-
# Initialize lxml Cleaner with explicit kwargs gathered from parameters
|
127
|
-
self._cleaner = Cleaner(
|
128
|
-
meta=meta,
|
129
|
-
page_structure=page_structure,
|
130
|
-
links=links,
|
131
|
-
scripts=scripts,
|
132
|
-
javascript=javascript,
|
133
|
-
comments=comments,
|
134
|
-
style=style,
|
135
|
-
processing_instructions=processing_instructions,
|
136
|
-
embedded=embedded,
|
137
|
-
frames=frames,
|
138
|
-
forms=forms,
|
139
|
-
annoying_tags=annoying_tags,
|
140
|
-
kill_tags=kill_tags,
|
141
|
-
remove_unknown_tags=remove_unknown_tags,
|
142
|
-
safe_attrs_only=safe_attrs_only,
|
143
|
-
safe_attrs=safe_attrs,
|
144
|
-
)
|
145
|
-
|
146
|
-
# -------------------------
|
147
|
-
# User-friendly entry points
|
148
|
-
# -------------------------
|
149
|
-
|
150
|
-
|
151
|
-
def make_smol(self, *, raw_html: str | BeautifulSoup) -> str:
|
152
|
-
"""Clean and optionally minify HTML input.
|
153
|
-
|
154
|
-
The cleaning pipeline applies pre-parse hooks (on strings), prunes elements
|
155
|
-
by attribute stop words, sanitizes via lxml Cleaner, performs structural
|
156
|
-
pruning of header/footer/body, then applies post-clean hooks.
|
157
|
-
|
158
|
-
Parameters
|
159
|
-
----------
|
160
|
-
raw_html : str or BeautifulSoup
|
161
|
-
Raw HTML string or BeautifulSoup to be cleaned.
|
162
|
-
|
163
|
-
Returns
|
164
|
-
-------
|
165
|
-
str
|
166
|
-
Cleaned HTML as a string.
|
167
|
-
"""
|
168
|
-
|
169
|
-
# Stage 0: hooks that operate on the raw string
|
170
|
-
if isinstance(raw_html, str):
|
171
|
-
soup = BeautifulSoup(raw_html or "", features="lxml")
|
172
|
-
elif isinstance(raw_html, BeautifulSoup):
|
173
|
-
soup = raw_html
|
174
|
-
else:
|
175
|
-
raise TypeError("raw_html must be a str or BeautifulSoup instance")
|
176
|
-
|
177
|
-
# Stage 1: attribute-based pruning on the original soup
|
178
|
-
# Remove small, likely non-content elements based on attribute tokens.
|
179
|
-
self._strip_by_attribute_stop_words(soup=soup)
|
180
|
-
|
181
|
-
# Stage 2: lxml cleaner pass (robust HTML sanitation)
|
182
|
-
# Use lxml Cleaner to sanitize HTML, optionally minify afterwards.
|
183
|
-
cleaned_html = self._lxml_clean(str(soup))
|
184
|
-
clean_soup = BeautifulSoup(markup=cleaned_html, features="lxml")
|
185
|
-
|
186
|
-
# Stage 3: structural pruning on header/body/footer of the cleaned soup
|
187
|
-
self._prune_header_footer(clean_soup)
|
188
|
-
self._prune_body(clean_soup)
|
189
|
-
self._drop_empty_leaf_nodes(clean_soup)
|
190
|
-
|
191
|
-
return str(clean_soup)
|
192
|
-
|
193
|
-
|
194
|
-
def make_smol_bytes(self, *,
|
195
|
-
raw_html: str | BeautifulSoup,
|
196
|
-
compression_level: int = 5,
|
197
|
-
) -> bytes:
|
198
|
-
"""Return cleaned HTML as bytes, optionally Brotli-compressed.
|
199
|
-
|
200
|
-
If ``compression_level`` is 0, returns UTF-8 encoded bytes without compression.
|
201
|
-
For ``compression_level`` > 0, compresses the bytes using Brotli.
|
202
|
-
|
203
|
-
Parameters
|
204
|
-
----------
|
205
|
-
raw_html : str or BeautifulSoup
|
206
|
-
Raw HTML to clean.
|
207
|
-
compression_level : int, optional
|
208
|
-
Brotli quality/level. 0 disables compression. Default 11.
|
209
|
-
**cleaner_kwargs : dict
|
210
|
-
Optional keyword args forwarded to ``SmolHtmlCleaner``.
|
211
|
-
|
212
|
-
Returns
|
213
|
-
-------
|
214
|
-
bytes
|
215
|
-
Cleaned (and possibly compressed) HTML as bytes.
|
216
|
-
"""
|
217
|
-
html = self.make_smol(raw_html=raw_html)
|
218
|
-
data = html.encode("utf-8")
|
219
|
-
|
220
|
-
if compression_level <= 0:
|
221
|
-
return data
|
222
|
-
|
223
|
-
try:
|
224
|
-
import brotli as _brotli # type: ignore
|
225
|
-
except Exception as exc: # pragma: no cover - import-time dependency
|
226
|
-
raise RuntimeError(
|
227
|
-
"Brotli is required for compression. Install 'brotli' or 'brotlicffi', "
|
228
|
-
"or call with compression_level=0."
|
229
|
-
) from exc
|
230
|
-
|
231
|
-
# Prefer TEXT mode if available for HTML content; fall back gracefully.
|
232
|
-
mode = getattr(_brotli, "MODE_TEXT", None)
|
233
|
-
if mode is None:
|
234
|
-
mode = getattr(_brotli, "BROTLI_MODE_TEXT", None)
|
235
|
-
|
236
|
-
if mode is not None:
|
237
|
-
return _brotli.compress(data, quality=int(compression_level), mode=mode)
|
238
|
-
return _brotli.compress(data, quality=int(compression_level))
|
239
|
-
|
240
|
-
# -------------------------
|
241
|
-
# Internal helpers
|
242
|
-
# -------------------------
|
243
|
-
def _lxml_clean(self, html_str: str) -> str:
|
244
|
-
"""Sanitize and optionally minify HTML using lxml + minify_html.
|
245
|
-
|
246
|
-
Parameters
|
247
|
-
----------
|
248
|
-
html_str : str
|
249
|
-
HTML markup to be cleaned.
|
250
|
-
|
251
|
-
Returns
|
252
|
-
-------
|
253
|
-
str
|
254
|
-
Cleaned (and possibly minified) HTML markup.
|
255
|
-
"""
|
256
|
-
try:
|
257
|
-
cleaned = self._cleaner.clean_html(html_str)
|
258
|
-
return minify_html.minify(cleaned, **self.minify_kwargs) if self.minify else cleaned
|
259
|
-
except ValueError as ex:
|
260
|
-
# Handle encoding declaration edge-cases by round-tripping via lxml
|
261
|
-
msg = (
|
262
|
-
"Unicode strings with encoding declaration are not supported. "
|
263
|
-
"Please use bytes input or XML fragments without declaration."
|
264
|
-
)
|
265
|
-
if str(ex) == msg:
|
266
|
-
raw_bytes = html_str.encode("utf-8", errors="ignore")
|
267
|
-
doc = lxml_html.fromstring(raw_bytes)
|
268
|
-
cleaned = self._cleaner.clean_html(doc)
|
269
|
-
rendered = lxml_html.tostring(cleaned, encoding="utf-8").decode("utf-8")
|
270
|
-
return minify_html.minify(rendered, **self.minify_kwargs) if self.minify else rendered
|
271
|
-
raise
|
272
|
-
|
273
|
-
def _strip_by_attribute_stop_words(self, *, soup: BeautifulSoup) -> None:
|
274
|
-
"""Remove small, likely non-content elements by attribute tokens.
|
275
|
-
|
276
|
-
Scans leaf-like descendants under ``<body>`` and collects elements whose
|
277
|
-
``id``, ``class``, ``role``, or ``item_type`` values contain any of the
|
278
|
-
configured ``attr_stop_words`` tokens (case-insensitive), then decomposes
|
279
|
-
them. Mirrors the baseline leaf-ness and concatenation behavior.
|
280
|
-
|
281
|
-
Parameters
|
282
|
-
----------
|
283
|
-
soup : BeautifulSoup
|
284
|
-
Parsed document to prune in place.
|
285
|
-
"""
|
286
|
-
body = soup.find("body") or soup
|
287
|
-
to_decompose: list[Tag] = []
|
288
|
-
for el in body.descendants:
|
289
|
-
if not isinstance(el, Tag):
|
290
|
-
continue
|
291
|
-
attrs = el.attrs if isinstance(el.attrs, dict) else {}
|
292
|
-
if not attrs:
|
293
|
-
continue
|
294
|
-
# Only prune simple leaf-ish nodes to avoid huge deletes unintentionally
|
295
|
-
if sum(1 for _ in el.descendants) > 1:
|
296
|
-
continue
|
297
|
-
for name in ("id", "class", "role", "item_type"):
|
298
|
-
val = attrs.get(name)
|
299
|
-
if val is None:
|
300
|
-
continue
|
301
|
-
if isinstance(val, (list, tuple)):
|
302
|
-
# Match baseline behavior: concatenate tokens without separator
|
303
|
-
val_str = "".join(map(str, val))
|
304
|
-
else:
|
305
|
-
val_str = str(val)
|
306
|
-
if any(sw in val_str.lower() for sw in self.attr_stop_words):
|
307
|
-
to_decompose.append(el)
|
308
|
-
break
|
309
|
-
for el in to_decompose:
|
310
|
-
el.decompose()
|
311
|
-
|
312
|
-
def _prune_header_footer(self, soup: BeautifulSoup) -> None:
|
313
|
-
"""Prune likely navigational clutter inside header and footer.
|
314
|
-
|
315
|
-
Removes common list-like elements and links inside ``<header>``/``<footer>``
|
316
|
-
when the corresponding toggles are enabled.
|
317
|
-
"""
|
318
|
-
header = soup.find("header")
|
319
|
-
footer = soup.find("footer")
|
320
|
-
if header and self.remove_header_lists:
|
321
|
-
self._decompose_tags(header, {"a", "img", "ol", "ul", "li"})
|
322
|
-
if footer and self.remove_footer_lists:
|
323
|
-
self._decompose_tags(footer, {"a", "img", "ol", "ul", "li"})
|
324
|
-
|
325
|
-
def _prune_body(self, soup: BeautifulSoup) -> None:
|
326
|
-
body = soup.find("body") or soup
|
327
|
-
always_remove = {
|
328
|
-
"input", "textarea", "button", "select", "option", "optgroup", "datalist",
|
329
|
-
"label", "fieldset", "legend", "output", "meter", "dialog", "form",
|
330
|
-
"search", "progress", "svg", "canvas", "use", "nav", "object", "noscript",
|
331
|
-
}
|
332
|
-
to_decompose: list[Tag] = []
|
333
|
-
for el in body.descendants:
|
334
|
-
if not isinstance(el, Tag):
|
335
|
-
continue
|
336
|
-
if not isinstance(el.name, str):
|
337
|
-
continue
|
338
|
-
if el.name in self.non_text_to_keep:
|
339
|
-
continue
|
340
|
-
if el.name in always_remove:
|
341
|
-
to_decompose.append(el)
|
342
|
-
for el in to_decompose:
|
343
|
-
el.decompose()
|
344
|
-
|
345
|
-
def _drop_empty_leaf_nodes(self, soup: BeautifulSoup) -> None:
|
346
|
-
"""Iteratively remove empty leaves using the baseline's strict leaf check.
|
347
|
-
|
348
|
-
Walks leaf nodes (no descendants) and removes those with no text content,
|
349
|
-
excluding tags explicitly whitelisted in ``non_text_to_keep``.
|
350
|
-
"""
|
351
|
-
body = soup.find("body") or soup
|
352
|
-
while True:
|
353
|
-
to_decompose: list[Tag] = []
|
354
|
-
for el in body.descendants:
|
355
|
-
if not isinstance(el, Tag):
|
356
|
-
continue
|
357
|
-
if not isinstance(el.name, str):
|
358
|
-
continue
|
359
|
-
if el.name in self.non_text_to_keep:
|
360
|
-
continue
|
361
|
-
# Baseline leaf check: element must have zero descendants at all
|
362
|
-
if len(list(el.descendants)) != 0:
|
363
|
-
continue
|
364
|
-
# Remove if no text once stripped
|
365
|
-
if (el.get_text() or "").strip():
|
366
|
-
continue
|
367
|
-
to_decompose.append(el)
|
368
|
-
if not to_decompose:
|
369
|
-
break
|
370
|
-
for el in to_decompose:
|
371
|
-
el.decompose()
|
372
|
-
|
373
|
-
@staticmethod
|
374
|
-
def _decompose_tags(root: Tag, names: set[str]) -> None:
|
375
|
-
for el in list(root.descendants):
|
376
|
-
if isinstance(el, Tag) and isinstance(el.name, str) and el.name in names:
|
377
|
-
el.decompose()
|
smol_html-0.1.2.dist-info/RECORD
DELETED
@@ -1,6 +0,0 @@
|
|
1
|
-
smol_html/__init__.py,sha256=L_Clggminog8SY3a9TiN57ZajkYAQmXqdY8TOGEut2A,112
|
2
|
-
smol_html/smol_html.py,sha256=f9VGdIS3PTKfce3WpRQlhwMf7OEYOUWOlip8DI3dGV4,15194
|
3
|
-
smol_html-0.1.2.dist-info/METADATA,sha256=RHzSayuHJ0JalYqWxrI3lWcLfI3gi6BGwi8hEEY0m3A,7747
|
4
|
-
smol_html-0.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
5
|
-
smol_html-0.1.2.dist-info/licenses/LICENSE,sha256=88yg3BujRGq8MYlWhbrzB2YMNWJaXnBck3c7l23labs,1089
|
6
|
-
smol_html-0.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|