smol-html 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: smol-html
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Small, dependable HTML cleaner/minifier with sensible defaults
5
5
  Project-URL: Homepage, https://github.com/NosibleAI/smol-html
6
6
  Project-URL: Repository, https://github.com/NosibleAI/smol-html
@@ -22,17 +22,25 @@ Classifier: Programming Language :: Python :: 3.13
22
22
  Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
23
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
24
  Requires-Python: >=3.9
25
- Requires-Dist: beautifulsoup4>=4.13.5
26
- Requires-Dist: lxml[html-clean]>=6.0.1
27
- Requires-Dist: minify-html>=0.16.4
25
+ Requires-Dist: beautifulsoup4>=4.0.1
26
+ Requires-Dist: brotli>=0.5.2
27
+ Requires-Dist: lxml[html-clean]>=1.3.2
28
+ Requires-Dist: minify-html>=0.2.6
28
29
  Description-Content-Type: text/markdown
29
30
 
30
- ![Logo](https://github.com/NosibleAI/nosible-py/blob/main/docs/_static/readme.png?raw=true)
31
+ ![smol](smol.png)
32
+
31
33
 
32
34
  # smol-html
33
35
 
34
36
  Small, dependable HTML cleaner/minifier with sensible defaults.
35
37
 
38
+ ## Motivation
39
+
40
+ Nosible is a search engine, which means we need to store and process a very large number of webpages. To make this tractable, we strip out visual chrome and other non-essential components that don’t matter for downstream tasks (indexing, ranking, retrieval, and LLM pipelines) while preserving the important content and structure. This package cleans and minifies HTML, greatly reducing size on disk; combined with Brotli compression (by Google), the savings are even larger.
41
+
42
+ ![Cleaning impact on size](eval.png)
43
+
36
44
  ### 📦 Installation
37
45
 
38
46
  ```bash
@@ -63,7 +71,7 @@ html = """
63
71
 
64
72
  # All constructor arguments are keyword-only and optional.
65
73
  cleaner = SmolHtmlCleaner()
66
- cleaned = cleaner.clean(raw_html=html)
74
+ cleaned = cleaner.make_smol(raw_html=html)
67
75
 
68
76
  print(cleaned)
69
77
  ```
@@ -89,7 +97,7 @@ Minimal:
89
97
  from smol_html import SmolHtmlCleaner
90
98
 
91
99
  cleaner = SmolHtmlCleaner()
92
- out = cleaner.clean(raw_html="<p>Hi <!-- note --> <a href='x'>link</a></p>")
100
+ out = cleaner.make_smol(raw_html="<p>Hi <!-- note --> <a href='x'>link</a></p>")
93
101
  ```
94
102
 
95
103
  Customize a few options:
@@ -103,7 +111,38 @@ cleaner = SmolHtmlCleaner(
103
111
  minify=True,
104
112
  )
105
113
 
106
- out = cleaner.clean(raw_html="<p>Hi</p>")
114
+ out = cleaner.make_smol(raw_html="<p>Hi</p>")
115
+ ```
116
+
117
+ ## Compressed Bytes Output
118
+
119
+ Produce compressed bytes using Brotli with `make_smol_bytes`
120
+
121
+
122
+ ```python
123
+ from smol_html import SmolHtmlCleaner
124
+ import brotli # only needed if you want to decompress here in the example
125
+
126
+ html = """
127
+ <html>
128
+ <body>
129
+ <div> Hello <span> world </span> </div>
130
+ </body>
131
+ </html>
132
+ """
133
+
134
+ cleaner = SmolHtmlCleaner()
135
+
136
+ # Get compressed bytes (quality 11 is strong compression)
137
+ compressed = cleaner.make_smol_bytes(raw_html=html, compression_level=11)
138
+
139
+ # Example: decompress back to text to inspect (optional)
140
+ decompressed = brotli.decompress(compressed).decode("utf-8")
141
+ print(decompressed)
142
+
143
+ # Or write compressed output directly to a file
144
+ with open("page.html.br", "wb") as f:
145
+ f.write(compressed)
107
146
  ```
108
147
 
109
148
  ## Parameter Reference
@@ -0,0 +1,4 @@
1
+ smol_html-0.1.3.dist-info/METADATA,sha256=MApb1E7-tzyEYuRymzRjUTg8TD14vFBsfceTsY07r3s,8279
2
+ smol_html-0.1.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
3
+ smol_html-0.1.3.dist-info/licenses/LICENSE,sha256=88yg3BujRGq8MYlWhbrzB2YMNWJaXnBck3c7l23labs,1089
4
+ smol_html-0.1.3.dist-info/RECORD,,
smol_html/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from smol_html.smol_html import SmolHtmlCleaner
2
-
3
- all = ["__version__", "SmolHtmlCleaner"]
4
- __version__ = "0.1.0"
smol_html/smol_html.py DELETED
@@ -1,328 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import minify_html
4
- from bs4 import BeautifulSoup, Tag
5
- from lxml import html as lxml_html
6
- from lxml.html.clean import Cleaner
7
-
8
-
9
- # -------------------------
10
- # Public API
11
- # -------------------------
12
- class SmolHtmlCleaner:
13
- """
14
- Small, dependable HTML cleaner/minifier with sensible defaults.
15
-
16
- Parameters
17
- ----------
18
- non_text_to_keep : set of str, optional
19
- Tags preserved even if textless. Default includes meta/media/table/line-break tags.
20
- attr_stop_words : set of str, optional
21
- Attribute tokens indicating non-content scaffolding/UX. Default contains common UI tokens.
22
- remove_header_lists : bool, optional
23
- Prune links/lists inside ``<header>``. Default True.
24
- remove_footer_lists : bool, optional
25
- Prune links/lists inside ``<footer>``. Default True.
26
- minify : bool, optional
27
- Minify HTML output via ``minify_html``. Default True.
28
- minify_kwargs : dict, optional
29
- Extra args for ``minify_html.minify``. Default empty.
30
-
31
- lxml Cleaner parameters
32
- ----------------------
33
- meta : bool, optional
34
- Remove meta tags. Default False.
35
- page_structure : bool, optional
36
- Remove page structure tags (html, head, body). Default False.
37
- links : bool, optional
38
- Remove link tags. Default True.
39
- scripts : bool, optional
40
- Remove script tags. Default False.
41
- javascript : bool, optional
42
- Remove JavaScript content. Default True.
43
- comments : bool, optional
44
- Remove comments. Default True.
45
- style : bool, optional
46
- Remove style tags. Default True.
47
- processing_instructions : bool, optional
48
- Remove processing instructions. Default True.
49
- embedded : bool, optional
50
- Remove embedded content (object, embed, applet). Default True.
51
- frames : bool, optional
52
- Remove frame/iframe tags. Default True.
53
- forms : bool, optional
54
- Remove form tags. Default True.
55
- annoying_tags : bool, optional
56
- Remove tags considered annoying (blink, marquee, etc). Default True.
57
- kill_tags : set of str, optional
58
- Additional tags to remove. Default None.
59
- remove_unknown_tags : bool, optional
60
- Remove unknown tags. Default True.
61
- safe_attrs_only : bool, optional
62
- Only keep safe attributes. Default True.
63
- safe_attrs : set of str, optional
64
- Set of safe attributes to keep. Default is a sensible set.
65
-
66
- Notes
67
- -----
68
- Defaults and cleaning behavior are preserved; only the configuration surface
69
- moved from a dataclass to keyword-only parameters on the constructor.
70
- """
71
-
72
- def __init__(
73
- self,
74
- *,
75
- # Core behavior
76
- non_text_to_keep: set[str] = None,
77
- attr_stop_words: set[str] = None,
78
- remove_header_lists: bool = True,
79
- remove_footer_lists: bool = True,
80
- # Minify
81
- minify: bool = True,
82
- minify_kwargs: dict | None = None,
83
- # lxml Cleaner exposed explicitly (prefixed)
84
- meta: bool = False,
85
- page_structure: bool = False,
86
- links: bool = True,
87
- scripts: bool = False,
88
- javascript: bool = True,
89
- comments: bool = True,
90
- style: bool = True,
91
- processing_instructions: bool = True,
92
- embedded: bool = True,
93
- frames: bool = True,
94
- forms: bool = True,
95
- annoying_tags: bool = True,
96
- kill_tags: set[str] | None = None,
97
- remove_unknown_tags: bool = True,
98
- safe_attrs_only: bool = True,
99
- safe_attrs: set[str] = None,
100
- ):
101
- # Inline defaults identical to the prior CleanerConfig
102
- if safe_attrs is None:
103
- safe_attrs = {"href", "hreflang", "src", "srclang", "target", "alt", "kind", "type", "role", "abbr",
104
- "accept", "accept-charset", "datetime", "lang", "name", "rel", "title", "value", "content", "label",
105
- "item_type", "property", "itemprop"}
106
-
107
- if attr_stop_words is None:
108
- attr_stop_words = {"alert", "button", "checkbox", "dialog", "navigation", "tab", "tabpanel", "textbox",
109
- "menu", "banner", "form", "search", "progressbar", "radio", "slider", "comment", "nav", "sidebar",
110
- "breadcrumb", "dropdown", "menu-item", "toggle", "hamburger", "aside", "tooltip", "modal", "overlay",
111
- "popup", "advert", "hero", "utility", "login", "signup", "password", "email", "username"}
112
-
113
- if non_text_to_keep is None:
114
- non_text_to_keep = {"meta", "img", "picture", "figure", "figcaption", "video", "source", "audio", "table",
115
- "tr", "th", "td", "thead", "tbody", "tfoot", "caption", "br"}
116
-
117
- self.non_text_to_keep = non_text_to_keep
118
- self.attr_stop_words = attr_stop_words
119
- self.remove_header_lists = remove_header_lists
120
- self.remove_footer_lists = remove_footer_lists
121
- self.minify = minify
122
- self.minify_kwargs = dict(minify_kwargs or {})
123
-
124
- # Initialize lxml Cleaner with explicit kwargs gathered from parameters
125
- self._cleaner = Cleaner(
126
- meta=meta,
127
- page_structure=page_structure,
128
- links=links,
129
- scripts=scripts,
130
- javascript=javascript,
131
- comments=comments,
132
- style=style,
133
- processing_instructions=processing_instructions,
134
- embedded=embedded,
135
- frames=frames,
136
- forms=forms,
137
- annoying_tags=annoying_tags,
138
- kill_tags=kill_tags,
139
- remove_unknown_tags=remove_unknown_tags,
140
- safe_attrs_only=safe_attrs_only,
141
- safe_attrs=safe_attrs,
142
- )
143
-
144
- # -------------------------
145
- # User-friendly entry points
146
- # -------------------------
147
-
148
-
149
- def clean(self, *, raw_html: str | BeautifulSoup) -> str:
150
- """Clean and optionally minify HTML input.
151
-
152
- The cleaning pipeline applies pre-parse hooks (on strings), prunes elements
153
- by attribute stop words, sanitizes via lxml Cleaner, performs structural
154
- pruning of header/footer/body, then applies post-clean hooks.
155
-
156
- Parameters
157
- ----------
158
- raw_html : str or BeautifulSoup
159
- Raw HTML string or BeautifulSoup to be cleaned.
160
-
161
- Returns
162
- -------
163
- str
164
- Cleaned HTML as a string.
165
- """
166
-
167
- # Stage 0: hooks that operate on the raw string
168
- if isinstance(raw_html, str):
169
- soup = BeautifulSoup(raw_html or "", features="lxml")
170
- elif isinstance(raw_html, BeautifulSoup):
171
- soup = raw_html
172
- else:
173
- raise TypeError("raw_html must be a str or BeautifulSoup instance")
174
-
175
- # Stage 1: attribute-based pruning on the original soup
176
- # Remove small, likely non-content elements based on attribute tokens.
177
- self._strip_by_attribute_stop_words(soup=soup)
178
-
179
- # Stage 2: lxml cleaner pass (robust HTML sanitation)
180
- # Use lxml Cleaner to sanitize HTML, optionally minify afterwards.
181
- cleaned_html = self._lxml_clean(str(soup))
182
- clean_soup = BeautifulSoup(markup=cleaned_html, features="lxml")
183
-
184
- # Stage 3: structural pruning on header/body/footer of the cleaned soup
185
- self._prune_header_footer(clean_soup)
186
- self._prune_body(clean_soup)
187
- self._drop_empty_leaf_nodes(clean_soup)
188
-
189
- return str(clean_soup)
190
-
191
- # -------------------------
192
- # Internal helpers
193
- # -------------------------
194
- def _lxml_clean(self, html_str: str) -> str:
195
- """Sanitize and optionally minify HTML using lxml + minify_html.
196
-
197
- Parameters
198
- ----------
199
- html_str : str
200
- HTML markup to be cleaned.
201
-
202
- Returns
203
- -------
204
- str
205
- Cleaned (and possibly minified) HTML markup.
206
- """
207
- try:
208
- cleaned = self._cleaner.clean_html(html_str)
209
- return minify_html.minify(cleaned, **self.minify_kwargs) if self.minify else cleaned
210
- except ValueError as ex:
211
- # Handle encoding declaration edge-cases by round-tripping via lxml
212
- msg = (
213
- "Unicode strings with encoding declaration are not supported. "
214
- "Please use bytes input or XML fragments without declaration."
215
- )
216
- if str(ex) == msg:
217
- raw_bytes = html_str.encode("utf-8", errors="ignore")
218
- doc = lxml_html.fromstring(raw_bytes)
219
- cleaned = self._cleaner.clean_html(doc)
220
- rendered = lxml_html.tostring(cleaned, encoding="utf-8").decode("utf-8")
221
- return minify_html.minify(rendered, **self.minify_kwargs) if self.minify else rendered
222
- raise
223
-
224
- def _strip_by_attribute_stop_words(self, *, soup: BeautifulSoup) -> None:
225
- """Remove small, likely non-content elements by attribute tokens.
226
-
227
- Scans leaf-like descendants under ``<body>`` and collects elements whose
228
- ``id``, ``class``, ``role``, or ``item_type`` values contain any of the
229
- configured ``attr_stop_words`` tokens (case-insensitive), then decomposes
230
- them. Mirrors the baseline leaf-ness and concatenation behavior.
231
-
232
- Parameters
233
- ----------
234
- soup : BeautifulSoup
235
- Parsed document to prune in place.
236
- """
237
- body = soup.find("body") or soup
238
- to_decompose: list[Tag] = []
239
- for el in body.descendants:
240
- if not isinstance(el, Tag):
241
- continue
242
- attrs = el.attrs if isinstance(el.attrs, dict) else {}
243
- if not attrs:
244
- continue
245
- # Only prune simple leaf-ish nodes to avoid huge deletes unintentionally
246
- if sum(1 for _ in el.descendants) > 1:
247
- continue
248
- for name in ("id", "class", "role", "item_type"):
249
- val = attrs.get(name)
250
- if val is None:
251
- continue
252
- if isinstance(val, (list, tuple)):
253
- # Match baseline behavior: concatenate tokens without separator
254
- val_str = "".join(map(str, val))
255
- else:
256
- val_str = str(val)
257
- if any(sw in val_str.lower() for sw in self.attr_stop_words):
258
- to_decompose.append(el)
259
- break
260
- for el in to_decompose:
261
- el.decompose()
262
-
263
- def _prune_header_footer(self, soup: BeautifulSoup) -> None:
264
- """Prune likely navigational clutter inside header and footer.
265
-
266
- Removes common list-like elements and links inside ``<header>``/``<footer>``
267
- when the corresponding toggles are enabled.
268
- """
269
- header = soup.find("header")
270
- footer = soup.find("footer")
271
- if header and self.remove_header_lists:
272
- self._decompose_tags(header, {"a", "img", "ol", "ul", "li"})
273
- if footer and self.remove_footer_lists:
274
- self._decompose_tags(footer, {"a", "img", "ol", "ul", "li"})
275
-
276
- def _prune_body(self, soup: BeautifulSoup) -> None:
277
- body = soup.find("body") or soup
278
- always_remove = {
279
- "input", "textarea", "button", "select", "option", "optgroup", "datalist",
280
- "label", "fieldset", "legend", "output", "meter", "dialog", "form",
281
- "search", "progress", "svg", "canvas", "use", "nav", "object", "noscript",
282
- }
283
- to_decompose: list[Tag] = []
284
- for el in body.descendants:
285
- if not isinstance(el, Tag):
286
- continue
287
- if not isinstance(el.name, str):
288
- continue
289
- if el.name in self.non_text_to_keep:
290
- continue
291
- if el.name in always_remove:
292
- to_decompose.append(el)
293
- for el in to_decompose:
294
- el.decompose()
295
-
296
- def _drop_empty_leaf_nodes(self, soup: BeautifulSoup) -> None:
297
- """Iteratively remove empty leaves using the baseline's strict leaf check.
298
-
299
- Walks leaf nodes (no descendants) and removes those with no text content,
300
- excluding tags explicitly whitelisted in ``non_text_to_keep``.
301
- """
302
- body = soup.find("body") or soup
303
- while True:
304
- to_decompose: list[Tag] = []
305
- for el in body.descendants:
306
- if not isinstance(el, Tag):
307
- continue
308
- if not isinstance(el.name, str):
309
- continue
310
- if el.name in self.non_text_to_keep:
311
- continue
312
- # Baseline leaf check: element must have zero descendants at all
313
- if len(list(el.descendants)) != 0:
314
- continue
315
- # Remove if no text once stripped
316
- if (el.get_text() or "").strip():
317
- continue
318
- to_decompose.append(el)
319
- if not to_decompose:
320
- break
321
- for el in to_decompose:
322
- el.decompose()
323
-
324
- @staticmethod
325
- def _decompose_tags(root: Tag, names: set[str]) -> None:
326
- for el in list(root.descendants):
327
- if isinstance(el, Tag) and isinstance(el.name, str) and el.name in names:
328
- el.decompose()
@@ -1,6 +0,0 @@
1
- smol_html/__init__.py,sha256=05SUWU4WVeAQgrzTdcQbH91vvQGx6V-oHgIpUtMBbGE,111
2
- smol_html/smol_html.py,sha256=QCGyi0uh5bz3ex_z1sP5BYxjTHEeZOejUDA3eM9a2lE,13115
3
- smol_html-0.1.1.dist-info/METADATA,sha256=SJAUtNXwpq4Qo0fdekcgUvPOapUTjWAonHZ27cyUFkc,7055
4
- smol_html-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
- smol_html-0.1.1.dist-info/licenses/LICENSE,sha256=88yg3BujRGq8MYlWhbrzB2YMNWJaXnBck3c7l23labs,1089
6
- smol_html-0.1.1.dist-info/RECORD,,