smol-html 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smol_html-0.1.0/.gitignore +210 -0
- smol_html-0.1.0/.python-version +1 -0
- smol_html-0.1.0/LICENSE +21 -0
- smol_html-0.1.0/PKG-INFO +127 -0
- smol_html-0.1.0/README.md +98 -0
- smol_html-0.1.0/pyproject.toml +51 -0
- smol_html-0.1.0/src/smol_html/__init__.py +4 -0
- smol_html-0.1.0/src/smol_html/core.py +299 -0
@@ -0,0 +1,210 @@
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
2
|
+
__pycache__/
|
3
|
+
*.py[codz]
|
4
|
+
*$py.class
|
5
|
+
|
6
|
+
# C extensions
|
7
|
+
*.so
|
8
|
+
|
9
|
+
# Distribution / packaging
|
10
|
+
.Python
|
11
|
+
build/
|
12
|
+
develop-eggs/
|
13
|
+
dist/
|
14
|
+
downloads/
|
15
|
+
eggs/
|
16
|
+
.eggs/
|
17
|
+
lib/
|
18
|
+
lib64/
|
19
|
+
parts/
|
20
|
+
sdist/
|
21
|
+
var/
|
22
|
+
wheels/
|
23
|
+
share/python-wheels/
|
24
|
+
*.egg-info/
|
25
|
+
.installed.cfg
|
26
|
+
*.egg
|
27
|
+
MANIFEST
|
28
|
+
|
29
|
+
# PyInstaller
|
30
|
+
# Usually these files are written by a python script from a template
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32
|
+
*.manifest
|
33
|
+
*.spec
|
34
|
+
|
35
|
+
# Installer logs
|
36
|
+
pip-log.txt
|
37
|
+
pip-delete-this-directory.txt
|
38
|
+
|
39
|
+
# Unit test / coverage reports
|
40
|
+
htmlcov/
|
41
|
+
.tox/
|
42
|
+
.nox/
|
43
|
+
.coverage
|
44
|
+
.coverage.*
|
45
|
+
.cache
|
46
|
+
nosetests.xml
|
47
|
+
coverage.xml
|
48
|
+
*.cover
|
49
|
+
*.py.cover
|
50
|
+
.hypothesis/
|
51
|
+
.pytest_cache/
|
52
|
+
cover/
|
53
|
+
|
54
|
+
# Translations
|
55
|
+
*.mo
|
56
|
+
*.pot
|
57
|
+
|
58
|
+
# Django stuff:
|
59
|
+
*.log
|
60
|
+
local_settings.py
|
61
|
+
db.sqlite3
|
62
|
+
db.sqlite3-journal
|
63
|
+
|
64
|
+
# Flask stuff:
|
65
|
+
instance/
|
66
|
+
.webassets-cache
|
67
|
+
|
68
|
+
# Scrapy stuff:
|
69
|
+
.scrapy
|
70
|
+
|
71
|
+
# Sphinx documentation
|
72
|
+
docs/_build/
|
73
|
+
|
74
|
+
# PyBuilder
|
75
|
+
.pybuilder/
|
76
|
+
target/
|
77
|
+
|
78
|
+
# Jupyter Notebook
|
79
|
+
.ipynb_checkpoints
|
80
|
+
|
81
|
+
# IPython
|
82
|
+
profile_default/
|
83
|
+
ipython_config.py
|
84
|
+
|
85
|
+
# pyenv
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
88
|
+
# .python-version
|
89
|
+
|
90
|
+
# pipenv
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94
|
+
# install all needed dependencies.
|
95
|
+
#Pipfile.lock
|
96
|
+
|
97
|
+
# UV
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100
|
+
# commonly ignored for libraries.
|
101
|
+
#uv.lock
|
102
|
+
|
103
|
+
# poetry
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
106
|
+
# commonly ignored for libraries.
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
108
|
+
#poetry.lock
|
109
|
+
#poetry.toml
|
110
|
+
|
111
|
+
# pdm
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
115
|
+
#pdm.lock
|
116
|
+
#pdm.toml
|
117
|
+
.pdm-python
|
118
|
+
.pdm-build/
|
119
|
+
|
120
|
+
# pixi
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
122
|
+
#pixi.lock
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
125
|
+
.pixi
|
126
|
+
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
128
|
+
__pypackages__/
|
129
|
+
|
130
|
+
# Celery stuff
|
131
|
+
celerybeat-schedule
|
132
|
+
celerybeat.pid
|
133
|
+
|
134
|
+
# SageMath parsed files
|
135
|
+
*.sage.py
|
136
|
+
|
137
|
+
# Environments
|
138
|
+
.env
|
139
|
+
.envrc
|
140
|
+
.venv
|
141
|
+
env/
|
142
|
+
venv/
|
143
|
+
ENV/
|
144
|
+
env.bak/
|
145
|
+
venv.bak/
|
146
|
+
|
147
|
+
# Spyder project settings
|
148
|
+
.spyderproject
|
149
|
+
.spyproject
|
150
|
+
|
151
|
+
# Rope project settings
|
152
|
+
.ropeproject
|
153
|
+
|
154
|
+
# mkdocs documentation
|
155
|
+
/site
|
156
|
+
|
157
|
+
# mypy
|
158
|
+
.mypy_cache/
|
159
|
+
.dmypy.json
|
160
|
+
dmypy.json
|
161
|
+
|
162
|
+
# Pyre type checker
|
163
|
+
.pyre/
|
164
|
+
|
165
|
+
# pytype static type analyzer
|
166
|
+
.pytype/
|
167
|
+
|
168
|
+
# Cython debug symbols
|
169
|
+
cython_debug/
|
170
|
+
|
171
|
+
# PyCharm
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
176
|
+
#.idea/
|
177
|
+
|
178
|
+
# Abstra
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
181
|
+
# Learn more at https://abstra.io/docs
|
182
|
+
.abstra/
|
183
|
+
|
184
|
+
# Visual Studio Code
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
189
|
+
# .vscode/
|
190
|
+
|
191
|
+
# Ruff stuff:
|
192
|
+
.ruff_cache/
|
193
|
+
|
194
|
+
# PyPI configuration file
|
195
|
+
.pypirc
|
196
|
+
|
197
|
+
# Cursor
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
201
|
+
.cursorignore
|
202
|
+
.cursorindexingignore
|
203
|
+
|
204
|
+
# Marimo
|
205
|
+
marimo/_static/
|
206
|
+
marimo/_lsp/
|
207
|
+
__marimo__/
|
208
|
+
|
209
|
+
|
210
|
+
.scratch/
|
@@ -0,0 +1 @@
|
|
1
|
+
3.12
|
smol_html-0.1.0/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Nosible Ltd
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
smol_html-0.1.0/PKG-INFO
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: smol-html
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Small, dependable HTML cleaner/minifier with sensible defaults
|
5
|
+
Project-URL: Homepage, https://github.com/NosibleAI/smol-html
|
6
|
+
Project-URL: Repository, https://github.com/NosibleAI/smol-html
|
7
|
+
Project-URL: Issues, https://github.com/NosibleAI/smol-html/issues
|
8
|
+
Author-email: Gareth Warburton <garethw738@gmail.com>, Stuart Reid <stuart@nosible.com>, Matthew Dicks <matthew@nosible.com>, Richard Taylor <richard@nosible.com>
|
9
|
+
License: MIT
|
10
|
+
License-File: LICENSE
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
14
|
+
Classifier: Operating System :: OS Independent
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
16
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
24
|
+
Requires-Python: >=3.9
|
25
|
+
Requires-Dist: beautifulsoup4>=4.13.5
|
26
|
+
Requires-Dist: lxml[html-clean]>=6.0.1
|
27
|
+
Requires-Dist: minify-html>=0.16.4
|
28
|
+
Description-Content-Type: text/markdown
|
29
|
+
|
30
|
+
# smol-html
|
31
|
+
|
32
|
+
Small, dependable HTML cleaner/minifier with sensible defaults.
|
33
|
+
|
34
|
+
## Installation
|
35
|
+
|
36
|
+
- pip: `pip install smol-html`
|
37
|
+
- uv: `uv pip install smol-html`
|
38
|
+
|
39
|
+
## Quick Start
|
40
|
+
|
41
|
+
Clean an HTML string (or page contents):
|
42
|
+
|
43
|
+
```python
|
44
|
+
from smol_html import SmolHtmlCleaner
|
45
|
+
|
46
|
+
html = """
|
47
|
+
<html>
|
48
|
+
<head><title> Example </title></head>
|
49
|
+
<body>
|
50
|
+
<div> Hello <span> world </span> </div>
|
51
|
+
</body>
|
52
|
+
</html>
|
53
|
+
"""
|
54
|
+
|
55
|
+
# All constructor arguments are keyword-only and optional.
|
56
|
+
cleaner = SmolHtmlCleaner()
|
57
|
+
cleaned = cleaner.clean(raw_html=html)
|
58
|
+
|
59
|
+
print(cleaned)
|
60
|
+
```
|
61
|
+
|
62
|
+
## Customization
|
63
|
+
|
64
|
+
`SmolHtmlCleaner` exposes keyword-only parameters with practical defaults. You can:
|
65
|
+
- Pass overrides to the constructor, or
|
66
|
+
- Adjust attributes on the instance after creation.
|
67
|
+
|
68
|
+
```python
|
69
|
+
from smol_html import SmolHtmlCleaner
|
70
|
+
|
71
|
+
cleaner = SmolHtmlCleaner()
|
72
|
+
cleaner.attr_stop_words.add("advert") # e.g., add a custom stop word
|
73
|
+
```
|
74
|
+
|
75
|
+
## Usage Examples
|
76
|
+
|
77
|
+
Minimal:
|
78
|
+
|
79
|
+
```python
|
80
|
+
from smol_html import SmolHtmlCleaner
|
81
|
+
|
82
|
+
cleaner = SmolHtmlCleaner()
|
83
|
+
out = cleaner.clean(raw_html="<p>Hi <!-- note --> <a href='x'>link</a></p>")
|
84
|
+
```
|
85
|
+
|
86
|
+
Customize a few options:
|
87
|
+
|
88
|
+
```python
|
89
|
+
from smol_html import SmolHtmlCleaner
|
90
|
+
|
91
|
+
cleaner = SmolHtmlCleaner(
|
92
|
+
attr_stop_words={"nav", "advert"},
|
93
|
+
remove_header_lists=False,
|
94
|
+
minify=True,
|
95
|
+
)
|
96
|
+
|
97
|
+
out = cleaner.clean(raw_html="<p>Hi</p>")
|
98
|
+
```
|
99
|
+
|
100
|
+
## Parameter Reference
|
101
|
+
|
102
|
+
The most useful parameters, what they do, and when to change them:
|
103
|
+
|
104
|
+
| Parameter | Type | Default | What it does | When to change |
|
105
|
+
|---|---|---|---|---|
|
106
|
+
| `non_text_to_keep` | `set[str]` | media/meta/table/`br` tags | Whitelist of empty/non-text tags to preserve (e.g., images, figures, tables, line breaks). | If important non-text elements are being removed or you want to keep/drop more empty tags. |
|
107
|
+
| `attr_stop_words` | `set[str]` | common UI/navigation tokens | Tokens matched against `id`/`class`/`role`/`item_type` on small elements; matches are removed as likely non-content. | Add tokens like `advert`, `hero`, `menu` to aggressively drop UI chrome, or remove tokens if content is lost. |
|
108
|
+
| `remove_header_lists` | `bool` | `True` | Removes links/lists/images within `<header>` to reduce nav clutter. | Set `False` if your header contains meaningful content you want to keep. |
|
109
|
+
| `remove_footer_lists` | `bool` | `True` | Removes links/lists/images within `<footer>` to reduce boilerplate. | Set `False` for content-heavy footers you need. |
|
110
|
+
| `minify` | `bool` | `True` | Minifies output HTML using `minify_html`. | Set `False` for readability or debugging; use `--pretty` in the CLI. |
|
111
|
+
| `minify_kwargs` | `dict` | `{}` | Extra options passed to `minify_html.minify`. | Tune minification behavior (e.g., whitespace, comments) without changing cleaning. |
|
112
|
+
| `meta` | `bool` | `False` | lxml Cleaner option: remove `<meta>` content when `True`. | Usually leave `False`; enable only for strict sanitation. |
|
113
|
+
| `page_structure` | `bool` | `False` | lxml Cleaner option: remove page-structure tags (e.g., `<head>`, `<body>`) when `True`. | Rarely needed; keep `False` to preserve structure. |
|
114
|
+
| `links` | `bool` | `True` | lxml Cleaner option: sanitize/clean links. | Leave `True` unless you need raw anchors untouched. |
|
115
|
+
| `scripts` | `bool` | `False` | lxml Cleaner option: remove `<script>` tags when `True`. | Keep `False` to preserve scripts; usually safe to remove via `javascript=True` anyway. |
|
116
|
+
| `javascript` | `bool` | `True` | lxml Cleaner option: remove JS and event handlers. | Set `False` only if you truly need inline JS (not recommended). |
|
117
|
+
| `comments` | `bool` | `True` | lxml Cleaner option: remove HTML comments. | Set `False` to retain comments for debugging. |
|
118
|
+
| `style` | `bool` | `True` | lxml Cleaner option: remove CSS and style attributes. | Set `False` to keep inline styles/CSS. |
|
119
|
+
| `processing_instructions` | `bool` | `True` | lxml Cleaner option: remove processing instructions. | Rarely change; keep for safety. |
|
120
|
+
| `embedded` | `bool` | `True` | lxml Cleaner option: remove embedded content (e.g., `<embed>`, `<object>`). | Set `False` to keep embedded media. |
|
121
|
+
| `frames` | `bool` | `True` | lxml Cleaner option: remove frames/iframes. | Set `False` if iframes contain needed content. |
|
122
|
+
| `forms` | `bool` | `True` | lxml Cleaner option: remove form elements. | Set `False` if you need to keep forms/inputs. |
|
123
|
+
| `annoying_tags` | `bool` | `True` | lxml Cleaner option: remove tags considered "annoying" by lxml (e.g., `<blink>`, `<marquee>`). | Rarely change. |
|
124
|
+
| `kill_tags` | `set[str] | None` | `None` | Additional explicit tags to remove entirely. | Add site-specific or custom tags to drop. |
|
125
|
+
| `remove_unknown_tags` | `bool` | `True` | lxml Cleaner option: drop unknown/invalid tags. | Set `False` if you rely on custom elements. |
|
126
|
+
| `safe_attrs_only` | `bool` | `True` | Only allow attributes listed in `safe_attrs`. | Set `False` if you need to keep arbitrary attributes. |
|
127
|
+
| `safe_attrs` | `set[str]` | curated set | Allowed HTML attributes when `safe_attrs_only=True`. | Extend to keep additional attributes you trust. |
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# smol-html
|
2
|
+
|
3
|
+
Small, dependable HTML cleaner/minifier with sensible defaults.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
- pip: `pip install smol-html`
|
8
|
+
- uv: `uv pip install smol-html`
|
9
|
+
|
10
|
+
## Quick Start
|
11
|
+
|
12
|
+
Clean an HTML string (or page contents):
|
13
|
+
|
14
|
+
```python
|
15
|
+
from smol_html import SmolHtmlCleaner
|
16
|
+
|
17
|
+
html = """
|
18
|
+
<html>
|
19
|
+
<head><title> Example </title></head>
|
20
|
+
<body>
|
21
|
+
<div> Hello <span> world </span> </div>
|
22
|
+
</body>
|
23
|
+
</html>
|
24
|
+
"""
|
25
|
+
|
26
|
+
# All constructor arguments are keyword-only and optional.
|
27
|
+
cleaner = SmolHtmlCleaner()
|
28
|
+
cleaned = cleaner.clean(raw_html=html)
|
29
|
+
|
30
|
+
print(cleaned)
|
31
|
+
```
|
32
|
+
|
33
|
+
## Customization
|
34
|
+
|
35
|
+
`SmolHtmlCleaner` exposes keyword-only parameters with practical defaults. You can:
|
36
|
+
- Pass overrides to the constructor, or
|
37
|
+
- Adjust attributes on the instance after creation.
|
38
|
+
|
39
|
+
```python
|
40
|
+
from smol_html import SmolHtmlCleaner
|
41
|
+
|
42
|
+
cleaner = SmolHtmlCleaner()
|
43
|
+
cleaner.attr_stop_words.add("advert") # e.g., add a custom stop word
|
44
|
+
```
|
45
|
+
|
46
|
+
## Usage Examples
|
47
|
+
|
48
|
+
Minimal:
|
49
|
+
|
50
|
+
```python
|
51
|
+
from smol_html import SmolHtmlCleaner
|
52
|
+
|
53
|
+
cleaner = SmolHtmlCleaner()
|
54
|
+
out = cleaner.clean(raw_html="<p>Hi <!-- note --> <a href='x'>link</a></p>")
|
55
|
+
```
|
56
|
+
|
57
|
+
Customize a few options:
|
58
|
+
|
59
|
+
```python
|
60
|
+
from smol_html import SmolHtmlCleaner
|
61
|
+
|
62
|
+
cleaner = SmolHtmlCleaner(
|
63
|
+
attr_stop_words={"nav", "advert"},
|
64
|
+
remove_header_lists=False,
|
65
|
+
minify=True,
|
66
|
+
)
|
67
|
+
|
68
|
+
out = cleaner.clean(raw_html="<p>Hi</p>")
|
69
|
+
```
|
70
|
+
|
71
|
+
## Parameter Reference
|
72
|
+
|
73
|
+
The most useful parameters, what they do, and when to change them:
|
74
|
+
|
75
|
+
| Parameter | Type | Default | What it does | When to change |
|
76
|
+
|---|---|---|---|---|
|
77
|
+
| `non_text_to_keep` | `set[str]` | media/meta/table/`br` tags | Whitelist of empty/non-text tags to preserve (e.g., images, figures, tables, line breaks). | If important non-text elements are being removed or you want to keep/drop more empty tags. |
|
78
|
+
| `attr_stop_words` | `set[str]` | common UI/navigation tokens | Tokens matched against `id`/`class`/`role`/`item_type` on small elements; matches are removed as likely non-content. | Add tokens like `advert`, `hero`, `menu` to aggressively drop UI chrome, or remove tokens if content is lost. |
|
79
|
+
| `remove_header_lists` | `bool` | `True` | Removes links/lists/images within `<header>` to reduce nav clutter. | Set `False` if your header contains meaningful content you want to keep. |
|
80
|
+
| `remove_footer_lists` | `bool` | `True` | Removes links/lists/images within `<footer>` to reduce boilerplate. | Set `False` for content-heavy footers you need. |
|
81
|
+
| `minify` | `bool` | `True` | Minifies output HTML using `minify_html`. | Set `False` for readability or debugging; use `--pretty` in the CLI. |
|
82
|
+
| `minify_kwargs` | `dict` | `{}` | Extra options passed to `minify_html.minify`. | Tune minification behavior (e.g., whitespace, comments) without changing cleaning. |
|
83
|
+
| `meta` | `bool` | `False` | lxml Cleaner option: remove `<meta>` content when `True`. | Usually leave `False`; enable only for strict sanitation. |
|
84
|
+
| `page_structure` | `bool` | `False` | lxml Cleaner option: remove page-structure tags (e.g., `<head>`, `<body>`) when `True`. | Rarely needed; keep `False` to preserve structure. |
|
85
|
+
| `links` | `bool` | `True` | lxml Cleaner option: sanitize/clean links. | Leave `True` unless you need raw anchors untouched. |
|
86
|
+
| `scripts` | `bool` | `False` | lxml Cleaner option: remove `<script>` tags when `True`. | Keep `False` to preserve scripts; usually safe to remove via `javascript=True` anyway. |
|
87
|
+
| `javascript` | `bool` | `True` | lxml Cleaner option: remove JS and event handlers. | Set `False` only if you truly need inline JS (not recommended). |
|
88
|
+
| `comments` | `bool` | `True` | lxml Cleaner option: remove HTML comments. | Set `False` to retain comments for debugging. |
|
89
|
+
| `style` | `bool` | `True` | lxml Cleaner option: remove CSS and style attributes. | Set `False` to keep inline styles/CSS. |
|
90
|
+
| `processing_instructions` | `bool` | `True` | lxml Cleaner option: remove processing instructions. | Rarely change; keep for safety. |
|
91
|
+
| `embedded` | `bool` | `True` | lxml Cleaner option: remove embedded content (e.g., `<embed>`, `<object>`). | Set `False` to keep embedded media. |
|
92
|
+
| `frames` | `bool` | `True` | lxml Cleaner option: remove frames/iframes. | Set `False` if iframes contain needed content. |
|
93
|
+
| `forms` | `bool` | `True` | lxml Cleaner option: remove form elements. | Set `False` if you need to keep forms/inputs. |
|
94
|
+
| `annoying_tags` | `bool` | `True` | lxml Cleaner option: remove tags considered "annoying" by lxml (e.g., `<blink>`, `<marquee>`). | Rarely change. |
|
95
|
+
| `kill_tags` | `set[str] | None` | `None` | Additional explicit tags to remove entirely. | Add site-specific or custom tags to drop. |
|
96
|
+
| `remove_unknown_tags` | `bool` | `True` | lxml Cleaner option: drop unknown/invalid tags. | Set `False` if you rely on custom elements. |
|
97
|
+
| `safe_attrs_only` | `bool` | `True` | Only allow attributes listed in `safe_attrs`. | Set `False` if you need to keep arbitrary attributes. |
|
98
|
+
| `safe_attrs` | `set[str]` | curated set | Allowed HTML attributes when `safe_attrs_only=True`. | Extend to keep additional attributes you trust. |
|
@@ -0,0 +1,51 @@
|
|
1
|
+
[project]
|
2
|
+
name = "smol-html"
|
3
|
+
version = "0.1.0"
|
4
|
+
description = "Small, dependable HTML cleaner/minifier with sensible defaults"
|
5
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
6
|
+
requires-python = ">=3.9"
|
7
|
+
authors = [
|
8
|
+
{ name = "Gareth Warburton", email = "garethw738@gmail.com" },
|
9
|
+
{ name = "Stuart Reid", email = "stuart@nosible.com" },
|
10
|
+
{ name = "Matthew Dicks", email = "matthew@nosible.com" },
|
11
|
+
{ name = "Richard Taylor", email = "richard@nosible.com" },
|
12
|
+
]
|
13
|
+
license = { text = "MIT" }
|
14
|
+
dependencies = [
|
15
|
+
"beautifulsoup4>=4.13.5",
|
16
|
+
"lxml[html-clean]>=6.0.1",
|
17
|
+
"minify-html>=0.16.4",
|
18
|
+
]
|
19
|
+
classifiers = [
|
20
|
+
"Development Status :: 4 - Beta",
|
21
|
+
"Intended Audience :: Developers",
|
22
|
+
"License :: OSI Approved :: MIT License",
|
23
|
+
"Programming Language :: Python :: 3",
|
24
|
+
"Programming Language :: Python :: 3 :: Only",
|
25
|
+
"Programming Language :: Python :: 3.9",
|
26
|
+
"Programming Language :: Python :: 3.10",
|
27
|
+
"Programming Language :: Python :: 3.11",
|
28
|
+
"Programming Language :: Python :: 3.12",
|
29
|
+
"Programming Language :: Python :: 3.13",
|
30
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
31
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
32
|
+
"Operating System :: OS Independent",
|
33
|
+
]
|
34
|
+
|
35
|
+
[project.urls]
|
36
|
+
Homepage = "https://github.com/NosibleAI/smol-html"
|
37
|
+
Repository = "https://github.com/NosibleAI/smol-html"
|
38
|
+
Issues = "https://github.com/NosibleAI/smol-html/issues"
|
39
|
+
|
40
|
+
[build-system]
|
41
|
+
requires = ["hatchling>=1.0.0"]
|
42
|
+
build-backend = "hatchling.build"
|
43
|
+
|
44
|
+
[tool.hatch.build.targets.wheel]
|
45
|
+
packages = ["src/smol_html"]
|
46
|
+
|
47
|
+
[tool.uv]
|
48
|
+
dev-dependencies = [
|
49
|
+
"pytest",
|
50
|
+
"pytest-xdist",
|
51
|
+
]
|
@@ -0,0 +1,299 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import minify_html
|
4
|
+
from bs4 import BeautifulSoup, Tag
|
5
|
+
from lxml import html as lxml_html
|
6
|
+
from lxml.html.clean import Cleaner
|
7
|
+
|
8
|
+
|
9
|
+
# -------------------------
|
10
|
+
# Public API
|
11
|
+
# -------------------------
|
12
|
+
class SmolHtmlCleaner:
|
13
|
+
"""
|
14
|
+
Small, dependable HTML cleaner/minifier with sensible defaults.
|
15
|
+
|
16
|
+
Parameters
|
17
|
+
----------
|
18
|
+
non_text_to_keep : set of str, optional
|
19
|
+
Tags preserved even if textless. Default includes meta/media/table/line-break tags.
|
20
|
+
attr_stop_words : set of str, optional
|
21
|
+
Attribute tokens indicating non-content scaffolding/UX. Default contains common UI tokens.
|
22
|
+
remove_header_lists : bool, optional
|
23
|
+
Prune links/lists inside ``<header>``. Default True.
|
24
|
+
remove_footer_lists : bool, optional
|
25
|
+
Prune links/lists inside ``<footer>``. Default True.
|
26
|
+
minify : bool, optional
|
27
|
+
Minify HTML output via ``minify_html``. Default True.
|
28
|
+
minify_kwargs : dict, optional
|
29
|
+
Extra args for ``minify_html.minify``. Default empty.
|
30
|
+
pre_parse_hooks : sequence of callables, optional
|
31
|
+
Functions ``(str) -> str`` applied before parsing.
|
32
|
+
post_clean_hooks : sequence of callables, optional
|
33
|
+
Functions ``(BeautifulSoup) -> BeautifulSoup`` applied after cleaning.
|
34
|
+
lxml_* : various, optional
|
35
|
+
Direct mapping to ``lxml.html.clean.Cleaner`` kwargs (e.g., ``lxml_comments``, ``lxml_style``).
|
36
|
+
|
37
|
+
Notes
|
38
|
+
-----
|
39
|
+
Defaults and cleaning behavior are preserved; only the configuration surface
|
40
|
+
moved from a dataclass to keyword-only parameters on the constructor.
|
41
|
+
"""
|
42
|
+
|
43
|
+
def __init__(
|
44
|
+
self,
|
45
|
+
*,
|
46
|
+
# Core behavior
|
47
|
+
non_text_to_keep: set[str] = None,
|
48
|
+
attr_stop_words: set[str] = None,
|
49
|
+
remove_header_lists: bool = True,
|
50
|
+
remove_footer_lists: bool = True,
|
51
|
+
# Minify
|
52
|
+
minify: bool = True,
|
53
|
+
minify_kwargs: dict | None = None,
|
54
|
+
# lxml Cleaner exposed explicitly (prefixed)
|
55
|
+
meta: bool = False,
|
56
|
+
page_structure: bool = False,
|
57
|
+
links: bool = True,
|
58
|
+
scripts: bool = False,
|
59
|
+
javascript: bool = True,
|
60
|
+
comments: bool = True,
|
61
|
+
style: bool = True,
|
62
|
+
processing_instructions: bool = True,
|
63
|
+
embedded: bool = True,
|
64
|
+
frames: bool = True,
|
65
|
+
forms: bool = True,
|
66
|
+
annoying_tags: bool = True,
|
67
|
+
kill_tags: set[str] | None = None,
|
68
|
+
remove_unknown_tags: bool = True,
|
69
|
+
safe_attrs_only: bool = True,
|
70
|
+
safe_attrs: set[str] = None,
|
71
|
+
):
|
72
|
+
# Inline defaults identical to the prior CleanerConfig
|
73
|
+
if safe_attrs is None:
|
74
|
+
safe_attrs = {"href", "hreflang", "src", "srclang", "target", "alt", "kind", "type", "role", "abbr",
|
75
|
+
"accept", "accept-charset", "datetime", "lang", "name", "rel", "title", "value", "content", "label",
|
76
|
+
"item_type", "property", "itemprop"}
|
77
|
+
|
78
|
+
if attr_stop_words is None:
|
79
|
+
attr_stop_words = {"alert", "button", "checkbox", "dialog", "navigation", "tab", "tabpanel", "textbox",
|
80
|
+
"menu", "banner", "form", "search", "progressbar", "radio", "slider", "comment", "nav", "sidebar",
|
81
|
+
"breadcrumb", "dropdown", "menu-item", "toggle", "hamburger", "aside", "tooltip", "modal", "overlay",
|
82
|
+
"popup", "advert", "hero", "utility", "login", "signup", "password", "email", "username"}
|
83
|
+
|
84
|
+
if non_text_to_keep is None:
|
85
|
+
non_text_to_keep = {"meta", "img", "picture", "figure", "figcaption", "video", "source", "audio", "table",
|
86
|
+
"tr", "th", "td", "thead", "tbody", "tfoot", "caption", "br"}
|
87
|
+
|
88
|
+
self.non_text_to_keep = non_text_to_keep
|
89
|
+
self.attr_stop_words = attr_stop_words
|
90
|
+
self.remove_header_lists = remove_header_lists
|
91
|
+
self.remove_footer_lists = remove_footer_lists
|
92
|
+
self.minify = minify
|
93
|
+
self.minify_kwargs = dict(minify_kwargs or {})
|
94
|
+
|
95
|
+
# Initialize lxml Cleaner with explicit kwargs gathered from parameters
|
96
|
+
self._cleaner = Cleaner(
|
97
|
+
meta=meta,
|
98
|
+
page_structure=page_structure,
|
99
|
+
links=links,
|
100
|
+
scripts=scripts,
|
101
|
+
javascript=javascript,
|
102
|
+
comments=comments,
|
103
|
+
style=style,
|
104
|
+
processing_instructions=processing_instructions,
|
105
|
+
embedded=embedded,
|
106
|
+
frames=frames,
|
107
|
+
forms=forms,
|
108
|
+
annoying_tags=annoying_tags,
|
109
|
+
kill_tags=kill_tags,
|
110
|
+
remove_unknown_tags=remove_unknown_tags,
|
111
|
+
safe_attrs_only=safe_attrs_only,
|
112
|
+
safe_attrs=safe_attrs,
|
113
|
+
)
|
114
|
+
|
115
|
+
# -------------------------
|
116
|
+
# User-friendly entry points
|
117
|
+
# -------------------------
|
118
|
+
|
119
|
+
|
120
|
+
def clean(self, *, raw_html: str | BeautifulSoup) -> str:
|
121
|
+
"""Clean and optionally minify HTML input.
|
122
|
+
|
123
|
+
The cleaning pipeline applies pre-parse hooks (on strings), prunes elements
|
124
|
+
by attribute stop words, sanitizes via lxml Cleaner, performs structural
|
125
|
+
pruning of header/footer/body, then applies post-clean hooks.
|
126
|
+
|
127
|
+
Parameters
|
128
|
+
----------
|
129
|
+
raw_html : str or BeautifulSoup
|
130
|
+
Raw HTML string or BeautifulSoup to be cleaned.
|
131
|
+
|
132
|
+
Returns
|
133
|
+
-------
|
134
|
+
str
|
135
|
+
Cleaned HTML as a string.
|
136
|
+
"""
|
137
|
+
|
138
|
+
# Stage 0: hooks that operate on the raw string
|
139
|
+
if isinstance(raw_html, str):
|
140
|
+
soup = BeautifulSoup(raw_html or "", features="lxml")
|
141
|
+
elif isinstance(raw_html, BeautifulSoup):
|
142
|
+
soup = raw_html
|
143
|
+
else:
|
144
|
+
raise TypeError("raw_html must be a str or BeautifulSoup instance")
|
145
|
+
|
146
|
+
# Stage 1: attribute-based pruning on the original soup
|
147
|
+
# Remove small, likely non-content elements based on attribute tokens.
|
148
|
+
self._strip_by_attribute_stop_words(soup=soup)
|
149
|
+
|
150
|
+
# Stage 2: lxml cleaner pass (robust HTML sanitation)
|
151
|
+
# Use lxml Cleaner to sanitize HTML, optionally minify afterwards.
|
152
|
+
cleaned_html = self._lxml_clean(str(soup))
|
153
|
+
clean_soup = BeautifulSoup(markup=cleaned_html, features="lxml")
|
154
|
+
|
155
|
+
# Stage 3: structural pruning on header/body/footer of the cleaned soup
|
156
|
+
self._prune_header_footer(clean_soup)
|
157
|
+
self._prune_body(clean_soup)
|
158
|
+
self._drop_empty_leaf_nodes(clean_soup)
|
159
|
+
|
160
|
+
return str(clean_soup)
|
161
|
+
|
162
|
+
# -------------------------
|
163
|
+
# Internal helpers
|
164
|
+
# -------------------------
|
165
|
+
def _lxml_clean(self, html_str: str) -> str:
|
166
|
+
"""Sanitize and optionally minify HTML using lxml + minify_html.
|
167
|
+
|
168
|
+
Parameters
|
169
|
+
----------
|
170
|
+
html_str : str
|
171
|
+
HTML markup to be cleaned.
|
172
|
+
|
173
|
+
Returns
|
174
|
+
-------
|
175
|
+
str
|
176
|
+
Cleaned (and possibly minified) HTML markup.
|
177
|
+
"""
|
178
|
+
try:
|
179
|
+
cleaned = self._cleaner.clean_html(html_str)
|
180
|
+
return minify_html.minify(cleaned, **self.minify_kwargs) if self.minify else cleaned
|
181
|
+
except ValueError as ex:
|
182
|
+
# Handle encoding declaration edge-cases by round-tripping via lxml
|
183
|
+
msg = (
|
184
|
+
"Unicode strings with encoding declaration are not supported. "
|
185
|
+
"Please use bytes input or XML fragments without declaration."
|
186
|
+
)
|
187
|
+
if str(ex) == msg:
|
188
|
+
raw_bytes = html_str.encode("utf-8", errors="ignore")
|
189
|
+
doc = lxml_html.fromstring(raw_bytes)
|
190
|
+
cleaned = self._cleaner.clean_html(doc)
|
191
|
+
rendered = lxml_html.tostring(cleaned, encoding="utf-8").decode("utf-8")
|
192
|
+
return minify_html.minify(rendered, **self.minify_kwargs) if self.minify else rendered
|
193
|
+
raise
|
194
|
+
|
195
|
+
def _strip_by_attribute_stop_words(self, *, soup: BeautifulSoup) -> None:
|
196
|
+
"""Remove small, likely non-content elements by attribute tokens.
|
197
|
+
|
198
|
+
Scans leaf-like descendants under ``<body>`` and collects elements whose
|
199
|
+
``id``, ``class``, ``role``, or ``item_type`` values contain any of the
|
200
|
+
configured ``attr_stop_words`` tokens (case-insensitive), then decomposes
|
201
|
+
them. Mirrors the baseline leaf-ness and concatenation behavior.
|
202
|
+
|
203
|
+
Parameters
|
204
|
+
----------
|
205
|
+
soup : BeautifulSoup
|
206
|
+
Parsed document to prune in place.
|
207
|
+
"""
|
208
|
+
body = soup.find("body") or soup
|
209
|
+
to_decompose: list[Tag] = []
|
210
|
+
for el in body.descendants:
|
211
|
+
if not isinstance(el, Tag):
|
212
|
+
continue
|
213
|
+
attrs = el.attrs if isinstance(el.attrs, dict) else {}
|
214
|
+
if not attrs:
|
215
|
+
continue
|
216
|
+
# Only prune simple leaf-ish nodes to avoid huge deletes unintentionally
|
217
|
+
if sum(1 for _ in el.descendants) > 1:
|
218
|
+
continue
|
219
|
+
for name in ("id", "class", "role", "item_type"):
|
220
|
+
val = attrs.get(name)
|
221
|
+
if val is None:
|
222
|
+
continue
|
223
|
+
if isinstance(val, (list, tuple)):
|
224
|
+
# Match baseline behavior: concatenate tokens without separator
|
225
|
+
val_str = "".join(map(str, val))
|
226
|
+
else:
|
227
|
+
val_str = str(val)
|
228
|
+
if any(sw in val_str.lower() for sw in self.attr_stop_words):
|
229
|
+
to_decompose.append(el)
|
230
|
+
break
|
231
|
+
for el in to_decompose:
|
232
|
+
el.decompose()
|
233
|
+
|
234
|
+
def _prune_header_footer(self, soup: BeautifulSoup) -> None:
|
235
|
+
"""Prune likely navigational clutter inside header and footer.
|
236
|
+
|
237
|
+
Removes common list-like elements and links inside ``<header>``/``<footer>``
|
238
|
+
when the corresponding toggles are enabled.
|
239
|
+
"""
|
240
|
+
header = soup.find("header")
|
241
|
+
footer = soup.find("footer")
|
242
|
+
if header and self.remove_header_lists:
|
243
|
+
self._decompose_tags(header, {"a", "img", "ol", "ul", "li"})
|
244
|
+
if footer and self.remove_footer_lists:
|
245
|
+
self._decompose_tags(footer, {"a", "img", "ol", "ul", "li"})
|
246
|
+
|
247
|
+
def _prune_body(self, soup: BeautifulSoup) -> None:
|
248
|
+
body = soup.find("body") or soup
|
249
|
+
always_remove = {
|
250
|
+
"input", "textarea", "button", "select", "option", "optgroup", "datalist",
|
251
|
+
"label", "fieldset", "legend", "output", "meter", "dialog", "form",
|
252
|
+
"search", "progress", "svg", "canvas", "use", "nav", "object", "noscript",
|
253
|
+
}
|
254
|
+
to_decompose: list[Tag] = []
|
255
|
+
for el in body.descendants:
|
256
|
+
if not isinstance(el, Tag):
|
257
|
+
continue
|
258
|
+
if not isinstance(el.name, str):
|
259
|
+
continue
|
260
|
+
if el.name in self.non_text_to_keep:
|
261
|
+
continue
|
262
|
+
if el.name in always_remove:
|
263
|
+
to_decompose.append(el)
|
264
|
+
for el in to_decompose:
|
265
|
+
el.decompose()
|
266
|
+
|
267
|
+
def _drop_empty_leaf_nodes(self, soup: BeautifulSoup) -> None:
|
268
|
+
"""Iteratively remove empty leaves using the baseline's strict leaf check.
|
269
|
+
|
270
|
+
Walks leaf nodes (no descendants) and removes those with no text content,
|
271
|
+
excluding tags explicitly whitelisted in ``non_text_to_keep``.
|
272
|
+
"""
|
273
|
+
body = soup.find("body") or soup
|
274
|
+
while True:
|
275
|
+
to_decompose: list[Tag] = []
|
276
|
+
for el in body.descendants:
|
277
|
+
if not isinstance(el, Tag):
|
278
|
+
continue
|
279
|
+
if not isinstance(el.name, str):
|
280
|
+
continue
|
281
|
+
if el.name in self.non_text_to_keep:
|
282
|
+
continue
|
283
|
+
# Baseline leaf check: element must have zero descendants at all
|
284
|
+
if len(list(el.descendants)) != 0:
|
285
|
+
continue
|
286
|
+
# Remove if no text once stripped
|
287
|
+
if (el.get_text() or "").strip():
|
288
|
+
continue
|
289
|
+
to_decompose.append(el)
|
290
|
+
if not to_decompose:
|
291
|
+
break
|
292
|
+
for el in to_decompose:
|
293
|
+
el.decompose()
|
294
|
+
|
295
|
+
@staticmethod
|
296
|
+
def _decompose_tags(root: Tag, names: set[str]) -> None:
|
297
|
+
for el in list(root.descendants):
|
298
|
+
if isinstance(el, Tag) and isinstance(el.name, str) and el.name in names:
|
299
|
+
el.decompose()
|