veilrender 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- veilrender-0.1.0/PKG-INFO +129 -0
- veilrender-0.1.0/README.md +109 -0
- veilrender-0.1.0/pyproject.toml +61 -0
- veilrender-0.1.0/setup.cfg +4 -0
- veilrender-0.1.0/src/veilrender/__init__.py +3 -0
- veilrender-0.1.0/src/veilrender/__main__.py +6 -0
- veilrender-0.1.0/src/veilrender/_vendor/__init__.py +2 -0
- veilrender-0.1.0/src/veilrender/_vendor/benchmark_compare.py +323 -0
- veilrender-0.1.0/src/veilrender/_vendor/cache.py +1023 -0
- veilrender-0.1.0/src/veilrender/_vendor/config.py +713 -0
- veilrender-0.1.0/src/veilrender/_vendor/dotenv.py +514 -0
- veilrender-0.1.0/src/veilrender/_vendor/httpserver.py +1007 -0
- veilrender-0.1.0/src/veilrender/_vendor/jsonc.py +352 -0
- veilrender-0.1.0/src/veilrender/_vendor/markdown.py +904 -0
- veilrender-0.1.0/src/veilrender/_vendor/readability.py +1002 -0
- veilrender-0.1.0/src/veilrender/_vendor/retry.py +503 -0
- veilrender-0.1.0/src/veilrender/_vendor/soup.py +998 -0
- veilrender-0.1.0/src/veilrender/_vendor/structlog.py +888 -0
- veilrender-0.1.0/src/veilrender/_vendor/useragent.py +475 -0
- veilrender-0.1.0/src/veilrender/_vendor/yaml.py +1124 -0
- veilrender-0.1.0/src/veilrender/app.py +158 -0
- veilrender-0.1.0/src/veilrender/auth.py +39 -0
- veilrender-0.1.0/src/veilrender/browser.py +172 -0
- veilrender-0.1.0/src/veilrender/cdp_proxy.py +314 -0
- veilrender-0.1.0/src/veilrender/config.py +25 -0
- veilrender-0.1.0/src/veilrender/models.py +109 -0
- veilrender-0.1.0/src/veilrender/routes/__init__.py +1 -0
- veilrender-0.1.0/src/veilrender/routes/health.py +17 -0
- veilrender-0.1.0/src/veilrender/routes/render.py +122 -0
- veilrender-0.1.0/src/veilrender/routes/screenshot.py +65 -0
- veilrender-0.1.0/src/veilrender.egg-info/PKG-INFO +129 -0
- veilrender-0.1.0/src/veilrender.egg-info/SOURCES.txt +34 -0
- veilrender-0.1.0/src/veilrender.egg-info/dependency_links.txt +1 -0
- veilrender-0.1.0/src/veilrender.egg-info/entry_points.txt +2 -0
- veilrender-0.1.0/src/veilrender.egg-info/requires.txt +7 -0
- veilrender-0.1.0/src/veilrender.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: veilrender
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Headless browser rendering API — self-hostable on HF Spaces, Docker, or bare metal
|
|
5
|
+
Author: Peng Ding
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/Oaklight/veilrender
|
|
8
|
+
Project-URL: Issues, https://github.com/Oaklight/veilrender/issues
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: cloakbrowser>=0.3.0
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: ruff>=0.11.0; extra == "dev"
|
|
17
|
+
Requires-Dist: ty>=0.0.1a0; extra == "dev"
|
|
18
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
20
|
+
|
|
21
|
+
# VeilRender
|
|
22
|
+
|
|
23
|
+
[](https://pypi.org/project/veilrender/)
|
|
24
|
+
[](https://github.com/Oaklight/veilrender/releases/latest)
|
|
25
|
+
[](https://github.com/Oaklight/veilrender/actions/workflows/ci.yml)
|
|
26
|
+
[](https://hub.docker.com/r/oaklight/veilrender)
|
|
27
|
+
[](https://hub.docker.com/r/oaklight/veilrender)
|
|
28
|
+
[](https://opensource.org/licenses/MIT)
|
|
29
|
+
[](https://huggingface.co/spaces/oaklight/veilrender)
|
|
30
|
+
|
|
31
|
+
[中文](README_zh.md) | **English**
|
|
32
|
+
|
|
33
|
+
Headless browser rendering API — self-hostable on HF Spaces, Docker, or bare metal.
|
|
34
|
+
|
|
35
|
+
VeilRender accepts a URL and returns the fully rendered page content (HTML, Markdown, readability-extracted article) using a headless Chromium browser. Designed as a fallback for fetch tools that fail on JavaScript-rendered pages.
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
### Docker
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
docker run -p 7860:7860 -e VEILRENDER_API_TOKEN=your-secret ghcr.io/oaklight/veilrender
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Local Development
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install -e ".[dev]"
|
|
49
|
+
playwright install chromium
|
|
50
|
+
python -m veilrender
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## API
|
|
54
|
+
|
|
55
|
+
### GET /health
|
|
56
|
+
|
|
57
|
+
Returns `{"status": "ok"}` if the service is running.
|
|
58
|
+
|
|
59
|
+
### POST /render
|
|
60
|
+
|
|
61
|
+
Render a URL and return the page content.
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
curl -X POST http://localhost:7860/render \
|
|
65
|
+
-H "Authorization: Bearer your-secret" \
|
|
66
|
+
-H "Content-Type: application/json" \
|
|
67
|
+
-d '{"url": "https://example.com"}'
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Response:
|
|
71
|
+
|
|
72
|
+
```json
|
|
73
|
+
{
|
|
74
|
+
"content": {
|
|
75
|
+
"html": "...",
|
|
76
|
+
"markdown": "...",
|
|
77
|
+
"readability": "..."
|
|
78
|
+
},
|
|
79
|
+
"metadata": {
|
|
80
|
+
"title": "Example Domain",
|
|
81
|
+
"url": "https://example.com",
|
|
82
|
+
"status_code": 200
|
|
83
|
+
},
|
|
84
|
+
"links": [{"url": "https://www.iana.org/domains/example", "text": "More information..."}]
|
|
85
|
+
}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### POST /screenshot
|
|
89
|
+
|
|
90
|
+
Capture a screenshot of a URL.
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
curl -X POST http://localhost:7860/screenshot \
|
|
94
|
+
-H "Authorization: Bearer your-secret" \
|
|
95
|
+
-H "Content-Type: application/json" \
|
|
96
|
+
-d '{"url": "https://example.com"}' \
|
|
97
|
+
-o screenshot.png
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Configuration
|
|
101
|
+
|
|
102
|
+
All settings are configured via environment variables with the `VEILRENDER_` prefix:
|
|
103
|
+
|
|
104
|
+
| Variable | Default | Description |
|
|
105
|
+
|----------|---------|-------------|
|
|
106
|
+
| `VEILRENDER_API_TOKEN` | *(none)* | API token for authentication. If unset, auth is disabled. |
|
|
107
|
+
| `VEILRENDER_PORT` | `7860` | Server port |
|
|
108
|
+
| `VEILRENDER_HOST` | `0.0.0.0` | Server bind address |
|
|
109
|
+
| `VEILRENDER_TIMEOUT` | `30000` | Browser navigation timeout (ms) |
|
|
110
|
+
| `VEILRENDER_VIEWPORT_WIDTH` | `1280` | Browser viewport width |
|
|
111
|
+
| `VEILRENDER_VIEWPORT_HEIGHT` | `720` | Browser viewport height |
|
|
112
|
+
| `VEILRENDER_MAX_CONCURRENT` | `3` | Max concurrent browser contexts |
|
|
113
|
+
|
|
114
|
+
## Benchmark
|
|
115
|
+
|
|
116
|
+
Tested on HF Spaces (free tier, 2 vCPU) and a self-hosted VPS (3 vCPU, 1 GB container). **100% success rate** across all 46 requests per target.
|
|
117
|
+
|
|
118
|
+
| Test | HF Spaces | Self-hosted |
|
|
119
|
+
|------|-----------|-------------|
|
|
120
|
+
| Sequential × 5 (mixed URLs) | 8.72 s total | 11.81 s total |
|
|
121
|
+
| Concurrent × 10 (mixed URLs) | 1.40 – 9.37 s | 1.29 – 13.45 s |
|
|
122
|
+
| Rapid-fire × 20 (sequential) | 0.885 s avg | 1.029 s avg |
|
|
123
|
+
| Peak container memory | — | 614 MiB / 1 GB |
|
|
124
|
+
|
|
125
|
+
Full results: [BENCHMARK.md](BENCHMARK.md)
|
|
126
|
+
|
|
127
|
+
## License
|
|
128
|
+
|
|
129
|
+
MIT
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# VeilRender
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/veilrender/)
|
|
4
|
+
[](https://github.com/Oaklight/veilrender/releases/latest)
|
|
5
|
+
[](https://github.com/Oaklight/veilrender/actions/workflows/ci.yml)
|
|
6
|
+
[](https://hub.docker.com/r/oaklight/veilrender)
|
|
7
|
+
[](https://hub.docker.com/r/oaklight/veilrender)
|
|
8
|
+
[](https://opensource.org/licenses/MIT)
|
|
9
|
+
[](https://huggingface.co/spaces/oaklight/veilrender)
|
|
10
|
+
|
|
11
|
+
[中文](README_zh.md) | **English**
|
|
12
|
+
|
|
13
|
+
Headless browser rendering API — self-hostable on HF Spaces, Docker, or bare metal.
|
|
14
|
+
|
|
15
|
+
VeilRender accepts a URL and returns the fully rendered page content (HTML, Markdown, readability-extracted article) using a headless Chromium browser. Designed as a fallback for fetch tools that fail on JavaScript-rendered pages.
|
|
16
|
+
|
|
17
|
+
## Quick Start
|
|
18
|
+
|
|
19
|
+
### Docker
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
docker run -p 7860:7860 -e VEILRENDER_API_TOKEN=your-secret ghcr.io/oaklight/veilrender
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### Local Development
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install -e ".[dev]"
|
|
29
|
+
playwright install chromium
|
|
30
|
+
python -m veilrender
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## API
|
|
34
|
+
|
|
35
|
+
### GET /health
|
|
36
|
+
|
|
37
|
+
Returns `{"status": "ok"}` if the service is running.
|
|
38
|
+
|
|
39
|
+
### POST /render
|
|
40
|
+
|
|
41
|
+
Render a URL and return the page content.
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
curl -X POST http://localhost:7860/render \
|
|
45
|
+
-H "Authorization: Bearer your-secret" \
|
|
46
|
+
-H "Content-Type: application/json" \
|
|
47
|
+
-d '{"url": "https://example.com"}'
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Response:
|
|
51
|
+
|
|
52
|
+
```json
|
|
53
|
+
{
|
|
54
|
+
"content": {
|
|
55
|
+
"html": "...",
|
|
56
|
+
"markdown": "...",
|
|
57
|
+
"readability": "..."
|
|
58
|
+
},
|
|
59
|
+
"metadata": {
|
|
60
|
+
"title": "Example Domain",
|
|
61
|
+
"url": "https://example.com",
|
|
62
|
+
"status_code": 200
|
|
63
|
+
},
|
|
64
|
+
"links": [{"url": "https://www.iana.org/domains/example", "text": "More information..."}]
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### POST /screenshot
|
|
69
|
+
|
|
70
|
+
Capture a screenshot of a URL.
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
curl -X POST http://localhost:7860/screenshot \
|
|
74
|
+
-H "Authorization: Bearer your-secret" \
|
|
75
|
+
-H "Content-Type: application/json" \
|
|
76
|
+
-d '{"url": "https://example.com"}' \
|
|
77
|
+
-o screenshot.png
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Configuration
|
|
81
|
+
|
|
82
|
+
All settings are configured via environment variables with the `VEILRENDER_` prefix:
|
|
83
|
+
|
|
84
|
+
| Variable | Default | Description |
|
|
85
|
+
|----------|---------|-------------|
|
|
86
|
+
| `VEILRENDER_API_TOKEN` | *(none)* | API token for authentication. If unset, auth is disabled. |
|
|
87
|
+
| `VEILRENDER_PORT` | `7860` | Server port |
|
|
88
|
+
| `VEILRENDER_HOST` | `0.0.0.0` | Server bind address |
|
|
89
|
+
| `VEILRENDER_TIMEOUT` | `30000` | Browser navigation timeout (ms) |
|
|
90
|
+
| `VEILRENDER_VIEWPORT_WIDTH` | `1280` | Browser viewport width |
|
|
91
|
+
| `VEILRENDER_VIEWPORT_HEIGHT` | `720` | Browser viewport height |
|
|
92
|
+
| `VEILRENDER_MAX_CONCURRENT` | `3` | Max concurrent browser contexts |
|
|
93
|
+
|
|
94
|
+
## Benchmark
|
|
95
|
+
|
|
96
|
+
Tested on HF Spaces (free tier, 2 vCPU) and a self-hosted VPS (3 vCPU, 1 GB container). **100% success rate** across all 46 requests per target.
|
|
97
|
+
|
|
98
|
+
| Test | HF Spaces | Self-hosted |
|
|
99
|
+
|------|-----------|-------------|
|
|
100
|
+
| Sequential × 5 (mixed URLs) | 8.72 s total | 11.81 s total |
|
|
101
|
+
| Concurrent × 10 (mixed URLs) | 1.40 – 9.37 s | 1.29 – 13.45 s |
|
|
102
|
+
| Rapid-fire × 20 (sequential) | 0.885 s avg | 1.029 s avg |
|
|
103
|
+
| Peak container memory | — | 614 MiB / 1 GB |
|
|
104
|
+
|
|
105
|
+
Full results: [BENCHMARK.md](BENCHMARK.md)
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
MIT
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "veilrender"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Headless browser rendering API — self-hostable on HF Spaces, Docker, or bare metal"
|
|
9
|
+
authors = [{ name = "Peng Ding" }]
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
dependencies = [
|
|
20
|
+
"cloakbrowser>=0.3.0",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.optional-dependencies]
|
|
24
|
+
dev = [
|
|
25
|
+
"ruff>=0.11.0",
|
|
26
|
+
"ty>=0.0.1a0",
|
|
27
|
+
"pytest>=7.0.0",
|
|
28
|
+
"pytest-asyncio>=0.21.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.scripts]
|
|
32
|
+
veilrender = "veilrender.app:main"
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Repository = "https://github.com/Oaklight/veilrender"
|
|
36
|
+
Issues = "https://github.com/Oaklight/veilrender/issues"
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.packages.find]
|
|
39
|
+
where = ["src"]
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.dynamic]
|
|
42
|
+
version = { attr = "veilrender.__version__" }
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.package-data]
|
|
45
|
+
"veilrender" = ["py.typed"]
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
target-version = "py310"
|
|
49
|
+
|
|
50
|
+
[tool.ruff.lint]
|
|
51
|
+
select = ["E", "F", "UP"]
|
|
52
|
+
ignore = ["UP007", "E501"]
|
|
53
|
+
|
|
54
|
+
[tool.ty.environment]
|
|
55
|
+
python-version = "3.10"
|
|
56
|
+
|
|
57
|
+
[tool.ty.src]
|
|
58
|
+
exclude = ["src/veilrender/_vendor/**"]
|
|
59
|
+
|
|
60
|
+
[tool.ty.rules]
|
|
61
|
+
unresolved-import = "ignore"
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Three-way readability benchmark: zerodep vs readability-lxml vs Mozilla JS.
|
|
3
|
+
|
|
4
|
+
Runs each implementation on Mozilla's test fixtures and prints a comparison
|
|
5
|
+
table. JS timing is measured internally by bench_mozilla.js (no subprocess
|
|
6
|
+
overhead in the numbers).
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python benchmark_compare.py # all fixtures
|
|
10
|
+
python benchmark_compare.py 001 bbc-1 # specific fixtures
|
|
11
|
+
python benchmark_compare.py --rounds 20 # more rounds
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
import timeit
|
|
23
|
+
|
|
24
|
+
# ── Setup paths ──────────────────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
27
|
+
_TEST_PAGES_DIR = os.path.join(_THIS_DIR, "test-pages")
|
|
28
|
+
_BENCH_JS = os.path.join(_THIS_DIR, "bench_mozilla.js")
|
|
29
|
+
|
|
30
|
+
sys.path.insert(0, _THIS_DIR)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ── Discover fixtures ────────────────────────────────────────────────────────
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def discover_fixtures() -> list[str]:
|
|
37
|
+
"""Return sorted list of available fixture names."""
|
|
38
|
+
if not os.path.isdir(_TEST_PAGES_DIR):
|
|
39
|
+
return []
|
|
40
|
+
return sorted(
|
|
41
|
+
d
|
|
42
|
+
for d in os.listdir(_TEST_PAGES_DIR)
|
|
43
|
+
if os.path.isdir(os.path.join(_TEST_PAGES_DIR, d))
|
|
44
|
+
and os.path.isfile(os.path.join(_TEST_PAGES_DIR, d, "source.html"))
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_source(name: str) -> str:
|
|
49
|
+
"""Load source HTML for a fixture."""
|
|
50
|
+
path = os.path.join(_TEST_PAGES_DIR, name, "source.html")
|
|
51
|
+
with open(path, encoding="utf-8") as f:
|
|
52
|
+
return f.read()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ── Python: zerodep readability ──────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def bench_zerodep(html: str, rounds: int) -> dict:
|
|
59
|
+
"""Benchmark our readability.extract() and return timing dict."""
|
|
60
|
+
from readability import extract
|
|
61
|
+
|
|
62
|
+
# Warm-up.
|
|
63
|
+
result = extract(html)
|
|
64
|
+
|
|
65
|
+
times = []
|
|
66
|
+
for _ in range(rounds):
|
|
67
|
+
t0 = timeit.default_timer()
|
|
68
|
+
extract(html)
|
|
69
|
+
t1 = timeit.default_timer()
|
|
70
|
+
times.append((t1 - t0) * 1000) # ms
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
"times_ms": times,
|
|
74
|
+
"min_ms": min(times),
|
|
75
|
+
"mean_ms": sum(times) / len(times),
|
|
76
|
+
"max_ms": max(times),
|
|
77
|
+
"title": result.title,
|
|
78
|
+
"length": result.length,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# ── Python: readability-lxml ────────────────────────────────────────────────
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _load_readability_lxml():
|
|
86
|
+
"""Load readability-lxml's Document class, working around name clash."""
|
|
87
|
+
import importlib
|
|
88
|
+
import importlib.metadata
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
importlib.metadata.version("readability-lxml")
|
|
92
|
+
except importlib.metadata.PackageNotFoundError:
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
saved_path = sys.path[:]
|
|
96
|
+
saved_modules = {
|
|
97
|
+
k: sys.modules.pop(k)
|
|
98
|
+
for k in list(sys.modules)
|
|
99
|
+
if k == "readability" or k.startswith("readability.")
|
|
100
|
+
}
|
|
101
|
+
try:
|
|
102
|
+
sys.path = [
|
|
103
|
+
p for p in sys.path if os.path.abspath(p) != os.path.abspath(_THIS_DIR)
|
|
104
|
+
]
|
|
105
|
+
mod = importlib.import_module("readability")
|
|
106
|
+
return mod.Document
|
|
107
|
+
finally:
|
|
108
|
+
sys.path = saved_path
|
|
109
|
+
for k in list(sys.modules):
|
|
110
|
+
if k == "readability" or k.startswith("readability."):
|
|
111
|
+
del sys.modules[k]
|
|
112
|
+
sys.modules.update(saved_modules)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
_RefDocument = _load_readability_lxml()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def bench_readability_lxml(html: str, rounds: int) -> dict | None:
|
|
119
|
+
"""Benchmark readability-lxml and return timing dict, or None."""
|
|
120
|
+
if _RefDocument is None:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
# Warm-up.
|
|
124
|
+
doc = _RefDocument(html)
|
|
125
|
+
summary = doc.summary()
|
|
126
|
+
|
|
127
|
+
times = []
|
|
128
|
+
for _ in range(rounds):
|
|
129
|
+
t0 = timeit.default_timer()
|
|
130
|
+
doc = _RefDocument(html)
|
|
131
|
+
doc.summary()
|
|
132
|
+
t1 = timeit.default_timer()
|
|
133
|
+
times.append((t1 - t0) * 1000)
|
|
134
|
+
|
|
135
|
+
# Extract title from summary HTML (basic).
|
|
136
|
+
title = doc.short_title() if hasattr(doc, "short_title") else ""
|
|
137
|
+
length = len(summary) if summary else 0
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
"times_ms": times,
|
|
141
|
+
"min_ms": min(times),
|
|
142
|
+
"mean_ms": sum(times) / len(times),
|
|
143
|
+
"max_ms": max(times),
|
|
144
|
+
"title": title,
|
|
145
|
+
"length": length,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# ── JavaScript: Mozilla Readability.js ───────────────────────────────────────
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def bench_mozilla_js(fixture_name: str, rounds: int) -> dict | None:
|
|
153
|
+
"""Benchmark Mozilla Readability.js via Node.js subprocess.
|
|
154
|
+
|
|
155
|
+
Timing is measured internally by bench_mozilla.js — no subprocess
|
|
156
|
+
overhead in the reported numbers.
|
|
157
|
+
"""
|
|
158
|
+
if not shutil.which("node"):
|
|
159
|
+
return None
|
|
160
|
+
if not os.path.isfile(_BENCH_JS):
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
source_path = os.path.join(_TEST_PAGES_DIR, fixture_name, "source.html")
|
|
164
|
+
try:
|
|
165
|
+
result = subprocess.run(
|
|
166
|
+
["node", _BENCH_JS, source_path, str(rounds)],
|
|
167
|
+
capture_output=True,
|
|
168
|
+
text=True,
|
|
169
|
+
timeout=120,
|
|
170
|
+
cwd=_THIS_DIR,
|
|
171
|
+
)
|
|
172
|
+
if result.returncode != 0:
|
|
173
|
+
print(f" [JS error: {result.stderr.strip()[:100]}]", file=sys.stderr)
|
|
174
|
+
return None
|
|
175
|
+
return json.loads(result.stdout)
|
|
176
|
+
except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# ── Output formatting ────────────────────────────────────────────────────────
|
|
181
|
+
|
|
182
|
+
# ANSI colors (disabled if not a terminal).
|
|
183
|
+
if sys.stdout.isatty():
|
|
184
|
+
_BOLD = "\033[1m"
|
|
185
|
+
_GREEN = "\033[32m"
|
|
186
|
+
_YELLOW = "\033[33m"
|
|
187
|
+
_CYAN = "\033[36m"
|
|
188
|
+
_RESET = "\033[0m"
|
|
189
|
+
_DIM = "\033[2m"
|
|
190
|
+
else:
|
|
191
|
+
_BOLD = _GREEN = _YELLOW = _CYAN = _RESET = _DIM = ""
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _fmt_ms(ms: float) -> str:
|
|
195
|
+
"""Format milliseconds with appropriate unit."""
|
|
196
|
+
if ms < 1:
|
|
197
|
+
return f"{ms * 1000:.0f} µs"
|
|
198
|
+
if ms < 1000:
|
|
199
|
+
return f"{ms:.1f} ms"
|
|
200
|
+
return f"{ms / 1000:.2f} s"
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _ratio_str(ms: float, baseline: float) -> str:
|
|
204
|
+
"""Format a ratio relative to baseline."""
|
|
205
|
+
if baseline <= 0:
|
|
206
|
+
return ""
|
|
207
|
+
ratio = ms / baseline
|
|
208
|
+
if ratio < 1.05:
|
|
209
|
+
return f"{_GREEN}1.00x{_RESET}"
|
|
210
|
+
return f"{_YELLOW}{ratio:.2f}x{_RESET}"
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def print_results(
|
|
214
|
+
fixture_name: str,
|
|
215
|
+
html_size: int,
|
|
216
|
+
zd: dict,
|
|
217
|
+
lxml: dict | None,
|
|
218
|
+
js: dict | None,
|
|
219
|
+
) -> None:
|
|
220
|
+
"""Print a single fixture's results as a formatted row."""
|
|
221
|
+
baseline = zd["mean_ms"]
|
|
222
|
+
|
|
223
|
+
cols = [
|
|
224
|
+
f" {_BOLD}{fixture_name:<28s}{_RESET}",
|
|
225
|
+
f"{_DIM}{html_size / 1024:>7.1f} KB{_RESET}",
|
|
226
|
+
f"{_CYAN}zerodep{_RESET} {_fmt_ms(zd['mean_ms']):>10s}"
|
|
227
|
+
f" {_ratio_str(zd['mean_ms'], baseline)}",
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
if lxml is not None:
|
|
231
|
+
cols.append(
|
|
232
|
+
f"{_CYAN}lxml{_RESET} {_fmt_ms(lxml['mean_ms']):>10s}"
|
|
233
|
+
f" {_ratio_str(lxml['mean_ms'], baseline)}"
|
|
234
|
+
)
|
|
235
|
+
else:
|
|
236
|
+
cols.append(f"{_DIM}lxml {'n/a':>10s}{_RESET}")
|
|
237
|
+
|
|
238
|
+
if js is not None:
|
|
239
|
+
cols.append(
|
|
240
|
+
f"{_CYAN}mozilla{_RESET} {_fmt_ms(js['mean_ms']):>10s}"
|
|
241
|
+
f" {_ratio_str(js['mean_ms'], baseline)}"
|
|
242
|
+
)
|
|
243
|
+
else:
|
|
244
|
+
cols.append(f"{_DIM}mozilla {'n/a':>10s}{_RESET}")
|
|
245
|
+
|
|
246
|
+
print(" ".join(cols))
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# ── Main ─────────────────────────────────────────────────────────────────────
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def main() -> None:
|
|
253
|
+
parser = argparse.ArgumentParser(
|
|
254
|
+
description="Three-way readability benchmark comparison."
|
|
255
|
+
)
|
|
256
|
+
parser.add_argument(
|
|
257
|
+
"fixtures",
|
|
258
|
+
nargs="*",
|
|
259
|
+
help="Fixture names to benchmark (default: all).",
|
|
260
|
+
)
|
|
261
|
+
parser.add_argument(
|
|
262
|
+
"--rounds",
|
|
263
|
+
type=int,
|
|
264
|
+
default=10,
|
|
265
|
+
help="Number of timing rounds per fixture (default: 10).",
|
|
266
|
+
)
|
|
267
|
+
args = parser.parse_args()
|
|
268
|
+
|
|
269
|
+
all_fixtures = discover_fixtures()
|
|
270
|
+
if not all_fixtures:
|
|
271
|
+
print("No test fixtures found in test-pages/", file=sys.stderr)
|
|
272
|
+
sys.exit(1)
|
|
273
|
+
|
|
274
|
+
fixtures = args.fixtures if args.fixtures else all_fixtures
|
|
275
|
+
# Validate fixture names.
|
|
276
|
+
for name in fixtures:
|
|
277
|
+
if name not in all_fixtures:
|
|
278
|
+
print(f"Unknown fixture: {name}", file=sys.stderr)
|
|
279
|
+
print(f"Available: {', '.join(all_fixtures)}", file=sys.stderr)
|
|
280
|
+
sys.exit(1)
|
|
281
|
+
|
|
282
|
+
rounds = args.rounds
|
|
283
|
+
|
|
284
|
+
# Header.
|
|
285
|
+
print()
|
|
286
|
+
print(f"{_BOLD}Readability Benchmark ({rounds} rounds per fixture){_RESET}")
|
|
287
|
+
has_node = shutil.which("node") is not None
|
|
288
|
+
has_lxml = _RefDocument is not None
|
|
289
|
+
status = []
|
|
290
|
+
status.append(f"zerodep: {_GREEN}yes{_RESET}")
|
|
291
|
+
lxml_status = _GREEN + "yes" + _RESET if has_lxml else _DIM + "no" + _RESET
|
|
292
|
+
status.append(f"readability-lxml: {lxml_status}")
|
|
293
|
+
status.append(
|
|
294
|
+
f"mozilla js: {_GREEN + 'yes' + _RESET if has_node else _DIM + 'no' + _RESET}"
|
|
295
|
+
)
|
|
296
|
+
print(f" Implementations: {' | '.join(status)}")
|
|
297
|
+
print(f" {_DIM}Times shown are mean. Ratios relative to zerodep.{_RESET}")
|
|
298
|
+
print()
|
|
299
|
+
|
|
300
|
+
# Column headers.
|
|
301
|
+
print(
|
|
302
|
+
f" {'Fixture':<28s} {'Size':>9s} "
|
|
303
|
+
f"{'zerodep':>19s} {'readability-lxml':>19s} "
|
|
304
|
+
f"{'mozilla js':>19s}"
|
|
305
|
+
)
|
|
306
|
+
print(" " + "─" * 110)
|
|
307
|
+
|
|
308
|
+
for name in fixtures:
|
|
309
|
+
html = load_source(name)
|
|
310
|
+
html_size = len(html.encode("utf-8"))
|
|
311
|
+
|
|
312
|
+
# Benchmark all three.
|
|
313
|
+
zd = bench_zerodep(html, rounds)
|
|
314
|
+
lxml = bench_readability_lxml(html, rounds)
|
|
315
|
+
js = bench_mozilla_js(name, rounds)
|
|
316
|
+
|
|
317
|
+
print_results(name, html_size, zd, lxml, js)
|
|
318
|
+
|
|
319
|
+
print()
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
if __name__ == "__main__":
|
|
323
|
+
main()
|