veilrender 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
veilrender/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """VeilRender — headless browser rendering API."""
2
+
3
+ __version__ = "0.1.0"
veilrender/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Entry point for ``python -m veilrender``."""
2
+
3
+ from veilrender.app import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,2 @@
1
+ # Vendored zerodep modules — do not edit manually.
2
+ # Update via: make vendor
@@ -0,0 +1,323 @@
1
+ #!/usr/bin/env python3
2
+ """Three-way readability benchmark: zerodep vs readability-lxml vs Mozilla JS.
3
+
4
+ Runs each implementation on Mozilla's test fixtures and prints a comparison
5
+ table. JS timing is measured internally by bench_mozilla.js (no subprocess
6
+ overhead in the numbers).
7
+
8
+ Usage:
9
+ python benchmark_compare.py # all fixtures
10
+ python benchmark_compare.py 001 bbc-1 # specific fixtures
11
+ python benchmark_compare.py --rounds 20 # more rounds
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import os
19
+ import shutil
20
+ import subprocess
21
+ import sys
22
+ import timeit
23
+
24
+ # ── Setup paths ──────────────────────────────────────────────────────────────
25
+
26
+ _THIS_DIR = os.path.dirname(os.path.abspath(__file__))
27
+ _TEST_PAGES_DIR = os.path.join(_THIS_DIR, "test-pages")
28
+ _BENCH_JS = os.path.join(_THIS_DIR, "bench_mozilla.js")
29
+
30
+ sys.path.insert(0, _THIS_DIR)
31
+
32
+
33
+ # ── Discover fixtures ────────────────────────────────────────────────────────
34
+
35
+
36
+ def discover_fixtures() -> list[str]:
37
+ """Return sorted list of available fixture names."""
38
+ if not os.path.isdir(_TEST_PAGES_DIR):
39
+ return []
40
+ return sorted(
41
+ d
42
+ for d in os.listdir(_TEST_PAGES_DIR)
43
+ if os.path.isdir(os.path.join(_TEST_PAGES_DIR, d))
44
+ and os.path.isfile(os.path.join(_TEST_PAGES_DIR, d, "source.html"))
45
+ )
46
+
47
+
48
+ def load_source(name: str) -> str:
49
+ """Load source HTML for a fixture."""
50
+ path = os.path.join(_TEST_PAGES_DIR, name, "source.html")
51
+ with open(path, encoding="utf-8") as f:
52
+ return f.read()
53
+
54
+
55
+ # ── Python: zerodep readability ──────────────────────────────────────────────
56
+
57
+
58
+ def bench_zerodep(html: str, rounds: int) -> dict:
59
+ """Benchmark our readability.extract() and return timing dict."""
60
+ from readability import extract
61
+
62
+ # Warm-up.
63
+ result = extract(html)
64
+
65
+ times = []
66
+ for _ in range(rounds):
67
+ t0 = timeit.default_timer()
68
+ extract(html)
69
+ t1 = timeit.default_timer()
70
+ times.append((t1 - t0) * 1000) # ms
71
+
72
+ return {
73
+ "times_ms": times,
74
+ "min_ms": min(times),
75
+ "mean_ms": sum(times) / len(times),
76
+ "max_ms": max(times),
77
+ "title": result.title,
78
+ "length": result.length,
79
+ }
80
+
81
+
82
+ # ── Python: readability-lxml ────────────────────────────────────────────────
83
+
84
+
85
+ def _load_readability_lxml():
86
+ """Load readability-lxml's Document class, working around name clash."""
87
+ import importlib
88
+ import importlib.metadata
89
+
90
+ try:
91
+ importlib.metadata.version("readability-lxml")
92
+ except importlib.metadata.PackageNotFoundError:
93
+ return None
94
+
95
+ saved_path = sys.path[:]
96
+ saved_modules = {
97
+ k: sys.modules.pop(k)
98
+ for k in list(sys.modules)
99
+ if k == "readability" or k.startswith("readability.")
100
+ }
101
+ try:
102
+ sys.path = [
103
+ p for p in sys.path if os.path.abspath(p) != os.path.abspath(_THIS_DIR)
104
+ ]
105
+ mod = importlib.import_module("readability")
106
+ return mod.Document
107
+ finally:
108
+ sys.path = saved_path
109
+ for k in list(sys.modules):
110
+ if k == "readability" or k.startswith("readability."):
111
+ del sys.modules[k]
112
+ sys.modules.update(saved_modules)
113
+
114
+
115
+ _RefDocument = _load_readability_lxml()
116
+
117
+
118
+ def bench_readability_lxml(html: str, rounds: int) -> dict | None:
119
+ """Benchmark readability-lxml and return timing dict, or None."""
120
+ if _RefDocument is None:
121
+ return None
122
+
123
+ # Warm-up.
124
+ doc = _RefDocument(html)
125
+ summary = doc.summary()
126
+
127
+ times = []
128
+ for _ in range(rounds):
129
+ t0 = timeit.default_timer()
130
+ doc = _RefDocument(html)
131
+ doc.summary()
132
+ t1 = timeit.default_timer()
133
+ times.append((t1 - t0) * 1000)
134
+
135
+ # Extract title from summary HTML (basic).
136
+ title = doc.short_title() if hasattr(doc, "short_title") else ""
137
+ length = len(summary) if summary else 0
138
+
139
+ return {
140
+ "times_ms": times,
141
+ "min_ms": min(times),
142
+ "mean_ms": sum(times) / len(times),
143
+ "max_ms": max(times),
144
+ "title": title,
145
+ "length": length,
146
+ }
147
+
148
+
149
+ # ── JavaScript: Mozilla Readability.js ───────────────────────────────────────
150
+
151
+
152
+ def bench_mozilla_js(fixture_name: str, rounds: int) -> dict | None:
153
+ """Benchmark Mozilla Readability.js via Node.js subprocess.
154
+
155
+ Timing is measured internally by bench_mozilla.js — no subprocess
156
+ overhead in the reported numbers.
157
+ """
158
+ if not shutil.which("node"):
159
+ return None
160
+ if not os.path.isfile(_BENCH_JS):
161
+ return None
162
+
163
+ source_path = os.path.join(_TEST_PAGES_DIR, fixture_name, "source.html")
164
+ try:
165
+ result = subprocess.run(
166
+ ["node", _BENCH_JS, source_path, str(rounds)],
167
+ capture_output=True,
168
+ text=True,
169
+ timeout=120,
170
+ cwd=_THIS_DIR,
171
+ )
172
+ if result.returncode != 0:
173
+ print(f" [JS error: {result.stderr.strip()[:100]}]", file=sys.stderr)
174
+ return None
175
+ return json.loads(result.stdout)
176
+ except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
177
+ return None
178
+
179
+
180
+ # ── Output formatting ────────────────────────────────────────────────────────
181
+
182
+ # ANSI colors (disabled if not a terminal).
183
+ if sys.stdout.isatty():
184
+ _BOLD = "\033[1m"
185
+ _GREEN = "\033[32m"
186
+ _YELLOW = "\033[33m"
187
+ _CYAN = "\033[36m"
188
+ _RESET = "\033[0m"
189
+ _DIM = "\033[2m"
190
+ else:
191
+ _BOLD = _GREEN = _YELLOW = _CYAN = _RESET = _DIM = ""
192
+
193
+
194
+ def _fmt_ms(ms: float) -> str:
195
+ """Format milliseconds with appropriate unit."""
196
+ if ms < 1:
197
+ return f"{ms * 1000:.0f} µs"
198
+ if ms < 1000:
199
+ return f"{ms:.1f} ms"
200
+ return f"{ms / 1000:.2f} s"
201
+
202
+
203
+ def _ratio_str(ms: float, baseline: float) -> str:
204
+ """Format a ratio relative to baseline."""
205
+ if baseline <= 0:
206
+ return ""
207
+ ratio = ms / baseline
208
+ if ratio < 1.05:
209
+ return f"{_GREEN}1.00x{_RESET}"
210
+ return f"{_YELLOW}{ratio:.2f}x{_RESET}"
211
+
212
+
213
+ def print_results(
214
+ fixture_name: str,
215
+ html_size: int,
216
+ zd: dict,
217
+ lxml: dict | None,
218
+ js: dict | None,
219
+ ) -> None:
220
+ """Print a single fixture's results as a formatted row."""
221
+ baseline = zd["mean_ms"]
222
+
223
+ cols = [
224
+ f" {_BOLD}{fixture_name:<28s}{_RESET}",
225
+ f"{_DIM}{html_size / 1024:>7.1f} KB{_RESET}",
226
+ f"{_CYAN}zerodep{_RESET} {_fmt_ms(zd['mean_ms']):>10s}"
227
+ f" {_ratio_str(zd['mean_ms'], baseline)}",
228
+ ]
229
+
230
+ if lxml is not None:
231
+ cols.append(
232
+ f"{_CYAN}lxml{_RESET} {_fmt_ms(lxml['mean_ms']):>10s}"
233
+ f" {_ratio_str(lxml['mean_ms'], baseline)}"
234
+ )
235
+ else:
236
+ cols.append(f"{_DIM}lxml {'n/a':>10s}{_RESET}")
237
+
238
+ if js is not None:
239
+ cols.append(
240
+ f"{_CYAN}mozilla{_RESET} {_fmt_ms(js['mean_ms']):>10s}"
241
+ f" {_ratio_str(js['mean_ms'], baseline)}"
242
+ )
243
+ else:
244
+ cols.append(f"{_DIM}mozilla {'n/a':>10s}{_RESET}")
245
+
246
+ print(" ".join(cols))
247
+
248
+
249
+ # ── Main ─────────────────────────────────────────────────────────────────────
250
+
251
+
252
+ def main() -> None:
253
+ parser = argparse.ArgumentParser(
254
+ description="Three-way readability benchmark comparison."
255
+ )
256
+ parser.add_argument(
257
+ "fixtures",
258
+ nargs="*",
259
+ help="Fixture names to benchmark (default: all).",
260
+ )
261
+ parser.add_argument(
262
+ "--rounds",
263
+ type=int,
264
+ default=10,
265
+ help="Number of timing rounds per fixture (default: 10).",
266
+ )
267
+ args = parser.parse_args()
268
+
269
+ all_fixtures = discover_fixtures()
270
+ if not all_fixtures:
271
+ print("No test fixtures found in test-pages/", file=sys.stderr)
272
+ sys.exit(1)
273
+
274
+ fixtures = args.fixtures if args.fixtures else all_fixtures
275
+ # Validate fixture names.
276
+ for name in fixtures:
277
+ if name not in all_fixtures:
278
+ print(f"Unknown fixture: {name}", file=sys.stderr)
279
+ print(f"Available: {', '.join(all_fixtures)}", file=sys.stderr)
280
+ sys.exit(1)
281
+
282
+ rounds = args.rounds
283
+
284
+ # Header.
285
+ print()
286
+ print(f"{_BOLD}Readability Benchmark ({rounds} rounds per fixture){_RESET}")
287
+ has_node = shutil.which("node") is not None
288
+ has_lxml = _RefDocument is not None
289
+ status = []
290
+ status.append(f"zerodep: {_GREEN}yes{_RESET}")
291
+ lxml_status = _GREEN + "yes" + _RESET if has_lxml else _DIM + "no" + _RESET
292
+ status.append(f"readability-lxml: {lxml_status}")
293
+ status.append(
294
+ f"mozilla js: {_GREEN + 'yes' + _RESET if has_node else _DIM + 'no' + _RESET}"
295
+ )
296
+ print(f" Implementations: {' | '.join(status)}")
297
+ print(f" {_DIM}Times shown are mean. Ratios relative to zerodep.{_RESET}")
298
+ print()
299
+
300
+ # Column headers.
301
+ print(
302
+ f" {'Fixture':<28s} {'Size':>9s} "
303
+ f"{'zerodep':>19s} {'readability-lxml':>19s} "
304
+ f"{'mozilla js':>19s}"
305
+ )
306
+ print(" " + "─" * 110)
307
+
308
+ for name in fixtures:
309
+ html = load_source(name)
310
+ html_size = len(html.encode("utf-8"))
311
+
312
+ # Benchmark all three.
313
+ zd = bench_zerodep(html, rounds)
314
+ lxml = bench_readability_lxml(html, rounds)
315
+ js = bench_mozilla_js(name, rounds)
316
+
317
+ print_results(name, html_size, zd, lxml, js)
318
+
319
+ print()
320
+
321
+
322
+ if __name__ == "__main__":
323
+ main()