justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/__init__.py CHANGED
@@ -1,14 +1,42 @@
1
1
  from .parser import JustHTML, StrictModeError
2
+ from .sanitize import (
3
+ CSS_PRESET_TEXT,
4
+ DEFAULT_DOCUMENT_POLICY,
5
+ DEFAULT_POLICY,
6
+ SanitizationPolicy,
7
+ UnsafeHtmlError,
8
+ UrlPolicy,
9
+ UrlProxy,
10
+ UrlRule,
11
+ )
2
12
  from .selector import SelectorError, matches, query
3
13
  from .serialize import to_html, to_test_format
4
14
  from .stream import stream
5
15
  from .tokens import ParseError
16
+ from .transforms import CollapseWhitespace, Drop, Edit, Empty, Linkify, PruneEmpty, Sanitize, SetAttrs, Unwrap
6
17
 
7
18
  __all__ = [
19
+ "CSS_PRESET_TEXT",
20
+ "DEFAULT_DOCUMENT_POLICY",
21
+ "DEFAULT_POLICY",
22
+ "CollapseWhitespace",
23
+ "Drop",
24
+ "Edit",
25
+ "Empty",
8
26
  "JustHTML",
27
+ "Linkify",
9
28
  "ParseError",
29
+ "PruneEmpty",
30
+ "SanitizationPolicy",
31
+ "Sanitize",
10
32
  "SelectorError",
33
+ "SetAttrs",
11
34
  "StrictModeError",
35
+ "UnsafeHtmlError",
36
+ "Unwrap",
37
+ "UrlPolicy",
38
+ "UrlProxy",
39
+ "UrlRule",
12
40
  "matches",
13
41
  "query",
14
42
  "stream",
justhtml/__main__.py CHANGED
@@ -1,28 +1,176 @@
1
1
  #!/usr/bin/env python3
2
2
  """Command-line interface for JustHTML."""
3
3
 
4
- # ruff: noqa: PTH123
4
+ from __future__ import annotations
5
5
 
6
+ import argparse
7
+ import io
6
8
  import sys
9
+ from importlib.metadata import PackageNotFoundError, version
10
+ from pathlib import Path
11
+ from typing import TextIO, cast
7
12
 
8
13
  from . import JustHTML
14
+ from .context import FragmentContext
15
+ from .selector import SelectorError
9
16
 
10
17
 
11
- def main():
12
- if len(sys.argv) < 2:
13
- print("Usage: python -m justhtml <file.html>", file=sys.stderr)
14
- print(" python -m justhtml - (read from stdin)", file=sys.stderr)
15
- sys.exit(1)
18
+ def _get_version() -> str:
19
+ try:
20
+ return version("justhtml")
21
+ except PackageNotFoundError: # pragma: no cover
22
+ return "dev"
16
23
 
17
- path = sys.argv[1]
24
+
25
+ def _parse_args(argv: list[str]) -> argparse.Namespace:
26
+ parser = argparse.ArgumentParser(
27
+ prog="justhtml",
28
+ description="Parse HTML5 and output text, pretty-printed HTML, or Markdown.",
29
+ epilog=(
30
+ "Examples:\n"
31
+ " justhtml page.html\n"
32
+ " curl -s https://example.com | justhtml -\n"
33
+ " justhtml page.html --selector 'main p' --format text\n"
34
+ " justhtml page.html --selector 'a' --format html\n"
35
+ " justhtml page.html --selector 'article' --format markdown\n"
36
+ "\n"
37
+ "If you don't have the 'justhtml' command available, use:\n"
38
+ " python -m justhtml ...\n"
39
+ ),
40
+ formatter_class=argparse.RawDescriptionHelpFormatter,
41
+ )
42
+
43
+ parser.add_argument(
44
+ "path",
45
+ nargs="?",
46
+ help="HTML file to parse, or '-' to read from stdin",
47
+ )
48
+ parser.add_argument("--output", help="File to write output to")
49
+ parser.add_argument(
50
+ "--selector",
51
+ help="CSS selector for choosing nodes (defaults to the document root)",
52
+ )
53
+ parser.add_argument(
54
+ "--format",
55
+ choices=["html", "text", "markdown"],
56
+ default="html",
57
+ help="Output format (default: html)",
58
+ )
59
+
60
+ parser.add_argument(
61
+ "--unsafe",
62
+ action="store_true",
63
+ help="Disable sanitization (trusted input only)",
64
+ )
65
+ parser.add_argument(
66
+ "--first",
67
+ action="store_true",
68
+ help="Only output the first matching node",
69
+ )
70
+
71
+ parser.add_argument(
72
+ "--fragment",
73
+ action="store_true",
74
+ help="Parse input as an HTML fragment (context: <div>)",
75
+ )
76
+
77
+ parser.add_argument(
78
+ "--separator",
79
+ default=" ",
80
+ help="Text-only: join string between text nodes (default: a single space)",
81
+ )
82
+ strip_group = parser.add_mutually_exclusive_group()
83
+ strip_group.add_argument(
84
+ "--strip",
85
+ action="store_true",
86
+ default=True,
87
+ help="Text-only: strip each text node and drop empty segments (default)",
88
+ )
89
+ strip_group.add_argument(
90
+ "--no-strip",
91
+ action="store_false",
92
+ dest="strip",
93
+ help="Text-only: preserve text node whitespace",
94
+ )
95
+
96
+ parser.add_argument(
97
+ "--version",
98
+ action="version",
99
+ version=f"justhtml {_get_version()}",
100
+ )
101
+
102
+ args = parser.parse_args(argv)
103
+
104
+ if not args.path:
105
+ parser.print_help(sys.stderr)
106
+ raise SystemExit(1)
107
+
108
+ return args
109
+
110
+
111
+ def _read_html(path: str) -> str | bytes:
18
112
  if path == "-":
19
- html = sys.stdin.read()
20
- else:
21
- with open(path) as f:
22
- html = f.read()
113
+ stdin = sys.stdin
114
+ if isinstance(stdin, io.TextIOWrapper):
115
+ data: bytes = stdin.buffer.read()
116
+ return data
117
+ return cast("str", stdin.read())
118
+
119
+ return Path(path).read_bytes()
120
+
121
+
122
+ def main() -> None:
123
+ args = _parse_args(sys.argv[1:])
124
+ html = _read_html(args.path)
125
+ fragment_context = FragmentContext("div") if args.fragment else None
126
+ doc = JustHTML(html, fragment_context=fragment_context)
127
+
128
+ try:
129
+ nodes = doc.query(args.selector) if args.selector else [doc.root]
130
+ except SelectorError as e:
131
+ print(str(e), file=sys.stderr)
132
+ raise SystemExit(2) from e
133
+
134
+ if not nodes:
135
+ raise SystemExit(1)
136
+
137
+ if args.first:
138
+ nodes = [nodes[0]]
139
+
140
+ def write_output(out: TextIO) -> None:
141
+ safe = not args.unsafe
142
+ if args.format == "html":
143
+ outputs = [node.to_html(safe=safe) for node in nodes]
144
+ out.write("\n".join(outputs))
145
+ out.write("\n")
146
+ return
147
+
148
+ if args.format == "text":
149
+ # Keep these branches explicit so coverage will highlight untested CLI options.
150
+ if args.separator == " ":
151
+ if args.strip:
152
+ outputs = [node.to_text(strip=True, safe=safe) for node in nodes]
153
+ else:
154
+ outputs = [node.to_text(strip=False, safe=safe) for node in nodes]
155
+ else:
156
+ if args.strip:
157
+ outputs = [node.to_text(separator=args.separator, strip=True, safe=safe) for node in nodes]
158
+ else:
159
+ outputs = [node.to_text(separator=args.separator, strip=False, safe=safe) for node in nodes]
160
+ out.write("\n".join(outputs))
161
+ out.write("\n")
162
+ return
163
+
164
+ outputs = [node.to_markdown(safe=safe) for node in nodes]
165
+ out.write("\n\n".join(outputs))
166
+ out.write("\n")
167
+
168
+ if args.output:
169
+ with Path(args.output).open(mode="w", encoding="utf-8") as outfile:
170
+ write_output(outfile)
171
+ return
23
172
 
24
- doc = JustHTML(html)
25
- print(doc.root.to_html())
173
+ write_output(sys.stdout)
26
174
 
27
175
 
28
176
  if __name__ == "__main__":
justhtml/constants.py CHANGED
@@ -1,5 +1,9 @@
1
1
  """HTML5 spec constants for tree building."""
2
2
 
3
+ from __future__ import annotations
4
+
5
+ from typing import Final
6
+
3
7
  # HTML5 spec: Foreign attribute adjustments for SVG/MathML
4
8
  # Maps lowercase attribute names to (prefix, local_name, namespace_url)
5
9
  FOREIGN_ATTRIBUTE_ADJUSTMENTS = {
@@ -180,6 +184,18 @@ HTML4_PUBLIC_PREFIXES = (
180
184
 
181
185
  HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"}
182
186
 
187
+ # Elements where pretty-printing and whitespace-collapsing transforms should
188
+ # preserve text node whitespace.
189
+ WHITESPACE_PRESERVING_ELEMENTS: Final[frozenset[str]] = frozenset(
190
+ {
191
+ "code",
192
+ "pre",
193
+ "script",
194
+ "style",
195
+ "textarea",
196
+ }
197
+ )
198
+
183
199
  FORMATTING_ELEMENTS = {
184
200
  "a",
185
201
  "b",
@@ -284,7 +300,7 @@ SPECIAL_ELEMENTS = {
284
300
  "wbr",
285
301
  }
286
302
 
287
- FORMAT_MARKER = object()
303
+ FORMAT_MARKER: Final[object] = object()
288
304
 
289
305
  DEFAULT_SCOPE_TERMINATORS = {
290
306
  "applet",
justhtml/context.py CHANGED
@@ -1,6 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+
1
4
  class FragmentContext:
2
5
  __slots__ = ("namespace", "tag_name")
3
6
 
4
- def __init__(self, tag_name, namespace=None):
7
+ tag_name: str
8
+ namespace: str | None
9
+
10
+ def __init__(self, tag_name: str, namespace: str | None = None) -> None:
5
11
  self.tag_name = tag_name
6
12
  self.namespace = namespace
justhtml/encoding.py ADDED
@@ -0,0 +1,405 @@
1
+ """HTML encoding sniffing and decoding.
2
+
3
+ Implements the HTML encoding sniffing behavior needed for the html5lib-tests
4
+ encoding fixtures.
5
+
6
+ Inputs are bytes and an optional transport-supplied encoding label.
7
+ Outputs are a decoded Unicode string and the chosen encoding name.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ _ASCII_WHITESPACE: set[int] = {0x09, 0x0A, 0x0C, 0x0D, 0x20}
13
+
14
+
15
+ def _ascii_lower(b: int) -> int:
16
+ # b is an int 0..255
17
+ if 0x41 <= b <= 0x5A:
18
+ return b | 0x20
19
+ return b
20
+
21
+
22
+ def _is_ascii_alpha(b: int) -> bool:
23
+ b = _ascii_lower(b)
24
+ return 0x61 <= b <= 0x7A
25
+
26
+
27
+ def _skip_ascii_whitespace(data: bytes, i: int) -> int:
28
+ n = len(data)
29
+ while i < n and data[i] in _ASCII_WHITESPACE:
30
+ i += 1
31
+ return i
32
+
33
+
34
+ def _strip_ascii_whitespace(value: bytes | None) -> bytes | None:
35
+ if value is None:
36
+ return None
37
+ start = 0
38
+ end = len(value)
39
+ while start < end and value[start] in _ASCII_WHITESPACE:
40
+ start += 1
41
+ while end > start and value[end - 1] in _ASCII_WHITESPACE:
42
+ end -= 1
43
+ return value[start:end]
44
+
45
+
46
+ def normalize_encoding_label(label: str | bytes | None) -> str | None:
47
+ if not label:
48
+ return None
49
+
50
+ if isinstance(label, bytes):
51
+ label = label.decode("ascii", "ignore")
52
+
53
+ s = str(label).strip()
54
+ if not s:
55
+ return None
56
+
57
+ s = s.lower()
58
+
59
+ # Security: never allow utf-7.
60
+ if s in {"utf-7", "utf7", "x-utf-7"}:
61
+ return "windows-1252"
62
+
63
+ if s in {"utf-8", "utf8"}:
64
+ return "utf-8"
65
+
66
+ # HTML treats latin-1 labels as windows-1252.
67
+ if s in {
68
+ "iso-8859-1",
69
+ "iso8859-1",
70
+ "latin1",
71
+ "latin-1",
72
+ "l1",
73
+ "cp819",
74
+ "ibm819",
75
+ }:
76
+ return "windows-1252"
77
+
78
+ if s in {"windows-1252", "windows1252", "cp1252", "x-cp1252"}:
79
+ return "windows-1252"
80
+
81
+ if s in {"iso-8859-2", "iso8859-2", "latin2", "latin-2"}:
82
+ return "iso-8859-2"
83
+
84
+ if s in {"euc-jp", "eucjp"}:
85
+ return "euc-jp"
86
+
87
+ if s in {"utf-16", "utf16"}:
88
+ return "utf-16"
89
+ if s in {"utf-16le", "utf16le"}:
90
+ return "utf-16le"
91
+ if s in {"utf-16be", "utf16be"}:
92
+ return "utf-16be"
93
+
94
+ return None
95
+
96
+
97
+ def _normalize_meta_declared_encoding(label: bytes | None) -> str | None:
98
+ enc = normalize_encoding_label(label)
99
+ if enc is None:
100
+ return None
101
+
102
+ # Per HTML meta charset handling: ignore UTF-16/UTF-32 declarations and
103
+ # treat them as UTF-8.
104
+ if enc in {"utf-16", "utf-16le", "utf-16be", "utf-32", "utf-32le", "utf-32be"}:
105
+ return "utf-8"
106
+
107
+ return enc
108
+
109
+
110
+ def _sniff_bom(data: bytes) -> tuple[str | None, int]:
111
+ if len(data) >= 3 and data[0:3] == b"\xef\xbb\xbf":
112
+ return "utf-8", 3
113
+ if len(data) >= 2 and data[0:2] == b"\xff\xfe":
114
+ return "utf-16le", 2
115
+ if len(data) >= 2 and data[0:2] == b"\xfe\xff":
116
+ return "utf-16be", 2
117
+ return None, 0
118
+
119
+
120
+ def _extract_charset_from_content(content_bytes: bytes) -> bytes | None:
121
+ if not content_bytes:
122
+ return None
123
+
124
+ # Normalize whitespace to spaces for robust matching.
125
+ b = bytearray()
126
+ for ch in content_bytes:
127
+ if ch in _ASCII_WHITESPACE:
128
+ b.append(0x20)
129
+ else:
130
+ b.append(_ascii_lower(ch))
131
+ s = bytes(b)
132
+
133
+ idx = s.find(b"charset")
134
+ if idx == -1:
135
+ return None
136
+
137
+ i = idx + len(b"charset")
138
+ n = len(s)
139
+ while i < n and s[i] in _ASCII_WHITESPACE:
140
+ i += 1
141
+ if i >= n or s[i] != 0x3D: # '='
142
+ return None
143
+ i += 1
144
+ while i < n and s[i] in _ASCII_WHITESPACE:
145
+ i += 1
146
+ if i >= n:
147
+ return None
148
+
149
+ quote: int | None = None
150
+ if s[i] in (0x22, 0x27): # '"' or "'"
151
+ quote = s[i]
152
+ i += 1
153
+
154
+ start = i
155
+ while i < n:
156
+ ch = s[i]
157
+ if quote is not None:
158
+ if ch == quote:
159
+ break
160
+ else:
161
+ if ch in _ASCII_WHITESPACE or ch == 0x3B: # ';'
162
+ break
163
+ i += 1
164
+
165
+ if quote is not None and (i >= n or s[i] != quote):
166
+ return None
167
+
168
+ return s[start:i]
169
+
170
+
171
+ def _prescan_for_meta_charset(data: bytes) -> str | None:
172
+ # Scan up to 1024 bytes worth of non-comment input, but allow skipping
173
+ # arbitrarily large comments (bounded by a hard cap).
174
+ max_non_comment = 1024
175
+ max_total_scan = 65536
176
+
177
+ n = len(data)
178
+ i = 0
179
+ non_comment = 0
180
+
181
+ while i < n and i < max_total_scan and non_comment < max_non_comment:
182
+ if data[i] != 0x3C: # '<'
183
+ i += 1
184
+ non_comment += 1
185
+ continue
186
+
187
+ # Comment
188
+ if i + 3 < n and data[i + 1 : i + 4] == b"!--":
189
+ end = data.find(b"-->", i + 4)
190
+ if end == -1:
191
+ return None
192
+ i = end + 3
193
+ continue
194
+
195
+ # Tag open
196
+ j = i + 1
197
+ if j < n and data[j] == 0x2F: # '/'
198
+ # Skip end tag.
199
+ k = i
200
+ quote: int | None = None
201
+ while k < n and k < max_total_scan and non_comment < max_non_comment:
202
+ ch = data[k]
203
+ if quote is None:
204
+ if ch in (0x22, 0x27):
205
+ quote = ch
206
+ elif ch == 0x3E: # '>'
207
+ k += 1
208
+ non_comment += 1
209
+ break
210
+ else:
211
+ if ch == quote:
212
+ quote = None
213
+ k += 1
214
+ non_comment += 1
215
+ i = k
216
+ continue
217
+
218
+ if j >= n or not _is_ascii_alpha(data[j]):
219
+ i += 1
220
+ non_comment += 1
221
+ continue
222
+
223
+ name_start = j
224
+ while j < n and _is_ascii_alpha(data[j]):
225
+ j += 1
226
+
227
+ tag_name = data[name_start:j]
228
+ if tag_name.lower() != b"meta":
229
+ # Skip the rest of this tag so we don't accidentally interpret '<'
230
+ # inside an attribute value as a new tag.
231
+ k = i
232
+ quote = None
233
+ while k < n and k < max_total_scan and non_comment < max_non_comment:
234
+ ch = data[k]
235
+ if quote is None:
236
+ if ch in (0x22, 0x27):
237
+ quote = ch
238
+ elif ch == 0x3E: # '>'
239
+ k += 1
240
+ non_comment += 1
241
+ break
242
+ else:
243
+ if ch == quote:
244
+ quote = None
245
+ k += 1
246
+ non_comment += 1
247
+ i = k
248
+ continue
249
+
250
+ # Parse attributes until '>'
251
+ charset: bytes | None = None
252
+ http_equiv: bytes | None = None
253
+ content: bytes | None = None
254
+
255
+ k = j
256
+ saw_gt = False
257
+ start_i = i
258
+ while k < n and k < max_total_scan:
259
+ ch = data[k]
260
+ if ch == 0x3E: # '>'
261
+ saw_gt = True
262
+ k += 1
263
+ break
264
+
265
+ if ch == 0x3C: # '<' - restart scanning from here
266
+ break
267
+
268
+ if ch in _ASCII_WHITESPACE or ch == 0x2F: # '/'
269
+ k += 1
270
+ continue
271
+
272
+ # Attribute name
273
+ attr_start = k
274
+ while k < n:
275
+ ch = data[k]
276
+ if ch in _ASCII_WHITESPACE or ch in {0x3D, 0x3E, 0x2F, 0x3C}:
277
+ break
278
+ k += 1
279
+ attr_name = data[attr_start:k].lower()
280
+ k = _skip_ascii_whitespace(data, k)
281
+
282
+ value: bytes | None = None
283
+ if k < n and data[k] == 0x3D: # '='
284
+ k += 1
285
+ k = _skip_ascii_whitespace(data, k)
286
+ if k >= n:
287
+ break
288
+
289
+ quote = None
290
+ if data[k] in (0x22, 0x27):
291
+ quote = data[k]
292
+ k += 1
293
+ val_start = k
294
+ end_quote = data.find(bytes((quote,)), k)
295
+ if end_quote == -1:
296
+ # Unclosed quote: ignore this meta.
297
+ i += 1
298
+ non_comment += 1
299
+ charset = None
300
+ http_equiv = None
301
+ content = None
302
+ saw_gt = False
303
+ break
304
+ value = data[val_start:end_quote]
305
+ k = end_quote + 1
306
+ else:
307
+ val_start = k
308
+ while k < n:
309
+ ch = data[k]
310
+ if ch in _ASCII_WHITESPACE or ch in {0x3E, 0x3C}:
311
+ break
312
+ k += 1
313
+ value = data[val_start:k]
314
+
315
+ if attr_name == b"charset":
316
+ charset = _strip_ascii_whitespace(value)
317
+ elif attr_name == b"http-equiv":
318
+ http_equiv = value
319
+ elif attr_name == b"content":
320
+ content = value
321
+
322
+ if saw_gt:
323
+ if charset:
324
+ enc = _normalize_meta_declared_encoding(charset)
325
+ if enc:
326
+ return enc
327
+
328
+ if http_equiv and http_equiv.lower() == b"content-type" and content:
329
+ extracted = _extract_charset_from_content(content)
330
+ if extracted:
331
+ enc = _normalize_meta_declared_encoding(extracted)
332
+ if enc:
333
+ return enc
334
+
335
+ # Continue scanning after this tag.
336
+ i = k
337
+ consumed = i - start_i
338
+ non_comment += consumed
339
+ else:
340
+ # Continue scanning after this tag attempt
341
+ i += 1
342
+ non_comment += 1
343
+
344
+ return None
345
+
346
+
347
+ def sniff_html_encoding(data: bytes, transport_encoding: str | None = None) -> tuple[str, int]:
348
+ # Transport overrides everything.
349
+ transport = normalize_encoding_label(transport_encoding)
350
+ if transport:
351
+ return transport, 0
352
+
353
+ bom_enc, bom_len = _sniff_bom(data)
354
+ if bom_enc:
355
+ return bom_enc, bom_len
356
+
357
+ meta_enc = _prescan_for_meta_charset(data)
358
+ if meta_enc:
359
+ return meta_enc, 0
360
+
361
+ return "windows-1252", 0
362
+
363
+
364
+ def decode_html(data: bytes, transport_encoding: str | None = None) -> tuple[str, str]:
365
+ """Decode an HTML byte stream using HTML encoding sniffing.
366
+
367
+ Returns (text, encoding_name).
368
+ """
369
+ enc, bom_len = sniff_html_encoding(data, transport_encoding=transport_encoding)
370
+
371
+ # Allowlist supported decoders.
372
+ if enc not in {
373
+ "utf-8",
374
+ "windows-1252",
375
+ "iso-8859-2",
376
+ "euc-jp",
377
+ "utf-16",
378
+ "utf-16le",
379
+ "utf-16be",
380
+ }: # pragma: no cover
381
+ enc = "windows-1252"
382
+ bom_len = 0
383
+
384
+ payload = data[bom_len:] if bom_len else data
385
+
386
+ if enc == "windows-1252":
387
+ return payload.decode("cp1252"), "windows-1252"
388
+
389
+ if enc == "iso-8859-2":
390
+ return payload.decode("iso-8859-2", "replace"), "iso-8859-2"
391
+
392
+ if enc == "euc-jp":
393
+ return payload.decode("euc_jp", "replace"), "euc-jp"
394
+
395
+ if enc == "utf-16le":
396
+ return payload.decode("utf-16le", "replace"), "utf-16le"
397
+
398
+ if enc == "utf-16be":
399
+ return payload.decode("utf-16be", "replace"), "utf-16be"
400
+
401
+ if enc == "utf-16":
402
+ return payload.decode("utf-16", "replace"), "utf-16"
403
+
404
+ # Default utf-8
405
+ return payload.decode("utf-8", "replace"), "utf-8"