justhtml 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/__init__.py ADDED
@@ -0,0 +1,17 @@
1
+ from .parser import JustHTML, StrictModeError
2
+ from .selector import SelectorError, matches, query
3
+ from .serialize import to_html, to_test_format
4
+ from .stream import stream
5
+ from .tokens import ParseError
6
+
7
+ __all__ = [
8
+ "JustHTML",
9
+ "ParseError",
10
+ "SelectorError",
11
+ "StrictModeError",
12
+ "matches",
13
+ "query",
14
+ "stream",
15
+ "to_html",
16
+ "to_test_format",
17
+ ]
justhtml/__main__.py ADDED
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env python3
2
+ """Command-line interface for JustHTML."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import io
8
+ import sys
9
+ from importlib.metadata import PackageNotFoundError, version
10
+ from pathlib import Path
11
+ from typing import cast
12
+
13
+ from . import JustHTML
14
+ from .selector import SelectorError
15
+
16
+
17
+ def _get_version() -> str:
18
+ try:
19
+ return version("justhtml")
20
+ except PackageNotFoundError: # pragma: no cover
21
+ return "dev"
22
+
23
+
24
+ def _parse_args(argv: list[str]) -> argparse.Namespace:
25
+ parser = argparse.ArgumentParser(
26
+ prog="justhtml",
27
+ description="Parse HTML5 and output text, pretty-printed HTML, or Markdown.",
28
+ epilog=(
29
+ "Examples:\n"
30
+ " justhtml page.html\n"
31
+ " curl -s https://example.com | justhtml -\n"
32
+ " justhtml page.html --selector 'main p' --format text\n"
33
+ " justhtml page.html --selector 'a' --format html\n"
34
+ " justhtml page.html --selector 'article' --format markdown\n"
35
+ "\n"
36
+ "If you don't have the 'justhtml' command available, use:\n"
37
+ " python -m justhtml ...\n"
38
+ ),
39
+ formatter_class=argparse.RawDescriptionHelpFormatter,
40
+ )
41
+
42
+ parser.add_argument(
43
+ "path",
44
+ nargs="?",
45
+ help="HTML file to parse, or '-' to read from stdin",
46
+ )
47
+ parser.add_argument(
48
+ "--selector",
49
+ help="CSS selector for choosing nodes (defaults to the document root)",
50
+ )
51
+ parser.add_argument(
52
+ "--format",
53
+ choices=["html", "text", "markdown"],
54
+ default="html",
55
+ help="Output format (default: html)",
56
+ )
57
+ parser.add_argument(
58
+ "--first",
59
+ action="store_true",
60
+ help="Only output the first matching node",
61
+ )
62
+
63
+ parser.add_argument(
64
+ "--separator",
65
+ default=" ",
66
+ help="Text-only: join string between text nodes (default: a single space)",
67
+ )
68
+ strip_group = parser.add_mutually_exclusive_group()
69
+ strip_group.add_argument(
70
+ "--strip",
71
+ action="store_true",
72
+ default=True,
73
+ help="Text-only: strip each text node and drop empty segments (default)",
74
+ )
75
+ strip_group.add_argument(
76
+ "--no-strip",
77
+ action="store_false",
78
+ dest="strip",
79
+ help="Text-only: preserve text node whitespace",
80
+ )
81
+
82
+ parser.add_argument(
83
+ "--version",
84
+ action="version",
85
+ version=f"justhtml {_get_version()}",
86
+ )
87
+
88
+ args = parser.parse_args(argv)
89
+
90
+ if not args.path:
91
+ parser.print_help(sys.stderr)
92
+ raise SystemExit(1)
93
+
94
+ return args
95
+
96
+
97
+ def _read_html(path: str) -> str | bytes:
98
+ if path == "-":
99
+ stdin = sys.stdin
100
+ if isinstance(stdin, io.TextIOWrapper):
101
+ data: bytes = stdin.buffer.read()
102
+ return data
103
+ return cast("str", stdin.read())
104
+
105
+ return Path(path).read_bytes()
106
+
107
+
108
+ def main() -> None:
109
+ args = _parse_args(sys.argv[1:])
110
+ html = _read_html(args.path)
111
+ doc = JustHTML(html)
112
+
113
+ try:
114
+ nodes = doc.query(args.selector) if args.selector else [doc.root]
115
+ except SelectorError as e:
116
+ print(str(e), file=sys.stderr)
117
+ raise SystemExit(2) from e
118
+
119
+ if not nodes:
120
+ raise SystemExit(1)
121
+
122
+ if args.first:
123
+ nodes = [nodes[0]]
124
+
125
+ if args.format == "html":
126
+ outputs = [node.to_html() for node in nodes]
127
+ sys.stdout.write("\n".join(outputs))
128
+ sys.stdout.write("\n")
129
+ return
130
+
131
+ if args.format == "text":
132
+ outputs = [node.to_text(separator=args.separator, strip=args.strip) for node in nodes]
133
+ sys.stdout.write("\n".join(outputs))
134
+ sys.stdout.write("\n")
135
+ return
136
+
137
+ outputs = [node.to_markdown() for node in nodes]
138
+ sys.stdout.write("\n\n".join(outputs))
139
+ sys.stdout.write("\n")
140
+ return
141
+
142
+
143
+ if __name__ == "__main__":
144
+ main()
justhtml/constants.py ADDED
@@ -0,0 +1,445 @@
1
+ """HTML5 spec constants for tree building."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Final
6
+
7
+ # HTML5 spec: Foreign attribute adjustments for SVG/MathML
8
+ # Maps lowercase attribute names to (prefix, local_name, namespace_url)
9
+ FOREIGN_ATTRIBUTE_ADJUSTMENTS = {
10
+ "xlink:actuate": ("xlink", "actuate", "http://www.w3.org/1999/xlink"),
11
+ "xlink:arcrole": ("xlink", "arcrole", "http://www.w3.org/1999/xlink"),
12
+ "xlink:href": ("xlink", "href", "http://www.w3.org/1999/xlink"),
13
+ "xlink:role": ("xlink", "role", "http://www.w3.org/1999/xlink"),
14
+ "xlink:show": ("xlink", "show", "http://www.w3.org/1999/xlink"),
15
+ "xlink:title": ("xlink", "title", "http://www.w3.org/1999/xlink"),
16
+ "xlink:type": ("xlink", "type", "http://www.w3.org/1999/xlink"),
17
+ "xml:lang": ("xml", "lang", "http://www.w3.org/XML/1998/namespace"),
18
+ "xml:space": ("xml", "space", "http://www.w3.org/XML/1998/namespace"),
19
+ "xmlns": (None, "xmlns", "http://www.w3.org/2000/xmlns/"),
20
+ "xmlns:xlink": ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/"),
21
+ }
22
+
23
+ # MathML attribute case adjustments
24
+ MATHML_ATTRIBUTE_ADJUSTMENTS = {
25
+ "definitionurl": "definitionURL",
26
+ }
27
+
28
+ # SVG attribute case adjustments
29
+ SVG_ATTRIBUTE_ADJUSTMENTS = {
30
+ "attributename": "attributeName",
31
+ "attributetype": "attributeType",
32
+ "basefrequency": "baseFrequency",
33
+ "baseprofile": "baseProfile",
34
+ "calcmode": "calcMode",
35
+ "clippathunits": "clipPathUnits",
36
+ "diffuseconstant": "diffuseConstant",
37
+ "edgemode": "edgeMode",
38
+ "filterunits": "filterUnits",
39
+ "glyphref": "glyphRef",
40
+ "gradienttransform": "gradientTransform",
41
+ "gradientunits": "gradientUnits",
42
+ "kernelmatrix": "kernelMatrix",
43
+ "kernelunitlength": "kernelUnitLength",
44
+ "keypoints": "keyPoints",
45
+ "keysplines": "keySplines",
46
+ "keytimes": "keyTimes",
47
+ "lengthadjust": "lengthAdjust",
48
+ "limitingconeangle": "limitingConeAngle",
49
+ "markerheight": "markerHeight",
50
+ "markerunits": "markerUnits",
51
+ "markerwidth": "markerWidth",
52
+ "maskcontentunits": "maskContentUnits",
53
+ "maskunits": "maskUnits",
54
+ "numoctaves": "numOctaves",
55
+ "pathlength": "pathLength",
56
+ "patterncontentunits": "patternContentUnits",
57
+ "patterntransform": "patternTransform",
58
+ "patternunits": "patternUnits",
59
+ "pointsatx": "pointsAtX",
60
+ "pointsaty": "pointsAtY",
61
+ "pointsatz": "pointsAtZ",
62
+ "preservealpha": "preserveAlpha",
63
+ "preserveaspectratio": "preserveAspectRatio",
64
+ "primitiveunits": "primitiveUnits",
65
+ "refx": "refX",
66
+ "refy": "refY",
67
+ "repeatcount": "repeatCount",
68
+ "repeatdur": "repeatDur",
69
+ "requiredextensions": "requiredExtensions",
70
+ "requiredfeatures": "requiredFeatures",
71
+ "specularconstant": "specularConstant",
72
+ "specularexponent": "specularExponent",
73
+ "spreadmethod": "spreadMethod",
74
+ "startoffset": "startOffset",
75
+ "stddeviation": "stdDeviation",
76
+ "stitchtiles": "stitchTiles",
77
+ "surfacescale": "surfaceScale",
78
+ "systemlanguage": "systemLanguage",
79
+ "tablevalues": "tableValues",
80
+ "targetx": "targetX",
81
+ "targety": "targetY",
82
+ "textlength": "textLength",
83
+ "viewbox": "viewBox",
84
+ "viewtarget": "viewTarget",
85
+ "xchannelselector": "xChannelSelector",
86
+ "ychannelselector": "yChannelSelector",
87
+ "zoomandpan": "zoomAndPan",
88
+ }
89
+
90
+ # HTML integration points (SVG/MathML elements that allow HTML content)
91
+ # Structure: (namespace_url, element_name)
92
+ HTML_INTEGRATION_POINT_ELEMENTS = {
93
+ ("http://www.w3.org/1998/Math/MathML", "annotation-xml"),
94
+ ("http://www.w3.org/2000/svg", "foreignObject"),
95
+ ("http://www.w3.org/2000/svg", "desc"),
96
+ ("http://www.w3.org/2000/svg", "title"),
97
+ }
98
+
99
+ # MathML text integration points
100
+ # Structure: (namespace_url, element_name)
101
+ MATHML_TEXT_INTEGRATION_POINT_ELEMENTS = {
102
+ ("http://www.w3.org/1998/Math/MathML", "mi"),
103
+ ("http://www.w3.org/1998/Math/MathML", "mo"),
104
+ ("http://www.w3.org/1998/Math/MathML", "mn"),
105
+ ("http://www.w3.org/1998/Math/MathML", "ms"),
106
+ ("http://www.w3.org/1998/Math/MathML", "mtext"),
107
+ }
108
+
109
+ QUIRKY_PUBLIC_PREFIXES = (
110
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
111
+ "-//as//dtd html 3.0 aswedit + extensions//",
112
+ "-//ietf//dtd html 2.0 level 1//",
113
+ "-//ietf//dtd html 2.0 level 2//",
114
+ "-//ietf//dtd html 2.0 strict level 1//",
115
+ "-//ietf//dtd html 2.0 strict level 2//",
116
+ "-//ietf//dtd html 2.0 strict//",
117
+ "-//ietf//dtd html 2.0//",
118
+ "-//ietf//dtd html 2.1e//",
119
+ "-//ietf//dtd html 3.0//",
120
+ "-//ietf//dtd html 3.2 final//",
121
+ "-//ietf//dtd html 3.2//",
122
+ "-//ietf//dtd html 3//",
123
+ "-//ietf//dtd html level 0//",
124
+ "-//ietf//dtd html level 1//",
125
+ "-//ietf//dtd html level 2//",
126
+ "-//ietf//dtd html level 3//",
127
+ "-//ietf//dtd html strict level 0//",
128
+ "-//ietf//dtd html strict level 1//",
129
+ "-//ietf//dtd html strict level 2//",
130
+ "-//ietf//dtd html strict level 3//",
131
+ "-//ietf//dtd html strict//",
132
+ "-//ietf//dtd html//",
133
+ "-//metrius//dtd metrius presentational//",
134
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
135
+ "-//microsoft//dtd internet explorer 2.0 html//",
136
+ "-//microsoft//dtd internet explorer 2.0 tables//",
137
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
138
+ "-//microsoft//dtd internet explorer 3.0 html//",
139
+ "-//microsoft//dtd internet explorer 3.0 tables//",
140
+ "-//netscape comm. corp.//dtd html//",
141
+ "-//netscape comm. corp.//dtd strict html//",
142
+ "-//o'reilly and associates//dtd html 2.0//",
143
+ "-//o'reilly and associates//dtd html extended 1.0//",
144
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
145
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
146
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
147
+ "-//spyglass//dtd html 2.0 extended//",
148
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
149
+ "-//sun microsystems corp.//dtd hotjava html//",
150
+ "-//sun microsystems corp.//dtd hotjava strict html//",
151
+ "-//w3c//dtd html 3 1995-03-24//",
152
+ "-//w3c//dtd html 3.2 draft//",
153
+ "-//w3c//dtd html 3.2 final//",
154
+ "-//w3c//dtd html 3.2//",
155
+ "-//w3c//dtd html 3.2s draft//",
156
+ "-//w3c//dtd html 4.0 frameset//",
157
+ "-//w3c//dtd html 4.0 transitional//",
158
+ "-//w3c//dtd html experimental 19960712//",
159
+ "-//w3c//dtd html experimental 970421//",
160
+ "-//w3c//dtd html experimental 970421//",
161
+ "-//w3c//dtd w3 html//",
162
+ "-//w3o//dtd w3 html 3.0//",
163
+ "-//webtechs//dtd mozilla html 2.0//",
164
+ "-//webtechs//dtd mozilla html//",
165
+ )
166
+
167
+ QUIRKY_PUBLIC_MATCHES = (
168
+ "-//w3o//dtd w3 html strict 3.0//en//",
169
+ "-/w3c/dtd html 4.0 transitional/en",
170
+ "html",
171
+ )
172
+
173
+ QUIRKY_SYSTEM_MATCHES = ("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd",)
174
+
175
+ LIMITED_QUIRKY_PUBLIC_PREFIXES = (
176
+ "-//w3c//dtd xhtml 1.0 frameset//",
177
+ "-//w3c//dtd xhtml 1.0 transitional//",
178
+ )
179
+
180
+ HTML4_PUBLIC_PREFIXES = (
181
+ "-//w3c//dtd html 4.01 frameset//",
182
+ "-//w3c//dtd html 4.01 transitional//",
183
+ )
184
+
185
+ HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"}
186
+
187
+ FORMATTING_ELEMENTS = {
188
+ "a",
189
+ "b",
190
+ "big",
191
+ "code",
192
+ "em",
193
+ "font",
194
+ "i",
195
+ "nobr",
196
+ "s",
197
+ "small",
198
+ "strike",
199
+ "strong",
200
+ "tt",
201
+ "u",
202
+ }
203
+
204
+ SPECIAL_ELEMENTS = {
205
+ "address",
206
+ "applet",
207
+ "area",
208
+ "article",
209
+ "aside",
210
+ "base",
211
+ "basefont",
212
+ "bgsound",
213
+ "blockquote",
214
+ "body",
215
+ "br",
216
+ "button",
217
+ "caption",
218
+ "center",
219
+ "col",
220
+ "colgroup",
221
+ "dd",
222
+ "details",
223
+ "dialog",
224
+ "dir",
225
+ "div",
226
+ "dl",
227
+ "dt",
228
+ "embed",
229
+ "fieldset",
230
+ "figcaption",
231
+ "figure",
232
+ "footer",
233
+ "form",
234
+ "frame",
235
+ "frameset",
236
+ "h1",
237
+ "h2",
238
+ "h3",
239
+ "h4",
240
+ "h5",
241
+ "h6",
242
+ "head",
243
+ "header",
244
+ "hgroup",
245
+ "hr",
246
+ "html",
247
+ "iframe",
248
+ "img",
249
+ "input",
250
+ "keygen",
251
+ "li",
252
+ "link",
253
+ "listing",
254
+ "main",
255
+ "marquee",
256
+ "menu",
257
+ "menuitem",
258
+ "meta",
259
+ "nav",
260
+ "noembed",
261
+ "noframes",
262
+ "noscript",
263
+ "object",
264
+ "ol",
265
+ "p",
266
+ "param",
267
+ "plaintext",
268
+ "pre",
269
+ "script",
270
+ "search",
271
+ "section",
272
+ "select",
273
+ "source",
274
+ "style",
275
+ "summary",
276
+ "table",
277
+ "tbody",
278
+ "td",
279
+ "template",
280
+ "textarea",
281
+ "tfoot",
282
+ "th",
283
+ "thead",
284
+ "title",
285
+ "tr",
286
+ "track",
287
+ "ul",
288
+ "wbr",
289
+ }
290
+
291
+ FORMAT_MARKER: Final[object] = object()
292
+
293
+ DEFAULT_SCOPE_TERMINATORS = {
294
+ "applet",
295
+ "caption",
296
+ "html",
297
+ "table",
298
+ "td",
299
+ "th",
300
+ "marquee",
301
+ "object",
302
+ "template",
303
+ }
304
+
305
+ BUTTON_SCOPE_TERMINATORS = DEFAULT_SCOPE_TERMINATORS | {"button"}
306
+ LIST_ITEM_SCOPE_TERMINATORS = DEFAULT_SCOPE_TERMINATORS | {"ol", "ul"}
307
+ DEFINITION_SCOPE_TERMINATORS = DEFAULT_SCOPE_TERMINATORS | {"dl"}
308
+
309
+ TABLE_FOSTER_TARGETS = {"table", "tbody", "tfoot", "thead", "tr"}
310
+
311
+ SVG_TAG_NAME_ADJUSTMENTS = {
312
+ "altglyph": "altGlyph",
313
+ "altglyphdef": "altGlyphDef",
314
+ "altglyphitem": "altGlyphItem",
315
+ "animatecolor": "animateColor",
316
+ "animatemotion": "animateMotion",
317
+ "animatetransform": "animateTransform",
318
+ "clippath": "clipPath",
319
+ "feblend": "feBlend",
320
+ "fecolormatrix": "feColorMatrix",
321
+ "fecomponenttransfer": "feComponentTransfer",
322
+ "fecomposite": "feComposite",
323
+ "feconvolvematrix": "feConvolveMatrix",
324
+ "fediffuselighting": "feDiffuseLighting",
325
+ "fedisplacementmap": "feDisplacementMap",
326
+ "fedistantlight": "feDistantLight",
327
+ "feflood": "feFlood",
328
+ "fefunca": "feFuncA",
329
+ "fefuncb": "feFuncB",
330
+ "fefuncg": "feFuncG",
331
+ "fefuncr": "feFuncR",
332
+ "fegaussianblur": "feGaussianBlur",
333
+ "feimage": "feImage",
334
+ "femerge": "feMerge",
335
+ "femergenode": "feMergeNode",
336
+ "femorphology": "feMorphology",
337
+ "feoffset": "feOffset",
338
+ "fepointlight": "fePointLight",
339
+ "fespecularlighting": "feSpecularLighting",
340
+ "fespotlight": "feSpotLight",
341
+ "fetile": "feTile",
342
+ "feturbulence": "feTurbulence",
343
+ "foreignobject": "foreignObject",
344
+ "glyphref": "glyphRef",
345
+ "lineargradient": "linearGradient",
346
+ "radialgradient": "radialGradient",
347
+ "textpath": "textPath",
348
+ }
349
+
350
+ FOREIGN_BREAKOUT_ELEMENTS = {
351
+ "b",
352
+ "big",
353
+ "blockquote",
354
+ "body",
355
+ "br",
356
+ "center",
357
+ "code",
358
+ "dd",
359
+ "div",
360
+ "dl",
361
+ "dt",
362
+ "em",
363
+ "embed",
364
+ "h1",
365
+ "h2",
366
+ "h3",
367
+ "h4",
368
+ "h5",
369
+ "h6",
370
+ "head",
371
+ "hr",
372
+ "i",
373
+ "img",
374
+ "li",
375
+ "listing",
376
+ "menu",
377
+ "meta",
378
+ "nobr",
379
+ "ol",
380
+ "p",
381
+ "pre",
382
+ "ruby",
383
+ "s",
384
+ "small",
385
+ "span",
386
+ "strong",
387
+ "strike",
388
+ "sub",
389
+ "sup",
390
+ "table",
391
+ "tt",
392
+ "u",
393
+ "ul",
394
+ "var",
395
+ }
396
+
397
+ NAMESPACE_URL_TO_PREFIX = {
398
+ "http://www.w3.org/1999/xhtml": "html",
399
+ "http://www.w3.org/1998/Math/MathML": "math",
400
+ "http://www.w3.org/2000/svg": "svg",
401
+ }
402
+
403
+ HTML_INTEGRATION_POINT_SET = {
404
+ (NAMESPACE_URL_TO_PREFIX.get(ns, ns), name) for ns, name in HTML_INTEGRATION_POINT_ELEMENTS
405
+ }
406
+
407
+ MATHML_TEXT_INTEGRATION_POINT_SET = {
408
+ (NAMESPACE_URL_TO_PREFIX.get(ns, ns), name) for ns, name in MATHML_TEXT_INTEGRATION_POINT_ELEMENTS
409
+ }
410
+
411
+ TABLE_ALLOWED_CHILDREN = {
412
+ "caption",
413
+ "colgroup",
414
+ "tbody",
415
+ "tfoot",
416
+ "thead",
417
+ "tr",
418
+ "td",
419
+ "th",
420
+ "script",
421
+ "template",
422
+ "style",
423
+ }
424
+
425
+ TABLE_SCOPE_TERMINATORS = {"html", "table", "template"}
426
+ IMPLIED_END_TAGS = {"dd", "dt", "li", "option", "optgroup", "p", "rb", "rp", "rt", "rtc"}
427
+
428
+ VOID_ELEMENTS = frozenset(
429
+ {
430
+ "area",
431
+ "base",
432
+ "br",
433
+ "col",
434
+ "embed",
435
+ "hr",
436
+ "img",
437
+ "input",
438
+ "link",
439
+ "meta",
440
+ "param",
441
+ "source",
442
+ "track",
443
+ "wbr",
444
+ },
445
+ )
justhtml/context.py ADDED
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class FragmentContext:
5
+ __slots__ = ("namespace", "tag_name")
6
+
7
+ tag_name: str
8
+ namespace: str | None
9
+
10
+ def __init__(self, tag_name: str, namespace: str | None = None) -> None:
11
+ self.tag_name = tag_name
12
+ self.namespace = namespace