justhtml 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/__init__.py ADDED
@@ -0,0 +1,17 @@
1
+ from .parser import JustHTML, StrictModeError
2
+ from .selector import SelectorError, matches, query
3
+ from .serialize import to_html, to_test_format
4
+ from .stream import stream
5
+ from .tokens import ParseError
6
+
7
+ __all__ = [
8
+ "JustHTML",
9
+ "ParseError",
10
+ "SelectorError",
11
+ "StrictModeError",
12
+ "matches",
13
+ "query",
14
+ "stream",
15
+ "to_html",
16
+ "to_test_format",
17
+ ]
justhtml/__main__.py ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env python3
2
+ """Command-line interface for JustHTML."""
3
+
4
+ # ruff: noqa: PTH123
5
+
6
+ import sys
7
+
8
+ from . import JustHTML
9
+
10
+
11
+ def main():
12
+ if len(sys.argv) < 2:
13
+ print("Usage: python -m justhtml <file.html>", file=sys.stderr)
14
+ print(" python -m justhtml - (read from stdin)", file=sys.stderr)
15
+ sys.exit(1)
16
+
17
+ path = sys.argv[1]
18
+ if path == "-":
19
+ html = sys.stdin.read()
20
+ else:
21
+ with open(path) as f:
22
+ html = f.read()
23
+
24
+ doc = JustHTML(html)
25
+ print(doc.root.to_html())
26
+
27
+
28
+ if __name__ == "__main__":
29
+ main()
justhtml/constants.py ADDED
@@ -0,0 +1,441 @@
1
+ """HTML5 spec constants for tree building."""
2
+
3
+ # HTML5 spec: Foreign attribute adjustments for SVG/MathML
4
+ # Maps lowercase attribute names to (prefix, local_name, namespace_url)
5
+ FOREIGN_ATTRIBUTE_ADJUSTMENTS = {
6
+ "xlink:actuate": ("xlink", "actuate", "http://www.w3.org/1999/xlink"),
7
+ "xlink:arcrole": ("xlink", "arcrole", "http://www.w3.org/1999/xlink"),
8
+ "xlink:href": ("xlink", "href", "http://www.w3.org/1999/xlink"),
9
+ "xlink:role": ("xlink", "role", "http://www.w3.org/1999/xlink"),
10
+ "xlink:show": ("xlink", "show", "http://www.w3.org/1999/xlink"),
11
+ "xlink:title": ("xlink", "title", "http://www.w3.org/1999/xlink"),
12
+ "xlink:type": ("xlink", "type", "http://www.w3.org/1999/xlink"),
13
+ "xml:lang": ("xml", "lang", "http://www.w3.org/XML/1998/namespace"),
14
+ "xml:space": ("xml", "space", "http://www.w3.org/XML/1998/namespace"),
15
+ "xmlns": (None, "xmlns", "http://www.w3.org/2000/xmlns/"),
16
+ "xmlns:xlink": ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/"),
17
+ }
18
+
19
+ # MathML attribute case adjustments
20
+ MATHML_ATTRIBUTE_ADJUSTMENTS = {
21
+ "definitionurl": "definitionURL",
22
+ }
23
+
24
+ # SVG attribute case adjustments
25
+ SVG_ATTRIBUTE_ADJUSTMENTS = {
26
+ "attributename": "attributeName",
27
+ "attributetype": "attributeType",
28
+ "basefrequency": "baseFrequency",
29
+ "baseprofile": "baseProfile",
30
+ "calcmode": "calcMode",
31
+ "clippathunits": "clipPathUnits",
32
+ "diffuseconstant": "diffuseConstant",
33
+ "edgemode": "edgeMode",
34
+ "filterunits": "filterUnits",
35
+ "glyphref": "glyphRef",
36
+ "gradienttransform": "gradientTransform",
37
+ "gradientunits": "gradientUnits",
38
+ "kernelmatrix": "kernelMatrix",
39
+ "kernelunitlength": "kernelUnitLength",
40
+ "keypoints": "keyPoints",
41
+ "keysplines": "keySplines",
42
+ "keytimes": "keyTimes",
43
+ "lengthadjust": "lengthAdjust",
44
+ "limitingconeangle": "limitingConeAngle",
45
+ "markerheight": "markerHeight",
46
+ "markerunits": "markerUnits",
47
+ "markerwidth": "markerWidth",
48
+ "maskcontentunits": "maskContentUnits",
49
+ "maskunits": "maskUnits",
50
+ "numoctaves": "numOctaves",
51
+ "pathlength": "pathLength",
52
+ "patterncontentunits": "patternContentUnits",
53
+ "patterntransform": "patternTransform",
54
+ "patternunits": "patternUnits",
55
+ "pointsatx": "pointsAtX",
56
+ "pointsaty": "pointsAtY",
57
+ "pointsatz": "pointsAtZ",
58
+ "preservealpha": "preserveAlpha",
59
+ "preserveaspectratio": "preserveAspectRatio",
60
+ "primitiveunits": "primitiveUnits",
61
+ "refx": "refX",
62
+ "refy": "refY",
63
+ "repeatcount": "repeatCount",
64
+ "repeatdur": "repeatDur",
65
+ "requiredextensions": "requiredExtensions",
66
+ "requiredfeatures": "requiredFeatures",
67
+ "specularconstant": "specularConstant",
68
+ "specularexponent": "specularExponent",
69
+ "spreadmethod": "spreadMethod",
70
+ "startoffset": "startOffset",
71
+ "stddeviation": "stdDeviation",
72
+ "stitchtiles": "stitchTiles",
73
+ "surfacescale": "surfaceScale",
74
+ "systemlanguage": "systemLanguage",
75
+ "tablevalues": "tableValues",
76
+ "targetx": "targetX",
77
+ "targety": "targetY",
78
+ "textlength": "textLength",
79
+ "viewbox": "viewBox",
80
+ "viewtarget": "viewTarget",
81
+ "xchannelselector": "xChannelSelector",
82
+ "ychannelselector": "yChannelSelector",
83
+ "zoomandpan": "zoomAndPan",
84
+ }
85
+
86
+ # HTML integration points (SVG/MathML elements that allow HTML content)
87
+ # Structure: (namespace_url, element_name)
88
+ HTML_INTEGRATION_POINT_ELEMENTS = {
89
+ ("http://www.w3.org/1998/Math/MathML", "annotation-xml"),
90
+ ("http://www.w3.org/2000/svg", "foreignObject"),
91
+ ("http://www.w3.org/2000/svg", "desc"),
92
+ ("http://www.w3.org/2000/svg", "title"),
93
+ }
94
+
95
+ # MathML text integration points
96
+ # Structure: (namespace_url, element_name)
97
+ MATHML_TEXT_INTEGRATION_POINT_ELEMENTS = {
98
+ ("http://www.w3.org/1998/Math/MathML", "mi"),
99
+ ("http://www.w3.org/1998/Math/MathML", "mo"),
100
+ ("http://www.w3.org/1998/Math/MathML", "mn"),
101
+ ("http://www.w3.org/1998/Math/MathML", "ms"),
102
+ ("http://www.w3.org/1998/Math/MathML", "mtext"),
103
+ }
104
+
105
+ QUIRKY_PUBLIC_PREFIXES = (
106
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
107
+ "-//as//dtd html 3.0 aswedit + extensions//",
108
+ "-//ietf//dtd html 2.0 level 1//",
109
+ "-//ietf//dtd html 2.0 level 2//",
110
+ "-//ietf//dtd html 2.0 strict level 1//",
111
+ "-//ietf//dtd html 2.0 strict level 2//",
112
+ "-//ietf//dtd html 2.0 strict//",
113
+ "-//ietf//dtd html 2.0//",
114
+ "-//ietf//dtd html 2.1e//",
115
+ "-//ietf//dtd html 3.0//",
116
+ "-//ietf//dtd html 3.2 final//",
117
+ "-//ietf//dtd html 3.2//",
118
+ "-//ietf//dtd html 3//",
119
+ "-//ietf//dtd html level 0//",
120
+ "-//ietf//dtd html level 1//",
121
+ "-//ietf//dtd html level 2//",
122
+ "-//ietf//dtd html level 3//",
123
+ "-//ietf//dtd html strict level 0//",
124
+ "-//ietf//dtd html strict level 1//",
125
+ "-//ietf//dtd html strict level 2//",
126
+ "-//ietf//dtd html strict level 3//",
127
+ "-//ietf//dtd html strict//",
128
+ "-//ietf//dtd html//",
129
+ "-//metrius//dtd metrius presentational//",
130
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
131
+ "-//microsoft//dtd internet explorer 2.0 html//",
132
+ "-//microsoft//dtd internet explorer 2.0 tables//",
133
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
134
+ "-//microsoft//dtd internet explorer 3.0 html//",
135
+ "-//microsoft//dtd internet explorer 3.0 tables//",
136
+ "-//netscape comm. corp.//dtd html//",
137
+ "-//netscape comm. corp.//dtd strict html//",
138
+ "-//o'reilly and associates//dtd html 2.0//",
139
+ "-//o'reilly and associates//dtd html extended 1.0//",
140
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
141
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
142
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
143
+ "-//spyglass//dtd html 2.0 extended//",
144
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
145
+ "-//sun microsystems corp.//dtd hotjava html//",
146
+ "-//sun microsystems corp.//dtd hotjava strict html//",
147
+ "-//w3c//dtd html 3 1995-03-24//",
148
+ "-//w3c//dtd html 3.2 draft//",
149
+ "-//w3c//dtd html 3.2 final//",
150
+ "-//w3c//dtd html 3.2//",
151
+ "-//w3c//dtd html 3.2s draft//",
152
+ "-//w3c//dtd html 4.0 frameset//",
153
+ "-//w3c//dtd html 4.0 transitional//",
154
+ "-//w3c//dtd html experimental 19960712//",
155
+ "-//w3c//dtd html experimental 970421//",
156
+ "-//w3c//dtd html experimental 970421//",
157
+ "-//w3c//dtd w3 html//",
158
+ "-//w3o//dtd w3 html 3.0//",
159
+ "-//webtechs//dtd mozilla html 2.0//",
160
+ "-//webtechs//dtd mozilla html//",
161
+ )
162
+
163
+ QUIRKY_PUBLIC_MATCHES = (
164
+ "-//w3o//dtd w3 html strict 3.0//en//",
165
+ "-/w3c/dtd html 4.0 transitional/en",
166
+ "html",
167
+ )
168
+
169
+ QUIRKY_SYSTEM_MATCHES = ("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd",)
170
+
171
+ LIMITED_QUIRKY_PUBLIC_PREFIXES = (
172
+ "-//w3c//dtd xhtml 1.0 frameset//",
173
+ "-//w3c//dtd xhtml 1.0 transitional//",
174
+ )
175
+
176
+ HTML4_PUBLIC_PREFIXES = (
177
+ "-//w3c//dtd html 4.01 frameset//",
178
+ "-//w3c//dtd html 4.01 transitional//",
179
+ )
180
+
181
+ HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"}
182
+
183
+ FORMATTING_ELEMENTS = {
184
+ "a",
185
+ "b",
186
+ "big",
187
+ "code",
188
+ "em",
189
+ "font",
190
+ "i",
191
+ "nobr",
192
+ "s",
193
+ "small",
194
+ "strike",
195
+ "strong",
196
+ "tt",
197
+ "u",
198
+ }
199
+
200
+ SPECIAL_ELEMENTS = {
201
+ "address",
202
+ "applet",
203
+ "area",
204
+ "article",
205
+ "aside",
206
+ "base",
207
+ "basefont",
208
+ "bgsound",
209
+ "blockquote",
210
+ "body",
211
+ "br",
212
+ "button",
213
+ "caption",
214
+ "center",
215
+ "col",
216
+ "colgroup",
217
+ "dd",
218
+ "details",
219
+ "dialog",
220
+ "dir",
221
+ "div",
222
+ "dl",
223
+ "dt",
224
+ "embed",
225
+ "fieldset",
226
+ "figcaption",
227
+ "figure",
228
+ "footer",
229
+ "form",
230
+ "frame",
231
+ "frameset",
232
+ "h1",
233
+ "h2",
234
+ "h3",
235
+ "h4",
236
+ "h5",
237
+ "h6",
238
+ "head",
239
+ "header",
240
+ "hgroup",
241
+ "hr",
242
+ "html",
243
+ "iframe",
244
+ "img",
245
+ "input",
246
+ "keygen",
247
+ "li",
248
+ "link",
249
+ "listing",
250
+ "main",
251
+ "marquee",
252
+ "menu",
253
+ "menuitem",
254
+ "meta",
255
+ "nav",
256
+ "noembed",
257
+ "noframes",
258
+ "noscript",
259
+ "object",
260
+ "ol",
261
+ "p",
262
+ "param",
263
+ "plaintext",
264
+ "pre",
265
+ "script",
266
+ "search",
267
+ "section",
268
+ "select",
269
+ "source",
270
+ "style",
271
+ "summary",
272
+ "table",
273
+ "tbody",
274
+ "td",
275
+ "template",
276
+ "textarea",
277
+ "tfoot",
278
+ "th",
279
+ "thead",
280
+ "title",
281
+ "tr",
282
+ "track",
283
+ "ul",
284
+ "wbr",
285
+ }
286
+
287
+ FORMAT_MARKER = object()
288
+
289
+ DEFAULT_SCOPE_TERMINATORS = {
290
+ "applet",
291
+ "caption",
292
+ "html",
293
+ "table",
294
+ "td",
295
+ "th",
296
+ "marquee",
297
+ "object",
298
+ "template",
299
+ }
300
+
301
+ BUTTON_SCOPE_TERMINATORS = DEFAULT_SCOPE_TERMINATORS | {"button"}
302
+ LIST_ITEM_SCOPE_TERMINATORS = DEFAULT_SCOPE_TERMINATORS | {"ol", "ul"}
303
+ DEFINITION_SCOPE_TERMINATORS = DEFAULT_SCOPE_TERMINATORS | {"dl"}
304
+
305
+ TABLE_FOSTER_TARGETS = {"table", "tbody", "tfoot", "thead", "tr"}
306
+
307
+ SVG_TAG_NAME_ADJUSTMENTS = {
308
+ "altglyph": "altGlyph",
309
+ "altglyphdef": "altGlyphDef",
310
+ "altglyphitem": "altGlyphItem",
311
+ "animatecolor": "animateColor",
312
+ "animatemotion": "animateMotion",
313
+ "animatetransform": "animateTransform",
314
+ "clippath": "clipPath",
315
+ "feblend": "feBlend",
316
+ "fecolormatrix": "feColorMatrix",
317
+ "fecomponenttransfer": "feComponentTransfer",
318
+ "fecomposite": "feComposite",
319
+ "feconvolvematrix": "feConvolveMatrix",
320
+ "fediffuselighting": "feDiffuseLighting",
321
+ "fedisplacementmap": "feDisplacementMap",
322
+ "fedistantlight": "feDistantLight",
323
+ "feflood": "feFlood",
324
+ "fefunca": "feFuncA",
325
+ "fefuncb": "feFuncB",
326
+ "fefuncg": "feFuncG",
327
+ "fefuncr": "feFuncR",
328
+ "fegaussianblur": "feGaussianBlur",
329
+ "feimage": "feImage",
330
+ "femerge": "feMerge",
331
+ "femergenode": "feMergeNode",
332
+ "femorphology": "feMorphology",
333
+ "feoffset": "feOffset",
334
+ "fepointlight": "fePointLight",
335
+ "fespecularlighting": "feSpecularLighting",
336
+ "fespotlight": "feSpotLight",
337
+ "fetile": "feTile",
338
+ "feturbulence": "feTurbulence",
339
+ "foreignobject": "foreignObject",
340
+ "glyphref": "glyphRef",
341
+ "lineargradient": "linearGradient",
342
+ "radialgradient": "radialGradient",
343
+ "textpath": "textPath",
344
+ }
345
+
346
+ FOREIGN_BREAKOUT_ELEMENTS = {
347
+ "b",
348
+ "big",
349
+ "blockquote",
350
+ "body",
351
+ "br",
352
+ "center",
353
+ "code",
354
+ "dd",
355
+ "div",
356
+ "dl",
357
+ "dt",
358
+ "em",
359
+ "embed",
360
+ "h1",
361
+ "h2",
362
+ "h3",
363
+ "h4",
364
+ "h5",
365
+ "h6",
366
+ "head",
367
+ "hr",
368
+ "i",
369
+ "img",
370
+ "li",
371
+ "listing",
372
+ "menu",
373
+ "meta",
374
+ "nobr",
375
+ "ol",
376
+ "p",
377
+ "pre",
378
+ "ruby",
379
+ "s",
380
+ "small",
381
+ "span",
382
+ "strong",
383
+ "strike",
384
+ "sub",
385
+ "sup",
386
+ "table",
387
+ "tt",
388
+ "u",
389
+ "ul",
390
+ "var",
391
+ }
392
+
393
+ NAMESPACE_URL_TO_PREFIX = {
394
+ "http://www.w3.org/1999/xhtml": "html",
395
+ "http://www.w3.org/1998/Math/MathML": "math",
396
+ "http://www.w3.org/2000/svg": "svg",
397
+ }
398
+
399
+ HTML_INTEGRATION_POINT_SET = {
400
+ (NAMESPACE_URL_TO_PREFIX.get(ns, ns), name) for ns, name in HTML_INTEGRATION_POINT_ELEMENTS
401
+ }
402
+
403
+ MATHML_TEXT_INTEGRATION_POINT_SET = {
404
+ (NAMESPACE_URL_TO_PREFIX.get(ns, ns), name) for ns, name in MATHML_TEXT_INTEGRATION_POINT_ELEMENTS
405
+ }
406
+
407
+ TABLE_ALLOWED_CHILDREN = {
408
+ "caption",
409
+ "colgroup",
410
+ "tbody",
411
+ "tfoot",
412
+ "thead",
413
+ "tr",
414
+ "td",
415
+ "th",
416
+ "script",
417
+ "template",
418
+ "style",
419
+ }
420
+
421
+ TABLE_SCOPE_TERMINATORS = {"html", "table", "template"}
422
+ IMPLIED_END_TAGS = {"dd", "dt", "li", "option", "optgroup", "p", "rb", "rp", "rt", "rtc"}
423
+
424
+ VOID_ELEMENTS = frozenset(
425
+ {
426
+ "area",
427
+ "base",
428
+ "br",
429
+ "col",
430
+ "embed",
431
+ "hr",
432
+ "img",
433
+ "input",
434
+ "link",
435
+ "meta",
436
+ "param",
437
+ "source",
438
+ "track",
439
+ "wbr",
440
+ },
441
+ )
justhtml/context.py ADDED
@@ -0,0 +1,6 @@
1
+ class FragmentContext:
2
+ __slots__ = ("namespace", "tag_name")
3
+
4
+ def __init__(self, tag_name, namespace=None):
5
+ self.tag_name = tag_name
6
+ self.namespace = namespace