dommy 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +31 -13
  3. data/lib/dommy/animation.rb +288 -0
  4. data/lib/dommy/attr.rb +23 -11
  5. data/lib/dommy/backend/nokogiri_adapter.rb +51 -0
  6. data/lib/dommy/backend/nokolexbor_adapter.rb +80 -0
  7. data/lib/dommy/backend.rb +129 -0
  8. data/lib/dommy/blob.rb +2 -2
  9. data/lib/dommy/compression_streams.rb +147 -0
  10. data/lib/dommy/cookie_store.rb +128 -0
  11. data/lib/dommy/crypto.rb +396 -0
  12. data/lib/dommy/css.rb +7 -7
  13. data/lib/dommy/custom_elements.rb +6 -6
  14. data/lib/dommy/document.rb +190 -32
  15. data/lib/dommy/dom_parser.rb +5 -4
  16. data/lib/dommy/element.rb +356 -53
  17. data/lib/dommy/event.rb +431 -25
  18. data/lib/dommy/event_source.rb +131 -0
  19. data/lib/dommy/fetch.rb +76 -6
  20. data/lib/dommy/file_reader.rb +176 -0
  21. data/lib/dommy/form_data.rb +1 -3
  22. data/lib/dommy/history.rb +82 -0
  23. data/lib/dommy/html_collection.rb +4 -4
  24. data/lib/dommy/html_elements.rb +130 -67
  25. data/lib/dommy/internal/cookie_jar.rb +2 -0
  26. data/lib/dommy/internal/css_pseudo_handlers.rb +28 -0
  27. data/lib/dommy/internal/dom_matching.rb +4 -4
  28. data/lib/dommy/internal/idna.rb +443 -0
  29. data/lib/dommy/internal/idna_data.rb +10379 -0
  30. data/lib/dommy/internal/ipv4_parser.rb +78 -0
  31. data/lib/dommy/internal/node_traversal.rb +1 -1
  32. data/lib/dommy/internal/node_wrapper_cache.rb +23 -12
  33. data/lib/dommy/internal/observable_callback.rb +25 -0
  34. data/lib/dommy/internal/punycode.rb +202 -0
  35. data/lib/dommy/internal/range_text_serializer.rb +72 -0
  36. data/lib/dommy/internal/reflected_attributes.rb +45 -0
  37. data/lib/dommy/internal/template_content_registry.rb +6 -6
  38. data/lib/dommy/intersection_observer.rb +82 -0
  39. data/lib/dommy/{router.rb → location.rb} +8 -142
  40. data/lib/dommy/media_query_list.rb +118 -0
  41. data/lib/dommy/message_channel.rb +249 -0
  42. data/lib/dommy/{observer.rb → mutation_observer.rb} +21 -11
  43. data/lib/dommy/navigator.rb +365 -5
  44. data/lib/dommy/node.rb +12 -0
  45. data/lib/dommy/notification.rb +89 -0
  46. data/lib/dommy/parser.rb +13 -13
  47. data/lib/dommy/performance.rb +146 -0
  48. data/lib/dommy/performance_observer.rb +55 -0
  49. data/lib/dommy/range.rb +597 -0
  50. data/lib/dommy/resize_observer.rb +53 -0
  51. data/lib/dommy/shadow_root.rb +10 -8
  52. data/lib/dommy/streams.rb +386 -0
  53. data/lib/dommy/svg_elements.rb +3863 -0
  54. data/lib/dommy/text_codec.rb +175 -0
  55. data/lib/dommy/tree_walker.rb +21 -21
  56. data/lib/dommy/url.rb +274 -29
  57. data/lib/dommy/url_pattern.rb +144 -0
  58. data/lib/dommy/version.rb +1 -1
  59. data/lib/dommy/web_socket.rb +209 -0
  60. data/lib/dommy/window.rb +369 -0
  61. data/lib/dommy/worker.rb +143 -0
  62. data/lib/dommy/xml_http_request.rb +438 -0
  63. data/lib/dommy.rb +43 -5
  64. metadata +44 -29
  65. data/lib/dommy/world.rb +0 -209
@@ -0,0 +1,443 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "idna_data"
4
+
5
+ module Dommy
6
+ module Internal
7
+ # IDNA ToASCII / ToUnicode for domain names. Built on
8
+ # `Internal::Punycode` plus the Unicode tables in
9
+ # `Internal::IDNAData` (generated by `script/build_idna_tables.rb`
10
+ # from Unicode 16.0 source files in `vendor/unicode/`).
11
+ #
12
+ # Conforms (approximately) to UTS #46 with WHATWG URL parameters:
13
+ #
14
+ # UseSTD3ASCIIRules = false (so `_` etc. are allowed)
15
+ # Transitional_Processing = false (so `ß` stays as `ß`)
16
+ # CheckHyphens = true (strict per RFC 5891)
17
+ # CheckBidi = true (RFC 5893)
18
+ # CheckJoiners = true (RFC 5892 ContextJ for ZWJ/ZWNJ)
19
+ #
20
+ # Algorithm: UTS #46 §4 (Processing). Each input goes through:
21
+ # 1. Map (UTS #46 mapping table)
22
+ # 2. Normalize (NFC)
23
+ # 3. Break into labels on `.`
24
+ # 4. ACE-decode any `xn--`-prefixed label
25
+ # 5. Validate (hyphen rules, leading combining marks, Bidi, ContextJ)
26
+ # 6. Punycode-encode non-ASCII labels
27
+ # 7. Length-validate the result
28
+ module IDNA
29
+ ACE_PREFIX = "xn--"
30
+ MAX_LABEL_OCTETS = 63
31
+ MAX_DOMAIN_OCTETS = 253
32
+
33
+ # Bidi classes permitted in the body of each kind of label
34
+ # (per RFC 5893 §2).
35
+ RTL_BODY_CLASSES = %i[R AL AN EN ES CS ET ON BN NSM].freeze
36
+ LTR_BODY_CLASSES = %i[L EN ES CS ET ON BN NSM].freeze
37
+
38
+ # Script ranges used by RFC 5892 ContextO checks. We only need
39
+ # Greek / Hebrew / Hiragana / Katakana / Han, so we hardcode
40
+ # the block ranges instead of pulling in the full Script
41
+ # property table. Covers the practical cases; a code point in
42
+ # one of these blocks but not actually that script is rare and
43
+ # the spec-stricter interpretation still flags it correctly
44
+ # for IDN.
45
+ GREEK_RANGES = [
46
+ # Greek and Coptic
47
+ [0x0370, 0x03FF],
48
+ # Greek Extended
49
+ [0x1F00, 0x1FFF]
50
+ ].freeze
51
+
52
+ HEBREW_RANGES = [
53
+ # Hebrew
54
+ [0x0590, 0x05FF],
55
+ # Alphabetic Presentation Forms — Hebrew
56
+ [0xFB1D, 0xFB4F]
57
+ ].freeze
58
+
59
+ HIRAGANA_KATAKANA_HAN_RANGES = [
60
+ # Hiragana
61
+ [0x3040, 0x309F],
62
+ # Katakana (incl. U+30FB itself)
63
+ [0x30A0, 0x30FF],
64
+ # Katakana Phonetic Extensions
65
+ [0x31F0, 0x31FF],
66
+ # CJK Unified Ideographs Extension A
67
+ [0x3400, 0x4DBF],
68
+ # CJK Unified Ideographs
69
+ [0x4E00, 0x9FFF],
70
+ # CJK Compatibility Ideographs
71
+ [0xF900, 0xFAFF],
72
+ # Halfwidth Katakana
73
+ [0xFF66, 0xFF9F],
74
+ # Kana Extended-A
75
+ [0x1B100, 0x1B12F],
76
+ # Small Kana Extension
77
+ [0x1B130, 0x1B16F],
78
+ # CJK Ext B
79
+ [0x20000, 0x2A6DF],
80
+ # CJK Ext C–G
81
+ [0x2A700, 0x2EBEF],
82
+ # CJK Compatibility Supplement
83
+ [0x2F800, 0x2FA1F]
84
+ ].freeze
85
+
86
+ class Error < StandardError
87
+ end
88
+
89
+ # `domain` → ASCII-only form. Returns nil for nil input.
90
+ def self.to_ascii(domain)
91
+ return domain if domain.nil?
92
+
93
+ mapped = uts46_map(domain.to_s)
94
+ normalized = mapped.unicode_normalize(:nfc)
95
+ labels = normalized.split(".", -1)
96
+
97
+ validate_no_empty_intermediate(labels)
98
+ bidi_domain = labels.any? { |l| bidi_label?(l) }
99
+
100
+ encoded = labels.map do |label|
101
+ ace_encoded = label.downcase.start_with?(ACE_PREFIX)
102
+ decoded = ace_decode(label)
103
+
104
+ # A-labels carry an extra invariant: the decoded U-label
105
+ # must itself be valid IDNA (no mapped / ignored /
106
+ # disallowed code points), and re-encoding it must produce
107
+ # the original A-label modulo case. UTS #46 §4 step 4 / RFC
108
+ # 5891 §4.2.
109
+ if ace_encoded
110
+ validate_decoded_u_label(decoded)
111
+ validate_a_label_roundtrip(label, decoded)
112
+ end
113
+
114
+ validate_label(decoded, bidi_domain: bidi_domain)
115
+ encode_label(decoded)
116
+ end
117
+
118
+ encoded.each { |label| validate_a_label_form(label) }
119
+ result = encoded.join(".")
120
+ validate_total_length(result)
121
+ result
122
+ end
123
+
124
+ # Inverse: any `xn--`-prefixed label is Punycode-decoded back to
125
+ # Unicode. ASCII labels pass through unchanged.
126
+ def self.to_unicode(domain)
127
+ return domain if domain.nil?
128
+
129
+ labels = domain.to_s.split(".", -1)
130
+ labels.map { |label| ace_decode(label) }.join(".")
131
+ end
132
+
133
+ # --- UTS #46 step 1: map -----------------------------------------
134
+
135
+ def self.uts46_map(input)
136
+ out = +""
137
+ input.each_codepoint do |cp|
138
+ row = IDNAData.lookup(IDNAData::IDNA_MAPPING, cp)
139
+ status = row ? row[2] : :disallowed
140
+
141
+ case status
142
+ when :valid
143
+ out << [cp].pack("U*")
144
+ when :ignored
145
+ # drop
146
+ when :mapped
147
+ mapping = row[3]
148
+ out << (mapping || [cp].pack("U*"))
149
+ when :disallowed
150
+ raise Error, "disallowed code point: U+#{cp.to_s(16).upcase}"
151
+ end
152
+ end
153
+
154
+ out
155
+ end
156
+
157
+ # --- Step 4: ACE decode if prefixed -----------------------------
158
+
159
+ def self.ace_decode(label)
160
+ return label unless label.downcase.start_with?(ACE_PREFIX)
161
+ return "" if label.length == ACE_PREFIX.length
162
+
163
+ Punycode.decode(label[ACE_PREFIX.length..])
164
+ end
165
+
166
+ # --- Step 5: validate per-label ---------------------------------
167
+
168
+ def self.validate_label(label, bidi_domain:)
169
+ return if label.empty?
170
+
171
+ validate_hyphens(label)
172
+ validate_no_leading_combining_mark(label)
173
+ check_contextj(label)
174
+ check_contexto(label)
175
+ check_bidi(label) if bidi_domain
176
+ end
177
+
178
+ def self.validate_hyphens(label)
179
+ if label.start_with?("-")
180
+ raise Error, "label starts with hyphen: #{label.inspect}"
181
+ end
182
+
183
+ if label.end_with?("-")
184
+ raise Error, "label ends with hyphen: #{label.inspect}"
185
+ end
186
+
187
+ if label.length >= 4 &&
188
+ label[2] == "-" &&
189
+ label[3] == "-" &&
190
+ !label.downcase.start_with?(ACE_PREFIX)
191
+ raise Error, "label has reserved hyphens at positions 3-4: #{label.inspect}"
192
+ end
193
+ end
194
+
195
+ def self.validate_no_leading_combining_mark(label)
196
+ first_cp = label.codepoints.first
197
+ return unless first_cp
198
+
199
+ bidi_class = bidi_class_of(first_cp)
200
+ return unless bidi_class == :NSM
201
+
202
+ raise Error, "label starts with combining mark: #{label.inspect}"
203
+ end
204
+
205
+ # --- Step 6: encode --------------------------------------------
206
+
207
+ def self.encode_label(label)
208
+ return label if label.empty?
209
+ return label if label.ascii_only?
210
+
211
+ ACE_PREFIX + Punycode.encode(label)
212
+ end
213
+
214
+ def self.validate_a_label_form(label)
215
+ if label.bytesize > MAX_LABEL_OCTETS
216
+ raise Error, "label exceeds 63 octets: #{label.inspect}"
217
+ end
218
+ end
219
+
220
+ # Per RFC 5891 §4.2.3 a non-final label must be non-empty.
221
+ # `example.test.` (trailing dot) parses as
222
+ # `["example", "test", ""]` — the trailing empty is OK; any
223
+ # other empty (e.g. `a..b` → `["a", "", "b"]`) is invalid.
224
+ def self.validate_no_empty_intermediate(labels)
225
+ labels[0...-1].each_with_index do |label, idx|
226
+ next unless label.empty?
227
+
228
+ raise Error, "empty label at position #{idx}"
229
+ end
230
+ end
231
+
232
+ # After ACE-decoding an A-label, every code point in the
233
+ # resulting U-label must itself be IDNA :valid — `:mapped`,
234
+ # `:ignored`, or `:disallowed` are not allowed at this stage.
235
+ def self.validate_decoded_u_label(label)
236
+ label.each_codepoint do |cp|
237
+ row = IDNAData.lookup(IDNAData::IDNA_MAPPING, cp)
238
+ status = row ? row[2] : :disallowed
239
+ next if status == :valid
240
+
241
+ raise(
242
+ Error,
243
+ "A-label decodes to invalid code point U+#{cp.to_s(16).upcase} (status #{status})"
244
+ )
245
+ end
246
+ end
247
+
248
+ # Round-trip invariant: re-encoding the U-label must produce
249
+ # the original A-label (case-insensitively). Catches malformed
250
+ # `xn--` inputs whose Punycode decodes-but-doesn't-recover.
251
+ def self.validate_a_label_roundtrip(a_label, u_label)
252
+ re_encoded = u_label.ascii_only? ? u_label : ACE_PREFIX + Punycode.encode(u_label)
253
+ return if re_encoded.downcase == a_label.downcase
254
+
255
+ raise(
256
+ Error,
257
+ "A-label fails round-trip: #{a_label.inspect} ↔ #{re_encoded.inspect}"
258
+ )
259
+ end
260
+
261
+ def self.validate_total_length(domain)
262
+ measured = domain.end_with?(".") ? domain[0...-1] : domain
263
+ if measured.bytesize > MAX_DOMAIN_OCTETS
264
+ raise Error, "domain exceeds 253 octets: #{measured.bytesize} octets"
265
+ end
266
+ end
267
+
268
+ # --- Bidi (RFC 5893) -------------------------------------------
269
+
270
+ def self.bidi_label?(label)
271
+ label.each_codepoint.any? do |cp|
272
+ %i[R AL AN].include?(bidi_class_of(cp))
273
+ end
274
+ end
275
+
276
+ def self.check_bidi(label)
277
+ cps = label.codepoints
278
+ return if cps.empty?
279
+
280
+ classes = cps.map { |cp| bidi_class_of(cp) }
281
+ first = classes.first
282
+ last_non_nsm = classes.reverse.find { |c| c != :NSM }
283
+
284
+ case first
285
+ when :R, :AL
286
+ classes.each do |c|
287
+ next if RTL_BODY_CLASSES.include?(c)
288
+
289
+ raise Error, "Bidi rule 2 violation: class #{c} in RTL label"
290
+ end
291
+
292
+ unless %i[R AL EN AN].include?(last_non_nsm)
293
+ raise Error, "Bidi rule 3 violation: RTL label trailing class #{last_non_nsm}"
294
+ end
295
+
296
+ if classes.include?(:EN) && classes.include?(:AN)
297
+ raise Error, "Bidi rule 4 violation: EN and AN both present"
298
+ end
299
+
300
+ when :L
301
+ classes.each do |c|
302
+ next if LTR_BODY_CLASSES.include?(c)
303
+
304
+ raise Error, "Bidi rule 5 violation: class #{c} in LTR label"
305
+ end
306
+
307
+ unless %i[L EN].include?(last_non_nsm)
308
+ raise Error, "Bidi rule 6 violation: LTR label trailing class #{last_non_nsm}"
309
+ end
310
+ else
311
+ raise Error, "Bidi rule 1 violation: label starts with #{first}"
312
+ end
313
+ end
314
+
315
+ # --- ContextJ (RFC 5892) ---------------------------------------
316
+
317
+ def self.check_contextj(label)
318
+ cps = label.codepoints
319
+ cps.each_with_index do |cp, i|
320
+ case cp
321
+ # ZWNJ
322
+ when 0x200C
323
+ next if zwnj_allowed?(cps, i)
324
+
325
+ raise Error, "ZWNJ in invalid context"
326
+ # ZWJ
327
+ when 0x200D
328
+ next if zwj_allowed?(cps, i)
329
+
330
+ raise Error, "ZWJ in invalid context"
331
+ end
332
+ end
333
+ end
334
+
335
+ def self.zwnj_allowed?(cps, idx)
336
+ prev = idx.positive? ? cps[idx - 1] : nil
337
+ return true if prev && IDNAData::VIRAMA.include?(prev)
338
+
339
+ # Or: (Joining_Type:L|D)(Joining_Type:T)* . (Joining_Type:T)*(Joining_Type:R|D)
340
+ left = scan_joining(cps, idx - 1, -1)
341
+ right = scan_joining(cps, idx + 1, 1)
342
+ %i[L D].include?(left) && %i[R D].include?(right)
343
+ end
344
+
345
+ def self.zwj_allowed?(cps, idx)
346
+ prev = idx.positive? ? cps[idx - 1] : nil
347
+ prev && IDNAData::VIRAMA.include?(prev)
348
+ end
349
+
350
+ # Walk in `step` direction from `start`, skipping Joining_Type=T,
351
+ # and return the first non-T joining type encountered (or nil at
352
+ # the edge).
353
+ def self.scan_joining(cps, start, step)
354
+ i = start
355
+ while i >= 0 && i < cps.length
356
+ jt = joining_type_of(cps[i])
357
+ return jt unless jt == :T
358
+
359
+ i += step
360
+ end
361
+
362
+ nil
363
+ end
364
+
365
+ # --- ContextO (RFC 5892 §4) ------------------------------------
366
+ #
367
+ # Position-sensitive rules for seven specific code points whose
368
+ # validity depends on neighbors / script co-occurrence.
369
+
370
+ def self.check_contexto(label)
371
+ cps = label.codepoints
372
+ cps.each_with_index do |cp, i|
373
+ case cp
374
+ when 0x00B7
375
+ # §4.1 MIDDLE DOT — allowed only between two `l` characters
376
+ # (Catalan `l·l` ligature).
377
+ unless cps[i - 1] == 0x006C && cps[i + 1] == 0x006C
378
+ raise Error, "U+00B7 MIDDLE DOT requires surrounding 'l' characters"
379
+ end
380
+
381
+ when 0x0375
382
+ # §4.2 GREEK LOWER NUMERAL SIGN — next char must be Greek.
383
+ unless in_ranges?(cps[i + 1], GREEK_RANGES)
384
+ raise Error, "U+0375 must precede a Greek-script character"
385
+ end
386
+
387
+ when 0x05F3, 0x05F4
388
+ # §4.3, §4.4 HEBREW GERESH / GERSHAYIM — previous char
389
+ # must be Hebrew.
390
+ unless in_ranges?(cps[i - 1], HEBREW_RANGES)
391
+ raise(
392
+ Error,
393
+ "U+#{cp.to_s(16).upcase} must follow a Hebrew-script character"
394
+ )
395
+ end
396
+
397
+ when 0x30FB
398
+ # §4.5 KATAKANA MIDDLE DOT — label must contain at least
399
+ # one Hiragana/Katakana/Han character. U+30FB itself has
400
+ # Script=Common, not Katakana — only its block falls in
401
+ # the Katakana block, so we exclude it from the cohort.
402
+ companions = cps.each_with_index.reject { |c, j| j == i || c == 0x30FB }.map(&:first)
403
+ unless companions.any? { |c| in_ranges?(c, HIRAGANA_KATAKANA_HAN_RANGES) }
404
+ raise(
405
+ Error,
406
+ "U+30FB requires another Hiragana/Katakana/Han character in the label"
407
+ )
408
+ end
409
+
410
+ when 0x0660..0x0669
411
+ # §4.6 Arabic-Indic Digits cannot mix with Extended
412
+ # Arabic-Indic Digits in the same label.
413
+ if cps.any? { |c| (0x06F0..0x06F9).cover?(c) }
414
+ raise Error, "Arabic-Indic digit forbidden alongside Extended Arabic-Indic digit"
415
+ end
416
+
417
+ when 0x06F0..0x06F9
418
+ # §4.7 symmetric to §4.6.
419
+ if cps.any? { |c| (0x0660..0x0669).cover?(c) }
420
+ raise Error, "Extended Arabic-Indic digit forbidden alongside Arabic-Indic digit"
421
+ end
422
+ end
423
+ end
424
+ end
425
+
426
+ def self.in_ranges?(cp, ranges)
427
+ return false if cp.nil?
428
+
429
+ ranges.any? { |(lo, hi)| cp >= lo && cp <= hi }
430
+ end
431
+
432
+ def self.bidi_class_of(cp)
433
+ row = IDNAData.lookup(IDNAData::BIDI_CLASS, cp)
434
+ row ? row[2] : :L
435
+ end
436
+
437
+ def self.joining_type_of(cp)
438
+ row = IDNAData.lookup(IDNAData::JOINING_TYPE, cp)
439
+ row ? row[2] : :U
440
+ end
441
+ end
442
+ end
443
+ end