RubyGems - dommy - Versions diffs - 0.5.0 → 0.7.0 - Mend

dommy 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

checksums.yaml +4 -4
data/README.md +31 -13
data/lib/dommy/animation.rb +288 -0
data/lib/dommy/attr.rb +23 -11
data/lib/dommy/backend/nokogiri_adapter.rb +51 -0
data/lib/dommy/backend/nokolexbor_adapter.rb +80 -0
data/lib/dommy/backend.rb +129 -0
data/lib/dommy/blob.rb +2 -2
data/lib/dommy/compression_streams.rb +147 -0
data/lib/dommy/cookie_store.rb +128 -0
data/lib/dommy/crypto.rb +396 -0
data/lib/dommy/css.rb +7 -7
data/lib/dommy/custom_elements.rb +6 -6
data/lib/dommy/document.rb +190 -32
data/lib/dommy/dom_parser.rb +5 -4
data/lib/dommy/element.rb +356 -53
data/lib/dommy/event.rb +431 -25
data/lib/dommy/event_source.rb +131 -0
data/lib/dommy/fetch.rb +76 -6
data/lib/dommy/file_reader.rb +176 -0
data/lib/dommy/form_data.rb +1 -3
data/lib/dommy/history.rb +82 -0
data/lib/dommy/html_collection.rb +4 -4
data/lib/dommy/html_elements.rb +130 -67
data/lib/dommy/internal/cookie_jar.rb +2 -0
data/lib/dommy/internal/css_pseudo_handlers.rb +28 -0
data/lib/dommy/internal/dom_matching.rb +4 -4
data/lib/dommy/internal/idna.rb +443 -0
data/lib/dommy/internal/idna_data.rb +10379 -0
data/lib/dommy/internal/ipv4_parser.rb +78 -0
data/lib/dommy/internal/node_traversal.rb +1 -1
data/lib/dommy/internal/node_wrapper_cache.rb +23 -12
data/lib/dommy/internal/observable_callback.rb +25 -0
data/lib/dommy/internal/punycode.rb +202 -0
data/lib/dommy/internal/range_text_serializer.rb +72 -0
data/lib/dommy/internal/reflected_attributes.rb +45 -0
data/lib/dommy/internal/template_content_registry.rb +6 -6
data/lib/dommy/intersection_observer.rb +82 -0
data/lib/dommy/{router.rb → location.rb} +8 -142
data/lib/dommy/media_query_list.rb +118 -0
data/lib/dommy/message_channel.rb +249 -0
data/lib/dommy/{observer.rb → mutation_observer.rb} +21 -11
data/lib/dommy/navigator.rb +365 -5
data/lib/dommy/node.rb +12 -0
data/lib/dommy/notification.rb +89 -0
data/lib/dommy/parser.rb +13 -13
data/lib/dommy/performance.rb +146 -0
data/lib/dommy/performance_observer.rb +55 -0
data/lib/dommy/range.rb +597 -0
data/lib/dommy/resize_observer.rb +53 -0
data/lib/dommy/shadow_root.rb +10 -8
data/lib/dommy/streams.rb +386 -0
data/lib/dommy/svg_elements.rb +3863 -0
data/lib/dommy/text_codec.rb +175 -0
data/lib/dommy/tree_walker.rb +21 -21
data/lib/dommy/url.rb +274 -29
data/lib/dommy/url_pattern.rb +144 -0
data/lib/dommy/version.rb +1 -1
data/lib/dommy/web_socket.rb +209 -0
data/lib/dommy/window.rb +369 -0
data/lib/dommy/worker.rb +143 -0
data/lib/dommy/xml_http_request.rb +438 -0
data/lib/dommy.rb +43 -5
metadata +44 -29
data/lib/dommy/world.rb +0 -209

data/lib/dommy/internal/idna.rb ADDED Viewed

@@ -0,0 +1,443 @@
+# frozen_string_literal: true
+require_relative "idna_data"
+module Dommy
+  module Internal
+    # IDNA ToASCII / ToUnicode for domain names. Built on
+    # `Internal::Punycode` plus the Unicode tables in
+    # `Internal::IDNAData` (generated by `script/build_idna_tables.rb`
+    # from Unicode 16.0 source files in `vendor/unicode/`).
+    #
+    # Conforms (approximately) to UTS #46 with WHATWG URL parameters:
+    #
+    #   UseSTD3ASCIIRules       = false   (so `_` etc. are allowed)
+    #   Transitional_Processing = false   (so `ß` stays as `ß`)
+    #   CheckHyphens            = true    (strict per RFC 5891)
+    #   CheckBidi               = true    (RFC 5893)
+    #   CheckJoiners            = true    (RFC 5892 ContextJ for ZWJ/ZWNJ)
+    #
+    # Algorithm: UTS #46 §4 (Processing). Each input goes through:
+    #   1. Map (UTS #46 mapping table)
+    #   2. Normalize (NFC)
+    #   3. Break into labels on `.`
+    #   4. ACE-decode any `xn--`-prefixed label
+    #   5. Validate (hyphen rules, leading combining marks, Bidi, ContextJ)
+    #   6. Punycode-encode non-ASCII labels
+    #   7. Length-validate the result
+    module IDNA
+      ACE_PREFIX = "xn--"
+      MAX_LABEL_OCTETS = 63
+      MAX_DOMAIN_OCTETS = 253
+      # Bidi classes permitted in the body of each kind of label
+      # (per RFC 5893 §2).
+      RTL_BODY_CLASSES = %i[R AL AN EN ES CS ET ON BN NSM].freeze
+      LTR_BODY_CLASSES = %i[L EN ES CS ET ON BN NSM].freeze
+      # Script ranges used by RFC 5892 ContextO checks. We only need
+      # Greek / Hebrew / Hiragana / Katakana / Han, so we hardcode
+      # the block ranges instead of pulling in the full Script
+      # property table. Covers the practical cases; a code point in
+      # one of these blocks but not actually that script is rare and
+      # the spec-stricter interpretation still flags it correctly
+      # for IDN.
+      GREEK_RANGES = [
+        # Greek and Coptic
+        [0x0370, 0x03FF],
+        # Greek Extended
+        [0x1F00, 0x1FFF]
+      ].freeze
+      HEBREW_RANGES = [
+        # Hebrew
+        [0x0590, 0x05FF],
+        # Alphabetic Presentation Forms — Hebrew
+        [0xFB1D, 0xFB4F]
+      ].freeze
+      HIRAGANA_KATAKANA_HAN_RANGES = [
+        # Hiragana
+        [0x3040, 0x309F],
+        # Katakana (incl. U+30FB itself)
+        [0x30A0, 0x30FF],
+        # Katakana Phonetic Extensions
+        [0x31F0, 0x31FF],
+        # CJK Unified Ideographs Extension A
+        [0x3400, 0x4DBF],
+        # CJK Unified Ideographs
+        [0x4E00, 0x9FFF],
+        # CJK Compatibility Ideographs
+        [0xF900, 0xFAFF],
+        # Halfwidth Katakana
+        [0xFF66, 0xFF9F],
+        # Kana Extended-A
+        [0x1B100, 0x1B12F],
+        # Small Kana Extension
+        [0x1B130, 0x1B16F],
+        # CJK Ext B
+        [0x20000, 0x2A6DF],
+        # CJK Ext C–G
+        [0x2A700, 0x2EBEF],
+        # CJK Compatibility Supplement
+        [0x2F800, 0x2FA1F]
+      ].freeze
+      class Error < StandardError
+      end
+      # `domain` → ASCII-only form. Returns nil for nil input.
+      def self.to_ascii(domain)
+        return domain if domain.nil?
+        mapped = uts46_map(domain.to_s)
+        normalized = mapped.unicode_normalize(:nfc)
+        labels = normalized.split(".", -1)
+        validate_no_empty_intermediate(labels)
+        bidi_domain = labels.any? { |l| bidi_label?(l) }
+        encoded = labels.map do |label|
+          ace_encoded = label.downcase.start_with?(ACE_PREFIX)
+          decoded = ace_decode(label)
+          # A-labels carry an extra invariant: the decoded U-label
+          # must itself be valid IDNA (no mapped / ignored /
+          # disallowed code points), and re-encoding it must produce
+          # the original A-label modulo case. UTS #46 §4 step 4 / RFC
+          # 5891 §4.2.
+          if ace_encoded
+            validate_decoded_u_label(decoded)
+            validate_a_label_roundtrip(label, decoded)
+          end
+          validate_label(decoded, bidi_domain: bidi_domain)
+          encode_label(decoded)
+        end
+        encoded.each { |label| validate_a_label_form(label) }
+        result = encoded.join(".")
+        validate_total_length(result)
+        result
+      end
+      # Inverse: any `xn--`-prefixed label is Punycode-decoded back to
+      # Unicode. ASCII labels pass through unchanged.
+      def self.to_unicode(domain)
+        return domain if domain.nil?
+        labels = domain.to_s.split(".", -1)
+        labels.map { |label| ace_decode(label) }.join(".")
+      end
+      # --- UTS #46 step 1: map -----------------------------------------
+      def self.uts46_map(input)
+        out = +""
+        input.each_codepoint do |cp|
+          row = IDNAData.lookup(IDNAData::IDNA_MAPPING, cp)
+          status = row ? row[2] : :disallowed
+          case status
+          when :valid
+            out << [cp].pack("U*")
+          when :ignored
+            # drop
+          when :mapped
+            mapping = row[3]
+            out << (mapping || [cp].pack("U*"))
+          when :disallowed
+            raise Error, "disallowed code point: U+#{cp.to_s(16).upcase}"
+          end
+        end
+        out
+      end
+      # --- Step 4: ACE decode if prefixed -----------------------------
+      def self.ace_decode(label)
+        return label unless label.downcase.start_with?(ACE_PREFIX)
+        return "" if label.length == ACE_PREFIX.length
+        Punycode.decode(label[ACE_PREFIX.length..])
+      end
+      # --- Step 5: validate per-label ---------------------------------
+      def self.validate_label(label, bidi_domain:)
+        return if label.empty?
+        validate_hyphens(label)
+        validate_no_leading_combining_mark(label)
+        check_contextj(label)
+        check_contexto(label)
+        check_bidi(label) if bidi_domain
+      end
+      def self.validate_hyphens(label)
+        if label.start_with?("-")
+          raise Error, "label starts with hyphen: #{label.inspect}"
+        end
+        if label.end_with?("-")
+          raise Error, "label ends with hyphen: #{label.inspect}"
+        end
+        if label.length >= 4 &&
+            label[2] == "-" &&
+            label[3] == "-" &&
+            !label.downcase.start_with?(ACE_PREFIX)
+          raise Error, "label has reserved hyphens at positions 3-4: #{label.inspect}"
+        end
+      end
+      def self.validate_no_leading_combining_mark(label)
+        first_cp = label.codepoints.first
+        return unless first_cp
+        bidi_class = bidi_class_of(first_cp)
+        return unless bidi_class == :NSM
+        raise Error, "label starts with combining mark: #{label.inspect}"
+      end
+      # --- Step 6: encode --------------------------------------------
+      def self.encode_label(label)
+        return label if label.empty?
+        return label if label.ascii_only?
+        ACE_PREFIX + Punycode.encode(label)
+      end
+      def self.validate_a_label_form(label)
+        if label.bytesize > MAX_LABEL_OCTETS
+          raise Error, "label exceeds 63 octets: #{label.inspect}"
+        end
+      end
+      # Per RFC 5891 §4.2.3 a non-final label must be non-empty.
+      # `example.test.` (trailing dot) parses as
+      # `["example", "test", ""]` — the trailing empty is OK; any
+      # other empty (e.g. `a..b` → `["a", "", "b"]`) is invalid.
+      def self.validate_no_empty_intermediate(labels)
+        labels[0...-1].each_with_index do |label, idx|
+          next unless label.empty?
+          raise Error, "empty label at position #{idx}"
+        end
+      end
+      # After ACE-decoding an A-label, every code point in the
+      # resulting U-label must itself be IDNA :valid — `:mapped`,
+      # `:ignored`, or `:disallowed` are not allowed at this stage.
+      def self.validate_decoded_u_label(label)
+        label.each_codepoint do |cp|
+          row = IDNAData.lookup(IDNAData::IDNA_MAPPING, cp)
+          status = row ? row[2] : :disallowed
+          next if status == :valid
+          raise(
+            Error,
+            "A-label decodes to invalid code point U+#{cp.to_s(16).upcase} (status #{status})"
+          )
+        end
+      end
+      # Round-trip invariant: re-encoding the U-label must produce
+      # the original A-label (case-insensitively). Catches malformed
+      # `xn--` inputs whose Punycode decodes-but-doesn't-recover.
+      def self.validate_a_label_roundtrip(a_label, u_label)
+        re_encoded = u_label.ascii_only? ? u_label : ACE_PREFIX + Punycode.encode(u_label)
+        return if re_encoded.downcase == a_label.downcase
+        raise(
+          Error,
+          "A-label fails round-trip: #{a_label.inspect} ↔ #{re_encoded.inspect}"
+        )
+      end
+      def self.validate_total_length(domain)
+        measured = domain.end_with?(".") ? domain[0...-1] : domain
+        if measured.bytesize > MAX_DOMAIN_OCTETS
+          raise Error, "domain exceeds 253 octets: #{measured.bytesize} octets"
+        end
+      end
+      # --- Bidi (RFC 5893) -------------------------------------------
+      def self.bidi_label?(label)
+        label.each_codepoint.any? do |cp|
+          %i[R AL AN].include?(bidi_class_of(cp))
+        end
+      end
+      def self.check_bidi(label)
+        cps = label.codepoints
+        return if cps.empty?
+        classes = cps.map { |cp| bidi_class_of(cp) }
+        first = classes.first
+        last_non_nsm = classes.reverse.find { |c| c != :NSM }
+        case first
+        when :R, :AL
+          classes.each do |c|
+            next if RTL_BODY_CLASSES.include?(c)
+            raise Error, "Bidi rule 2 violation: class #{c} in RTL label"
+          end
+          unless %i[R AL EN AN].include?(last_non_nsm)
+            raise Error, "Bidi rule 3 violation: RTL label trailing class #{last_non_nsm}"
+          end
+          if classes.include?(:EN) && classes.include?(:AN)
+            raise Error, "Bidi rule 4 violation: EN and AN both present"
+          end
+        when :L
+          classes.each do |c|
+            next if LTR_BODY_CLASSES.include?(c)
+            raise Error, "Bidi rule 5 violation: class #{c} in LTR label"
+          end
+          unless %i[L EN].include?(last_non_nsm)
+            raise Error, "Bidi rule 6 violation: LTR label trailing class #{last_non_nsm}"
+          end
+        else
+          raise Error, "Bidi rule 1 violation: label starts with #{first}"
+        end
+      end
+      # --- ContextJ (RFC 5892) ---------------------------------------
+      def self.check_contextj(label)
+        cps = label.codepoints
+        cps.each_with_index do |cp, i|
+          case cp
+          # ZWNJ
+          when 0x200C
+            next if zwnj_allowed?(cps, i)
+            raise Error, "ZWNJ in invalid context"
+            # ZWJ
+          when 0x200D
+            next if zwj_allowed?(cps, i)
+            raise Error, "ZWJ in invalid context"
+          end
+        end
+      end
+      def self.zwnj_allowed?(cps, idx)
+        prev = idx.positive? ? cps[idx - 1] : nil
+        return true if prev && IDNAData::VIRAMA.include?(prev)
+        # Or: (Joining_Type:L|D)(Joining_Type:T)* . (Joining_Type:T)*(Joining_Type:R|D)
+        left = scan_joining(cps, idx - 1, -1)
+        right = scan_joining(cps, idx + 1, 1)
+        %i[L D].include?(left) && %i[R D].include?(right)
+      end
+      def self.zwj_allowed?(cps, idx)
+        prev = idx.positive? ? cps[idx - 1] : nil
+        prev && IDNAData::VIRAMA.include?(prev)
+      end
+      # Walk in `step` direction from `start`, skipping Joining_Type=T,
+      # and return the first non-T joining type encountered (or nil at
+      # the edge).
+      def self.scan_joining(cps, start, step)
+        i = start
+        while i >= 0 && i < cps.length
+          jt = joining_type_of(cps[i])
+          return jt unless jt == :T
+          i += step
+        end
+        nil
+      end
+      # --- ContextO (RFC 5892 §4) ------------------------------------
+      #
+      # Position-sensitive rules for seven specific code points whose
+      # validity depends on neighbors / script co-occurrence.
+      def self.check_contexto(label)
+        cps = label.codepoints
+        cps.each_with_index do |cp, i|
+          case cp
+          when 0x00B7
+            # §4.1 MIDDLE DOT — allowed only between two `l` characters
+            # (Catalan `l·l` ligature).
+            unless cps[i - 1] == 0x006C && cps[i + 1] == 0x006C
+              raise Error, "U+00B7 MIDDLE DOT requires surrounding 'l' characters"
+            end
+          when 0x0375
+            # §4.2 GREEK LOWER NUMERAL SIGN — next char must be Greek.
+            unless in_ranges?(cps[i + 1], GREEK_RANGES)
+              raise Error, "U+0375 must precede a Greek-script character"
+            end
+          when 0x05F3, 0x05F4
+            # §4.3, §4.4 HEBREW GERESH / GERSHAYIM — previous char
+            # must be Hebrew.
+            unless in_ranges?(cps[i - 1], HEBREW_RANGES)
+              raise(
+                Error,
+                "U+#{cp.to_s(16).upcase} must follow a Hebrew-script character"
+              )
+            end
+          when 0x30FB
+            # §4.5 KATAKANA MIDDLE DOT — label must contain at least
+            # one Hiragana/Katakana/Han character. U+30FB itself has
+            # Script=Common, not Katakana — only its block falls in
+            # the Katakana block, so we exclude it from the cohort.
+            companions = cps.each_with_index.reject { |c, j| j == i || c == 0x30FB }.map(&:first)
+            unless companions.any? { |c| in_ranges?(c, HIRAGANA_KATAKANA_HAN_RANGES) }
+              raise(
+                Error,
+                "U+30FB requires another Hiragana/Katakana/Han character in the label"
+              )
+            end
+          when 0x0660..0x0669
+            # §4.6 Arabic-Indic Digits cannot mix with Extended
+            # Arabic-Indic Digits in the same label.
+            if cps.any? { |c| (0x06F0..0x06F9).cover?(c) }
+              raise Error, "Arabic-Indic digit forbidden alongside Extended Arabic-Indic digit"
+            end
+          when 0x06F0..0x06F9
+            # §4.7 symmetric to §4.6.
+            if cps.any? { |c| (0x0660..0x0669).cover?(c) }
+              raise Error, "Extended Arabic-Indic digit forbidden alongside Arabic-Indic digit"
+            end
+          end
+        end
+      end
+      def self.in_ranges?(cp, ranges)
+        return false if cp.nil?
+        ranges.any? { |(lo, hi)| cp >= lo && cp <= hi }
+      end
+      def self.bidi_class_of(cp)
+        row = IDNAData.lookup(IDNAData::BIDI_CLASS, cp)
+        row ? row[2] : :L
+      end
+      def self.joining_type_of(cp)
+        row = IDNAData.lookup(IDNAData::JOINING_TYPE, cp)
+        row ? row[2] : :U
+      end
+    end
+  end
+end