RubyGems - homographic_spoofing - Versions diffs - 0.1.0 - Mend

homographic_spoofing 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

data/lib/homographic_spoofing/detector/rule/disallowed_characters.rb ADDED Viewed

@@ -0,0 +1,140 @@
+# 3. and 4. of Google Chrome IDN policy See https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3AIdentifierStatus%3DAllowed%3A&abb=on&g=&i=
+class HomographicSpoofing::Detector::Rule::DisallowedCharacters < HomographicSpoofing::Detector::Rule::Base
+  class << self
+    # See http://kb.mozillazine.org/Network.IDN.blacklist_chars
+    MOZZILLA_DISALLOWED_CHARACTERS = Set[
+      "\u0020", # Space
+      "\u00a0", # No-break space
+      "\u00bc", # Vulgar fraction one quarter
+      "\u00bd", # Vulgar fraction one half
+      "\u00be", # Vulgar fraction three quarters
+      "\u01c3", # Latin letter retroflex click
+      "\u02d0", # Modifier letter triangular colon
+      "\u0337", # Combining short solidus overlay
+      "\u0338", # Combining long solidus overlay
+      "\u0589", # Armenian full stop
+      "\u058a", # Armenian hyphen
+      "\u05c3", # Hebrew punctuation sof pasuq
+      "\u05f4", # Hebrew punctuation gershayim
+      "\u0609", # Arabic-indic per mille sign
+      "\u060a", # Arabic-indic per ten thousand sign
+      "\u066a", # Arabic percent sign
+      "\u06d4", # Arabic full stop
+      "\u0701", # Syriac supralinear full stop
+      "\u0702", # Syriac sublinear full stop
+      "\u0703", # Syriac supralinear colon
+      "\u0704", # Syriac sublinear colon
+      "\u115f", # Hangul choseong filler
+      "\u1160", # Hangul jungseong filler
+      "\u1735", # Philippine single punctuation
+      "\u2000", # En quad
+      "\u2001", # Em quad
+      "\u2002", # En space
+      "\u2003", # Em space
+      "\u2004", # Three-per-em space
+      "\u2005", # Four-per-em space
+      "\u2006", # Six-per-em-space
+      "\u2007", # Figure space
+      "\u2008", # Punctuation space
+      "\u2009", # Thin space
+      "\u200a", # Hair space
+      "\u200b", # Zero width space
+      "\u200e", # Left-to-right mark
+      "\u200f", # Right-to-left mark
+      "\u2010", # Hyphen
+      "\u2019", # Right single quotation mark
+      "\u2024", # One dot leader
+      "\u2027", # Hyphenation point
+      "\u2028", # Line separator
+      "\u2029", # Paragraph separator
+      "\u202a", # Left-to-right embedding
+      "\u202b", # Right-to-left embedding
+      "\u202c", # Pop directional formatting
+      "\u202d", # Left-to-right override
+      "\u202e", # Right-to-left override
+      "\u202f", # Narrow no-break space
+      "\u2039", # Single left-pointing angle quotation mark
+      "\u203a", # Single right-pointing angle quotation mark
+      "\u2041", # Caret insertion point
+      "\u2044", # Fraction slash
+      "\u2052", # Commercial minus sign
+      "\u205f", # Medium mathematical space
+      "\u2153", # Vulgar fraction one third
+      "\u2154", # Vulgar fraction two thirds
+      "\u2155", # Vulgar fraction one fifth
+      "\u2156", # Vulgar fraction two fifths
+      "\u2157", # Vulgar fraction three fifths
+      "\u2158", # Vulgar fraction four fifths
+      "\u2159", # Vulgar fraction one sixth
+      "\u215a", # Vulgar fraction five sixths
+      "\u215b", # Vulgar fraction one eight
+      "\u215c", # Vulgar fraction three eighths
+      "\u215d", # Vulgar fraction five eighths
+      "\u215e", # Vulgar fraction seven eighths
+      "\u215f", # Fraction numerator one
+      "\u2215", # Division slash
+      "\u2236", # Ratio
+      "\u23ae", # Integral extension
+      "\u2571", # Box drawings light diagonal upper right to lower left
+      "\u29f6", # Solidus with overbar
+      "\u29f8", # Big solidus
+      "\u2afb", # Triple solidus binary relation
+      "\u2afd", # Double solidus operator
+      "\u2ff0", # Ideographic description character left to right
+      "\u2ff1", # Ideographic description character above to below
+      "\u2ff2", # Ideographic description character left to middle and right
+      "\u2ff3", # Ideographic description character above to middle and below
+      "\u2ff4", # Ideographic description character full surround
+      "\u2ff5", # Ideographic description character surround from above
+      "\u2ff6", # Ideographic description character surround from below
+      "\u2ff7", # Ideographic description character surround from left
+      "\u2ff8", # Ideographic description character surround from upper left
+      "\u2ff9", # Ideographic description character surround from upper right
+      "\u2ffa", # Ideographic description character surround from lower left
+      "\u2ffb", # Ideographic description character overlaid
+      "\u3000", # Ideographic space
+      "\u3002", # Ideographic full stop
+      "\u3014", # Left tortoise shell bracket
+      "\u3015", # Right tortoise shell bracket
+      "\u3033", # Vertical kana repeat mark upper half
+      "\u30a0", # Katakana-hiragana double hyphen
+      "\u3164", # Hangul filler
+      "\u321d", # Parenthesized korean character ojeon
+      "\u321e", # Parenthesized korean character o hu
+      "\u33ae", # Square rad over s
+      "\u33af", # Square rad over s squared
+      "\u33c6", # Square c over kg
+      "\u33df", # Square a over m
+      "\ua789", # Modifier letter colon
+      "\ufe14", # Presentation form for vertical semicolon
+      "\ufe15", # Presentation form for vertical exclamation mark
+      "\ufe3f", # Presentation form for vertical left angle bracket
+      "\ufe5d", # Small left tortoise shell bracket
+      "\ufe5e", # Small right tortoise shell bracket
+      "\ufeff", # Zero-width no-break space
+      "\uff0e", # Fullwidth full stop
+      "\uff0f", # Fullwidth solidus
+      "\uff61", # Halfwidth ideographic full stop
+      "\uffa0", # Halfwidth hangul filler
+      "\ufff9", # Interlinear annotation anchor
+      "\ufffa", # Interlinear annotation separator
+      "\ufffb", # Interlinear annotation terminator
+      "\ufffc", # Object replacement character
+      "\ufffd"  # Replacement character
+    ]
+    def allowed_chars_set
+      @@allowed_chars_set ||= (read_allowed_idn_chars.chars.to_set - MOZZILLA_DISALLOWED_CHARACTERS)
+    end
+    private
+      # Built with script/development/generate_allowed_idn_characters.rb
+      def read_allowed_idn_chars
+        File.read("#{__dir__}/data/allowed_idn_characters.txt")
+      end
+  end
+  def attack_detected?
+    !label_set.subset?(self.class.allowed_chars_set)
+  end
+end

data/lib/homographic_spoofing/detector/rule/idn/base.rb ADDED Viewed

@@ -0,0 +1,3 @@
+class HomographicSpoofing::Detector::Rule::Idn::Base < HomographicSpoofing::Detector::Rule::Base
+  delegate :tld, to: :@context
+end

data/lib/homographic_spoofing/detector/rule/idn/context.rb ADDED Viewed

@@ -0,0 +1,8 @@
+class HomographicSpoofing::Detector::Rule::Idn::Context < HomographicSpoofing::Detector::Rule::Context
+  attr_reader :tld
+  def initialize(label:, tld:)
+    @tld = tld
+    super(label:)
+  end
+end

data/lib/homographic_spoofing/detector/rule/idn/dangerous_pattern.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# 12. of Google Chrome IDN policy
+class HomographicSpoofing::Detector::Rule::Idn::DangerousPattern < HomographicSpoofing::Detector::Rule::Idn::Base
+  DANGEROUS_PATTERNS = Regexp.union(
+    /# Disallow the following as they may be mistaken for slashes when
+    # they're surrounded by non-Japanese scripts (i.e. has non-Katakana
+    # Hiragana or Han scripts on both sides):
+    # "ノ" (Katakana no, U+30ce), "ソ" (Katakana so, U+30bd),
+    # "ゾ" (Katakana zo, U+30be), "ン" (Katakana n, U+30f3),
+    # "丶" (CJK unified ideograph, U+4E36),
+    # "乀" (CJK unified ideograph, U+4E40),
+    # "乁" (CJK unified ideograph, U+4E41),
+    # "丿" (CJK unified ideograph, U+4E3F).
+    # If {no, so, zo, n} next to a
+    # non-Japanese script on either side is disallowed.
+    [^\p{kana}\p{hira}\p{hani}]
+    [\u30ce\u30f3\u30bd\u30be\u4e36\u4e40\u4e41\u4e3f]
+    [^\p{kana}\p{hira}\p{hani}]/x,
+      /# Disallow three Hiragana letters (U+307[8-A]) or Katakana letters
+      # (U+30D[8-A]) that look exactly like each other when they're used
+      # in a label otherwise entirely in Katakana or Hiragana.
+      ^[\p{kana}]+[\u3078-\u307a][\p{kana}]+\z/x,
+      /^[\p{hira}]+[\u30d8-\u30da][\p{hira}]+\z/,
+      /# Disallow U+30FD (Katakana iteration mark) and U+30FE (Katakana
+                                                               # voiced iteration mark) unless they're preceded by a Katakana.
+        ([^\p{kana}][\u30fd\u30fe]|^[\u30fd\u30fe])/x,
+        /# Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-
+                                                             # Katakana Prolonged Sound) used out-of-context.
+          ([^\p{kana}\p{hira}]\u30fc|^\u30fc|[a-z]\u30fb|\u30fb[a-z])/x,
+          /# Disallow these CJK ideographs if they are next to non-CJK
+          # characters. These characters can be used to spoof Latin
+          # characters or punctuation marks:
+          # U+4E00 (一), U+3127 (ㄧ), U+4E28 (丨), U+4E5B (乛), U+4E03 (七),
+          # U+4E05 (丅), U+5341 (十), U+3007 (〇), U+3112 (ㄒ), U+311A (ㄚ),
+          # U+311F (ㄟ), U+3128 (ㄨ), U+3129 (ㄩ), U+3108 (ㄈ), U+31BA (ㆺ),
+          # U+31B3 (ㆳ), U+5DE5 (工), U+31B2 (ㆲ), U+8BA0 (讠), U+4E01 (丁)
+          # These characters are already blocked:
+          # U+2F00 (⼀) (normalized to U+4E00), U+3192 (㆒), U+2F02 (⼂),
+          # U+2F17 (⼗) and U+3038 (〸) (both normalized to U+5341 (十)).
+          # Check if there is non-{Hiragana, Katagana, Han, Bopomofo} on the
+          # left.
+          [^\p{kana}\p{hira}\p{hani}\p{bopo}]
+    [\u4e00\u3127\u4e28\u4e5b\u4e03\u4e05\u5341\u3007\u3112\u311a\u311f\u3128\u3129\u3108\u31ba\u31b3\u5dE5\u31b2\u8ba0\u4e01]/x,
+      /# Check if there is non-{Hiragana, Katagana, Han, Bopomofo} on the
+      # right.
+      [\u4e00\u3127\u4e28\u4e5b\u4e03\u4e05\u5341\u3007\u3112\u311a\u311f\u3128\u3129\u3108\u31ba\u31b3\u5de5\u31b2\u8ba0\u4e01]
+    [^\p{kana}\p{hira}\p{hani}\p{bopo}]/x,
+      /# Disallow combining diacritical mark (U+0300-U+0339) after a
+      # non-LGC character. Other combining diacritical marks are not in
+      # the allowed character set.
+      [^\p{latn}\p{grek}\p{cyrl}][\u0300-\u0339]/x,
+      /# Disallow dotless i (U+0131) followed by a combining mark.
+      \u0131[\u0300-\u0339]/x,
+      /# Disallow combining Kana voiced sound marks.
+      (\u3099|\u309a)/x,
+      /# Disallow U+0307 (dot above) after 'i', 'j', 'l' or dotless i
+      # (U+0131). Dotless j (U+0237) is not in the allowed set to begin
+      # with.
+      [ijl]\u0307/x,
+      /^\u0237/
+  )
+  def attack_detected?
+    DANGEROUS_PATTERNS.match?(label)
+  end
+end

data/lib/homographic_spoofing/detector/rule/idn/deviation_characters.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46
+# transitional processing treats them as IDNA 2003 does; maps U+00DF and
+# U+03C2 and drops U+200[CD].
+class HomographicSpoofing::Detector::Rule::Idn::DeviationCharacters < HomographicSpoofing::Detector::Rule::Idn::Base
+  DEVIATION_CHARACTERS = %W[ ß ς \u200c \u200d ].to_set
+  def attack_detected?
+    (label_set & DEVIATION_CHARACTERS).present?
+  end
+end

data/lib/homographic_spoofing/detector/rule/idn/digits.rb ADDED Viewed

@@ -0,0 +1,25 @@
+# 11. of Google Chrome IDN policy
+class HomographicSpoofing::Detector::Rule::Idn::Digits < HomographicSpoofing::Detector::Rule::Idn::Base
+  def attack_detected?
+    contains_digit_lookalike? && contains_only_digits_or_digit_lookalike?
+  end
+  private
+    def contains_digit_lookalike?
+      label.chars.any? { |char| digit_lookalike?(char) }
+    end
+    def contains_only_digits_or_digit_lookalike?
+      label.chars.all? { |char| digit?(char) || digit_lookalike?(char) }
+    end
+    def digit?(char)
+      /[0-9]/.match?(char)
+    end
+    DIGIT_LOOKALIKES = %w[ θ २ ২ ੨ ੨ ૨ ೩ ೭ շ з ҙ ӡ उ ও ਤ ੩ ૩ ౩ ဒ ვ პ ੜ კ ੫ 丩 ㄐ ճ ৪ ੪ ୫ ૭ ୨ ౨ ].to_set
+    def digit_lookalike?(char)
+      DIGIT_LOOKALIKES.include?(char)
+    end
+end

data/lib/homographic_spoofing/detector/rule/idn/invisible_characters.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# 7. of Google Chrome IDN policy
+class HomographicSpoofing::Detector::Rule::Idn::InvisibleCharacters < HomographicSpoofing::Detector::Rule::Idn::Base
+  def attack_detected?
+    INVISIBLE_CHARACTERS_REGEXP.match?(label)
+  end
+  private
+    INVISIBLE_CHARACTERS_REGEXP = Regexp.union(
+      /\u0e48{2,}/,   # Thai tone repeated
+      /\u0301{2,}/,   # accute accent repeated
+      /\u00e1\u0301/, # 'a' with acuted accent + another acute accent
+      /^\u0300/,      # Combining mark at the beginning
+    )
+end

data/lib/homographic_spoofing/detector/rule/idn/script_confusable.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# 9. and 10. of Google Chrome IDN policy See http://unicode.org/reports/tr39/#Confusable_Detection
+class HomographicSpoofing::Detector::Rule::Idn::ScriptConfusable < HomographicSpoofing::Detector::Rule::Idn::Base
+  def attack_detected?
+    SCRIPT_CONFUSABLES.any? do |confusable|
+      confusable_chars = label.scan(confusable.script)
+      confusable_chars.present? &&
+        confusable_chars.all? { confusable.latin_lookalike.match?(_1) } &&
+        !is_script_confusable_allowed_for_tld?(confusable)
+    end
+  end
+  private
+    Confusable = Struct.new(:script, :latin_lookalike, :allowed_tlds)
+    SCRIPT_CONFUSABLES = [
+      # Armenian
+      [ /\p{armn}/, /[ագզէլհյոսւօՙ]/, /am/ ],
+      # Cyrillic
+      [ /\p{cyrl}/, /[аысԁеԍһіюјӏорԗԛѕԝхуъьҽпгѵѡ]/, /bg|by|kz|pyc|ru|su|ua|uz/ ],
+      # # Ethiopic (Ge'ez).
+      [ /\p{ethi}/, /[ሀሠሰስበነተከዐዕዘጠፐꬅ]/, /er|et/ ],
+      # # Georgian
+      [ /\p{geor}/, /[იოყძხჽჿ]/, /ge/ ],
+      # # Greek
+      [ /\p{grek}/, /[αικνρυωηοτ]/, /gr/ ],
+      # # Hebrew
+      [ /\p{hebr}/, /[דוחיןסװײ׳ﬦ]/, /il/ ],
+      # # Bengali
+      [ /\p{beng}/, /[০৭]/, nil ],
+      # # Devanagari
+      [ /\p{Deva}/, /[ऽ०ॱ]/, nil ],
+      # # Gujarati
+      [ /\p{Gujr}/, /[ડટ૦૧]/, nil ],
+      # # Gurmukhi
+      [ /\p{Guru}/, /[੦੧]/, nil ],
+      # # Kannada
+      [ /\p{Knda}/, /[ಽ೦೧]/, nil ],
+      # # Malayalam
+      [ /\p{Mlym}/, /[ടഠധനറ൦]/, nil ],
+      # # Oriya
+      [ /\p{Orya}/, /[ଠ୦୮]/, nil ],
+      # # Tamil
+      [ /\p{Taml}/, /[டப௦]/, nil ],
+      # # Telugu
+      [ /\p{Telu}/, /[౦౧]/, nil ],
+      # # Myanmar
+      [ /\p{Mymr}/, /[ခဂငထပဝ၀၂ၔၜ\u1090\u1091\u1095\u1096\u1097]/, /[a-z]+\.mm/ ],
+      # # Thai
+      [ /\p{Thai}/, /[ทนบพรหเแ๐ดลปฟม]/, /th/ ]
+    ].map { Confusable.new(*_1) }
+    def is_script_confusable_allowed_for_tld?(confusable)
+      tld_contains_any_letter_from_script?(confusable.script) ||
+        confusable.allowed_tlds&.match?(tld)
+    end
+    def tld_contains_any_letter_from_script?(script)
+      script.match?(tld)
+    end
+end

data/lib/homographic_spoofing/detector/rule/idn/script_specific.rb ADDED Viewed

@@ -0,0 +1,31 @@
+class HomographicSpoofing::Detector::Rule::Idn::ScriptSpecific < HomographicSpoofing::Detector::Rule::Idn::Base
+  def attack_detected?
+    latin_spoof? || icelandic_spoof? || azerbaijan_spoof?
+  end
+  private
+    LATN = "Latin"
+    # Disallow non-ASCII Latin letters to mix with a non-Latin script.
+    # Note that the non-ASCII Latin check should not be applied when the entire label is made of Latin.
+    def latin_spoof?
+      scripts != Set[LATN] && non_ascii_latin_letters.present?
+    end
+    def non_ascii_latin_letters
+      label.scan(/\p{latin}/).reject { _1 =~ /[a-z0-9]/ }
+    end
+    ICELANDIC_CHARACTERS = %w[ þ ð ].to_set
+    # Latin small letter thorn ("þ", U+00FE) can be used to spoof both b and p.
+    # It's used in modern Icelandic orthography, so allow it for the Icelandic
+    # ccTLD (.is) but block in any other TLD. Also block Latin small letter eth
+    # ("ð", U+00F0) which can be used to spoof the letter o.
+    def icelandic_spoof?
+      tld != "is" && (label_set & ICELANDIC_CHARACTERS).any?
+    end
+    # ə is only allowed under the .az TLD.
+    def azerbaijan_spoof?
+      tld != "az" && label_set.include?("ə")
+    end
+end

data/lib/homographic_spoofing/detector/rule/idn/unsafe_middle_dot.rb ADDED Viewed

@@ -0,0 +1,12 @@
+# 8. of Google Chrome IDN policy
+#
+# Allow middle dot (U+00B7) only on Catalan domains when between two 'l's, to
+# permit the Catalan character ela geminada to be expressed.
+# See https://tools.ietf.org/html/rfc5892#appendix-A.3 for details.
+class HomographicSpoofing::Detector::Rule::Idn::UnsafeMiddleDot < HomographicSpoofing::Detector::Rule::Idn::Base
+  def attack_detected?
+    label.scan(/l?·l?/).find do |match|
+      tld != "cat" || match != "l·l"
+    end
+  end
+end

data/lib/homographic_spoofing/detector/rule/local/dot_atom_text.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# See http://www.unicode.org/reports/tr39/#Email_Security_Profiles dot-atom-text.
+class HomographicSpoofing::Detector::Rule::Local::DotAtomText < HomographicSpoofing::Detector::Rule::Base
+  def attack_detected?
+    if invalid_dot_sequence?
+      true
+    elsif label_no_dots.present?
+      !valid_start_sequence? || contains_invalid_char?
+    end
+  end
+  private
+    def invalid_dot_sequence?
+      label.starts_with?(".") || label.ends_with?(".") || multiple_dots?
+    end
+    def multiple_dots?
+      /\.{2,}/.match?(label)
+    end
+    def label_no_dots
+      @label_no_dots ||= label.tr(".", "")
+    end
+    XID_Start_REGEXP = /\p{XIDS}/
+    def valid_start_sequence?
+      start = label.first
+      simple_char?(start) || XID_Start_REGEXP.match?(start)
+    end
+    def contains_invalid_char?
+      label_no_dots.chars.any? { invalid_char?(_1) }
+    end
+    # https://tools.ietf.org/html/rfc5322#section-3.2.3 atext
+    ATEXT_REGEXP = %r{[!#-'*+\-/-9=?A-Z\^-~]}
+    def invalid_char?(c)
+      if simple_char?(c)
+        !ATEXT_REGEXP.match(c)
+      else
+        HomographicSpoofing::Detector::Rule::DisallowedCharacters.allowed_chars_set.exclude?(c)
+      end
+    end
+    def simple_char?(c)
+      c < "\u007f"
+    end
+end

data/lib/homographic_spoofing/detector/rule/local/nfkc.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# See http://www.unicode.org/reports/tr39/#Email_Security_Profiles nkfc.
+class HomographicSpoofing::Detector::Rule::Local::Nfkc < HomographicSpoofing::Detector::Rule::Base
+  def attack_detected?
+    !label.unicode_normalized?(:nfkc)
+  end
+end

data/lib/homographic_spoofing/detector/rule/mixed_digits.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# 6. of Google Chrome IDN policy
+class HomographicSpoofing::Detector::Rule::MixedDigits < HomographicSpoofing::Detector::Rule::Base
+  def attack_detected?
+    digits_scripts.many?
+  end
+  private
+    def digits_scripts
+      digits.map { digits_map[_1] }.uniq
+    end
+    def digits
+      label.scan(/[[:digit:]]/)
+    end
+    def digits_map
+      @@digits_map ||= build_digits_map
+    end
+    def build_digits_map
+      CSV.parse(read_digits).each_with_object({}) do |(char, script), map|
+        map[char] = script
+      end
+    end
+    # Built with script/development/generate_digits_characters.rb
+    def read_digits
+      File.read("#{__dir__}/data/digits.csv")
+    end
+end

data/lib/homographic_spoofing/detector/rule/mixed_scripts.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# 5. of Google Chrome IDN policy See http://www.unicode.org/reports/tr39/#highly_restrictive
+class HomographicSpoofing::Detector::Rule::MixedScripts < HomographicSpoofing::Detector::Rule::Base
+  def attack_detected?
+    !highly_restrictive_scripts_combination?
+  end
+  private
+    BOPO = "Bopomofo"
+    HANG = "Hangul"
+    HANI = "Han"
+    HIRA = "Hiragana"
+    KANA = "Katakana"
+    LATN = "Latin"
+    JAPANESE = Set[HANI, HIRA, KANA]
+    CHINESE  = Set[BOPO, HANI]
+    KOREAN   = Set[HANI, HANG]
+    HIGHLY_RESTRICTIVE_SCRIPT_COMBINATIONS = [
+      Set[*JAPANESE, LATN],
+      Set[*CHINESE,  LATN],
+      Set[*KOREAN,   LATN]
+    ]
+    def highly_restrictive_scripts_combination?
+      scripts.length == 1 || HIGHLY_RESTRICTIVE_SCRIPT_COMBINATIONS.any? do |highly_restrictive_script_combination|
+        scripts.subset?(highly_restrictive_script_combination)
+      end
+    end
+end

data/lib/homographic_spoofing/detector/rule/quoted_string/bidi_control.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# See http://www.unicode.org/reports/tr39/#Email_Security_Profiles bidicontrol.
+class HomographicSpoofing::Detector::Rule::QuotedString::BidiControl < HomographicSpoofing::Detector::Rule::Base
+  # See https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3Abidicontrol%3A%5D&c=on&g=&i=
+  # for the full list of bidirectional format characters.
+  DISALLOWED_REGEXP = /[\u202a\u202b\u202c\u202d\u202e\u2066\u2067\u2068\u2069]/
+  def attack_detected?
+    DISALLOWED_REGEXP.match?(label)
+  end
+end

data/lib/homographic_spoofing/detector/rule/quoted_string/data/nonspacing_marks.txt ADDED Viewed

@@ -0,0 +1 @@

+ ̴̵̶̷̸̡̢̧̨̛̖̗̘̙̜̝̞̟̠̣̤̥̦̩̪̫̬̭̮̯̰̱̲̳̹̺̻̼͙͚͇͈͉͍͎̀́̂̃̄̅̆̇̈̉̊̋̌̍̎̏̐̑̒̓̔̽̾̿͛̀́͂̓̈́͆͊͋͌̕̚͘ͅ͏͓͔͕͖͐͑͒͗ͣͤͥͦͧͨͩͪͫͬͭͮͯ҃҄҅҆҇͜͟͢͝͞͠͡҈҉ְֱֲֳִֵֶַָׇֹֺֻּֽֿׁׂًٌٍؘَؙُؚِّْٰܑ֑֖֛֢֣֤֥֦֧֪ׅۣ۪ۭٕٖٜٟܱܴܷܸܹܻܼܾ݂݄݆݈֚֭֮֒֓֔֕֗֘֙֜֝֞֟֠֡֨֩֫֬֯ׄؐؑؒؓؔؕؗۖۗۘۙۚۛۜ۟۠ۡۢۤۧۨ۫۬ؖٓٔٗ٘ٙٚٛٝٞܰܲܳܵܶܺܽܿ݀݁݃݅݇݉݊ަާިީުޫެޭޮޯްࣰࣱࣲ߲߽࡙࡚࡛࢙࢚࢛࣏࣐࣑࣒࣓ࣣࣦࣩ࣭࣮࣯ࣶࣹࣺ߫߬߭߮߯߰߱߳ࠖࠗ࠘࠙ࠛࠜࠝࠞࠟࠠࠡࠢࠣࠥࠦࠧࠩࠪࠫࠬ࠭࢘࢜࢝࢞࢟࣊࣋࣌࣍࣎ࣔࣕࣖࣗࣘࣙࣚࣛࣜࣝࣞࣟ࣠࣡ࣳࣤࣥࣧࣨ࣪࣫࣬ࣴࣵࣷࣸࣻࣼࣽࣾࣿऀँं़ऺुूृॄॅॆेैॕ्॒॑॓॔ॖॗॢॣঁ়ুূৃৄ্ৢৣ৾ਁਂ਼ੁੂੇੈੋੌ੍ੑੰੱੵઁં઼ુૂૃૄૅેૈ્ૢૣૺૻૼ૽૾૿ଁ଼୕ୖିୁୂୃୄ୍ୢୣஂீ்ఀఄ఼ౕౖాిీెేైొోౌ్ౢౣಁ಼ೌ್ೢೣഀഁ഻഼ുൂൃൄ്ൢൣඁ්ිීුූัิีึืฺุู็่้๊๋์ํ๎ັິີຶືຸູົ຺ຼ໌ໍ໎྄່້໊໋ཱཱཱིིུུ༹༘༙༵༷࿆ྂྃ྆྇ྲྀཷླྀཹཱེཻོཽྀྀཾྍྎྏྐྑྒྒྷྔྕྖྗྙྚྛྜྜྷྞྟྠྡྡྷྣྤྥྦྦྷྨྩྪྫྫྷྭྮྯྰྱྲླྴྵྶྷྸྐྵྺྻྼိီုူဲဳဴဵံ့္်ွှၘၙၞၟၠၱၲၳၴႂႅႆႍႝ፝፞፟ᜒᜓ᜔ᜲᜳᝒᝓᝲᝳ឴឵ិីឹឺុូួំ់៌៍៎៏័៑្៝៉៊៓᠋᠌᠍᠏ᢅᢆᢩᤠᤡᤢᤧᤨᤲ᤻ᨘ᤹᤺ᨗᨛᩖᩘᩙᩚᩛᩜᩝᩞ᩠᩺᩻᩼ᩢᩥᩦᩧᩨᩩᩪᩫᩬᩳᩴ᩿᪵᪶᪷᪸᪹᪺᪽᩵᩶᩷᩸᩹᪰᪱᪲᪳᪴᪻᪼᪾ᪿᫀ᫃᫄᫊᫁᫂᫅᫆᫇᫈᫉᫋ᫌᫍᫎᬀᬁᬂᬃ᬴ᬶᬷᬸᬹᬺᬼᭂ᭬᭫᭭᭮᭯᭰᭱᭲᭳ᮀᮁᮢᮣᮬᮭᮤᮥᮨᮩ᯦᮫ᯨᯩᯭᯯᯰᯱᰬᰭᰮᰯᰰᰱᰲᰳᰶ᳔᳢᳣᳤᳥᳦᳧᳨⃒⃓⃘⃙⃚᰷᷐᷎᷺᳕᳖᳗᳘᳙᳜᳝᳞᳟᳭᷂᷊᷹᷽᷏᷿᷸᷷᳐᳑᳒᳚᳛᳴᳠᳸᳹᷀᷁᷃᷻᷄᷅᷆᷇᷈᷉᷋᷌᷑᷒ᷓᷔᷕᷖᷗᷘᷙᷚᷛᷜᷝᷞᷟᷠᷡᷢᷣᷤᷥᷦᷧᷨᷩᷪᷫᷬᷭᷮᷯᷰᷱᷲᷳᷴ᷵᷾⃐⃑⃔⃕⃖⃗⃛⃜᷶᷼᷍⃝⃞⃟⃠⃢⃣⃤⃥⃦⃪⃫゙゚⵿〪⃨⃬⃭⃮⃯〭〫⃡⃧⃩⃰⳯⳰⳱ⷠⷡⷢⷣⷤⷥⷦⷧⷨⷩⷪⷫⷬⷭⷮⷯⷰⷱⷲⷳⷴⷵⷶⷷⷸⷹⷺⷻⷼⷽⷾⷿ꙯〬꙰꙱꙲ꙴꙵꙶꙷꙸꙹꙺꙻ꙼꙽ꚞꚟ꛰꛱ꠂ꠆꠬ꠋꠥꠦ꣄ꣅ꣠꣡꣢꣣꣤꣥꣦꣧꣨꣩꣪꣫꣬꣭꣮꣯꣰꣱ꣿꤦꤧꤨꤩꤪ꤫꤬꤭ꥇꥈꥉꥊꥋꥌꥍꥎꥏꥐꥑꦀꦁꦂ꦳ꦶꦷꦸꦹꦼꦽꧥꨩꨪꨫꨬꨭꨮꨱꨲꨵꨶꩃꩌꩼꪴꪰꪲꪳꪷꪸꪾ꪿꫁ꫬꫭ꫶ꯥꯨ꯭ﬞ︀︁︂︃︄︅︆︇︈︉︊︋︌︍︎️︧︨︩︪︫︬︭𐇽𐋠︠︡︢︣︮︯︤︥︦𐍶𐍷𐍸𐍹𐍺𐨁𐨂𐨃𐨅𐨆𐨌𐨍𐨎𐨹𐨿𐨺𐫦𐻽𐻾𐻿𐽆𐽇𐽋𐽍𐽎𐽏𐽐𐾃𐾅𐨏𐨸𐫥𐴤𐴥𐴦𐴧𐺫𐺬𐽈𐽉𐽊𐽌𐾂𐾄𑀁𑀸𑀹𑀺𑀻𑀼𑀽𑀾𑀿𑁀𑁁𑁂𑁃𑁄𑁅𑁳𑁴𑁆𑁰𑁿𑂀𑂁𑂺𑂳𑂴𑂵𑂶𑂹𑃂𑄳𑄴𑄀𑄁𑄂𑄧𑄨𑄩𑄪𑄫𑄭𑄮𑄯𑄰𑄱𑄲𑅳𑆀𑆁𑆶𑆷𑆸𑆹𑆺𑆻𑆼𑆽𑆾𑇉𑇏𑇊𑇋𑇌𑈯𑈰𑈱𑉁𑈴𑈶𑈷𑈾𑋟𑋣𑋤𑋥𑋦𑋧𑋨𑋩𑋪𑌀𑌁𑌻𑌼𑍀𑍦𑍧𑍨𑍩𑍪𑍫𑍬𑍰𑍱𑍲𑍳𑍴𑐸𑐹𑐺𑐻𑐼𑐽𑐾𑐿𑑂𑑃𑑄𑑆𑑞𑒳𑒴𑒵𑒶𑒷𑒸𑒺𑒿𑓀𑓃𑓂𑖲𑖳𑖴𑖵𑖼𑖽𑗀𑖿𑗜𑗝𑘳𑘴𑘵𑘶𑘷𑘸𑘹𑘺𑘽𑘿𑙀𑚫𑚭𑚰𑚱𑚲𑚳𑚴𑚵𑚷𑜝𑜞𑜟𑜢𑜣𑜤𑜥𑜧𑜨𑜩𑜪𑜫𑠯𑠰𑠱𑠲𑠳𑠴𑠵𑠶𑠷𑠺𑠹𑤻𑤼𑥃𑤾𑧔𑧕𑧖𑧗𑧚𑧛𑧠𑨁𑨂𑨃𑨄𑨅𑨆𑨉𑨊𑨳𑨴𑨵𑨶𑨷𑨸𑨻𑨼𑨽𑨾𑩇𑩑𑩒𑩓𑩔𑩕𑩖𑩙𑩚𑩛𑪊𑪋𑪌𑪍𑪎𑪏𑪐𑪑𑪒𑪓𑪔𑪕𑪖𑪘𑪙𑰰𑰱𑰲𑰳𑰴𑰵𑰶𑰸𑰹𑰺𑰻𑰼𑰽𑲒𑲓𑲔𑲕𑲖𑲗𑲘𑲙𑲚𑲛𑲜𑲝𑲞𑲟𑲠𑲡𑲢𑲣𑲤𑲥𑲦𑲧𑲪𑲫𑲬𑲭𑲮𑲯𑲰𑲲𑲳𑲵𑲶𑴱𑴲𑴳𑴴𑴵𑴶𑴺𑴼𑴽𑴿𑵀𑵁𑵂𑵃𑵄𑵅𑵇𑶐𑶑𑶕𑶗𑻳𑻴𑼀𑼁𑼶𑼷𑼸𑼹𑼺𑽀𑽂𓑀𓑇𓑈𓑉𓑊𓑋𓑌𓑍𓑎𓑏𓑐𓑑𓑒𓑓𓑔𓑕𖫰𖫱𖫲𖫳𖫴𖬰𖬱𖬲𖬳𖬴𖬵𖬶𖽏𖾏𖾐𖾑𖾒𖿤𛲝𛲞𜼀𜼁𜼂𜼃𜼄𜼅𜼆𜼇𜼈𜼉𜼊𜼋𜼌𜼍𜼎𜼏𜼐𜼑𜼒𜼓𜼔𜼕𜼖𜼗𜼘𜼙𜼚𜼛𜼜𜼝𜼞𜼟𜼠𜼡𜼢𜼣𜼤𜼥𜼦𜼧𜼨𜼩𜼪𜼫𜼬𜼭𜼰𜼱𜼲𜼳𜼴𜼵𜼶𜼷𜼸𜼹𜼺𜼻𜼼𜼽𜼾𜼿𜽀𜽁𜽂𜽃𜽄𜽅𜽆𝅧𝅨𝅩𝅻𝅼𝅽𝅾𝅿𝆀𝆁𝆂𝆊𝆋𝆅𝆆𝆇𝆈𝆉𝆪𝆫𝆬𝆭𝉂𝉃𝉄𝨀𝨁𝨂𝨃𝨄𝨅𝨆𝨇𝨈𝨉𝨊𝨋𝨌𝨍𝨎𝨏𝨐𝨑𝨒𝨓𝨔𝨕𝨖𝨗𝨘𝨙𝨚𝨛𝨜𝨝𝨞𝨟𝨠𝨡𝨢𝨣𝨤𝨥𝨦𝨧𝨨𝨩𝨪𝨫𝨬𝨭𝨮𝨯𝨰𝨱𝨲𝨳𝨴𝨵𝨶𝨻𝨼𝨽𝨾𝨿𝩀𝩁𝩂𝩃𝩄𝩅𝩆𝩇𝩈𝩉𝩊𝩋𝩌𝩍𝩎𝩏𝩐𝩑𝩒𝩓𝩔𝩕𝩖𝩗𝩘𝩙𝩚𝩛𝩜𝩝𝩞𝩟𝩠𝩡𝩢𝩣𝩤𝩥𝩦𝩧𝩨𝩩𝩪𝩫𝩬𝩵𝪄𝪛𝪜𝪝𝪞𝪟𝪡𝪢𝪣𝪤𝪥𝪦𝪧𝪨𝪩𝪪𝪫𝪬𝪭𝪮𝪯𞥊𞓮𞣐𞣑𞣒𞣓𞣔𞣕𞣖𞀀𞀁𞀂𞀃𞀄𞀅𞀆𞀈𞀉𞀊𞀋𞀌𞀍𞀎𞀏𞀐𞀑𞀒𞀓𞀔𞀕𞀖𞀗𞀘𞀛𞀜𞀝𞀞𞀟𞀠𞀡𞀣𞀤𞀦𞀧𞀨𞀩𞀪𞂏𞄰𞄱𞄲𞄳𞄴𞄵𞄶𞊮𞋬𞋭𞋮𞋯𞓯𞥄𞥅𞥆𞥇𞥈𞥉𞓬𞓭󠄀󠄁󠄂󠄃󠄄󠄅󠄆󠄇󠄈󠄉󠄊󠄋󠄌󠄍󠄎󠄏󠄐󠄑󠄒󠄓󠄔󠄕󠄖󠄗󠄘󠄙󠄚󠄛󠄜󠄝󠄞󠄟󠄠󠄡󠄢󠄣󠄤󠄥󠄦󠄧󠄨󠄩󠄪󠄫󠄬󠄭󠄮󠄯󠄰󠄱󠄲󠄳󠄴󠄵󠄶󠄷󠄸󠄹󠄺󠄻󠄼󠄽󠄾󠄿󠅀󠅁󠅂󠅃󠅄󠅅󠅆󠅇󠅈󠅉󠅊󠅋󠅌󠅍󠅎󠅏󠅐󠅑󠅒󠅓󠅔󠅕󠅖󠅗󠅘󠅙󠅚󠅛󠅜󠅝󠅞󠅟󠅠󠅡󠅢󠅣󠅤󠅥󠅦󠅧󠅨󠅩󠅪󠅫󠅬󠅭󠅮󠅯󠅰󠅱󠅲󠅳󠅴󠅵󠅶󠅷󠅸󠅹󠅺󠅻󠅼󠅽󠅾󠅿󠆀󠆁󠆂󠆃󠆄󠆅󠆆󠆇󠆈󠆉󠆊󠆋󠆌󠆍󠆎󠆏󠆐󠆑󠆒󠆓󠆔󠆕󠆖󠆗󠆘󠆙󠆚󠆛󠆜󠆝󠆞󠆟󠆠󠆡󠆢󠆣󠆤󠆥󠆦󠆧󠆨󠆩󠆪󠆫󠆬󠆭󠆮󠆯󠆰󠆱󠆲󠆳󠆴󠆵󠆶󠆷󠆸󠆹󠆺󠆻󠆼󠆽󠆾󠆿󠇀󠇁󠇂󠇃󠇄󠇅󠇆󠇇󠇈󠇉󠇊󠇋󠇌󠇍󠇎󠇏󠇐󠇑󠇒󠇓󠇔󠇕󠇖󠇗󠇘󠇙󠇚󠇛󠇜󠇝󠇞󠇟󠇠󠇡󠇢󠇣󠇤󠇥󠇦󠇧󠇨󠇩󠇪󠇫󠇬󠇭󠇮󠇯

data/lib/homographic_spoofing/detector/rule/quoted_string/nfc.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# See http://www.unicode.org/reports/tr39/#Email_Security_Profiles nfc.
+class HomographicSpoofing::Detector::Rule::QuotedString::Nfc < HomographicSpoofing::Detector::Rule::Base
+  def attack_detected?
+    !label.unicode_normalized?(:nfc)
+  end
+end

data/lib/homographic_spoofing/detector/rule/quoted_string/nonspacing_marks.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# See http://www.unicode.org/reports/tr39/#Email_Security_Profiles nonspacing marks.
+class HomographicSpoofing::Detector::Rule::QuotedString::NonspacingMarks < HomographicSpoofing::Detector::Rule::Base
+  def attack_detected?
+    nonspacing_marks_regexp.match?(label)
+  end
+  private
+    def nonspacing_marks_regexp
+      # 5 or more nonspacing marks in a row or 2 or more repetitions of the same nonspacing mark.
+      @@nonspacing_marks_regexp ||= /[#{nonspacing_marks}]{5,}|([#{nonspacing_marks}])\1/
+    end
+    def nonspacing_marks
+      @nonspacing_marks ||= read_nonspacing_marks
+    end
+    # Built with script/development/generate_nonspacing_marks.rb
+    def read_nonspacing_marks
+      File.read("#{__dir__}/data/nonspacing_marks.txt")
+    end
+end

data/lib/homographic_spoofing/railtie.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class HomographicSpoofing::Railtie < ::Rails::Railtie
+  initializer "homographic_spoofing.logger" do
+    HomographicSpoofing.logger ||= Rails.logger
+  end
+end

data/lib/homographic_spoofing/sanitizer/base.rb ADDED Viewed

@@ -0,0 +1,39 @@
+class HomographicSpoofing::Sanitizer::Base
+  class_attribute :logger, default: HomographicSpoofing.logger
+  def self.sanitize(field)
+    new(field).sanitize
+  end
+  def initialize(field)
+    @field = field
+  end
+  def sanitize
+    result = field.dup
+    detector_class.new(field).detections.each do |detection|
+      log(detection.reason, detection.label)
+      result = punycode(result, detection.label)
+    end
+    result
+  end
+  private
+    attr_reader :field
+    def punycode(source, label)
+      source.gsub(label, Dnsruby::Name.punycode(label))
+    end
+    def detector_class
+      raise NotImplementedError, "subclasses must override this"
+    end
+    def log(reason, label)
+      self.class.logger.info("#{spoofing_type} Spoofing detected for: \"#{reason}\" on: \"#{label}\".") if self.class.logger
+    end
+    def spoofing_type
+      raise NotImplementedError, "subclasses must override this"
+    end
+end

data/lib/homographic_spoofing/sanitizer/email_address.rb ADDED Viewed

@@ -0,0 +1,10 @@
+class HomographicSpoofing::Sanitizer::EmailAddress < HomographicSpoofing::Sanitizer::Base
+   private
+    def detector_class
+      HomographicSpoofing::Detector::EmailAddress
+    end
+    def spoofing_type
+      "EmailAddress"
+    end
+end

data/lib/homographic_spoofing/sanitizer/idn.rb ADDED Viewed

@@ -0,0 +1,10 @@
+class HomographicSpoofing::Sanitizer::Idn < HomographicSpoofing::Sanitizer::Base
+  private
+    def detector_class
+      HomographicSpoofing::Detector::Idn
+    end
+    def spoofing_type
+      "EmailIDN"
+    end
+end

data/lib/homographic_spoofing/sanitizer/quoted_string.rb ADDED Viewed

@@ -0,0 +1,10 @@
+class HomographicSpoofing::Sanitizer::QuotedString < HomographicSpoofing::Sanitizer::Base
+  private
+    def detector_class
+      HomographicSpoofing::Detector::QuotedString
+    end
+    def spoofing_type
+      "EmailQuotedString"
+    end
+end

data/lib/homographic_spoofing/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module HomographicSpoofing
+  VERSION = "0.1.0"
+end