homographic_spoofing 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/MIT-LICENSE +20 -0
- data/README.md +117 -0
- data/lib/homographic_spoofing/detector/base.rb +41 -0
- data/lib/homographic_spoofing/detector/detection.rb +2 -0
- data/lib/homographic_spoofing/detector/email_address.rb +40 -0
- data/lib/homographic_spoofing/detector/idn.rb +78 -0
- data/lib/homographic_spoofing/detector/local.rb +14 -0
- data/lib/homographic_spoofing/detector/quoted_string.rb +13 -0
- data/lib/homographic_spoofing/detector/rule/base.rb +15 -0
- data/lib/homographic_spoofing/detector/rule/context.rb +19 -0
- data/lib/homographic_spoofing/detector/rule/data/allowed_idn_characters.txt +1 -0
- data/lib/homographic_spoofing/detector/rule/data/digits.csv +680 -0
- data/lib/homographic_spoofing/detector/rule/disallowed_characters.rb +140 -0
- data/lib/homographic_spoofing/detector/rule/idn/base.rb +3 -0
- data/lib/homographic_spoofing/detector/rule/idn/context.rb +8 -0
- data/lib/homographic_spoofing/detector/rule/idn/dangerous_pattern.rb +73 -0
- data/lib/homographic_spoofing/detector/rule/idn/deviation_characters.rb +10 -0
- data/lib/homographic_spoofing/detector/rule/idn/digits.rb +25 -0
- data/lib/homographic_spoofing/detector/rule/idn/invisible_characters.rb +14 -0
- data/lib/homographic_spoofing/detector/rule/idn/script_confusable.rb +59 -0
- data/lib/homographic_spoofing/detector/rule/idn/script_specific.rb +31 -0
- data/lib/homographic_spoofing/detector/rule/idn/unsafe_middle_dot.rb +12 -0
- data/lib/homographic_spoofing/detector/rule/local/dot_atom_text.rb +49 -0
- data/lib/homographic_spoofing/detector/rule/local/nfkc.rb +6 -0
- data/lib/homographic_spoofing/detector/rule/mixed_digits.rb +30 -0
- data/lib/homographic_spoofing/detector/rule/mixed_scripts.rb +30 -0
- data/lib/homographic_spoofing/detector/rule/quoted_string/bidi_control.rb +10 -0
- data/lib/homographic_spoofing/detector/rule/quoted_string/data/nonspacing_marks.txt +1 -0
- data/lib/homographic_spoofing/detector/rule/quoted_string/nfc.rb +6 -0
- data/lib/homographic_spoofing/detector/rule/quoted_string/nonspacing_marks.rb +21 -0
- data/lib/homographic_spoofing/railtie.rb +5 -0
- data/lib/homographic_spoofing/sanitizer/base.rb +39 -0
- data/lib/homographic_spoofing/sanitizer/email_address.rb +10 -0
- data/lib/homographic_spoofing/sanitizer/idn.rb +10 -0
- data/lib/homographic_spoofing/sanitizer/quoted_string.rb +10 -0
- data/lib/homographic_spoofing/version.rb +3 -0
- data/lib/homographic_spoofing.rb +47 -0
- metadata +166 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
# 3. and 4. of Google Chrome IDN policy See https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3AIdentifierStatus%3DAllowed%3A&abb=on&g=&i=
|
2
|
+
class HomographicSpoofing::Detector::Rule::DisallowedCharacters < HomographicSpoofing::Detector::Rule::Base
|
3
|
+
class << self
|
4
|
+
# See http://kb.mozillazine.org/Network.IDN.blacklist_chars
|
5
|
+
MOZZILLA_DISALLOWED_CHARACTERS = Set[
|
6
|
+
"\u0020", # Space
|
7
|
+
"\u00a0", # No-break space
|
8
|
+
"\u00bc", # Vulgar fraction one quarter
|
9
|
+
"\u00bd", # Vulgar fraction one half
|
10
|
+
"\u00be", # Vulgar fraction three quarters
|
11
|
+
"\u01c3", # Latin letter retroflex click
|
12
|
+
"\u02d0", # Modifier letter triangular colon
|
13
|
+
"\u0337", # Combining short solidus overlay
|
14
|
+
"\u0338", # Combining long solidus overlay
|
15
|
+
"\u0589", # Armenian full stop
|
16
|
+
"\u058a", # Armenian hyphen
|
17
|
+
"\u05c3", # Hebrew punctuation sof pasuq
|
18
|
+
"\u05f4", # Hebrew punctuation gershayim
|
19
|
+
"\u0609", # Arabic-indic per mille sign
|
20
|
+
"\u060a", # Arabic-indic per ten thousand sign
|
21
|
+
"\u066a", # Arabic percent sign
|
22
|
+
"\u06d4", # Arabic full stop
|
23
|
+
"\u0701", # Syriac supralinear full stop
|
24
|
+
"\u0702", # Syriac sublinear full stop
|
25
|
+
"\u0703", # Syriac supralinear colon
|
26
|
+
"\u0704", # Syriac sublinear colon
|
27
|
+
"\u115f", # Hangul choseong filler
|
28
|
+
"\u1160", # Hangul jungseong filler
|
29
|
+
"\u1735", # Philippine single punctuation
|
30
|
+
"\u2000", # En quad
|
31
|
+
"\u2001", # Em quad
|
32
|
+
"\u2002", # En space
|
33
|
+
"\u2003", # Em space
|
34
|
+
"\u2004", # Three-per-em space
|
35
|
+
"\u2005", # Four-per-em space
|
36
|
+
"\u2006", # Six-per-em-space
|
37
|
+
"\u2007", # Figure space
|
38
|
+
"\u2008", # Punctuation space
|
39
|
+
"\u2009", # Thin space
|
40
|
+
"\u200a", # Hair space
|
41
|
+
"\u200b", # Zero width space
|
42
|
+
"\u200e", # Left-to-right mark
|
43
|
+
"\u200f", # Right-to-left mark
|
44
|
+
"\u2010", # Hyphen
|
45
|
+
"\u2019", # Right single quotation mark
|
46
|
+
"\u2024", # One dot leader
|
47
|
+
"\u2027", # Hyphenation point
|
48
|
+
"\u2028", # Line separator
|
49
|
+
"\u2029", # Paragraph separator
|
50
|
+
"\u202a", # Left-to-right embedding
|
51
|
+
"\u202b", # Right-to-left embedding
|
52
|
+
"\u202c", # Pop directional formatting
|
53
|
+
"\u202d", # Left-to-right override
|
54
|
+
"\u202e", # Right-to-left override
|
55
|
+
"\u202f", # Narrow no-break space
|
56
|
+
"\u2039", # Single left-pointing angle quotation mark
|
57
|
+
"\u203a", # Single right-pointing angle quotation mark
|
58
|
+
"\u2041", # Caret insertion point
|
59
|
+
"\u2044", # Fraction slash
|
60
|
+
"\u2052", # Commercial minus sign
|
61
|
+
"\u205f", # Medium mathematical space
|
62
|
+
"\u2153", # Vulgar fraction one third
|
63
|
+
"\u2154", # Vulgar fraction two thirds
|
64
|
+
"\u2155", # Vulgar fraction one fifth
|
65
|
+
"\u2156", # Vulgar fraction two fifths
|
66
|
+
"\u2157", # Vulgar fraction three fifths
|
67
|
+
"\u2158", # Vulgar fraction four fifths
|
68
|
+
"\u2159", # Vulgar fraction one sixth
|
69
|
+
"\u215a", # Vulgar fraction five sixths
|
70
|
+
"\u215b", # Vulgar fraction one eight
|
71
|
+
"\u215c", # Vulgar fraction three eighths
|
72
|
+
"\u215d", # Vulgar fraction five eighths
|
73
|
+
"\u215e", # Vulgar fraction seven eighths
|
74
|
+
"\u215f", # Fraction numerator one
|
75
|
+
"\u2215", # Division slash
|
76
|
+
"\u2236", # Ratio
|
77
|
+
"\u23ae", # Integral extension
|
78
|
+
"\u2571", # Box drawings light diagonal upper right to lower left
|
79
|
+
"\u29f6", # Solidus with overbar
|
80
|
+
"\u29f8", # Big solidus
|
81
|
+
"\u2afb", # Triple solidus binary relation
|
82
|
+
"\u2afd", # Double solidus operator
|
83
|
+
"\u2ff0", # Ideographic description character left to right
|
84
|
+
"\u2ff1", # Ideographic description character above to below
|
85
|
+
"\u2ff2", # Ideographic description character left to middle and right
|
86
|
+
"\u2ff3", # Ideographic description character above to middle and below
|
87
|
+
"\u2ff4", # Ideographic description character full surround
|
88
|
+
"\u2ff5", # Ideographic description character surround from above
|
89
|
+
"\u2ff6", # Ideographic description character surround from below
|
90
|
+
"\u2ff7", # Ideographic description character surround from left
|
91
|
+
"\u2ff8", # Ideographic description character surround from upper left
|
92
|
+
"\u2ff9", # Ideographic description character surround from upper right
|
93
|
+
"\u2ffa", # Ideographic description character surround from lower left
|
94
|
+
"\u2ffb", # Ideographic description character overlaid
|
95
|
+
"\u3000", # Ideographic space
|
96
|
+
"\u3002", # Ideographic full stop
|
97
|
+
"\u3014", # Left tortoise shell bracket
|
98
|
+
"\u3015", # Right tortoise shell bracket
|
99
|
+
"\u3033", # Vertical kana repeat mark upper half
|
100
|
+
"\u30a0", # Katakana-hiragana double hyphen
|
101
|
+
"\u3164", # Hangul filler
|
102
|
+
"\u321d", # Parenthesized korean character ojeon
|
103
|
+
"\u321e", # Parenthesized korean character o hu
|
104
|
+
"\u33ae", # Square rad over s
|
105
|
+
"\u33af", # Square rad over s squared
|
106
|
+
"\u33c6", # Square c over kg
|
107
|
+
"\u33df", # Square a over m
|
108
|
+
"\ua789", # Modifier letter colon
|
109
|
+
"\ufe14", # Presentation form for vertical semicolon
|
110
|
+
"\ufe15", # Presentation form for vertical exclamation mark
|
111
|
+
"\ufe3f", # Presentation form for vertical left angle bracket
|
112
|
+
"\ufe5d", # Small left tortoise shell bracket
|
113
|
+
"\ufe5e", # Small right tortoise shell bracket
|
114
|
+
"\ufeff", # Zero-width no-break space
|
115
|
+
"\uff0e", # Fullwidth full stop
|
116
|
+
"\uff0f", # Fullwidth solidus
|
117
|
+
"\uff61", # Halfwidth ideographic full stop
|
118
|
+
"\uffa0", # Halfwidth hangul filler
|
119
|
+
"\ufff9", # Interlinear annotation anchor
|
120
|
+
"\ufffa", # Interlinear annotation separator
|
121
|
+
"\ufffb", # Interlinear annotation terminator
|
122
|
+
"\ufffc", # Object replacement character
|
123
|
+
"\ufffd" # Replacement character
|
124
|
+
]
|
125
|
+
|
126
|
+
def allowed_chars_set
|
127
|
+
@@allowed_chars_set ||= (read_allowed_idn_chars.chars.to_set - MOZZILLA_DISALLOWED_CHARACTERS)
|
128
|
+
end
|
129
|
+
|
130
|
+
private
|
131
|
+
# Built with script/development/generate_allowed_idn_characters.rb
|
132
|
+
def read_allowed_idn_chars
|
133
|
+
File.read("#{__dir__}/data/allowed_idn_characters.txt")
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def attack_detected?
|
138
|
+
!label_set.subset?(self.class.allowed_chars_set)
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# 12. of Google Chrome IDN policy
|
2
|
+
class HomographicSpoofing::Detector::Rule::Idn::DangerousPattern < HomographicSpoofing::Detector::Rule::Idn::Base
|
3
|
+
DANGEROUS_PATTERNS = Regexp.union(
|
4
|
+
/# Disallow the following as they may be mistaken for slashes when
|
5
|
+
# they're surrounded by non-Japanese scripts (i.e. has non-Katakana
|
6
|
+
# Hiragana or Han scripts on both sides):
|
7
|
+
# "ノ" (Katakana no, U+30ce), "ソ" (Katakana so, U+30bd),
|
8
|
+
# "ゾ" (Katakana zo, U+30be), "ン" (Katakana n, U+30f3),
|
9
|
+
# "丶" (CJK unified ideograph, U+4E36),
|
10
|
+
# "乀" (CJK unified ideograph, U+4E40),
|
11
|
+
# "乁" (CJK unified ideograph, U+4E41),
|
12
|
+
# "丿" (CJK unified ideograph, U+4E3F).
|
13
|
+
# If {no, so, zo, n} next to a
|
14
|
+
# non-Japanese script on either side is disallowed.
|
15
|
+
[^\p{kana}\p{hira}\p{hani}]
|
16
|
+
[\u30ce\u30f3\u30bd\u30be\u4e36\u4e40\u4e41\u4e3f]
|
17
|
+
[^\p{kana}\p{hira}\p{hani}]/x,
|
18
|
+
|
19
|
+
/# Disallow three Hiragana letters (U+307[8-A]) or Katakana letters
|
20
|
+
# (U+30D[8-A]) that look exactly like each other when they're used
|
21
|
+
# in a label otherwise entirely in Katakana or Hiragana.
|
22
|
+
^[\p{kana}]+[\u3078-\u307a][\p{kana}]+\z/x,
|
23
|
+
/^[\p{hira}]+[\u30d8-\u30da][\p{hira}]+\z/,
|
24
|
+
|
25
|
+
/# Disallow U+30FD (Katakana iteration mark) and U+30FE (Katakana
|
26
|
+
# voiced iteration mark) unless they're preceded by a Katakana.
|
27
|
+
([^\p{kana}][\u30fd\u30fe]|^[\u30fd\u30fe])/x,
|
28
|
+
|
29
|
+
/# Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-
|
30
|
+
# Katakana Prolonged Sound) used out-of-context.
|
31
|
+
([^\p{kana}\p{hira}]\u30fc|^\u30fc|[a-z]\u30fb|\u30fb[a-z])/x,
|
32
|
+
|
33
|
+
/# Disallow these CJK ideographs if they are next to non-CJK
|
34
|
+
# characters. These characters can be used to spoof Latin
|
35
|
+
# characters or punctuation marks:
|
36
|
+
# U+4E00 (一), U+3127 (ㄧ), U+4E28 (丨), U+4E5B (乛), U+4E03 (七),
|
37
|
+
# U+4E05 (丅), U+5341 (十), U+3007 (〇), U+3112 (ㄒ), U+311A (ㄚ),
|
38
|
+
# U+311F (ㄟ), U+3128 (ㄨ), U+3129 (ㄩ), U+3108 (ㄈ), U+31BA (ㆺ),
|
39
|
+
# U+31B3 (ㆳ), U+5DE5 (工), U+31B2 (ㆲ), U+8BA0 (讠), U+4E01 (丁)
|
40
|
+
# These characters are already blocked:
|
41
|
+
# U+2F00 (⼀) (normalized to U+4E00), U+3192 (㆒), U+2F02 (⼂),
|
42
|
+
# U+2F17 (⼗) and U+3038 (〸) (both normalized to U+5341 (十)).
|
43
|
+
# Check if there is non-{Hiragana, Katagana, Han, Bopomofo} on the
|
44
|
+
# left.
|
45
|
+
[^\p{kana}\p{hira}\p{hani}\p{bopo}]
|
46
|
+
[\u4e00\u3127\u4e28\u4e5b\u4e03\u4e05\u5341\u3007\u3112\u311a\u311f\u3128\u3129\u3108\u31ba\u31b3\u5dE5\u31b2\u8ba0\u4e01]/x,
|
47
|
+
/# Check if there is non-{Hiragana, Katagana, Han, Bopomofo} on the
|
48
|
+
# right.
|
49
|
+
[\u4e00\u3127\u4e28\u4e5b\u4e03\u4e05\u5341\u3007\u3112\u311a\u311f\u3128\u3129\u3108\u31ba\u31b3\u5de5\u31b2\u8ba0\u4e01]
|
50
|
+
[^\p{kana}\p{hira}\p{hani}\p{bopo}]/x,
|
51
|
+
|
52
|
+
/# Disallow combining diacritical mark (U+0300-U+0339) after a
|
53
|
+
# non-LGC character. Other combining diacritical marks are not in
|
54
|
+
# the allowed character set.
|
55
|
+
[^\p{latn}\p{grek}\p{cyrl}][\u0300-\u0339]/x,
|
56
|
+
|
57
|
+
/# Disallow dotless i (U+0131) followed by a combining mark.
|
58
|
+
\u0131[\u0300-\u0339]/x,
|
59
|
+
|
60
|
+
/# Disallow combining Kana voiced sound marks.
|
61
|
+
(\u3099|\u309a)/x,
|
62
|
+
|
63
|
+
/# Disallow U+0307 (dot above) after 'i', 'j', 'l' or dotless i
|
64
|
+
# (U+0131). Dotless j (U+0237) is not in the allowed set to begin
|
65
|
+
# with.
|
66
|
+
[ijl]\u0307/x,
|
67
|
+
/^\u0237/
|
68
|
+
)
|
69
|
+
|
70
|
+
def attack_detected?
|
71
|
+
DANGEROUS_PATTERNS.match?(label)
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46
|
2
|
+
# transitional processing treats them as IDNA 2003 does; maps U+00DF and
|
3
|
+
# U+03C2 and drops U+200[CD].
|
4
|
+
class HomographicSpoofing::Detector::Rule::Idn::DeviationCharacters < HomographicSpoofing::Detector::Rule::Idn::Base
|
5
|
+
DEVIATION_CHARACTERS = %W[ ß ς \u200c \u200d ].to_set
|
6
|
+
|
7
|
+
def attack_detected?
|
8
|
+
(label_set & DEVIATION_CHARACTERS).present?
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# 11. of Google Chrome IDN policy
|
2
|
+
class HomographicSpoofing::Detector::Rule::Idn::Digits < HomographicSpoofing::Detector::Rule::Idn::Base
|
3
|
+
def attack_detected?
|
4
|
+
contains_digit_lookalike? && contains_only_digits_or_digit_lookalike?
|
5
|
+
end
|
6
|
+
|
7
|
+
private
|
8
|
+
def contains_digit_lookalike?
|
9
|
+
label.chars.any? { |char| digit_lookalike?(char) }
|
10
|
+
end
|
11
|
+
|
12
|
+
def contains_only_digits_or_digit_lookalike?
|
13
|
+
label.chars.all? { |char| digit?(char) || digit_lookalike?(char) }
|
14
|
+
end
|
15
|
+
|
16
|
+
def digit?(char)
|
17
|
+
/[0-9]/.match?(char)
|
18
|
+
end
|
19
|
+
|
20
|
+
DIGIT_LOOKALIKES = %w[ θ २ ২ ੨ ੨ ૨ ೩ ೭ շ з ҙ ӡ उ ও ਤ ੩ ૩ ౩ ဒ ვ პ ੜ კ ੫ 丩 ㄐ ճ ৪ ੪ ୫ ૭ ୨ ౨ ].to_set
|
21
|
+
|
22
|
+
def digit_lookalike?(char)
|
23
|
+
DIGIT_LOOKALIKES.include?(char)
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# 7. of Google Chrome IDN policy
|
2
|
+
class HomographicSpoofing::Detector::Rule::Idn::InvisibleCharacters < HomographicSpoofing::Detector::Rule::Idn::Base
|
3
|
+
def attack_detected?
|
4
|
+
INVISIBLE_CHARACTERS_REGEXP.match?(label)
|
5
|
+
end
|
6
|
+
|
7
|
+
private
|
8
|
+
INVISIBLE_CHARACTERS_REGEXP = Regexp.union(
|
9
|
+
/\u0e48{2,}/, # Thai tone repeated
|
10
|
+
/\u0301{2,}/, # accute accent repeated
|
11
|
+
/\u00e1\u0301/, # 'a' with acuted accent + another acute accent
|
12
|
+
/^\u0300/, # Combining mark at the beginning
|
13
|
+
)
|
14
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# 9. and 10. of Google Chrome IDN policy See http://unicode.org/reports/tr39/#Confusable_Detection
|
2
|
+
class HomographicSpoofing::Detector::Rule::Idn::ScriptConfusable < HomographicSpoofing::Detector::Rule::Idn::Base
|
3
|
+
def attack_detected?
|
4
|
+
SCRIPT_CONFUSABLES.any? do |confusable|
|
5
|
+
confusable_chars = label.scan(confusable.script)
|
6
|
+
confusable_chars.present? &&
|
7
|
+
confusable_chars.all? { confusable.latin_lookalike.match?(_1) } &&
|
8
|
+
!is_script_confusable_allowed_for_tld?(confusable)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
Confusable = Struct.new(:script, :latin_lookalike, :allowed_tlds)
|
14
|
+
SCRIPT_CONFUSABLES = [
|
15
|
+
# Armenian
|
16
|
+
[ /\p{armn}/, /[ագզէլհյոսւօՙ]/, /am/ ],
|
17
|
+
# Cyrillic
|
18
|
+
[ /\p{cyrl}/, /[аысԁеԍһіюјӏорԗԛѕԝхуъьҽпгѵѡ]/, /bg|by|kz|pyc|ru|su|ua|uz/ ],
|
19
|
+
# # Ethiopic (Ge'ez).
|
20
|
+
[ /\p{ethi}/, /[ሀሠሰስበነተከዐዕዘጠፐꬅ]/, /er|et/ ],
|
21
|
+
# # Georgian
|
22
|
+
[ /\p{geor}/, /[იოყძხჽჿ]/, /ge/ ],
|
23
|
+
# # Greek
|
24
|
+
[ /\p{grek}/, /[αικνρυωηοτ]/, /gr/ ],
|
25
|
+
# # Hebrew
|
26
|
+
[ /\p{hebr}/, /[דוחיןסװײ׳ﬦ]/, /il/ ],
|
27
|
+
# # Bengali
|
28
|
+
[ /\p{beng}/, /[০৭]/, nil ],
|
29
|
+
# # Devanagari
|
30
|
+
[ /\p{Deva}/, /[ऽ०ॱ]/, nil ],
|
31
|
+
# # Gujarati
|
32
|
+
[ /\p{Gujr}/, /[ડટ૦૧]/, nil ],
|
33
|
+
# # Gurmukhi
|
34
|
+
[ /\p{Guru}/, /[੦੧]/, nil ],
|
35
|
+
# # Kannada
|
36
|
+
[ /\p{Knda}/, /[ಽ೦೧]/, nil ],
|
37
|
+
# # Malayalam
|
38
|
+
[ /\p{Mlym}/, /[ടഠധനറ൦]/, nil ],
|
39
|
+
# # Oriya
|
40
|
+
[ /\p{Orya}/, /[ଠ୦୮]/, nil ],
|
41
|
+
# # Tamil
|
42
|
+
[ /\p{Taml}/, /[டப௦]/, nil ],
|
43
|
+
# # Telugu
|
44
|
+
[ /\p{Telu}/, /[౦౧]/, nil ],
|
45
|
+
# # Myanmar
|
46
|
+
[ /\p{Mymr}/, /[ခဂငထပဝ၀၂ၔၜ\u1090\u1091\u1095\u1096\u1097]/, /[a-z]+\.mm/ ],
|
47
|
+
# # Thai
|
48
|
+
[ /\p{Thai}/, /[ทนบพรหเแ๐ดลปฟม]/, /th/ ]
|
49
|
+
].map { Confusable.new(*_1) }
|
50
|
+
|
51
|
+
def is_script_confusable_allowed_for_tld?(confusable)
|
52
|
+
tld_contains_any_letter_from_script?(confusable.script) ||
|
53
|
+
confusable.allowed_tlds&.match?(tld)
|
54
|
+
end
|
55
|
+
|
56
|
+
def tld_contains_any_letter_from_script?(script)
|
57
|
+
script.match?(tld)
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
class HomographicSpoofing::Detector::Rule::Idn::ScriptSpecific < HomographicSpoofing::Detector::Rule::Idn::Base
|
2
|
+
def attack_detected?
|
3
|
+
latin_spoof? || icelandic_spoof? || azerbaijan_spoof?
|
4
|
+
end
|
5
|
+
|
6
|
+
private
|
7
|
+
LATN = "Latin"
|
8
|
+
# Disallow non-ASCII Latin letters to mix with a non-Latin script.
|
9
|
+
# Note that the non-ASCII Latin check should not be applied when the entire label is made of Latin.
|
10
|
+
def latin_spoof?
|
11
|
+
scripts != Set[LATN] && non_ascii_latin_letters.present?
|
12
|
+
end
|
13
|
+
|
14
|
+
def non_ascii_latin_letters
|
15
|
+
label.scan(/\p{latin}/).reject { _1 =~ /[a-z0-9]/ }
|
16
|
+
end
|
17
|
+
|
18
|
+
ICELANDIC_CHARACTERS = %w[ þ ð ].to_set
|
19
|
+
# Latin small letter thorn ("þ", U+00FE) can be used to spoof both b and p.
|
20
|
+
# It's used in modern Icelandic orthography, so allow it for the Icelandic
|
21
|
+
# ccTLD (.is) but block in any other TLD. Also block Latin small letter eth
|
22
|
+
# ("ð", U+00F0) which can be used to spoof the letter o.
|
23
|
+
def icelandic_spoof?
|
24
|
+
tld != "is" && (label_set & ICELANDIC_CHARACTERS).any?
|
25
|
+
end
|
26
|
+
|
27
|
+
# ə is only allowed under the .az TLD.
|
28
|
+
def azerbaijan_spoof?
|
29
|
+
tld != "az" && label_set.include?("ə")
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# 8. of Google Chrome IDN policy
|
2
|
+
#
|
3
|
+
# Allow middle dot (U+00B7) only on Catalan domains when between two 'l's, to
|
4
|
+
# permit the Catalan character ela geminada to be expressed.
|
5
|
+
# See https://tools.ietf.org/html/rfc5892#appendix-A.3 for details.
|
6
|
+
class HomographicSpoofing::Detector::Rule::Idn::UnsafeMiddleDot < HomographicSpoofing::Detector::Rule::Idn::Base
|
7
|
+
def attack_detected?
|
8
|
+
label.scan(/l?·l?/).find do |match|
|
9
|
+
tld != "cat" || match != "l·l"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# See http://www.unicode.org/reports/tr39/#Email_Security_Profiles dot-atom-text.
|
2
|
+
class HomographicSpoofing::Detector::Rule::Local::DotAtomText < HomographicSpoofing::Detector::Rule::Base
|
3
|
+
def attack_detected?
|
4
|
+
if invalid_dot_sequence?
|
5
|
+
true
|
6
|
+
elsif label_no_dots.present?
|
7
|
+
!valid_start_sequence? || contains_invalid_char?
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
def invalid_dot_sequence?
|
13
|
+
label.starts_with?(".") || label.ends_with?(".") || multiple_dots?
|
14
|
+
end
|
15
|
+
|
16
|
+
def multiple_dots?
|
17
|
+
/\.{2,}/.match?(label)
|
18
|
+
end
|
19
|
+
|
20
|
+
def label_no_dots
|
21
|
+
@label_no_dots ||= label.tr(".", "")
|
22
|
+
end
|
23
|
+
|
24
|
+
XID_Start_REGEXP = /\p{XIDS}/
|
25
|
+
|
26
|
+
def valid_start_sequence?
|
27
|
+
start = label.first
|
28
|
+
simple_char?(start) || XID_Start_REGEXP.match?(start)
|
29
|
+
end
|
30
|
+
|
31
|
+
def contains_invalid_char?
|
32
|
+
label_no_dots.chars.any? { invalid_char?(_1) }
|
33
|
+
end
|
34
|
+
|
35
|
+
# https://tools.ietf.org/html/rfc5322#section-3.2.3 atext
|
36
|
+
ATEXT_REGEXP = %r{[!#-'*+\-/-9=?A-Z\^-~]}
|
37
|
+
|
38
|
+
def invalid_char?(c)
|
39
|
+
if simple_char?(c)
|
40
|
+
!ATEXT_REGEXP.match(c)
|
41
|
+
else
|
42
|
+
HomographicSpoofing::Detector::Rule::DisallowedCharacters.allowed_chars_set.exclude?(c)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def simple_char?(c)
|
47
|
+
c < "\u007f"
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# 6. of Google Chrome IDN policy
|
2
|
+
class HomographicSpoofing::Detector::Rule::MixedDigits < HomographicSpoofing::Detector::Rule::Base
|
3
|
+
def attack_detected?
|
4
|
+
digits_scripts.many?
|
5
|
+
end
|
6
|
+
|
7
|
+
private
|
8
|
+
def digits_scripts
|
9
|
+
digits.map { digits_map[_1] }.uniq
|
10
|
+
end
|
11
|
+
|
12
|
+
def digits
|
13
|
+
label.scan(/[[:digit:]]/)
|
14
|
+
end
|
15
|
+
|
16
|
+
def digits_map
|
17
|
+
@@digits_map ||= build_digits_map
|
18
|
+
end
|
19
|
+
|
20
|
+
def build_digits_map
|
21
|
+
CSV.parse(read_digits).each_with_object({}) do |(char, script), map|
|
22
|
+
map[char] = script
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Built with script/development/generate_digits_characters.rb
|
27
|
+
def read_digits
|
28
|
+
File.read("#{__dir__}/data/digits.csv")
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# 5. of Google Chrome IDN policy See http://www.unicode.org/reports/tr39/#highly_restrictive
|
2
|
+
class HomographicSpoofing::Detector::Rule::MixedScripts < HomographicSpoofing::Detector::Rule::Base
|
3
|
+
def attack_detected?
|
4
|
+
!highly_restrictive_scripts_combination?
|
5
|
+
end
|
6
|
+
|
7
|
+
private
|
8
|
+
BOPO = "Bopomofo"
|
9
|
+
HANG = "Hangul"
|
10
|
+
HANI = "Han"
|
11
|
+
HIRA = "Hiragana"
|
12
|
+
KANA = "Katakana"
|
13
|
+
LATN = "Latin"
|
14
|
+
|
15
|
+
JAPANESE = Set[HANI, HIRA, KANA]
|
16
|
+
CHINESE = Set[BOPO, HANI]
|
17
|
+
KOREAN = Set[HANI, HANG]
|
18
|
+
|
19
|
+
HIGHLY_RESTRICTIVE_SCRIPT_COMBINATIONS = [
|
20
|
+
Set[*JAPANESE, LATN],
|
21
|
+
Set[*CHINESE, LATN],
|
22
|
+
Set[*KOREAN, LATN]
|
23
|
+
]
|
24
|
+
|
25
|
+
def highly_restrictive_scripts_combination?
|
26
|
+
scripts.length == 1 || HIGHLY_RESTRICTIVE_SCRIPT_COMBINATIONS.any? do |highly_restrictive_script_combination|
|
27
|
+
scripts.subset?(highly_restrictive_script_combination)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# See http://www.unicode.org/reports/tr39/#Email_Security_Profiles bidicontrol.
|
2
|
+
class HomographicSpoofing::Detector::Rule::QuotedString::BidiControl < HomographicSpoofing::Detector::Rule::Base
|
3
|
+
# See https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3Abidicontrol%3A%5D&c=on&g=&i=
|
4
|
+
# for the full list of bidirectional format characters.
|
5
|
+
DISALLOWED_REGEXP = /[\u202a\u202b\u202c\u202d\u202e\u2066\u2067\u2068\u2069]/
|
6
|
+
|
7
|
+
def attack_detected?
|
8
|
+
DISALLOWED_REGEXP.match?(label)
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
̴̵̶̷̸̡̢̧̨̛̖̗̘̙̜̝̞̟̠̣̤̥̦̩̪̫̬̭̮̯̰̱̲̳̹̺̻̼͙͚͇͈͉͍͎̀́̂̃̄̅̆̇̈̉̊̋̌̍̎̏̐̑̒̓̔̽̾̿͛̀́͂̓̈́͆͊͋͌̕̚͘ͅ͏͓͔͕͖͐͑͒͗ͣͤͥͦͧͨͩͪͫͬͭͮͯ҃҄҅҆҇͜͟͢͝͞͠͡҈҉ְֱֲֳִֵֶַָׇֹֺֻּֽֿׁׂًٌٍؘَؙُؚِّْٰܑ֑֖֛֢֣֤֥֦֧֪ׅۣ۪ۭٕٖٜٟܱܴܷܸܹܻܼܾ݂݄݆݈֚֭֮֒֓֔֕֗֘֙֜֝֞֟֠֡֨֩֫֬֯ׄؐؑؒؓؔؕؗۖۗۘۙۚۛۜ۟۠ۡۢۤۧۨ۫۬ؖٓٔٗ٘ٙٚٛٝٞܰܲܳܵܶܺܽܿ݀݁݃݅݇݉݊ަާިީުޫެޭޮޯްࣰࣱࣲ߲߽࡙࡚࡛࢙࢚࢛࣏࣐࣑࣒࣓ࣣࣦࣩ࣭࣮࣯ࣶࣹࣺ߫߬߭߮߯߰߱߳ࠖࠗ࠘࠙ࠛࠜࠝࠞࠟࠠࠡࠢࠣࠥࠦࠧࠩࠪࠫࠬ࠭࢘࢜࢝࢞࢟࣊࣋࣌࣍࣎ࣔࣕࣖࣗࣘࣙࣚࣛࣜࣝࣞࣟ࣠࣡ࣳࣤࣥࣧࣨ࣪࣫࣬ࣴࣵࣷࣸࣻࣼࣽࣾࣿऀँं़ऺुूृॄॅॆेैॕ्॒॑॓॔ॖॗॢॣঁ়ুূৃৄ্ৢৣ৾ਁਂ਼ੁੂੇੈੋੌ੍ੑੰੱੵઁં઼ુૂૃૄૅેૈ્ૢૣૺૻૼ૽૾૿ଁ଼୕ୖିୁୂୃୄ୍ୢୣஂீ்ఀఄ఼ౕౖాిీెేైొోౌ్ౢౣಁ಼ೌ್ೢೣഀഁ഻഼ുൂൃൄ്ൢൣඁ්ිීුූัิีึืฺุู็่้๊๋์ํ๎ັິີຶືຸູົ຺ຼ໌ໍ໎྄່້໊໋ཱཱཱིིུུ༹༘༙༵༷࿆ྂྃ྆྇ྲྀཷླྀཹཱེཻོཽྀྀཾྍྎྏྐྑྒྒྷྔྕྖྗྙྚྛྜྜྷྞྟྠྡྡྷྣྤྥྦྦྷྨྩྪྫྫྷྭྮྯྰྱྲླྴྵྶྷྸྐྵྺྻྼိီုူဲဳဴဵံ့္်ွှၘၙၞၟၠၱၲၳၴႂႅႆႍႝ፝፞፟ᜒᜓ᜔ᜲᜳᝒᝓᝲᝳ឴឵ិីឹឺុូួំ់៌៍៎៏័៑្៝៉៊៓᠋᠌᠍᠏ᢅᢆᢩᤠᤡᤢᤧᤨᤲ᤻ᨘ᤹᤺ᨗᨛᩖᩘᩙᩚᩛᩜᩝᩞ᩠᩺᩻᩼ᩢᩥᩦᩧᩨᩩᩪᩫᩬᩳᩴ᩿᪵᪶᪷᪸᪹᪺᪽᩵᩶᩷᩸᩹᪰᪱᪲᪳᪴᪻᪼᪾ᪿᫀ᫃᫄᫊᫁᫂᫅᫆᫇᫈᫉᫋ᫌᫍᫎᬀᬁᬂᬃ᬴ᬶᬷᬸᬹᬺᬼᭂ᭬᭫᭭᭮᭯᭰᭱᭲᭳ᮀᮁᮢᮣᮬᮭᮤᮥᮨᮩ᯦᮫ᯨᯩᯭᯯᯰᯱᰬᰭᰮᰯᰰᰱᰲᰳᰶ᳔᳢᳣᳤᳥᳦᳧᳨⃒⃓⃘⃙⃚᰷᷐᷎᷺᳕᳖᳗᳘᳙᳜᳝᳞᳟᳭᷂᷊᷹᷽᷏᷿᷸᷷᳐᳑᳒᳚᳛᳴᳠᳸᳹᷀᷁᷃᷻᷄᷅᷆᷇᷈᷉᷋᷌᷑᷒ᷓᷔᷕᷖᷗᷘᷙᷚᷛᷜᷝᷞᷟᷠᷡᷢᷣᷤᷥᷦᷧᷨᷩᷪᷫᷬᷭᷮᷯᷰᷱᷲᷳᷴ᷵᷾⃐⃑⃔⃕⃖⃗⃛⃜᷶᷼᷍⃝⃞⃟⃠⃢⃣⃤⃥⃦⃪⃫゙゚⵿〪⃨⃬⃭⃮⃯〭〫⃡⃧⃩⃰⳯⳰⳱ⷠⷡⷢⷣⷤⷥⷦⷧⷨⷩⷪⷫⷬⷭⷮⷯⷰⷱⷲⷳⷴⷵⷶⷷⷸⷹⷺⷻⷼⷽⷾⷿ꙯〬꙰꙱꙲ꙴꙵꙶꙷꙸꙹꙺꙻ꙼꙽ꚞꚟ꛰꛱ꠂ꠆꠬ꠋꠥꠦ꣄ꣅ꣠꣡꣢꣣꣤꣥꣦꣧꣨꣩꣪꣫꣬꣭꣮꣯꣰꣱ꣿꤦꤧꤨꤩꤪ꤫꤬꤭ꥇꥈꥉꥊꥋꥌꥍꥎꥏꥐꥑꦀꦁꦂ꦳ꦶꦷꦸꦹꦼꦽꧥꨩꨪꨫꨬꨭꨮꨱꨲꨵꨶꩃꩌꩼꪴꪰꪲꪳꪷꪸꪾ꪿꫁ꫬꫭ꫶ꯥꯨ꯭ﬞ︀︁︂︃︄︅︆︇︈︉︊︋︌︍︎️︧︨︩︪︫︬︭𐇽𐋠︠︡︢︣︮︯︤︥︦𐍶𐍷𐍸𐍹𐍺𐨁𐨂𐨃𐨅𐨆𐨌𐨍𐨎𐨹𐨿𐨺𐫦𐻽𐻾𐻿𐽆𐽇𐽋𐽍𐽎𐽏𐽐𐾃𐾅𐨏𐨸𐫥𐴤𐴥𐴦𐴧𐺫𐺬𐽈𐽉𐽊𐽌𐾂𐾄𑀁𑀸𑀹𑀺𑀻𑀼𑀽𑀾𑀿𑁀𑁁𑁂𑁃𑁄𑁅𑁳𑁴𑁆𑁰𑁿𑂀𑂁𑂺𑂳𑂴𑂵𑂶𑂹𑃂𑄳𑄴𑄀𑄁𑄂𑄧𑄨𑄩𑄪𑄫𑄭𑄮𑄯𑄰𑄱𑄲𑅳𑆀𑆁𑆶𑆷𑆸𑆹𑆺𑆻𑆼𑆽𑆾𑇉𑇏𑇊𑇋𑇌𑈯𑈰𑈱𑉁𑈴𑈶𑈷𑈾𑋟𑋣𑋤𑋥𑋦𑋧𑋨𑋩𑋪𑌀𑌁𑌻𑌼𑍀𑍦𑍧𑍨𑍩𑍪𑍫𑍬𑍰𑍱𑍲𑍳𑍴𑐸𑐹𑐺𑐻𑐼𑐽𑐾𑐿𑑂𑑃𑑄𑑆𑑞𑒳𑒴𑒵𑒶𑒷𑒸𑒺𑒿𑓀𑓃𑓂𑖲𑖳𑖴𑖵𑖼𑖽𑗀𑖿𑗜𑗝𑘳𑘴𑘵𑘶𑘷𑘸𑘹𑘺𑘽𑘿𑙀𑚫𑚭𑚰𑚱𑚲𑚳𑚴𑚵𑚷𑜝𑜞𑜟𑜢𑜣𑜤𑜥𑜧𑜨𑜩𑜪𑜫𑠯𑠰𑠱𑠲𑠳𑠴𑠵𑠶𑠷𑠺𑠹𑤻𑤼𑥃𑤾𑧔𑧕𑧖𑧗𑧚𑧛𑧠𑨁𑨂𑨃𑨄𑨅𑨆𑨉𑨊𑨳𑨴𑨵𑨶𑨷𑨸𑨻𑨼𑨽𑨾𑩇𑩑𑩒𑩓𑩔𑩕𑩖𑩙𑩚𑩛𑪊𑪋𑪌𑪍𑪎𑪏𑪐𑪑𑪒𑪓𑪔𑪕𑪖𑪘𑪙𑰰𑰱𑰲𑰳𑰴𑰵𑰶𑰸𑰹𑰺𑰻𑰼𑰽𑲒𑲓𑲔𑲕𑲖𑲗𑲘𑲙𑲚𑲛𑲜𑲝𑲞𑲟𑲠𑲡𑲢𑲣𑲤𑲥𑲦𑲧𑲪𑲫𑲬𑲭𑲮𑲯𑲰𑲲𑲳𑲵𑲶𑴱𑴲𑴳𑴴𑴵𑴶𑴺𑴼𑴽𑴿𑵀𑵁𑵂𑵃𑵄𑵅𑵇𑶐𑶑𑶕𑶗𑻳𑻴𑼀𑼁𑼶𑼷𑼸𑼹𑼺𑽀𑽂𓑀𓑇𓑈𓑉𓑊𓑋𓑌𓑍𓑎𓑏𓑐𓑑𓑒𓑓𓑔𓑕𖫰𖫱𖫲𖫳𖫴𖬰𖬱𖬲𖬳𖬴𖬵𖬶𖽏𖾏𖾐𖾑𖾒𖿤𛲝𛲞𜼀𜼁𜼂𜼃𜼄𜼅𜼆𜼇𜼈𜼉𜼊𜼋𜼌𜼍𜼎𜼏𜼐𜼑𜼒𜼓𜼔𜼕𜼖𜼗𜼘𜼙𜼚𜼛𜼜𜼝𜼞𜼟𜼠𜼡𜼢𜼣𜼤𜼥𜼦𜼧𜼨𜼩𜼪𜼫𜼬𜼭𜼰𜼱𜼲𜼳𜼴𜼵𜼶𜼷𜼸𜼹𜼺𜼻𜼼𜼽𜼾𜼿𜽀𜽁𜽂𜽃𜽄𜽅𜽆𝅧𝅨𝅩𝅻𝅼𝅽𝅾𝅿𝆀𝆁𝆂𝆊𝆋𝆅𝆆𝆇𝆈𝆉𝆪𝆫𝆬𝆭𝉂𝉃𝉄𝨀𝨁𝨂𝨃𝨄𝨅𝨆𝨇𝨈𝨉𝨊𝨋𝨌𝨍𝨎𝨏𝨐𝨑𝨒𝨓𝨔𝨕𝨖𝨗𝨘𝨙𝨚𝨛𝨜𝨝𝨞𝨟𝨠𝨡𝨢𝨣𝨤𝨥𝨦𝨧𝨨𝨩𝨪𝨫𝨬𝨭𝨮𝨯𝨰𝨱𝨲𝨳𝨴𝨵𝨶𝨻𝨼𝨽𝨾𝨿𝩀𝩁𝩂𝩃𝩄𝩅𝩆𝩇𝩈𝩉𝩊𝩋𝩌𝩍𝩎𝩏𝩐𝩑𝩒𝩓𝩔𝩕𝩖𝩗𝩘𝩙𝩚𝩛𝩜𝩝𝩞𝩟𝩠𝩡𝩢𝩣𝩤𝩥𝩦𝩧𝩨𝩩𝩪𝩫𝩬𝩵𝪄𝪛𝪜𝪝𝪞𝪟𝪡𝪢𝪣𝪤𝪥𝪦𝪧𝪨𝪩𝪪𝪫𝪬𝪭𝪮𝪯𞥊𞓮𞣐𞣑𞣒𞣓𞣔𞣕𞣖𞀀𞀁𞀂𞀃𞀄𞀅𞀆𞀈𞀉𞀊𞀋𞀌𞀍𞀎𞀏𞀐𞀑𞀒𞀓𞀔𞀕𞀖𞀗𞀘𞀛𞀜𞀝𞀞𞀟𞀠𞀡𞀣𞀤𞀦𞀧𞀨𞀩𞀪𞂏𞄰𞄱𞄲𞄳𞄴𞄵𞄶𞊮𞋬𞋭𞋮𞋯𞓯𞥄𞥅𞥆𞥇𞥈𞥉𞓬𞓭󠄀󠄁󠄂󠄃󠄄󠄅󠄆󠄇󠄈󠄉󠄊󠄋󠄌󠄍󠄎󠄏󠄐󠄑󠄒󠄓󠄔󠄕󠄖󠄗󠄘󠄙󠄚󠄛󠄜󠄝󠄞󠄟󠄠󠄡󠄢󠄣󠄤󠄥󠄦󠄧󠄨󠄩󠄪󠄫󠄬󠄭󠄮󠄯󠄰󠄱󠄲󠄳󠄴󠄵󠄶󠄷󠄸󠄹󠄺󠄻󠄼󠄽󠄾󠄿󠅀󠅁󠅂󠅃󠅄󠅅󠅆󠅇󠅈󠅉󠅊󠅋󠅌󠅍󠅎󠅏󠅐󠅑󠅒󠅓󠅔󠅕󠅖󠅗󠅘󠅙󠅚󠅛󠅜󠅝󠅞󠅟󠅠󠅡󠅢󠅣󠅤󠅥󠅦󠅧󠅨󠅩󠅪󠅫󠅬󠅭󠅮󠅯󠅰󠅱󠅲󠅳󠅴󠅵󠅶󠅷󠅸󠅹󠅺󠅻󠅼󠅽󠅾󠅿󠆀󠆁󠆂󠆃󠆄󠆅󠆆󠆇󠆈󠆉󠆊󠆋󠆌󠆍󠆎󠆏󠆐󠆑󠆒󠆓󠆔󠆕󠆖󠆗󠆘󠆙󠆚󠆛󠆜󠆝󠆞󠆟󠆠󠆡󠆢󠆣󠆤󠆥󠆦󠆧󠆨󠆩󠆪󠆫󠆬󠆭󠆮󠆯󠆰󠆱󠆲󠆳󠆴󠆵󠆶󠆷󠆸󠆹󠆺󠆻󠆼󠆽󠆾󠆿󠇀󠇁󠇂󠇃󠇄󠇅󠇆󠇇󠇈󠇉󠇊󠇋󠇌󠇍󠇎󠇏󠇐󠇑󠇒󠇓󠇔󠇕󠇖󠇗󠇘󠇙󠇚󠇛󠇜󠇝󠇞󠇟󠇠󠇡󠇢󠇣󠇤󠇥󠇦󠇧󠇨󠇩󠇪󠇫󠇬󠇭󠇮󠇯
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# See http://www.unicode.org/reports/tr39/#Email_Security_Profiles nonspacing marks.
|
2
|
+
class HomographicSpoofing::Detector::Rule::QuotedString::NonspacingMarks < HomographicSpoofing::Detector::Rule::Base
|
3
|
+
def attack_detected?
|
4
|
+
nonspacing_marks_regexp.match?(label)
|
5
|
+
end
|
6
|
+
|
7
|
+
private
|
8
|
+
def nonspacing_marks_regexp
|
9
|
+
# 5 or more nonspacing marks in a row or 2 or more repetitions of the same nonspacing mark.
|
10
|
+
@@nonspacing_marks_regexp ||= /[#{nonspacing_marks}]{5,}|([#{nonspacing_marks}])\1/
|
11
|
+
end
|
12
|
+
|
13
|
+
def nonspacing_marks
|
14
|
+
@nonspacing_marks ||= read_nonspacing_marks
|
15
|
+
end
|
16
|
+
|
17
|
+
# Built with script/development/generate_nonspacing_marks.rb
|
18
|
+
def read_nonspacing_marks
|
19
|
+
File.read("#{__dir__}/data/nonspacing_marks.txt")
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
class HomographicSpoofing::Sanitizer::Base
|
2
|
+
class_attribute :logger, default: HomographicSpoofing.logger
|
3
|
+
|
4
|
+
def self.sanitize(field)
|
5
|
+
new(field).sanitize
|
6
|
+
end
|
7
|
+
|
8
|
+
def initialize(field)
|
9
|
+
@field = field
|
10
|
+
end
|
11
|
+
|
12
|
+
def sanitize
|
13
|
+
result = field.dup
|
14
|
+
detector_class.new(field).detections.each do |detection|
|
15
|
+
log(detection.reason, detection.label)
|
16
|
+
result = punycode(result, detection.label)
|
17
|
+
end
|
18
|
+
result
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
attr_reader :field
|
23
|
+
|
24
|
+
def punycode(source, label)
|
25
|
+
source.gsub(label, Dnsruby::Name.punycode(label))
|
26
|
+
end
|
27
|
+
|
28
|
+
def detector_class
|
29
|
+
raise NotImplementedError, "subclasses must override this"
|
30
|
+
end
|
31
|
+
|
32
|
+
def log(reason, label)
|
33
|
+
self.class.logger.info("#{spoofing_type} Spoofing detected for: \"#{reason}\" on: \"#{label}\".") if self.class.logger
|
34
|
+
end
|
35
|
+
|
36
|
+
def spoofing_type
|
37
|
+
raise NotImplementedError, "subclasses must override this"
|
38
|
+
end
|
39
|
+
end
|