unicode_script_detector 0.0.9 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6177527dc52eecfb0fbeb31e8f901d1138eb3fa5df70317e52cfe82d5d4a1071
4
- data.tar.gz: a4a289cb631b4099db822fa888183c29ef6c029b5a0d5f3e29fa177e243a5132
3
+ metadata.gz: 5552c7feac6b52cd98f1de0442768436f0cd801d8e491bffdac9509e87648490
4
+ data.tar.gz: 85ea21ac13d2c71e770802cd9a8259d74ee0542c86ee4d799470f311585c8dbe
5
5
  SHA512:
6
- metadata.gz: b395eb587afebc6fbeeb834792bf6eff0d93a0c771290fb7aa27d771fbb443fc53acb4b95c0fc0674cb09cb12a917d612fe81455725ac78be6dfc77222eda4a2
7
- data.tar.gz: 843f272dce94bcfc212965befc382e837d37c4a959389c836d829968fef6040de553b12b0e2c9b6da7d6625045877d9a6f310adb41f2c504cb99f730a8b07335
6
+ metadata.gz: 92f1a69dd82c2878830cd01d9370368d7590d074717df5f5fb61f05de946805a92c0a244b25f003d637f91ae9f754610141947de615fa8a8d9c4e8276cd3adc7
7
+ data.tar.gz: 3c0384fba23a30c4efd68eecba7a4e6db2396df46318c905af355cdf90aecfd8ffb29d634023bb8a9e18c33314d2ad8beb7410fe208edd708acd5a0159aaa1d8
data/README.md CHANGED
@@ -62,9 +62,38 @@ Latin: you (3 characters)
62
62
  Punctuation: ? (1 characters)
63
63
  ```
64
64
 
65
+ ## Get a homographic spoof analysis
66
+ ```ruby
67
+ UnicodeScriptDetector.spoof_analysis "Раypal"
68
+ =>
69
+ [
70
+ #<struct UnicodeScriptDetector::SpoofDetector::Detection
71
+ type=:confusable,
72
+ message="Found 2 character(s) from non-Latin scripts that visually resemble Latin letters",
73
+ characters=[
74
+ #<struct UnicodeScriptDetector::SpoofDetector::ConfusableChar char="Р", script="Cyrillic", looks_like="P", position=0>,
75
+ #<struct UnicodeScriptDetector::SpoofDetector::ConfusableChar char="а", script="Cyrillic", looks_like="a", position=1>
76
+ ],
77
+ severity=:high>,
78
+
79
+ #<struct UnicodeScriptDetector::SpoofDetector::Detection
80
+ type=:mixed_scripts,
81
+ message="Text contains a mix of 2 scripts: Cyrillic, Latin",
82
+ characters=["Cyrillic", "Latin"],
83
+ severity=:medium>
84
+ ]
85
+ ```
86
+
87
+ ## Check whether a homograph spoof is detected
88
+ ```ruby
89
+ UnicodeScriptDetector.spoofed? "Раypal"
90
+ => true
91
+ ```
92
+
65
93
  ## Development
66
- Start the console with `bin/console`.
67
- Run the tests with `bin/test`.
94
+ - Start the console with `bin/console`.
95
+ - Run the tests with `bin/test`.
96
+ - Update confusables list from unicode.org with `rake update_confusables`
68
97
 
69
98
  ## Contributing
70
99
  You're welcome to contribute to this project. See https://github.com/davidarendsen/unicode_script_detector.
@@ -0,0 +1,178 @@
1
+ require 'set'
2
+
3
+ module UnicodeScriptDetector
4
+ module Confusables
5
+ # Curated mapping of characters that visually resemble Latin letters.
6
+ # AUTO-GENERATED from https://unicode.org/Public/security/latest/confusables.txt
7
+ # Run `rake update_confusables` to regenerate.
8
+ MAPPING = {
9
+
10
+ # Cyrillic → Latin
11
+ 'Ѕ' => 'S', # U+0405
12
+ 'І' => 'l', # U+0406
13
+ 'Ј' => 'J', # U+0408
14
+ 'А' => 'A', # U+0410
15
+ 'В' => 'B', # U+0412
16
+ 'Е' => 'E', # U+0415
17
+ 'З' => '3', # U+0417
18
+ 'К' => 'K', # U+041A
19
+ 'М' => 'M', # U+041C
20
+ 'Н' => 'H', # U+041D
21
+ 'О' => 'O', # U+041E
22
+ 'Р' => 'P', # U+0420
23
+ 'С' => 'C', # U+0421
24
+ 'Т' => 'T', # U+0422
25
+ 'У' => 'Y', # U+0423
26
+ 'Х' => 'X', # U+0425
27
+ 'Ь' => 'b', # U+042C
28
+ 'а' => 'a', # U+0430
29
+ 'б' => '6', # U+0431
30
+ 'г' => 'r', # U+0433
31
+ 'е' => 'e', # U+0435
32
+ 'о' => 'o', # U+043E
33
+ 'р' => 'p', # U+0440
34
+ 'с' => 'c', # U+0441
35
+ 'у' => 'y', # U+0443
36
+ 'х' => 'x', # U+0445
37
+ 'ш' => 'w', # U+0448
38
+ 'ѕ' => 's', # U+0455
39
+ 'і' => 'i', # U+0456
40
+ 'ј' => 'j', # U+0458
41
+ 'ѡ' => 'w', # U+0461
42
+ 'Ѵ' => 'V', # U+0474
43
+ 'ѵ' => 'v', # U+0475
44
+ 'Ү' => 'Y', # U+04AE
45
+ 'ү' => 'y', # U+04AF
46
+ 'һ' => 'h', # U+04BB
47
+ 'ҽ' => 'e', # U+04BD
48
+ 'Ӏ' => 'l', # U+04C0
49
+ 'ӏ' => 'l', # U+04CF
50
+ 'Ӡ' => '3', # U+04E0
51
+ 'ԁ' => 'd', # U+0501
52
+ 'Ԍ' => 'G', # U+050C
53
+ 'ԛ' => 'q', # U+051B
54
+ 'Ԝ' => 'W', # U+051C
55
+ 'ԝ' => 'w', # U+051D
56
+
57
+ # Greek → Latin
58
+ 'ͺ' => 'i', # U+037A
59
+ 'Ϳ' => 'J', # U+037F
60
+ 'Α' => 'A', # U+0391
61
+ 'Β' => 'B', # U+0392
62
+ 'Ε' => 'E', # U+0395
63
+ 'Ζ' => 'Z', # U+0396
64
+ 'Η' => 'H', # U+0397
65
+ 'Ι' => 'l', # U+0399
66
+ 'Κ' => 'K', # U+039A
67
+ 'Μ' => 'M', # U+039C
68
+ 'Ν' => 'N', # U+039D
69
+ 'Ο' => 'O', # U+039F
70
+ 'Ρ' => 'P', # U+03A1
71
+ 'Τ' => 'T', # U+03A4
72
+ 'Υ' => 'Y', # U+03A5
73
+ 'Χ' => 'X', # U+03A7
74
+ 'α' => 'a', # U+03B1
75
+ 'γ' => 'y', # U+03B3
76
+ 'ι' => 'i', # U+03B9
77
+ 'ν' => 'v', # U+03BD
78
+ 'ο' => 'o', # U+03BF
79
+ 'ρ' => 'p', # U+03C1
80
+ 'σ' => 'o', # U+03C3
81
+ 'υ' => 'u', # U+03C5
82
+ 'ϒ' => 'Y', # U+03D2
83
+ 'Ϝ' => 'F', # U+03DC
84
+ 'Ϩ' => '2', # U+03E8
85
+ 'Ϭ' => '6', # U+03EC
86
+ 'ϭ' => 'o', # U+03ED
87
+ 'ϱ' => 'p', # U+03F1
88
+ 'ϲ' => 'c', # U+03F2
89
+ 'ϳ' => 'j', # U+03F3
90
+ 'ϸ' => 'p', # U+03F8
91
+ 'Ϲ' => 'C', # U+03F9
92
+ 'Ϻ' => 'M', # U+03FA
93
+
94
+ # Armenian → Latin
95
+ 'Ս' => 'U', # U+054D
96
+ 'Տ' => 'S', # U+054F
97
+ 'Օ' => 'O', # U+0555
98
+ 'ա' => 'w', # U+0561
99
+ 'գ' => 'q', # U+0563
100
+ 'զ' => 'q', # U+0566
101
+ 'հ' => 'h', # U+0570
102
+ 'ո' => 'n', # U+0578
103
+ 'ռ' => 'n', # U+057C
104
+ 'ս' => 'u', # U+057D
105
+ 'ց' => 'g', # U+0581
106
+ 'ւ' => 'i', # U+0582
107
+ 'ք' => 'f', # U+0584
108
+ 'օ' => 'o', # U+0585
109
+
110
+ # Georgian → Latin
111
+ 'ყ' => 'y', # U+10E7
112
+ 'ჿ' => 'o', # U+10FF
113
+
114
+ # Hebrew → Latin
115
+ '׀' => 'l', # U+05C0
116
+ 'ו' => 'l', # U+05D5
117
+ 'ט' => 'v', # U+05D8
118
+ 'ן' => 'l', # U+05DF
119
+ 'ס' => 'o', # U+05E1
120
+
121
+ # Ethiopic → Latin
122
+ 'ሀ' => 'U', # U+1200
123
+ 'ዐ' => 'O', # U+12D0
124
+ }.freeze
125
+
126
+ INVISIBLE_CHARACTERS = [
127
+ "\u200B", # Zero-width space
128
+ "\u200C", # Zero-width non-joiner
129
+ "\u200D", # Zero-width joiner
130
+ "\u200E", # Left-to-right mark
131
+ "\u200F", # Right-to-left mark
132
+ "\uFEFF", # Zero-width no-break space (BOM)
133
+ "\u2060", # Word joiner
134
+ "\u00AD", # Soft hyphen
135
+ "\u180E", # Mongolian vowel separator
136
+ "\u2061", # Function application
137
+ "\u2062", # Invisible times
138
+ "\u2063", # Invisible separator
139
+ "\u2064" # Invisible plus
140
+ ].freeze
141
+
142
+ DIRECTIONAL_OVERRIDES = [
143
+ "\u202A", # Left-to-right embedding
144
+ "\u202B", # Right-to-left embedding
145
+ "\u202C", # Pop directional formatting
146
+ "\u202D", # Left-to-right override
147
+ "\u202E" # Right-to-left override
148
+ ].freeze
149
+
150
+ SAFE_SCRIPT_COMBINATIONS = [
151
+ Set[:Latin, :Han, :Hiragana, :Katakana],
152
+ Set[:Latin, :Han, :Bopomofo],
153
+ Set[:Latin, :Han, :Hangul],
154
+ Set[:Hiragana, :Katakana, :Han],
155
+ Set[:Latin, :Inherited],
156
+ Set[:Latin, :Common],
157
+ Set[:Latin, :Punctuation],
158
+ Set[:Latin, :Digit],
159
+ Set[:Latin, :Whitespace]
160
+ ].freeze
161
+
162
+ def self.confusable?(char)
163
+ MAPPING.key?(char)
164
+ end
165
+
166
+ def self.looks_like(char)
167
+ MAPPING[char]
168
+ end
169
+
170
+ def self.invisible?(char)
171
+ INVISIBLE_CHARACTERS.include?(char)
172
+ end
173
+
174
+ def self.directional_override?(char)
175
+ DIRECTIONAL_OVERRIDES.include?(char)
176
+ end
177
+ end
178
+ end
@@ -0,0 +1,172 @@
1
+ module UnicodeScriptDetector
2
+ class SpoofDetector
3
+ Detection = Struct.new(:type, :message, :characters, :severity, keyword_init: true)
4
+ ConfusableChar = Struct.new(:char, :script, :looks_like, :position, keyword_init: true)
5
+ InvisibleChar = Struct.new(:char, :codepoint, :name, :position, keyword_init: true)
6
+
7
+ SEVERITY_HIGH = :high
8
+ SEVERITY_MEDIUM = :medium
9
+ SEVERITY_LOW = :low
10
+
11
+ def initialize(string)
12
+ @string = string.to_s
13
+ @detections = nil
14
+ end
15
+
16
+ # Returns all spoof detections found in the string
17
+ def detections
18
+ @detections ||= [
19
+ detect_confusables,
20
+ detect_invisible_characters,
21
+ detect_directional_overrides,
22
+ detect_mixed_scripts
23
+ ].compact
24
+ end
25
+
26
+ # Returns true if any spoofing is detected
27
+ def spoofed?
28
+ detections.any?
29
+ end
30
+
31
+ # Returns only confusable character detections
32
+ def confusables
33
+ detections.select { |d| d.type == :confusable }
34
+ end
35
+
36
+ # Returns only invisible character detections
37
+ def invisible_characters
38
+ detections.select { |d| d.type == :invisible }
39
+ end
40
+
41
+ # Returns only mixed script detections
42
+ def mixed_scripts
43
+ detections.select { |d| d.type == :mixed_scripts }
44
+ end
45
+
46
+ private
47
+
48
+ attr_reader :string
49
+
50
+ def detect_confusables
51
+ found = []
52
+
53
+ string.chars.each_with_index do |char, idx|
54
+ if Confusables.confusable?(char)
55
+ script = detect_script_for_char(char)
56
+ found << ConfusableChar.new(
57
+ char: char,
58
+ script: script[:name],
59
+ looks_like: Confusables.looks_like(char),
60
+ position: idx
61
+ )
62
+ end
63
+ end
64
+
65
+ return nil if found.empty?
66
+
67
+ Detection.new(
68
+ type: :confusable,
69
+ message: "Found #{found.length} character(s) from non-Latin scripts that visually resemble Latin letters",
70
+ characters: found,
71
+ severity: SEVERITY_HIGH
72
+ )
73
+ end
74
+
75
+ def detect_invisible_characters
76
+ found = []
77
+
78
+ string.chars.each_with_index do |char, idx|
79
+ if Confusables.invisible?(char)
80
+ found << InvisibleChar.new(
81
+ char: char,
82
+ codepoint: format("U+%04X", char.ord),
83
+ name: invisible_char_name(char),
84
+ position: idx
85
+ )
86
+ end
87
+ end
88
+
89
+ return nil if found.empty?
90
+
91
+ Detection.new(
92
+ type: :invisible,
93
+ message: "Found #{found.length} invisible character(s)",
94
+ characters: found,
95
+ severity: SEVERITY_HIGH
96
+ )
97
+ end
98
+
99
+ def detect_directional_overrides
100
+ found = []
101
+
102
+ string.chars.each_with_index do |char, idx|
103
+ if Confusables.directional_override?(char)
104
+ found << InvisibleChar.new(
105
+ char: char,
106
+ codepoint: format("U+%04X", char.ord),
107
+ name: "Bidirectional override",
108
+ position: idx
109
+ )
110
+ end
111
+ end
112
+
113
+ return nil if found.empty?
114
+
115
+ Detection.new(
116
+ type: :directional_override,
117
+ message: "Found #{found.length} bidirectional override character(s)",
118
+ characters: found,
119
+ severity: SEVERITY_HIGH
120
+ )
121
+ end
122
+
123
+ def detect_mixed_scripts
124
+ detector = Detector.new(string)
125
+ scripts = detector.scripts.reject { |s| ignored_script?(s) }
126
+
127
+ return nil if scripts.size <= 1
128
+
129
+ script_set = scripts.to_set
130
+
131
+ # Check if it's a known safe combination
132
+ return nil if Confusables::SAFE_SCRIPT_COMBINATIONS.any? { |safe| script_set.subset?(safe) }
133
+
134
+ Detection.new(
135
+ type: :mixed_scripts,
136
+ message: "Text contains a mix of #{scripts.size} scripts: #{scripts.map(&:to_s).join(", ")}",
137
+ characters: scripts.map(&:to_s),
138
+ severity: SEVERITY_MEDIUM
139
+ )
140
+ end
141
+
142
+ def detect_script_for_char(char)
143
+ Scripts::LIST.each do |script_data|
144
+ return script_data if char.match?(script_data[:regex])
145
+ end
146
+ { script: :Other, name: "Other" }
147
+ end
148
+
149
+ def ignored_script?(script)
150
+ %i[Common Inherited Whitespace Punctuation Digit New_Line Tab].include?(script)
151
+ end
152
+
153
+ def invisible_char_name(char)
154
+ names = {
155
+ "\u200B" => "Zero-width space",
156
+ "\u200C" => "Zero-width non-joiner",
157
+ "\u200D" => "Zero-width joiner",
158
+ "\u200E" => "Left-to-right mark",
159
+ "\u200F" => "Right-to-left mark",
160
+ "\uFEFF" => "Zero-width no-break space",
161
+ "\u2060" => "Word joiner",
162
+ "\u00AD" => "Soft hyphen",
163
+ "\u180E" => "Mongolian vowel separator",
164
+ "\u2061" => "Function application",
165
+ "\u2062" => "Invisible times",
166
+ "\u2063" => "Invisible separator",
167
+ "\u2064" => "Invisible plus",
168
+ }
169
+ names[char] || "Unknown invisible character"
170
+ end
171
+ end
172
+ end
@@ -1,3 +1,3 @@
1
1
  module UnicodeScriptDetector
2
- VERSION = "0.0.9"
2
+ VERSION = "0.0.10"
3
3
  end
@@ -20,5 +20,13 @@ module UnicodeScriptDetector
20
20
  def contains_only?(string, scripts)
21
21
  UnicodeScriptDetector::Detector.new(string).contains_only?(scripts)
22
22
  end
23
+
24
+ def spoof_analysis(string)
25
+ UnicodeScriptDetector::SpoofDetector.new(string).detections
26
+ end
27
+
28
+ def spoofed?(string)
29
+ UnicodeScriptDetector::SpoofDetector.new(string).spoofed?
30
+ end
23
31
  end
24
32
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicode_script_detector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Arendsen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-01-09 00:00:00.000000000 Z
11
+ date: 2026-05-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: zeitwerk
@@ -54,9 +54,11 @@ files:
54
54
  - README.md
55
55
  - lib/unicode_script_detector.rb
56
56
  - lib/unicode_script_detector/character.rb
57
+ - lib/unicode_script_detector/confusables.rb
57
58
  - lib/unicode_script_detector/detector.rb
58
59
  - lib/unicode_script_detector/script_group.rb
59
60
  - lib/unicode_script_detector/scripts.rb
61
+ - lib/unicode_script_detector/spoof_detector.rb
60
62
  - lib/unicode_script_detector/version.rb
61
63
  homepage: https://github.com/davidarendsen/unicode_script_detector
62
64
  licenses:
@@ -73,7 +75,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
73
75
  requirements:
74
76
  - - ">="
75
77
  - !ruby/object:Gem::Version
76
- version: 3.1.0
78
+ version: 3.2.0
77
79
  required_rubygems_version: !ruby/object:Gem::Requirement
78
80
  requirements:
79
81
  - - ">="