unicode-confusable 1.11.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +1 -1
- data/README.md +8 -4
- data/data/confusable.marshal.gz +0 -0
- data/lib/unicode/confusable/constants.rb +1 -1
- data/lib/unicode/confusable/ignorable.rb +9 -0
- data/lib/unicode/confusable.rb +8 -4
- data/spec/unicode_confusable_spec.rb +20 -8
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: decb589505c410acb8b064c0aa0b412f4178bac748a7b9eb0fb7a7d6d3b00da7
|
4
|
+
data.tar.gz: 5e60c81b398e29a1914cb356476b47c4c3b95312970d1c22064e7083c133c3b3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fc871a5e4a4291c95b0218e61f9a7125f872f5e27b94e1f63b5f33f36e6188bf90f147fcc82e24ea4d3c1153ba2cc364284124049f15a25914d6a520da33428c
|
7
|
+
data.tar.gz: de60105cb97d2255cf9dd930a767e019a7e5e0d1e6deb4d2201bc4e726c203cdf20ebf29417ae3c19d552bb93d0a9a90802a181513136e07701aec05c89fb885
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
# Unicode::Confusable [![[version]](https://badge.fury.io/rb/unicode-confusable.svg)](https://badge.fury.io/rb/unicode-confusable) [![[ci]](https://github.com/janlelis/unicode-confusable/workflows/Test/badge.svg)](https://github.com/janlelis/unicode-confusable/actions?query=workflow%3ATest)
|
2
2
|
|
3
|
-
Compares two strings if they are visually confusable as described in [Unicode® Technical Standard #39](https://www.unicode.org/reports/tr39/#Confusable_Detection): Both strings get transformed into a skeleton format before comparing them. The skeleton is generated by normalizing the string ([NFD](http://unicode.org/reports/tr15/#Norm_Forms)), replacing [confusable characters](https://unicode.org/Public/security/
|
3
|
+
Compares two strings if they are visually confusable as described in [Unicode® Technical Standard #39](https://www.unicode.org/reports/tr39/#Confusable_Detection): Both strings get transformed into a skeleton format before comparing them. The skeleton is generated by normalizing the string ([NFD](http://unicode.org/reports/tr15/#Norm_Forms)), removing ignorable characters, replacing [confusable characters](https://unicode.org/Public/security/16.0.0/confusables.txt), and normalizing the string again.
|
4
4
|
|
5
5
|
Unicode version: **16.0.0** (September 2024)
|
6
6
|
|
7
7
|
\* The Unicode normalization [depends on your Ruby version](https://idiosyncratic-ruby.com/73-unicode-version-mapping.html)
|
8
8
|
|
9
|
-
|
9
|
+
Please note: The TR39 standard now includes detection of confusables based on bidi formatting (i.e. right-to-left text). This is currently not supported by this gen.
|
10
10
|
|
11
|
-
|
11
|
+
Supported Rubies: **3.x** (might stil work: **2.x**)
|
12
12
|
|
13
13
|
## Usage
|
14
14
|
|
@@ -49,6 +49,10 @@ Unicode::Confusable.list("o")
|
|
49
49
|
# => ["⒪", "ꜵ", "℅", "ᴔ", "ꭁ", "ꭂ", "ﷲ", "№", "ం", "ಂ", "ം", "ං", "०", "੦", "૦", "௦", "౦", "೦", "൦", "๐", "໐", "၀", "٥", "۵", "o", "ℴ", "𝐨", "𝑜", "𝒐", "𝓸", "𝔬", "𝕠", "𝖔", "𝗈", "𝗼", "𝘰", "𝙤", "𝚘", "ᴏ", "ᴑ", "ꬽ", "ο", "𝛐", "𝜊", "𝝄", "𝝾", "𝞸", "σ", "𝛔", "𝜎", "𝝈", "𝞂", "𝞼", "ⲟ", "о", "ჿ", "օ", "ס", "ه", "𞸤", "𞹤", "𞺄", "ﻫ", "ﻬ", "ﻪ", "ﻩ", "ھ", "ﮬ", "ﮭ", "ﮫ", "ﮪ", "ہ", "ﮨ", "ﮩ", "ﮧ", "ﮦ", "ە", "ഠ", "ဝ", "𐓪", "𑣈", "𑣗", "𐐬", "ۿ", "ø", "ꬾ", "ɵ", "ꝋ", "ө", "ѳ", "ꮎ", "ꮻ", "ꭴ", "ﳙ", "ơ", "œ", "ɶ", "∞", "ꝏ", "ꚙ", "ﳗ", "ﱑ", "ﳘ", "ﱒ", "ﶓ", "ﶔ", "ﱓ", "ﱔ", "ൟ", "တ", "ꭣ", "ﲠ", "ﳢ", "ﲥ", "ﳤ", "ﷻ", "ﴱ", "ﳨ", "ﴲ", "ﳪ", "ﷺ", "ﷷ", "ﳍ", "ﳖ", "ﳯ", "ﳞ", "ﳱ", "ﳦ", "ﲛ", "ﳠ", "ﯭ", "ﯬ"]
|
50
50
|
```
|
51
51
|
|
52
|
+
## No Bidi-Confusable Check
|
53
|
+
|
54
|
+
Testing for bidirectional confusables is currently not supported.
|
55
|
+
|
52
56
|
## No Advanced Detection
|
53
57
|
|
54
58
|
TR 39 also describes mechanisms for a more exact recognition of confusables, also within the same string:
|
@@ -57,7 +61,7 @@ TR 39 also describes mechanisms for a more exact recognition of confusables, als
|
|
57
61
|
- Mixed-script confusable
|
58
62
|
- Whole-script confusable
|
59
63
|
|
60
|
-
This is currently
|
64
|
+
This is currently not supported by this gem.
|
61
65
|
|
62
66
|
See [unicode-x](https://github.com/janlelis/unicode-x) for more Unicode related micro libraries.
|
63
67
|
|
data/data/confusable.marshal.gz
CHANGED
Binary file
|
data/lib/unicode/confusable.rb
CHANGED
@@ -4,6 +4,8 @@ require 'unicode_normalize/normalize'
|
|
4
4
|
|
5
5
|
module Unicode
|
6
6
|
module Confusable
|
7
|
+
autoload :IGNORABLE, File.expand_path('confusable/ignorable', __dir__)
|
8
|
+
|
7
9
|
def self.confusable?(string1, string2)
|
8
10
|
skeleton(string1) == skeleton(string2)
|
9
11
|
end
|
@@ -12,8 +14,10 @@ module Unicode
|
|
12
14
|
require_relative 'confusable/index' unless defined? ::Unicode::Confusable::INDEX
|
13
15
|
UnicodeNormalize.normalize(
|
14
16
|
UnicodeNormalize.normalize(string, :nfd).each_codepoint.map{ |codepoint|
|
15
|
-
|
16
|
-
|
17
|
+
unless IGNORABLE.include?(codepoint)
|
18
|
+
INDEX[:CONFUSABLE][codepoint] || codepoint
|
19
|
+
end
|
20
|
+
}.flatten.compact.pack("U*"), :nfd
|
17
21
|
)
|
18
22
|
end
|
19
23
|
|
@@ -21,9 +25,9 @@ module Unicode
|
|
21
25
|
require_relative 'confusable/index' unless defined? ::Unicode::Confusable::INDEX
|
22
26
|
codepoint = char.codepoints.first or raise ArgumentError, "no data given to Unicode::Confusable.list"
|
23
27
|
if partial_mapping_allowed
|
24
|
-
INDEX.select{ |k,v| v == codepoint || v.is_a?(Array) && v.include?(codepoint) }.keys.map{ |codepoint| [codepoint].pack("U*") }
|
28
|
+
INDEX[:CONFUSABLE].select{ |k,v| v == codepoint || v.is_a?(Array) && v.include?(codepoint) }.keys.map{ |codepoint| [codepoint].pack("U*") }
|
25
29
|
else
|
26
|
-
INDEX.select{ |k,v| v == codepoint }.keys.map{ |codepoint| [codepoint].pack("U") }
|
30
|
+
INDEX[:CONFUSABLE].select{ |k,v| v == codepoint }.keys.map{ |codepoint| [codepoint].pack("U") }
|
27
31
|
end
|
28
32
|
end
|
29
33
|
end
|
@@ -2,16 +2,28 @@ require_relative "../lib/unicode/confusable"
|
|
2
2
|
require "minitest/autorun"
|
3
3
|
|
4
4
|
describe Unicode::Confusable do
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
5
|
+
describe ".confusable?(string1, string2)" do
|
6
|
+
it "will detect official confusables" do
|
7
|
+
assert_equal true, Unicode::Confusable.confusable?("1", "l")
|
8
|
+
assert_equal true, Unicode::Confusable.confusable?("ℜ𝘂ᖯʏ", "Ruby")
|
9
|
+
assert_equal true, Unicode::Confusable.confusable?("Michael", "Michae1")
|
10
|
+
assert_equal true, Unicode::Confusable.confusable?("⁇", "??")
|
11
|
+
end
|
12
|
+
|
13
|
+
it "will return false for non-confusables" do
|
14
|
+
assert_equal false, Unicode::Confusable.confusable?("a", "b")
|
15
|
+
assert_equal false, Unicode::Confusable.confusable?("⁇", "?")
|
16
|
+
end
|
10
17
|
end
|
11
18
|
|
12
|
-
|
13
|
-
|
14
|
-
|
19
|
+
describe ".skeleton(string)" do
|
20
|
+
it "returns internal skeleton representation" do
|
21
|
+
assert_equal "Ruby", Unicode::Confusable.skeleton("ℜ𝘂ᖯʏ")
|
22
|
+
end
|
23
|
+
|
24
|
+
it "will remove default ignorable codepoints" do
|
25
|
+
assert_equal "ab", Unicode::Confusable.skeleton("a\u{FE0F}b")
|
26
|
+
end
|
15
27
|
end
|
16
28
|
|
17
29
|
describe ".list(char)" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicode-confusable
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: "[Unicode 16.0.0] Compares two strings if they are visually confusable
|
14
14
|
as described in Unicode® Technical Standard #39: Both strings get transformed into
|
@@ -31,6 +31,7 @@ files:
|
|
31
31
|
- data/confusable.marshal.gz
|
32
32
|
- lib/unicode/confusable.rb
|
33
33
|
- lib/unicode/confusable/constants.rb
|
34
|
+
- lib/unicode/confusable/ignorable.rb
|
34
35
|
- lib/unicode/confusable/index.rb
|
35
36
|
- lib/unicode/confusable/string_ext.rb
|
36
37
|
- sig/unicode-confusable.rbs
|
@@ -56,7 +57,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
56
57
|
- !ruby/object:Gem::Version
|
57
58
|
version: '0'
|
58
59
|
requirements: []
|
59
|
-
rubygems_version: 3.5.
|
60
|
+
rubygems_version: 3.5.21
|
60
61
|
signing_key:
|
61
62
|
specification_version: 4
|
62
63
|
summary: Detect characters that look visually similar.
|