unisec 0.0.1 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/data/DerivedName.txt +44181 -0
- data/lib/unisec/cli/cli.rb +14 -8
- data/lib/unisec/cli/rugrep.rb +33 -0
- data/lib/unisec/cli/size.rb +38 -0
- data/lib/unisec/cli/versions.rb +38 -0
- data/lib/unisec/properties.rb +11 -2
- data/lib/unisec/rugrep.rb +126 -0
- data/lib/unisec/size.rb +171 -0
- data/lib/unisec/version.rb +1 -1
- data/lib/unisec/versions.rb +90 -0
- data/lib/unisec.rb +5 -2
- metadata +13 -4
data/lib/unisec/cli/cli.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'unisec/cli/
|
3
|
+
require 'unisec/cli/confusables'
|
4
4
|
require 'unisec/cli/hexdump'
|
5
5
|
require 'unisec/cli/properties'
|
6
|
-
require 'unisec/cli/
|
6
|
+
require 'unisec/cli/rugrep'
|
7
|
+
require 'unisec/cli/size'
|
8
|
+
require 'unisec/cli/surrogates'
|
9
|
+
require 'unisec/cli/versions'
|
7
10
|
|
8
11
|
module Unisec
|
9
12
|
# Module used to create the CLI for the executable
|
@@ -14,14 +17,17 @@ module Unisec
|
|
14
17
|
|
15
18
|
# Mapping between the (sub-)commands as seen by the user
|
16
19
|
# on the command-line interface and the CLI modules in the lib
|
17
|
-
register 'surrogates to', Surrogates::To
|
18
|
-
register 'surrogates from', Surrogates::From
|
19
|
-
register 'hexdump', Hexdump
|
20
|
-
register 'properties list', Properties::List
|
21
|
-
register 'properties codepoints', Properties::Codepoints
|
22
|
-
register 'properties char', Properties::Char
|
23
20
|
register 'confusables list', Confusables::List
|
24
21
|
register 'confusables randomize', Confusables::Randomize
|
22
|
+
register 'grep', Grep
|
23
|
+
register 'hexdump', Hexdump
|
24
|
+
register 'properties char', Properties::Char
|
25
|
+
register 'properties codepoints', Properties::Codepoints
|
26
|
+
register 'properties list', Properties::List
|
27
|
+
register 'size', Size
|
28
|
+
register 'surrogates from', Surrogates::From
|
29
|
+
register 'surrogates to', Surrogates::To
|
30
|
+
register 'versions', Versions
|
25
31
|
end
|
26
32
|
end
|
27
33
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'dry/cli'
|
4
|
+
require 'unisec'
|
5
|
+
|
6
|
+
module Unisec
|
7
|
+
module CLI
|
8
|
+
module Commands
|
9
|
+
# CLI command `unisec grep` for the class {Unisec::Rugrep} from the lib.
|
10
|
+
#
|
11
|
+
# Example:
|
12
|
+
#
|
13
|
+
# ```plaintext
|
14
|
+
# $ unisec grep 'FRENCH \w+'
|
15
|
+
# U+20A3 ₣ FRENCH FRANC SIGN
|
16
|
+
# U+1F35F 🍟 FRENCH FRIES
|
17
|
+
# ```
|
18
|
+
class Grep < Dry::CLI::Command
|
19
|
+
desc 'Search for Unicode code point names by regular expression'
|
20
|
+
|
21
|
+
argument :regexp, required: true,
|
22
|
+
desc: 'regular expression'
|
23
|
+
|
24
|
+
# Hexdump of all Unicode encodings.
|
25
|
+
# @param regexp [Regexp] Regular expression without delimiters or modifiers.
|
26
|
+
# Supports everything Ruby Regexp supports
|
27
|
+
def call(regexp: nil, **)
|
28
|
+
puts Unisec::Rugrep.regrep_display(regexp)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'dry/cli'
|
4
|
+
require 'unisec'
|
5
|
+
|
6
|
+
module Unisec
|
7
|
+
module CLI
|
8
|
+
module Commands
|
9
|
+
# CLI command `unisec size` for the class {Unisec::Size} from the lib.
|
10
|
+
#
|
11
|
+
# Example:
|
12
|
+
#
|
13
|
+
# ```plaintext
|
14
|
+
# $ unisec size 🧑🏼🔬
|
15
|
+
# Code point(s): 4
|
16
|
+
# Grapheme(s): 1
|
17
|
+
# UTF-8 byte(s): 15
|
18
|
+
# UTF-16 byte(s): 14
|
19
|
+
# UTF-32 byte(s): 16
|
20
|
+
# UTF-8 unit(s): 15
|
21
|
+
# UTF-16 unit(s): 7
|
22
|
+
# UTF-32 unit(s): 4
|
23
|
+
# ```
|
24
|
+
class Size < Dry::CLI::Command
|
25
|
+
desc 'All kinf of size information about a Unicode string'
|
26
|
+
|
27
|
+
argument :input, required: true,
|
28
|
+
desc: 'String input'
|
29
|
+
|
30
|
+
# All kinf of size information about a Unicode string.
|
31
|
+
# @param input [String] Input sting we want to know the size of
|
32
|
+
def call(input: nil, **)
|
33
|
+
puts Unisec::Size.new(input).display
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'dry/cli'
|
4
|
+
require 'unisec'
|
5
|
+
|
6
|
+
module Unisec
|
7
|
+
module CLI
|
8
|
+
module Commands
|
9
|
+
# CLI command `unisec versions` for the class {Unisec::Versions} from the lib.
|
10
|
+
#
|
11
|
+
# Example:
|
12
|
+
#
|
13
|
+
# ```plaintext
|
14
|
+
# $ unisec versions
|
15
|
+
# Unicode:
|
16
|
+
# Unicode (Ruby) 15.0.0
|
17
|
+
# Unicode (twitter_cldr gem) 14.0.0
|
18
|
+
# Unicode (unicode-confusable gem) 15.0.0
|
19
|
+
# ICU (twitter_cldr gem) 70.1
|
20
|
+
# CLDR (twitter_cldr gem) 40
|
21
|
+
# Unicode emoji (Ruby) 15.0
|
22
|
+
#
|
23
|
+
# Gems:
|
24
|
+
# unisec 0.0.1
|
25
|
+
# twitter_cldr gem 6.11.5
|
26
|
+
# unicode-confusable gem 1.9.0
|
27
|
+
# ```
|
28
|
+
class Versions < Dry::CLI::Command
|
29
|
+
desc 'Version of anything related to Unicode as used in unisec'
|
30
|
+
|
31
|
+
# Version of anything related to Unicode as used in unisec.
|
32
|
+
def call(**)
|
33
|
+
puts Unisec::Versions.display
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/unisec/properties.rb
CHANGED
@@ -50,7 +50,7 @@ module Unisec
|
|
50
50
|
def self.codepoints_display(prop)
|
51
51
|
codepoints = Properties.codepoints(prop)
|
52
52
|
codepoints.each do |cp|
|
53
|
-
puts "#{Properties.
|
53
|
+
puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
54
54
|
end
|
55
55
|
nil
|
56
56
|
end
|
@@ -158,7 +158,7 @@ module Unisec
|
|
158
158
|
# @example
|
159
159
|
# Unisec::Properties.char2codepoint('💎') # => "U+1F48E"
|
160
160
|
def self.char2codepoint(chr)
|
161
|
-
|
161
|
+
Properties.deccp2stdhexcp(chr.codepoints.first)
|
162
162
|
end
|
163
163
|
|
164
164
|
# Display the code points in Unicode format for the given characters (code points as string)
|
@@ -174,5 +174,14 @@ module Unisec
|
|
174
174
|
end
|
175
175
|
out.join(' ')
|
176
176
|
end
|
177
|
+
|
178
|
+
# Convert from decimal code point to standardized format hexadecimal code point
|
179
|
+
# @param int_cp [Integer] Code point in decimal format
|
180
|
+
# @return [String] code point in Unicode format
|
181
|
+
# @example
|
182
|
+
# Unisec::Properties.intcp2stdhexcp(128640) # => "U+1F680"
|
183
|
+
def self.deccp2stdhexcp(int_cp)
|
184
|
+
"U+#{format('%.4x', int_cp).upcase}"
|
185
|
+
end
|
177
186
|
end
|
178
187
|
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'twitter_cldr'
|
4
|
+
require 'paint'
|
5
|
+
|
6
|
+
module Unisec
|
7
|
+
# Ruby grep : Ruby regular expression search for Unicode code point names
|
8
|
+
class Rugrep
|
9
|
+
# UCD Derived names file location
|
10
|
+
# @see https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedName.txt
|
11
|
+
UCD_DERIVEDNAME = File.join(__dir__, '../../data/DerivedName.txt')
|
12
|
+
|
13
|
+
# Search code points by (Ruby) regexp
|
14
|
+
# @param regexp [Regexp] Regular expression without delimiters or modifiers.
|
15
|
+
# Supports everything Ruby Regexp supports
|
16
|
+
# @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
|
17
|
+
# @example
|
18
|
+
# Unisec::Rugrep.regrep('snowman|snowflake')
|
19
|
+
# # =>
|
20
|
+
# # [{:char=>"☃", :codepoint=>9731, :name=>"SNOWMAN"},
|
21
|
+
# # {:char=>"⛄", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
|
22
|
+
# # {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
|
23
|
+
# # {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
|
24
|
+
# # {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
|
25
|
+
# # {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
|
26
|
+
# Unisec::Rugrep.regrep('greek small letter \w+')
|
27
|
+
# # =>
|
28
|
+
# # [{:char=>"ͱ", :codepoint=>881, :name=>"GREEK SMALL LETTER HETA"},
|
29
|
+
# # {:char=>"ͳ", :codepoint=>883, :name=>"GREEK SMALL LETTER ARCHAIC SAMPI"},
|
30
|
+
# # {:char=>"ͷ", :codepoint=>887, :name=>"GREEK SMALL LETTER PAMPHYLIAN DIGAMMA"},
|
31
|
+
# # …]
|
32
|
+
def self.regrep(regexp)
|
33
|
+
out = []
|
34
|
+
file = File.new(UCD_DERIVEDNAME)
|
35
|
+
file.each_line(chomp: true) do |line|
|
36
|
+
# Skip if the line is empty or a comment
|
37
|
+
next if line.empty? || line[0] == '#'
|
38
|
+
|
39
|
+
# parse the line to extract code point as integer and the name
|
40
|
+
cp_int, name = line.split(';')
|
41
|
+
cp_int = cp_int.chomp.to_i(16)
|
42
|
+
name.lstrip!
|
43
|
+
next unless /#{regexp}/i.match?(name) # compiling regexp once is surprisingly not faster
|
44
|
+
|
45
|
+
out << {
|
46
|
+
char: TwitterCldr::Utils::CodePoints.to_string([cp_int]),
|
47
|
+
codepoint: cp_int,
|
48
|
+
name: name
|
49
|
+
}
|
50
|
+
end
|
51
|
+
out
|
52
|
+
end
|
53
|
+
|
54
|
+
# Display a CLI-friendly output listing all code points corresponding to a regular expression.
|
55
|
+
# @example
|
56
|
+
# Unisec::Rugrep.regrep_display('snowman|snowflake')
|
57
|
+
# # =>
|
58
|
+
# # U+2603 ☃ SNOWMAN
|
59
|
+
# # U+26C4 ⛄ SNOWMAN WITHOUT SNOW
|
60
|
+
# # U+26C7 ⛇ BLACK SNOWMAN
|
61
|
+
# # U+2744 ❄ SNOWFLAKE
|
62
|
+
# # U+2745 ❅ TIGHT TRIFOLIATE SNOWFLAKE
|
63
|
+
# # U+2746 ❆ HEAVY CHEVRON SNOWFLAKE
|
64
|
+
def self.regrep_display(regexp)
|
65
|
+
codepoints = regrep(regexp)
|
66
|
+
codepoints.each do |cp|
|
67
|
+
puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
68
|
+
end
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns the version of Unicode used in UCD local file (data/DerivedName.txt)
|
73
|
+
# @return [String] Unicode version
|
74
|
+
# @example
|
75
|
+
# Unisec::Rugrep.ucd_derivedname_version # => "15.1.0"
|
76
|
+
def self.ucd_derivedname_version
|
77
|
+
first_line = File.open(UCD_DERIVEDNAME, &:readline)
|
78
|
+
first_line.match(/-(\d+\.\d+\.\d+)\.txt/).captures.first
|
79
|
+
end
|
80
|
+
|
81
|
+
# Search code points by (Ruby) regexp
|
82
|
+
# @param regexp [Regexp] Regular expression without delimiters or modifiers
|
83
|
+
# @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
|
84
|
+
# @example
|
85
|
+
# Unisec::Rugrep.regrep_slow('snowman|snowflake')
|
86
|
+
# # =>
|
87
|
+
# # [{:char=>"☃", :codepoint=>9731, :name=>"SNOWMAN"},
|
88
|
+
# # {:char=>"⛄", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
|
89
|
+
# # {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
|
90
|
+
# # {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
|
91
|
+
# # {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
|
92
|
+
# # {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
|
93
|
+
# @note ⚠ This command is very time consuming (~ 1min) and unoptimized (execute one regexp per code point…)
|
94
|
+
def self.regrep_slow(regexp)
|
95
|
+
out = []
|
96
|
+
TwitterCldr::Shared::CodePoint.each do |cp|
|
97
|
+
next unless /#{regexp}/oi.match?(cp.name) # compiling regexp once is surprisingly not faster
|
98
|
+
|
99
|
+
out << {
|
100
|
+
char: TwitterCldr::Utils::CodePoints.to_string([cp.code_point]),
|
101
|
+
codepoint: cp.code_point,
|
102
|
+
name: cp.name
|
103
|
+
}
|
104
|
+
end
|
105
|
+
out
|
106
|
+
end
|
107
|
+
|
108
|
+
# Display a CLI-friendly output listing all code points corresponding to a regular expression.
|
109
|
+
# @example
|
110
|
+
# Unisec::Rugrep.regrep_display_slow('snowman|snowflake')
|
111
|
+
# # =>
|
112
|
+
# # U+2603 ☃ SNOWMAN
|
113
|
+
# # U+26C4 ⛄ SNOWMAN WITHOUT SNOW
|
114
|
+
# # U+26C7 ⛇ BLACK SNOWMAN
|
115
|
+
# # U+2744 ❄ SNOWFLAKE
|
116
|
+
# # U+2745 ❅ TIGHT TRIFOLIATE SNOWFLAKE
|
117
|
+
# # U+2746 ❆ HEAVY CHEVRON SNOWFLAKE
|
118
|
+
def self.regrep_display_slow(regexp)
|
119
|
+
codepoints = regrep_slow(regexp)
|
120
|
+
codepoints.each do |cp|
|
121
|
+
puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
122
|
+
end
|
123
|
+
nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
data/lib/unisec/size.rb
ADDED
@@ -0,0 +1,171 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'paint'
|
4
|
+
|
5
|
+
module Unisec
|
6
|
+
# All kinf of size information about a Unicode string
|
7
|
+
class Size
|
8
|
+
# Number of code points
|
9
|
+
# @return [Integer] number of code points
|
10
|
+
# @example
|
11
|
+
# us = Unisec::Size.new('👩❤️👩')
|
12
|
+
# us.code_points_size # => 6
|
13
|
+
attr_reader :code_points_size
|
14
|
+
|
15
|
+
# Number of graphemes
|
16
|
+
# @return [Integer] number of graphemes
|
17
|
+
# @example
|
18
|
+
# us = Unisec::Size.new('👩❤️👩')
|
19
|
+
# us.grapheme_size # => 1
|
20
|
+
attr_reader :grapheme_size
|
21
|
+
|
22
|
+
# UTF-8 size in bytes
|
23
|
+
# @return [Integer] UTF-8 size in bytes
|
24
|
+
# @example
|
25
|
+
# us = Unisec::Size.new('👩❤️👩')
|
26
|
+
# us.utf8_bytesize # => 20
|
27
|
+
attr_reader :utf8_bytesize
|
28
|
+
|
29
|
+
# UTF-16 size in bytes
|
30
|
+
# @return [Integer] UTF-16 size in bytes
|
31
|
+
# @example
|
32
|
+
# us = Unisec::Size.new('👩❤️👩')
|
33
|
+
# us.utf16_bytesize # => 16
|
34
|
+
attr_reader :utf16_bytesize
|
35
|
+
|
36
|
+
# UTF-32 size in bytes
|
37
|
+
# @return [Integer] UTF-32 size in bytes
|
38
|
+
# @example
|
39
|
+
# us = Unisec::Size.new('👩❤️👩')
|
40
|
+
# us.utf32_bytesize # => 24
|
41
|
+
attr_reader :utf32_bytesize
|
42
|
+
|
43
|
+
# Number of UTF-8 units
|
44
|
+
# @return [Integer] number of UTF-8 units
|
45
|
+
# @example
|
46
|
+
# us = Unisec::Size.new('👩❤️👩')
|
47
|
+
# us.utf8_unitsize # => 20
|
48
|
+
attr_reader :utf8_unitsize
|
49
|
+
|
50
|
+
# Number of UTF-16 units
|
51
|
+
# @return [Integer] number of UTF-16 units
|
52
|
+
# @example
|
53
|
+
# us = Unisec::Size.new('👩❤️👩')
|
54
|
+
# us.utf16_unitsize # => 8
|
55
|
+
attr_reader :utf16_unitsize
|
56
|
+
|
57
|
+
# Number of UTF-32 units
|
58
|
+
# @return [Integer] number of UTF-32 units
|
59
|
+
# @example
|
60
|
+
# us = Unisec::Size.new('👩❤️👩')
|
61
|
+
# us.utf32_unitsize # => 6
|
62
|
+
attr_reader :utf32_unitsize
|
63
|
+
|
64
|
+
def initialize(str)
|
65
|
+
@code_points_size = Size.code_points_size(str)
|
66
|
+
@grapheme_size = Size.grapheme_size(str)
|
67
|
+
@utf8_bytesize = Size.utf8_bytesize(str)
|
68
|
+
@utf16_bytesize = Size.utf16_bytesize(str)
|
69
|
+
@utf32_bytesize = Size.utf32_bytesize(str)
|
70
|
+
@utf8_unitsize = Size.utf8_unitsize(str)
|
71
|
+
@utf16_unitsize = Size.utf16_unitsize(str)
|
72
|
+
@utf32_unitsize = Size.utf32_unitsize(str)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Number of code points
|
76
|
+
# @param str [String] Input sting we want to know the size of
|
77
|
+
# @return [Integer] number of code points
|
78
|
+
# @example
|
79
|
+
# Unisec::Size.code_points_size('👩❤️👩') # => 6
|
80
|
+
def self.code_points_size(str)
|
81
|
+
str.size
|
82
|
+
end
|
83
|
+
|
84
|
+
# Number of graphemes
|
85
|
+
# @param str [String] Input sting we want to know the size of
|
86
|
+
# @return [Integer] number of graphemes
|
87
|
+
# @example
|
88
|
+
# Unisec::Size.grapheme_size('👩❤️👩') # => 1
|
89
|
+
def self.grapheme_size(str)
|
90
|
+
str.grapheme_clusters.size
|
91
|
+
end
|
92
|
+
|
93
|
+
# UTF-8 size in bytes
|
94
|
+
# @param str [String] Input sting we want to know the size of
|
95
|
+
# @return [Integer] UTF-8 size in bytes
|
96
|
+
# @example
|
97
|
+
# Unisec::Size.utf8_bytesize('👩❤️👩') # => 20
|
98
|
+
def self.utf8_bytesize(str)
|
99
|
+
str.bytesize
|
100
|
+
end
|
101
|
+
|
102
|
+
# UTF-16 size in bytes
|
103
|
+
# @param str [String] Input sting we want to know the size of
|
104
|
+
# @return [Integer] UTF-16 size in bytes
|
105
|
+
# @example
|
106
|
+
# Unisec::Size.utf16_bytesize('👩❤️👩') # => 16
|
107
|
+
def self.utf16_bytesize(str)
|
108
|
+
str.encode('UTF-16BE').bytesize
|
109
|
+
end
|
110
|
+
|
111
|
+
# UTF-32 size in bytes
|
112
|
+
# @param str [String] Input sting we want to know the size of
|
113
|
+
# @return [Integer] UTF-32 size in bytes
|
114
|
+
# @example
|
115
|
+
# Unisec::Size.utf32_bytesize('👩❤️👩') # => 24
|
116
|
+
def self.utf32_bytesize(str)
|
117
|
+
str.encode('UTF-32BE').bytesize
|
118
|
+
end
|
119
|
+
|
120
|
+
# Number of UTF-8 units
|
121
|
+
# @param str [String] Input sting we want to know the size of
|
122
|
+
# @return [Integer] number of UTF-8 units
|
123
|
+
# @example
|
124
|
+
# Unisec::Size.utf8_unitsize('👩❤️👩') # => 20
|
125
|
+
def self.utf8_unitsize(str)
|
126
|
+
utf8_bytesize(str)
|
127
|
+
end
|
128
|
+
|
129
|
+
# Number of UTF-16 units
|
130
|
+
# @param str [String] Input sting we want to know the size of
|
131
|
+
# @return [Integer] number of UTF-16 units
|
132
|
+
# @example
|
133
|
+
# Unisec::Size.utf16_unitsize('👩❤️👩') # => 8
|
134
|
+
def self.utf16_unitsize(str)
|
135
|
+
utf16_bytesize(str) / 2
|
136
|
+
end
|
137
|
+
|
138
|
+
# Number of UTF-32 units
|
139
|
+
# @param str [String] Input sting we want to know the size of
|
140
|
+
# @return [Integer] number of UTF-32 units
|
141
|
+
# @example
|
142
|
+
# Unisec::Size.utf32_unitsize('👩❤️👩') # => 6
|
143
|
+
def self.utf32_unitsize(str)
|
144
|
+
utf32_bytesize(str) / 4
|
145
|
+
end
|
146
|
+
|
147
|
+
# Display a CLI-friendly output summurizing the size information about a Unicode string.
|
148
|
+
# @example
|
149
|
+
# Unisec::Size.new('👩❤️👨').display
|
150
|
+
# # =>
|
151
|
+
# # Code point(s): 6
|
152
|
+
# # Grapheme(s): 1
|
153
|
+
# # UTF-8 byte(s): 20
|
154
|
+
# # UTF-16 byte(s): 16
|
155
|
+
# # UTF-32 byte(s): 24
|
156
|
+
# # UTF-8 unit(s): 20
|
157
|
+
# # UTF-16 unit(s): 8
|
158
|
+
# # UTF-32 unit(s): 6
|
159
|
+
def display
|
160
|
+
display = ->(key, value) { puts Paint[key, :red, :bold].ljust(27) + " #{value}" }
|
161
|
+
display.call('Code point(s):', @code_points_size)
|
162
|
+
display.call('Grapheme(s):', @grapheme_size)
|
163
|
+
display.call('UTF-8 byte(s):', @utf8_bytesize)
|
164
|
+
display.call('UTF-16 byte(s):', @utf16_bytesize)
|
165
|
+
display.call('UTF-32 byte(s):', @utf32_bytesize)
|
166
|
+
display.call('UTF-8 unit(s):', @utf8_unitsize)
|
167
|
+
display.call('UTF-16 unit(s):', @utf16_unitsize)
|
168
|
+
display.call('UTF-32 unit(s):', @utf32_unitsize)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
data/lib/unisec/version.rb
CHANGED
@@ -0,0 +1,90 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'twitter_cldr'
|
4
|
+
require 'unicode/confusable'
|
5
|
+
require 'paint'
|
6
|
+
|
7
|
+
module Unisec
|
8
|
+
# Version information related to Unicode used in Unisec
|
9
|
+
class Versions
|
10
|
+
# Version and label of anything related to Unicode used in Unisec
|
11
|
+
# @return [Hash] versions of each component
|
12
|
+
# @example
|
13
|
+
# Unisec::Versions.versions
|
14
|
+
# # =>
|
15
|
+
# # {:unisec=>{:version=>"0.0.1", :label=>"unisec"},
|
16
|
+
# # … }
|
17
|
+
def self.versions # rubocop:disable Metrics/MethodLength
|
18
|
+
{
|
19
|
+
unisec: {
|
20
|
+
version: Unisec::VERSION,
|
21
|
+
label: 'unisec'
|
22
|
+
},
|
23
|
+
ruby_unicode: {
|
24
|
+
version: RbConfig::CONFIG['UNICODE_VERSION'],
|
25
|
+
label: 'Unicode (Ruby)'
|
26
|
+
},
|
27
|
+
ruby_unicode_emoji: {
|
28
|
+
version: RbConfig::CONFIG['UNICODE_EMOJI_VERSION'],
|
29
|
+
label: 'Unicode emoji (Ruby)'
|
30
|
+
},
|
31
|
+
twittercldr_cldr: {
|
32
|
+
version: TwitterCldr::Versions::CLDR_VERSION,
|
33
|
+
label: 'CLDR (twitter_cldr gem)'
|
34
|
+
},
|
35
|
+
twittercldr_icu: {
|
36
|
+
version: TwitterCldr::Versions::ICU_VERSION,
|
37
|
+
label: 'ICU (twitter_cldr gem)'
|
38
|
+
},
|
39
|
+
twittercldr_unicode: {
|
40
|
+
version: TwitterCldr::Versions::UNICODE_VERSION,
|
41
|
+
label: 'Unicode (twitter_cldr gem)'
|
42
|
+
},
|
43
|
+
twittercldr: {
|
44
|
+
version: TwitterCldr::VERSION,
|
45
|
+
label: 'twitter_cldr gem'
|
46
|
+
},
|
47
|
+
unicodeconfusable: {
|
48
|
+
version: Unicode::Confusable::VERSION,
|
49
|
+
label: 'unicode-confusable gem'
|
50
|
+
},
|
51
|
+
unicodeconfusable_unicode: {
|
52
|
+
version: Unicode::Confusable::UNICODE_VERSION,
|
53
|
+
label: 'Unicode (unicode-confusable gem)'
|
54
|
+
},
|
55
|
+
ucd_derivedname: {
|
56
|
+
version: Unisec::Rugrep.ucd_derivedname_version,
|
57
|
+
label: 'UCD (data/DerivedName.txt)'
|
58
|
+
}
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
# Display a CLI-friendly output of the version of anything related to Unicode used in unisec
|
63
|
+
# @example
|
64
|
+
# Unisec::Versions.display
|
65
|
+
# # =>
|
66
|
+
# # Unicode:
|
67
|
+
# # Unicode (Ruby) 15.0.0
|
68
|
+
# # …
|
69
|
+
# #
|
70
|
+
# # Gems:
|
71
|
+
# # unisec 0.0.1
|
72
|
+
# # …
|
73
|
+
def self.display # rubocop:disable Metrics/AbcSize
|
74
|
+
data = versions
|
75
|
+
display = ->(node) { puts Paint[data[node][:label], :red, :bold].ljust(44) + " #{data[node][:version]}" }
|
76
|
+
puts Paint['Unicode:', :underline]
|
77
|
+
display.call(:ruby_unicode)
|
78
|
+
display.call(:twittercldr_unicode)
|
79
|
+
display.call(:unicodeconfusable_unicode)
|
80
|
+
display.call(:twittercldr_icu)
|
81
|
+
display.call(:twittercldr_cldr)
|
82
|
+
display.call(:ruby_unicode_emoji)
|
83
|
+
display.call(:ucd_derivedname)
|
84
|
+
puts Paint["\nGems:", :underline]
|
85
|
+
display.call(:unisec)
|
86
|
+
display.call(:twittercldr)
|
87
|
+
display.call(:unicodeconfusable)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
data/lib/unisec.rb
CHANGED
@@ -2,7 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'unisec/version'
|
4
4
|
|
5
|
-
require 'unisec/
|
5
|
+
require 'unisec/confusables'
|
6
6
|
require 'unisec/hexdump'
|
7
7
|
require 'unisec/properties'
|
8
|
-
require 'unisec/
|
8
|
+
require 'unisec/rugrep'
|
9
|
+
require 'unisec/size'
|
10
|
+
require 'unisec/surrogates'
|
11
|
+
require 'unisec/versions'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unisec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alexandre ZANNI
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ctf-party
|
@@ -86,7 +86,9 @@ dependencies:
|
|
86
86
|
- - "~>"
|
87
87
|
- !ruby/object:Gem::Version
|
88
88
|
version: '1.9'
|
89
|
-
description: Toolkit for security research manipulating Unicode
|
89
|
+
description: 'Toolkit for security research manipulating Unicode: confusables, homoglyphs,
|
90
|
+
hexdump, code point, UTF-8, UTF-16, UTF-32, properties, regexp search, size, grapheme,
|
91
|
+
surrogates, version, ICU, CLDR, UCD'
|
90
92
|
email: alexandre.zanni@europe.com
|
91
93
|
executables:
|
92
94
|
- unisec
|
@@ -95,18 +97,25 @@ extra_rdoc_files: []
|
|
95
97
|
files:
|
96
98
|
- LICENSE
|
97
99
|
- bin/unisec
|
100
|
+
- data/DerivedName.txt
|
98
101
|
- lib/unisec.rb
|
99
102
|
- lib/unisec/cli/cli.rb
|
100
103
|
- lib/unisec/cli/confusables.rb
|
101
104
|
- lib/unisec/cli/hexdump.rb
|
102
105
|
- lib/unisec/cli/properties.rb
|
106
|
+
- lib/unisec/cli/rugrep.rb
|
107
|
+
- lib/unisec/cli/size.rb
|
103
108
|
- lib/unisec/cli/surrogates.rb
|
109
|
+
- lib/unisec/cli/versions.rb
|
104
110
|
- lib/unisec/confusables.rb
|
105
111
|
- lib/unisec/hexdump.rb
|
106
112
|
- lib/unisec/properties.rb
|
113
|
+
- lib/unisec/rugrep.rb
|
114
|
+
- lib/unisec/size.rb
|
107
115
|
- lib/unisec/surrogates.rb
|
108
116
|
- lib/unisec/utils.rb
|
109
117
|
- lib/unisec/version.rb
|
118
|
+
- lib/unisec/versions.rb
|
110
119
|
homepage: https://github.com/Acceis/unisec
|
111
120
|
licenses:
|
112
121
|
- MIT
|
@@ -136,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
136
145
|
- !ruby/object:Gem::Version
|
137
146
|
version: '0'
|
138
147
|
requirements: []
|
139
|
-
rubygems_version: 3.4.
|
148
|
+
rubygems_version: 3.4.10
|
140
149
|
signing_key:
|
141
150
|
specification_version: 4
|
142
151
|
summary: Unicode Security Toolkit
|