unisec 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/data/DerivedName.txt +44181 -0
- data/lib/unisec/cli/cli.rb +14 -8
- data/lib/unisec/cli/rugrep.rb +33 -0
- data/lib/unisec/cli/size.rb +38 -0
- data/lib/unisec/cli/versions.rb +38 -0
- data/lib/unisec/properties.rb +11 -2
- data/lib/unisec/rugrep.rb +126 -0
- data/lib/unisec/size.rb +171 -0
- data/lib/unisec/version.rb +1 -1
- data/lib/unisec/versions.rb +90 -0
- data/lib/unisec.rb +5 -2
- metadata +13 -4
data/lib/unisec/cli/cli.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'unisec/cli/
|
3
|
+
require 'unisec/cli/confusables'
|
4
4
|
require 'unisec/cli/hexdump'
|
5
5
|
require 'unisec/cli/properties'
|
6
|
-
require 'unisec/cli/
|
6
|
+
require 'unisec/cli/rugrep'
|
7
|
+
require 'unisec/cli/size'
|
8
|
+
require 'unisec/cli/surrogates'
|
9
|
+
require 'unisec/cli/versions'
|
7
10
|
|
8
11
|
module Unisec
|
9
12
|
# Module used to create the CLI for the executable
|
@@ -14,14 +17,17 @@ module Unisec
|
|
14
17
|
|
15
18
|
# Mapping between the (sub-)commands as seen by the user
|
16
19
|
# on the command-line interface and the CLI modules in the lib
|
17
|
-
register 'surrogates to', Surrogates::To
|
18
|
-
register 'surrogates from', Surrogates::From
|
19
|
-
register 'hexdump', Hexdump
|
20
|
-
register 'properties list', Properties::List
|
21
|
-
register 'properties codepoints', Properties::Codepoints
|
22
|
-
register 'properties char', Properties::Char
|
23
20
|
register 'confusables list', Confusables::List
|
24
21
|
register 'confusables randomize', Confusables::Randomize
|
22
|
+
register 'grep', Grep
|
23
|
+
register 'hexdump', Hexdump
|
24
|
+
register 'properties char', Properties::Char
|
25
|
+
register 'properties codepoints', Properties::Codepoints
|
26
|
+
register 'properties list', Properties::List
|
27
|
+
register 'size', Size
|
28
|
+
register 'surrogates from', Surrogates::From
|
29
|
+
register 'surrogates to', Surrogates::To
|
30
|
+
register 'versions', Versions
|
25
31
|
end
|
26
32
|
end
|
27
33
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'dry/cli'
|
4
|
+
require 'unisec'
|
5
|
+
|
6
|
+
module Unisec
|
7
|
+
module CLI
|
8
|
+
module Commands
|
9
|
+
# CLI command `unisec grep` for the class {Unisec::Rugrep} from the lib.
|
10
|
+
#
|
11
|
+
# Example:
|
12
|
+
#
|
13
|
+
# ```plaintext
|
14
|
+
# $ unisec grep 'FRENCH \w+'
|
15
|
+
# U+20A3 ₣ FRENCH FRANC SIGN
|
16
|
+
# U+1F35F 🍟 FRENCH FRIES
|
17
|
+
# ```
|
18
|
+
class Grep < Dry::CLI::Command
|
19
|
+
desc 'Search for Unicode code point names by regular expression'
|
20
|
+
|
21
|
+
argument :regexp, required: true,
|
22
|
+
desc: 'regular expression'
|
23
|
+
|
24
|
+
# Hexdump of all Unicode encodings.
|
25
|
+
# @param regexp [Regexp] Regular expression without delimiters or modifiers.
|
26
|
+
# Supports everything Ruby Regexp supports
|
27
|
+
def call(regexp: nil, **)
|
28
|
+
puts Unisec::Rugrep.regrep_display(regexp)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'dry/cli'
|
4
|
+
require 'unisec'
|
5
|
+
|
6
|
+
module Unisec
|
7
|
+
module CLI
|
8
|
+
module Commands
|
9
|
+
# CLI command `unisec size` for the class {Unisec::Size} from the lib.
|
10
|
+
#
|
11
|
+
# Example:
|
12
|
+
#
|
13
|
+
# ```plaintext
|
14
|
+
# $ unisec size 🧑🏼🔬
|
15
|
+
# Code point(s): 4
|
16
|
+
# Grapheme(s): 1
|
17
|
+
# UTF-8 byte(s): 15
|
18
|
+
# UTF-16 byte(s): 14
|
19
|
+
# UTF-32 byte(s): 16
|
20
|
+
# UTF-8 unit(s): 15
|
21
|
+
# UTF-16 unit(s): 7
|
22
|
+
# UTF-32 unit(s): 4
|
23
|
+
# ```
|
24
|
+
class Size < Dry::CLI::Command
|
25
|
+
desc 'All kinf of size information about a Unicode string'
|
26
|
+
|
27
|
+
argument :input, required: true,
|
28
|
+
desc: 'String input'
|
29
|
+
|
30
|
+
# All kinf of size information about a Unicode string.
|
31
|
+
# @param input [String] Input sting we want to know the size of
|
32
|
+
def call(input: nil, **)
|
33
|
+
puts Unisec::Size.new(input).display
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'dry/cli'
|
4
|
+
require 'unisec'
|
5
|
+
|
6
|
+
module Unisec
|
7
|
+
module CLI
|
8
|
+
module Commands
|
9
|
+
# CLI command `unisec versions` for the class {Unisec::Versions} from the lib.
|
10
|
+
#
|
11
|
+
# Example:
|
12
|
+
#
|
13
|
+
# ```plaintext
|
14
|
+
# $ unisec versions
|
15
|
+
# Unicode:
|
16
|
+
# Unicode (Ruby) 15.0.0
|
17
|
+
# Unicode (twitter_cldr gem) 14.0.0
|
18
|
+
# Unicode (unicode-confusable gem) 15.0.0
|
19
|
+
# ICU (twitter_cldr gem) 70.1
|
20
|
+
# CLDR (twitter_cldr gem) 40
|
21
|
+
# Unicode emoji (Ruby) 15.0
|
22
|
+
#
|
23
|
+
# Gems:
|
24
|
+
# unisec 0.0.1
|
25
|
+
# twitter_cldr gem 6.11.5
|
26
|
+
# unicode-confusable gem 1.9.0
|
27
|
+
# ```
|
28
|
+
class Versions < Dry::CLI::Command
|
29
|
+
desc 'Version of anything related to Unicode as used in unisec'
|
30
|
+
|
31
|
+
# Version of anything related to Unicode as used in unisec.
|
32
|
+
def call(**)
|
33
|
+
puts Unisec::Versions.display
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/unisec/properties.rb
CHANGED
@@ -50,7 +50,7 @@ module Unisec
|
|
50
50
|
def self.codepoints_display(prop)
|
51
51
|
codepoints = Properties.codepoints(prop)
|
52
52
|
codepoints.each do |cp|
|
53
|
-
puts "#{Properties.
|
53
|
+
puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
54
54
|
end
|
55
55
|
nil
|
56
56
|
end
|
@@ -158,7 +158,7 @@ module Unisec
|
|
158
158
|
# @example
|
159
159
|
# Unisec::Properties.char2codepoint('💎') # => "U+1F48E"
|
160
160
|
def self.char2codepoint(chr)
|
161
|
-
|
161
|
+
Properties.deccp2stdhexcp(chr.codepoints.first)
|
162
162
|
end
|
163
163
|
|
164
164
|
# Display the code points in Unicode format for the given characters (code points as string)
|
@@ -174,5 +174,14 @@ module Unisec
|
|
174
174
|
end
|
175
175
|
out.join(' ')
|
176
176
|
end
|
177
|
+
|
178
|
+
# Convert from decimal code point to standardized format hexadecimal code point
|
179
|
+
# @param int_cp [Integer] Code point in decimal format
|
180
|
+
# @return [String] code point in Unicode format
|
181
|
+
# @example
|
182
|
+
# Unisec::Properties.intcp2stdhexcp(128640) # => "U+1F680"
|
183
|
+
def self.deccp2stdhexcp(int_cp)
|
184
|
+
"U+#{format('%.4x', int_cp).upcase}"
|
185
|
+
end
|
177
186
|
end
|
178
187
|
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'twitter_cldr'
|
4
|
+
require 'paint'
|
5
|
+
|
6
|
+
module Unisec
|
7
|
+
# Ruby grep : Ruby regular expression search for Unicode code point names
|
8
|
+
class Rugrep
|
9
|
+
# UCD Derived names file location
|
10
|
+
# @see https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedName.txt
|
11
|
+
UCD_DERIVEDNAME = File.join(__dir__, '../../data/DerivedName.txt')
|
12
|
+
|
13
|
+
# Search code points by (Ruby) regexp
|
14
|
+
# @param regexp [Regexp] Regular expression without delimiters or modifiers.
|
15
|
+
# Supports everything Ruby Regexp supports
|
16
|
+
# @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
|
17
|
+
# @example
|
18
|
+
# Unisec::Rugrep.regrep('snowman|snowflake')
|
19
|
+
# # =>
|
20
|
+
# # [{:char=>"☃", :codepoint=>9731, :name=>"SNOWMAN"},
|
21
|
+
# # {:char=>"⛄", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
|
22
|
+
# # {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
|
23
|
+
# # {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
|
24
|
+
# # {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
|
25
|
+
# # {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
|
26
|
+
# Unisec::Rugrep.regrep('greek small letter \w+')
|
27
|
+
# # =>
|
28
|
+
# # [{:char=>"ͱ", :codepoint=>881, :name=>"GREEK SMALL LETTER HETA"},
|
29
|
+
# # {:char=>"ͳ", :codepoint=>883, :name=>"GREEK SMALL LETTER ARCHAIC SAMPI"},
|
30
|
+
# # {:char=>"ͷ", :codepoint=>887, :name=>"GREEK SMALL LETTER PAMPHYLIAN DIGAMMA"},
|
31
|
+
# # …]
|
32
|
+
def self.regrep(regexp)
|
33
|
+
out = []
|
34
|
+
file = File.new(UCD_DERIVEDNAME)
|
35
|
+
file.each_line(chomp: true) do |line|
|
36
|
+
# Skip if the line is empty or a comment
|
37
|
+
next if line.empty? || line[0] == '#'
|
38
|
+
|
39
|
+
# parse the line to extract code point as integer and the name
|
40
|
+
cp_int, name = line.split(';')
|
41
|
+
cp_int = cp_int.chomp.to_i(16)
|
42
|
+
name.lstrip!
|
43
|
+
next unless /#{regexp}/i.match?(name) # compiling regexp once is surprisingly not faster
|
44
|
+
|
45
|
+
out << {
|
46
|
+
char: TwitterCldr::Utils::CodePoints.to_string([cp_int]),
|
47
|
+
codepoint: cp_int,
|
48
|
+
name: name
|
49
|
+
}
|
50
|
+
end
|
51
|
+
out
|
52
|
+
end
|
53
|
+
|
54
|
+
# Display a CLI-friendly output listing all code points corresponding to a regular expression.
|
55
|
+
# @example
|
56
|
+
# Unisec::Rugrep.regrep_display('snowman|snowflake')
|
57
|
+
# # =>
|
58
|
+
# # U+2603 ☃ SNOWMAN
|
59
|
+
# # U+26C4 ⛄ SNOWMAN WITHOUT SNOW
|
60
|
+
# # U+26C7 ⛇ BLACK SNOWMAN
|
61
|
+
# # U+2744 ❄ SNOWFLAKE
|
62
|
+
# # U+2745 ❅ TIGHT TRIFOLIATE SNOWFLAKE
|
63
|
+
# # U+2746 ❆ HEAVY CHEVRON SNOWFLAKE
|
64
|
+
def self.regrep_display(regexp)
|
65
|
+
codepoints = regrep(regexp)
|
66
|
+
codepoints.each do |cp|
|
67
|
+
puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
68
|
+
end
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns the version of Unicode used in UCD local file (data/DerivedName.txt)
|
73
|
+
# @return [String] Unicode version
|
74
|
+
# @example
|
75
|
+
# Unisec::Rugrep.ucd_derivedname_version # => "15.1.0"
|
76
|
+
def self.ucd_derivedname_version
|
77
|
+
first_line = File.open(UCD_DERIVEDNAME, &:readline)
|
78
|
+
first_line.match(/-(\d+\.\d+\.\d+)\.txt/).captures.first
|
79
|
+
end
|
80
|
+
|
81
|
+
# Search code points by (Ruby) regexp
|
82
|
+
# @param regexp [Regexp] Regular expression without delimiters or modifiers
|
83
|
+
# @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
|
84
|
+
# @example
|
85
|
+
# Unisec::Rugrep.regrep_slow('snowman|snowflake')
|
86
|
+
# # =>
|
87
|
+
# # [{:char=>"☃", :codepoint=>9731, :name=>"SNOWMAN"},
|
88
|
+
# # {:char=>"⛄", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
|
89
|
+
# # {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
|
90
|
+
# # {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
|
91
|
+
# # {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
|
92
|
+
# # {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
|
93
|
+
# @note ⚠ This command is very time consuming (~ 1min) and unoptimized (execute one regexp per code point…)
|
94
|
+
def self.regrep_slow(regexp)
|
95
|
+
out = []
|
96
|
+
TwitterCldr::Shared::CodePoint.each do |cp|
|
97
|
+
next unless /#{regexp}/oi.match?(cp.name) # compiling regexp once is surprisingly not faster
|
98
|
+
|
99
|
+
out << {
|
100
|
+
char: TwitterCldr::Utils::CodePoints.to_string([cp.code_point]),
|
101
|
+
codepoint: cp.code_point,
|
102
|
+
name: cp.name
|
103
|
+
}
|
104
|
+
end
|
105
|
+
out
|
106
|
+
end
|
107
|
+
|
108
|
+
# Display a CLI-friendly output listing all code points corresponding to a regular expression.
|
109
|
+
# @example
|
110
|
+
# Unisec::Rugrep.regrep_display_slow('snowman|snowflake')
|
111
|
+
# # =>
|
112
|
+
# # U+2603 ☃ SNOWMAN
|
113
|
+
# # U+26C4 ⛄ SNOWMAN WITHOUT SNOW
|
114
|
+
# # U+26C7 ⛇ BLACK SNOWMAN
|
115
|
+
# # U+2744 ❄ SNOWFLAKE
|
116
|
+
# # U+2745 ❅ TIGHT TRIFOLIATE SNOWFLAKE
|
117
|
+
# # U+2746 ❆ HEAVY CHEVRON SNOWFLAKE
|
118
|
+
def self.regrep_display_slow(regexp)
|
119
|
+
codepoints = regrep_slow(regexp)
|
120
|
+
codepoints.each do |cp|
|
121
|
+
puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
122
|
+
end
|
123
|
+
nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
data/lib/unisec/size.rb
ADDED
@@ -0,0 +1,171 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'paint'
|
4
|
+
|
5
|
+
module Unisec
|
6
|
+
# All kinf of size information about a Unicode string
|
7
|
+
class Size
|
8
|
+
# Number of code points
|
9
|
+
# @return [Integer] number of code points
|
10
|
+
# @example
|
11
|
+
# us = Unisec::Size.new('👩❤️👩')
|
12
|
+
# us.code_points_size # => 6
|
13
|
+
attr_reader :code_points_size
|
14
|
+
|
15
|
+
# Number of graphemes
|
16
|
+
# @return [Integer] number of graphemes
|
17
|
+
# @example
|
18
|
+
# us = Unisec::Size.new('👩❤️👩')
|
19
|
+
# us.grapheme_size # => 1
|
20
|
+
attr_reader :grapheme_size
|
21
|
+
|
22
|
+
# UTF-8 size in bytes
|
23
|
+
# @return [Integer] UTF-8 size in bytes
|
24
|
+
# @example
|
25
|
+
# us = Unisec::Size.new('👩❤️👩')
|
26
|
+
# us.utf8_bytesize # => 20
|
27
|
+
attr_reader :utf8_bytesize
|
28
|
+
|
29
|
+
# UTF-16 size in bytes
|
30
|
+
# @return [Integer] UTF-16 size in bytes
|
31
|
+
# @example
|
32
|
+
# us = Unisec::Size.new('👩❤️👩')
|
33
|
+
# us.utf16_bytesize # => 16
|
34
|
+
attr_reader :utf16_bytesize
|
35
|
+
|
36
|
+
# UTF-32 size in bytes
|
37
|
+
# @return [Integer] UTF-32 size in bytes
|
38
|
+
# @example
|
39
|
+
# us = Unisec::Size.new('👩❤️👩')
|
40
|
+
# us.utf32_bytesize # => 24
|
41
|
+
attr_reader :utf32_bytesize
|
42
|
+
|
43
|
+
# Number of UTF-8 units
|
44
|
+
# @return [Integer] number of UTF-8 units
|
45
|
+
# @example
|
46
|
+
# us = Unisec::Size.new('👩❤️👩')
|
47
|
+
# us.utf8_unitsize # => 20
|
48
|
+
attr_reader :utf8_unitsize
|
49
|
+
|
50
|
+
# Number of UTF-16 units
|
51
|
+
# @return [Integer] number of UTF-16 units
|
52
|
+
# @example
|
53
|
+
# us = Unisec::Size.new('👩❤️👩')
|
54
|
+
# us.utf16_unitsize # => 8
|
55
|
+
attr_reader :utf16_unitsize
|
56
|
+
|
57
|
+
# Number of UTF-32 units
|
58
|
+
# @return [Integer] number of UTF-32 units
|
59
|
+
# @example
|
60
|
+
# us = Unisec::Size.new('👩❤️👩')
|
61
|
+
# us.utf32_unitsize # => 6
|
62
|
+
attr_reader :utf32_unitsize
|
63
|
+
|
64
|
+
def initialize(str)
|
65
|
+
@code_points_size = Size.code_points_size(str)
|
66
|
+
@grapheme_size = Size.grapheme_size(str)
|
67
|
+
@utf8_bytesize = Size.utf8_bytesize(str)
|
68
|
+
@utf16_bytesize = Size.utf16_bytesize(str)
|
69
|
+
@utf32_bytesize = Size.utf32_bytesize(str)
|
70
|
+
@utf8_unitsize = Size.utf8_unitsize(str)
|
71
|
+
@utf16_unitsize = Size.utf16_unitsize(str)
|
72
|
+
@utf32_unitsize = Size.utf32_unitsize(str)
|
73
|
+
end
|
74
|
+
|
75
|
+
# Number of code points
|
76
|
+
# @param str [String] Input sting we want to know the size of
|
77
|
+
# @return [Integer] number of code points
|
78
|
+
# @example
|
79
|
+
# Unisec::Size.code_points_size('👩❤️👩') # => 6
|
80
|
+
def self.code_points_size(str)
|
81
|
+
str.size
|
82
|
+
end
|
83
|
+
|
84
|
+
# Number of graphemes
|
85
|
+
# @param str [String] Input sting we want to know the size of
|
86
|
+
# @return [Integer] number of graphemes
|
87
|
+
# @example
|
88
|
+
# Unisec::Size.grapheme_size('👩❤️👩') # => 1
|
89
|
+
def self.grapheme_size(str)
|
90
|
+
str.grapheme_clusters.size
|
91
|
+
end
|
92
|
+
|
93
|
+
# UTF-8 size in bytes
|
94
|
+
# @param str [String] Input sting we want to know the size of
|
95
|
+
# @return [Integer] UTF-8 size in bytes
|
96
|
+
# @example
|
97
|
+
# Unisec::Size.utf8_bytesize('👩❤️👩') # => 20
|
98
|
+
def self.utf8_bytesize(str)
|
99
|
+
str.bytesize
|
100
|
+
end
|
101
|
+
|
102
|
+
# UTF-16 size in bytes
|
103
|
+
# @param str [String] Input sting we want to know the size of
|
104
|
+
# @return [Integer] UTF-16 size in bytes
|
105
|
+
# @example
|
106
|
+
# Unisec::Size.utf16_bytesize('👩❤️👩') # => 16
|
107
|
+
def self.utf16_bytesize(str)
|
108
|
+
str.encode('UTF-16BE').bytesize
|
109
|
+
end
|
110
|
+
|
111
|
+
# UTF-32 size in bytes
|
112
|
+
# @param str [String] Input sting we want to know the size of
|
113
|
+
# @return [Integer] UTF-32 size in bytes
|
114
|
+
# @example
|
115
|
+
# Unisec::Size.utf32_bytesize('👩❤️👩') # => 24
|
116
|
+
def self.utf32_bytesize(str)
|
117
|
+
str.encode('UTF-32BE').bytesize
|
118
|
+
end
|
119
|
+
|
120
|
+
# Number of UTF-8 units
|
121
|
+
# @param str [String] Input sting we want to know the size of
|
122
|
+
# @return [Integer] number of UTF-8 units
|
123
|
+
# @example
|
124
|
+
# Unisec::Size.utf8_unitsize('👩❤️👩') # => 20
|
125
|
+
def self.utf8_unitsize(str)
|
126
|
+
utf8_bytesize(str)
|
127
|
+
end
|
128
|
+
|
129
|
+
# Number of UTF-16 units
|
130
|
+
# @param str [String] Input sting we want to know the size of
|
131
|
+
# @return [Integer] number of UTF-16 units
|
132
|
+
# @example
|
133
|
+
# Unisec::Size.utf16_unitsize('👩❤️👩') # => 8
|
134
|
+
def self.utf16_unitsize(str)
|
135
|
+
utf16_bytesize(str) / 2
|
136
|
+
end
|
137
|
+
|
138
|
+
# Number of UTF-32 units
|
139
|
+
# @param str [String] Input sting we want to know the size of
|
140
|
+
# @return [Integer] number of UTF-32 units
|
141
|
+
# @example
|
142
|
+
# Unisec::Size.utf32_unitsize('👩❤️👩') # => 6
|
143
|
+
def self.utf32_unitsize(str)
|
144
|
+
utf32_bytesize(str) / 4
|
145
|
+
end
|
146
|
+
|
147
|
+
# Display a CLI-friendly output summurizing the size information about a Unicode string.
|
148
|
+
# @example
|
149
|
+
# Unisec::Size.new('👩❤️👨').display
|
150
|
+
# # =>
|
151
|
+
# # Code point(s): 6
|
152
|
+
# # Grapheme(s): 1
|
153
|
+
# # UTF-8 byte(s): 20
|
154
|
+
# # UTF-16 byte(s): 16
|
155
|
+
# # UTF-32 byte(s): 24
|
156
|
+
# # UTF-8 unit(s): 20
|
157
|
+
# # UTF-16 unit(s): 8
|
158
|
+
# # UTF-32 unit(s): 6
|
159
|
+
def display
|
160
|
+
display = ->(key, value) { puts Paint[key, :red, :bold].ljust(27) + " #{value}" }
|
161
|
+
display.call('Code point(s):', @code_points_size)
|
162
|
+
display.call('Grapheme(s):', @grapheme_size)
|
163
|
+
display.call('UTF-8 byte(s):', @utf8_bytesize)
|
164
|
+
display.call('UTF-16 byte(s):', @utf16_bytesize)
|
165
|
+
display.call('UTF-32 byte(s):', @utf32_bytesize)
|
166
|
+
display.call('UTF-8 unit(s):', @utf8_unitsize)
|
167
|
+
display.call('UTF-16 unit(s):', @utf16_unitsize)
|
168
|
+
display.call('UTF-32 unit(s):', @utf32_unitsize)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
data/lib/unisec/version.rb
CHANGED
@@ -0,0 +1,90 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'twitter_cldr'
|
4
|
+
require 'unicode/confusable'
|
5
|
+
require 'paint'
|
6
|
+
|
7
|
+
module Unisec
|
8
|
+
# Version information related to Unicode used in Unisec
|
9
|
+
class Versions
|
10
|
+
# Version and label of anything related to Unicode used in Unisec
|
11
|
+
# @return [Hash] versions of each component
|
12
|
+
# @example
|
13
|
+
# Unisec::Versions.versions
|
14
|
+
# # =>
|
15
|
+
# # {:unisec=>{:version=>"0.0.1", :label=>"unisec"},
|
16
|
+
# # … }
|
17
|
+
def self.versions # rubocop:disable Metrics/MethodLength
|
18
|
+
{
|
19
|
+
unisec: {
|
20
|
+
version: Unisec::VERSION,
|
21
|
+
label: 'unisec'
|
22
|
+
},
|
23
|
+
ruby_unicode: {
|
24
|
+
version: RbConfig::CONFIG['UNICODE_VERSION'],
|
25
|
+
label: 'Unicode (Ruby)'
|
26
|
+
},
|
27
|
+
ruby_unicode_emoji: {
|
28
|
+
version: RbConfig::CONFIG['UNICODE_EMOJI_VERSION'],
|
29
|
+
label: 'Unicode emoji (Ruby)'
|
30
|
+
},
|
31
|
+
twittercldr_cldr: {
|
32
|
+
version: TwitterCldr::Versions::CLDR_VERSION,
|
33
|
+
label: 'CLDR (twitter_cldr gem)'
|
34
|
+
},
|
35
|
+
twittercldr_icu: {
|
36
|
+
version: TwitterCldr::Versions::ICU_VERSION,
|
37
|
+
label: 'ICU (twitter_cldr gem)'
|
38
|
+
},
|
39
|
+
twittercldr_unicode: {
|
40
|
+
version: TwitterCldr::Versions::UNICODE_VERSION,
|
41
|
+
label: 'Unicode (twitter_cldr gem)'
|
42
|
+
},
|
43
|
+
twittercldr: {
|
44
|
+
version: TwitterCldr::VERSION,
|
45
|
+
label: 'twitter_cldr gem'
|
46
|
+
},
|
47
|
+
unicodeconfusable: {
|
48
|
+
version: Unicode::Confusable::VERSION,
|
49
|
+
label: 'unicode-confusable gem'
|
50
|
+
},
|
51
|
+
unicodeconfusable_unicode: {
|
52
|
+
version: Unicode::Confusable::UNICODE_VERSION,
|
53
|
+
label: 'Unicode (unicode-confusable gem)'
|
54
|
+
},
|
55
|
+
ucd_derivedname: {
|
56
|
+
version: Unisec::Rugrep.ucd_derivedname_version,
|
57
|
+
label: 'UCD (data/DerivedName.txt)'
|
58
|
+
}
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
# Display a CLI-friendly output of the version of anything related to Unicode used in unisec
|
63
|
+
# @example
|
64
|
+
# Unisec::Versions.display
|
65
|
+
# # =>
|
66
|
+
# # Unicode:
|
67
|
+
# # Unicode (Ruby) 15.0.0
|
68
|
+
# # …
|
69
|
+
# #
|
70
|
+
# # Gems:
|
71
|
+
# # unisec 0.0.1
|
72
|
+
# # …
|
73
|
+
def self.display # rubocop:disable Metrics/AbcSize
|
74
|
+
data = versions
|
75
|
+
display = ->(node) { puts Paint[data[node][:label], :red, :bold].ljust(44) + " #{data[node][:version]}" }
|
76
|
+
puts Paint['Unicode:', :underline]
|
77
|
+
display.call(:ruby_unicode)
|
78
|
+
display.call(:twittercldr_unicode)
|
79
|
+
display.call(:unicodeconfusable_unicode)
|
80
|
+
display.call(:twittercldr_icu)
|
81
|
+
display.call(:twittercldr_cldr)
|
82
|
+
display.call(:ruby_unicode_emoji)
|
83
|
+
display.call(:ucd_derivedname)
|
84
|
+
puts Paint["\nGems:", :underline]
|
85
|
+
display.call(:unisec)
|
86
|
+
display.call(:twittercldr)
|
87
|
+
display.call(:unicodeconfusable)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
data/lib/unisec.rb
CHANGED
@@ -2,7 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'unisec/version'
|
4
4
|
|
5
|
-
require 'unisec/
|
5
|
+
require 'unisec/confusables'
|
6
6
|
require 'unisec/hexdump'
|
7
7
|
require 'unisec/properties'
|
8
|
-
require 'unisec/
|
8
|
+
require 'unisec/rugrep'
|
9
|
+
require 'unisec/size'
|
10
|
+
require 'unisec/surrogates'
|
11
|
+
require 'unisec/versions'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unisec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alexandre ZANNI
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ctf-party
|
@@ -86,7 +86,9 @@ dependencies:
|
|
86
86
|
- - "~>"
|
87
87
|
- !ruby/object:Gem::Version
|
88
88
|
version: '1.9'
|
89
|
-
description: Toolkit for security research manipulating Unicode
|
89
|
+
description: 'Toolkit for security research manipulating Unicode: confusables, homoglyphs,
|
90
|
+
hexdump, code point, UTF-8, UTF-16, UTF-32, properties, regexp search, size, grapheme,
|
91
|
+
surrogates, version, ICU, CLDR, UCD'
|
90
92
|
email: alexandre.zanni@europe.com
|
91
93
|
executables:
|
92
94
|
- unisec
|
@@ -95,18 +97,25 @@ extra_rdoc_files: []
|
|
95
97
|
files:
|
96
98
|
- LICENSE
|
97
99
|
- bin/unisec
|
100
|
+
- data/DerivedName.txt
|
98
101
|
- lib/unisec.rb
|
99
102
|
- lib/unisec/cli/cli.rb
|
100
103
|
- lib/unisec/cli/confusables.rb
|
101
104
|
- lib/unisec/cli/hexdump.rb
|
102
105
|
- lib/unisec/cli/properties.rb
|
106
|
+
- lib/unisec/cli/rugrep.rb
|
107
|
+
- lib/unisec/cli/size.rb
|
103
108
|
- lib/unisec/cli/surrogates.rb
|
109
|
+
- lib/unisec/cli/versions.rb
|
104
110
|
- lib/unisec/confusables.rb
|
105
111
|
- lib/unisec/hexdump.rb
|
106
112
|
- lib/unisec/properties.rb
|
113
|
+
- lib/unisec/rugrep.rb
|
114
|
+
- lib/unisec/size.rb
|
107
115
|
- lib/unisec/surrogates.rb
|
108
116
|
- lib/unisec/utils.rb
|
109
117
|
- lib/unisec/version.rb
|
118
|
+
- lib/unisec/versions.rb
|
110
119
|
homepage: https://github.com/Acceis/unisec
|
111
120
|
licenses:
|
112
121
|
- MIT
|
@@ -136,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
136
145
|
- !ruby/object:Gem::Version
|
137
146
|
version: '0'
|
138
147
|
requirements: []
|
139
|
-
rubygems_version: 3.4.
|
148
|
+
rubygems_version: 3.4.10
|
140
149
|
signing_key:
|
141
150
|
specification_version: 4
|
142
151
|
summary: Unicode Security Toolkit
|