unisec 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +2 -1
- data/data/Blocks.txt +383 -0
- data/data/DerivedName.txt +1659 -12
- data/lib/unisec/bidi.rb +8 -8
- data/lib/unisec/blocks.rb +209 -0
- data/lib/unisec/cli/blocks.rb +93 -0
- data/lib/unisec/cli/cli.rb +12 -3
- data/lib/unisec/cli/dump.rb +87 -0
- data/lib/unisec/cli/normalization.rb +71 -39
- data/lib/unisec/cli/planes.rb +99 -0
- data/lib/unisec/cli/rugrep.rb +1 -1
- data/lib/unisec/confusables.rb +3 -1
- data/lib/unisec/decdump.rb +118 -0
- data/lib/unisec/hexdump.rb +1 -1
- data/lib/unisec/normalization.rb +46 -1
- data/lib/unisec/planes.rb +224 -0
- data/lib/unisec/properties.rb +11 -42
- data/lib/unisec/rugrep.rb +3 -2
- data/lib/unisec/utils.rb +94 -0
- data/lib/unisec/version.rb +1 -1
- data/lib/unisec/versions.rb +5 -0
- data/lib/unisec.rb +3 -0
- metadata +24 -27
- data/lib/unisec/cli/hexdump.rb +0 -47
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'dry/cli'
|
|
4
|
+
require 'unisec'
|
|
5
|
+
require 'unisec/utils'
|
|
6
|
+
|
|
7
|
+
module Unisec
|
|
8
|
+
module CLI
|
|
9
|
+
module Commands
|
|
10
|
+
# CLI sub-commands `unisec planes xxx` for the class {Unisec::Planes} from the lib.
|
|
11
|
+
module Planes
|
|
12
|
+
# Command `unisec planes list`
|
|
13
|
+
#
|
|
14
|
+
# Example:
|
|
15
|
+
#
|
|
16
|
+
# ```plaintext
|
|
17
|
+
# $ unisec planes list
|
|
18
|
+
# Range: U+0000 - U+FFFF Name: Basic Multilingual Plane
|
|
19
|
+
# Range: U+10000 - U+1FFFF Name: Supplementary Multilingual Plane
|
|
20
|
+
# Range: U+20000 - U+2FFFF Name: Supplementary Ideographic Plane
|
|
21
|
+
# Range: U+30000 - U+3FFFF Name: Tertiary Ideographic Plane
|
|
22
|
+
# …
|
|
23
|
+
# $ unisec planes list --with-blocks=true
|
|
24
|
+
# Range: U+0000 - U+FFFF Name: Basic Multilingual Plane
|
|
25
|
+
# Blocks:
|
|
26
|
+
# Range: U+0000 - U+007F Name: Basic Latin
|
|
27
|
+
# Range: U+0080 - U+00FF Name: Latin-1 Supplement
|
|
28
|
+
# Range: U+0100 - U+017F Name: Latin Extended-A
|
|
29
|
+
# Range: U+0180 - U+024F Name: Latin Extended-B
|
|
30
|
+
# ```
|
|
31
|
+
class List < Dry::CLI::Command
|
|
32
|
+
desc 'List all Unicode planes'
|
|
33
|
+
|
|
34
|
+
option :with_blocks, default: 'false', values: %w[true false],
|
|
35
|
+
desc: 'display the blocks associated with each plane?'
|
|
36
|
+
option :with_count, default: 'false', values: %w[true false],
|
|
37
|
+
desc: "calculate block's range size & char count?"
|
|
38
|
+
|
|
39
|
+
# List Unicode blocks
|
|
40
|
+
def call(**options)
|
|
41
|
+
Unisec::Planes.list_display(with_blocks: options[:with_blocks].to_bool,
|
|
42
|
+
with_count: options[:with_count].to_bool)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Command `unisec planes search`
|
|
47
|
+
#
|
|
48
|
+
# Example:
|
|
49
|
+
#
|
|
50
|
+
# ```plaintext
|
|
51
|
+
# $ unisec planes search 3
|
|
52
|
+
# Range: U+30000 - U+3FFFF Name: Tertiary Ideographic Plane
|
|
53
|
+
# $ unisec planes search 2 --with-blocks=true
|
|
54
|
+
# Range: U+20000 - U+2FFFF Name: Supplementary Ideographic Plane
|
|
55
|
+
# Blocks:
|
|
56
|
+
# Range: U+20000 - U+2A6DF Name: CJK Unified Ideographs Extension B
|
|
57
|
+
# Range: U+2A700 - U+2B73F Name: CJK Unified Ideographs Extension C
|
|
58
|
+
# Range: U+2B740 - U+2B81F Name: CJK Unified Ideographs Extension D
|
|
59
|
+
# Range: U+2B820 - U+2CEAF Name: CJK Unified Ideographs Extension E
|
|
60
|
+
# Range: U+2CEB0 - U+2EBEF Name: CJK Unified Ideographs Extension F
|
|
61
|
+
# Range: U+2EBF0 - U+2EE5F Name: CJK Unified Ideographs Extension I
|
|
62
|
+
# Range: U+2F800 - U+2FA1F Name: CJK Compatibility Ideographs Supplement
|
|
63
|
+
# $ unisec planes search 'basic multilingual plane'
|
|
64
|
+
# Range: U+0000 - U+FFFF Name: Basic Multilingual Plane
|
|
65
|
+
# $ unisec planes search 'unassigned'
|
|
66
|
+
# Range: U+40000 - U+4FFFF Name: unassigned
|
|
67
|
+
# Range: U+50000 - U+5FFFF Name: unassigned
|
|
68
|
+
# Range: U+60000 - U+6FFFF Name: unassigned
|
|
69
|
+
# Range: U+70000 - U+7FFFF Name: unassigned
|
|
70
|
+
# Range: U+80000 - U+8FFFF Name: unassigned
|
|
71
|
+
# Range: U+90000 - U+9FFFF Name: unassigned
|
|
72
|
+
# Range: U+A0000 - U+AFFFF Name: unassigned
|
|
73
|
+
# Range: U+B0000 - U+BFFFF Name: unassigned
|
|
74
|
+
# Range: U+C0000 - U+CFFFF Name: unassigned
|
|
75
|
+
# Range: U+D0000 - U+DFFFF Name: unassigned
|
|
76
|
+
# ```
|
|
77
|
+
class Search < Dry::CLI::Command
|
|
78
|
+
desc 'Search for a specific plane'
|
|
79
|
+
|
|
80
|
+
argument :plane_arg, required: true,
|
|
81
|
+
desc: 'Name or number of the plane'
|
|
82
|
+
|
|
83
|
+
option :with_blocks, default: 'false', values: %w[true false],
|
|
84
|
+
desc: 'display the blocks associated with each plane?'
|
|
85
|
+
option :with_count, default: 'false', values: %w[true false],
|
|
86
|
+
desc: "calculate block's range size & char count?"
|
|
87
|
+
|
|
88
|
+
# Display a plane matching a plane name or plane number
|
|
89
|
+
# @param plane_arg [String|Integer] name or number of the plane
|
|
90
|
+
def call(plane_arg: nil, **options)
|
|
91
|
+
plane_arg = plane_arg.to_i if /\A\d+\Z/.match?(plane_arg) # cast decimal string to integer
|
|
92
|
+
Unisec::Planes.plane_display(plane_arg, with_blocks: options[:with_blocks].to_bool,
|
|
93
|
+
with_count: options[:with_count].to_bool)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
data/lib/unisec/cli/rugrep.rb
CHANGED
|
@@ -21,7 +21,7 @@ module Unisec
|
|
|
21
21
|
argument :regexp, required: true,
|
|
22
22
|
desc: 'regular expression'
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# Unicode code point names matching regexp.
|
|
25
25
|
# @param regexp [Regexp] Regular expression without delimiters or modifiers.
|
|
26
26
|
# Supports everything Ruby Regexp supports
|
|
27
27
|
def call(regexp: nil, **)
|
data/lib/unisec/confusables.rb
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
require 'unicode/confusable'
|
|
4
4
|
require 'twitter_cldr'
|
|
5
|
+
require 'paint'
|
|
6
|
+
require 'unisec/utils'
|
|
5
7
|
|
|
6
8
|
module Unisec
|
|
7
9
|
# Operations about Unicode confusable characters (homoglyphs).
|
|
@@ -22,7 +24,7 @@ module Unisec
|
|
|
22
24
|
# @param map [Boolean] allows partial mapping, includes confusable where the given chart is a part of
|
|
23
25
|
def self.list_display(chr, map: true)
|
|
24
26
|
Confusables.list(chr, map: map).each do |confu|
|
|
25
|
-
puts "#{
|
|
27
|
+
puts "#{Utils::String.char2codepoint(confu).ljust(9)} #{confu.ljust(4)} " \
|
|
26
28
|
"#{TwitterCldr::Shared::CodePoint.get(confu.codepoints.first).name}"
|
|
27
29
|
end
|
|
28
30
|
nil
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'ctf_party'
|
|
4
|
+
require 'paint'
|
|
5
|
+
|
|
6
|
+
module Unisec
|
|
7
|
+
# Decimal dump (decdump) of all Unicode encodings.
|
|
8
|
+
class Decdump
|
|
9
|
+
# UTF-8 decdump
|
|
10
|
+
# @return [String] UTF-8 decdump
|
|
11
|
+
attr_reader :utf8
|
|
12
|
+
|
|
13
|
+
# UTF-16BE decdump
|
|
14
|
+
# @return [String] UTF-16BE decdump
|
|
15
|
+
attr_reader :utf16be
|
|
16
|
+
|
|
17
|
+
# UTF-16LE decdump
|
|
18
|
+
# @return [String] UTF-16LE decdump
|
|
19
|
+
attr_reader :utf16le
|
|
20
|
+
|
|
21
|
+
# UTF-32BE decdump
|
|
22
|
+
# @return [String] UTF-32BE decdump
|
|
23
|
+
attr_reader :utf32be
|
|
24
|
+
|
|
25
|
+
# UTF-32LE decdump
|
|
26
|
+
# @return [String] UTF-32LE decdump
|
|
27
|
+
attr_reader :utf32le
|
|
28
|
+
|
|
29
|
+
# Init the decdump.
|
|
30
|
+
# @param str [String] Input string to encode
|
|
31
|
+
# @example
|
|
32
|
+
# ded = Unisec::Decdump.new('I 💕 Ruby 💎')
|
|
33
|
+
# ded.utf8 # => "073 032 240 159 146 149 032 082 117 098 121 032 240 159 146 142"
|
|
34
|
+
# ded.utf16be # => "|000 073| |000 032| |216 061| |220 149| |000 032| |000 082| |000 117| |000 098| |000 121| |000 032| |216 061| |220 142|"
|
|
35
|
+
# ded.utf32be # => "|000 000 000 073| |000 000 000 032| |000 001 244 149| |000 000 000 032| |000 000 000 082| |000 000 000 117| |000 000 000 098| |000 000 000 121| |000 000 000 032| |000 001 244 142|"
|
|
36
|
+
def initialize(str)
|
|
37
|
+
@utf8 = Decdump.utf8(str)
|
|
38
|
+
@utf16be = Decdump.utf16be(str)
|
|
39
|
+
@utf16le = Decdump.utf16le(str)
|
|
40
|
+
@utf32be = Decdump.utf32be(str)
|
|
41
|
+
@utf32le = Decdump.utf32le(str)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Encode to UTF-8 in decdump format (spaced at every code unit = every byte)
|
|
45
|
+
# @param str [String] Input string to encode
|
|
46
|
+
# @return [String] decdump (UTF-8 encoded)
|
|
47
|
+
# @example
|
|
48
|
+
# Unisec::Decdump.utf8('🐋') # => "240 159 144 139"
|
|
49
|
+
def self.utf8(str)
|
|
50
|
+
str.encode('UTF-8').to_hex.scan(/.{2}/).map { |x| x.hex2dec(padding: 3) }.join(' ')
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Encode to UTF-16BE in decdump format (packed by code unit = every 2 bytes)
|
|
54
|
+
# @param str [String] Input string to encode
|
|
55
|
+
# @return [String] decdump (UTF-16BE encoded)
|
|
56
|
+
# @example
|
|
57
|
+
# Unisec::Decdump.utf16be('🐋') # => "|216 061| |220 011|"
|
|
58
|
+
def self.utf16be(str)
|
|
59
|
+
dec_chuncks = str.encode('UTF-16BE').to_hex.scan(/.{2}/).map do |x|
|
|
60
|
+
x.hex2dec(padding: 3)
|
|
61
|
+
end
|
|
62
|
+
dec_chuncks.join(' ').scan(/\d+ \d+/).map { |x| "|#{x}|" }.join(' ')
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Encode to UTF-16LE in decdump format (packed by code unit = every 2 bytes)
|
|
66
|
+
# @param str [String] Input string to encode
|
|
67
|
+
# @return [String] decdump (UTF-16LE encoded)
|
|
68
|
+
# @example
|
|
69
|
+
# Unisec::Decdump.utf16le('🐋') # => "|061 216| |011 220|"
|
|
70
|
+
def self.utf16le(str)
|
|
71
|
+
dec_chuncks = str.encode('UTF-16LE').to_hex.scan(/.{2}/).map do |x|
|
|
72
|
+
x.hex2dec(padding: 3)
|
|
73
|
+
end
|
|
74
|
+
dec_chuncks.join(' ').scan(/\d+ \d+/).map { |x| "|#{x}|" }.join(' ')
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Encode to UTF-32BE in decdump format (packed by code unit = every 4 bytes)
|
|
78
|
+
# @param str [String] Input string to encode
|
|
79
|
+
# @return [String] decdump (UTF-32BE encoded)
|
|
80
|
+
# @example
|
|
81
|
+
# Unisec::Decdump.utf32be('🐋') # => "|000 001 244 011|"
|
|
82
|
+
def self.utf32be(str)
|
|
83
|
+
dec_chuncks = str.encode('UTF-32BE').to_hex.scan(/.{2}/).map do |x|
|
|
84
|
+
x.hex2dec(padding: 3)
|
|
85
|
+
end
|
|
86
|
+
dec_chuncks.join(' ').scan(/\d+ \d+ \d+ \d+/).map { |x| "|#{x}|" }.join(' ')
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Encode to UTF-32LE in decdump format (packed by code unit = every 4 bytes)
|
|
90
|
+
# @param str [String] Input string to encode
|
|
91
|
+
# @return [String] decdump (UTF-32LE encoded)
|
|
92
|
+
# @example
|
|
93
|
+
# Unisec::Decdump.utf32le('🐋') # => "|011 244 001 000|"
|
|
94
|
+
def self.utf32le(str)
|
|
95
|
+
dec_chuncks = str.encode('UTF-32LE').to_hex.scan(/.{2}/).map do |x|
|
|
96
|
+
x.hex2dec(padding: 3)
|
|
97
|
+
end
|
|
98
|
+
dec_chuncks.join(' ').scan(/\d+ \d+ \d+ \d+/).map { |x| "|#{x}|" }.join(' ')
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Display a CLI-friendly output summurizing the decdump in all Unicode encodings
|
|
102
|
+
# @return [String] CLI-ready output
|
|
103
|
+
# @example
|
|
104
|
+
# puts Unisec::Decdump.new('K').display # =>
|
|
105
|
+
# # UTF-8: 226 132 170
|
|
106
|
+
# # UTF-16BE: |033 042|
|
|
107
|
+
# # UTF-16LE: |042 033|
|
|
108
|
+
# # UTF-32BE: |000 000 033 042|
|
|
109
|
+
# # UTF-32LE: |042 033 000 000|
|
|
110
|
+
def display
|
|
111
|
+
"UTF-8: #{@utf8}\n" \
|
|
112
|
+
"UTF-16BE: #{@utf16be}\n" \
|
|
113
|
+
"UTF-16LE: #{@utf16le}\n" \
|
|
114
|
+
"UTF-32BE: #{@utf32be}\n" \
|
|
115
|
+
"UTF-32LE: #{@utf32le}".gsub('|', Paint['|', :red])
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
data/lib/unisec/hexdump.rb
CHANGED
data/lib/unisec/normalization.rb
CHANGED
|
@@ -1,10 +1,22 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'ctf_party'
|
|
4
|
+
require 'paint'
|
|
5
|
+
require 'unisec/utils'
|
|
4
6
|
|
|
5
7
|
module Unisec
|
|
6
8
|
# Normalization Forms
|
|
7
9
|
class Normalization
|
|
10
|
+
# HTML escapable characters mapped with their Unicode counterparts that will
|
|
11
|
+
# cast to themself after applying normalization forms using compatibility mode.
|
|
12
|
+
HTML_ESCAPE_BYPASS = {
|
|
13
|
+
'<' => ['﹤', '<'],
|
|
14
|
+
'>' => ['﹥', '>'],
|
|
15
|
+
'"' => ['"'],
|
|
16
|
+
"'" => ['''],
|
|
17
|
+
'&' => ['﹠', '&']
|
|
18
|
+
}.freeze
|
|
19
|
+
|
|
8
20
|
# Original input
|
|
9
21
|
# @return [String] untouched input
|
|
10
22
|
attr_reader :original
|
|
@@ -64,6 +76,25 @@ module Unisec
|
|
|
64
76
|
str.unicode_normalize(:nfkd)
|
|
65
77
|
end
|
|
66
78
|
|
|
79
|
+
# Replace HTML escapable characters with their Unicode counterparts that will
|
|
80
|
+
# cast to themself after applying normalization forms using compatibility mode.
|
|
81
|
+
# Usefull for XSS, to bypass HTML escape.
|
|
82
|
+
# If several values are possible, one is picked randomly.
|
|
83
|
+
# @param str [String] the target string
|
|
84
|
+
# @return [String] escaped input
|
|
85
|
+
def self.replace_bypass(str)
|
|
86
|
+
str = str.dup
|
|
87
|
+
HTML_ESCAPE_BYPASS.each do |k, v|
|
|
88
|
+
str.gsub!(k, v.sample)
|
|
89
|
+
end
|
|
90
|
+
str
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Instance version of {Normalization.replace_bypass}.
|
|
94
|
+
def replace_bypass
|
|
95
|
+
Normalization.replace_bypass(@original)
|
|
96
|
+
end
|
|
97
|
+
|
|
67
98
|
# Display a CLI-friendly output summurizing all normalization forms
|
|
68
99
|
# @return [String] CLI-ready output
|
|
69
100
|
# @example
|
|
@@ -82,7 +113,7 @@ module Unisec
|
|
|
82
113
|
def display
|
|
83
114
|
colorize = lambda { |form_title, form_attr|
|
|
84
115
|
"#{Paint[form_title.to_s, :underline,
|
|
85
|
-
:bold]}: #{form_attr}\n #{Paint[Unisec::
|
|
116
|
+
:bold]}: #{form_attr}\n #{Paint[Unisec::Utils::String.chars2codepoints(form_attr), :red]}\n"
|
|
86
117
|
}
|
|
87
118
|
colorize.call('Original', @original) +
|
|
88
119
|
colorize.call('NFC', @nfc) +
|
|
@@ -90,5 +121,19 @@ module Unisec
|
|
|
90
121
|
colorize.call('NFD', @nfd) +
|
|
91
122
|
colorize.call('NFKD', @nfkd)
|
|
92
123
|
end
|
|
124
|
+
|
|
125
|
+
# Display a CLI-friendly output of the XSS payload to bypass HTML escape and
|
|
126
|
+
# what it does once normalized in NFKC & NFKD.
|
|
127
|
+
def display_replace
|
|
128
|
+
colorize = lambda { |form_title, form_attr|
|
|
129
|
+
"#{Paint[form_title.to_s, :underline,
|
|
130
|
+
:bold]}: #{form_attr}\n #{Paint[Unisec::Utils::String.chars2codepoints(form_attr), :red]}\n"
|
|
131
|
+
}
|
|
132
|
+
payload = replace_bypass
|
|
133
|
+
colorize.call('Original', @original) +
|
|
134
|
+
colorize.call('Bypass payload', payload) +
|
|
135
|
+
colorize.call('NFKC', Normalization.nfkc(payload)) +
|
|
136
|
+
colorize.call('NFKD', Normalization.nfkd(payload))
|
|
137
|
+
end
|
|
93
138
|
end
|
|
94
139
|
end
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'paint'
|
|
4
|
+
require 'unisec/utils'
|
|
5
|
+
|
|
6
|
+
module Unisec
|
|
7
|
+
# Operations about Unicode planes
|
|
8
|
+
class Planes # rubocop:disable Metrics/ClassLength
|
|
9
|
+
# Data about the planes
|
|
10
|
+
PLANES = [
|
|
11
|
+
{ range: 0x0..0xffff, name: 'Basic Multilingual Plane' },
|
|
12
|
+
{ range: 0x10000..0x1ffff, name: 'Supplementary Multilingual Plane' },
|
|
13
|
+
{ range: 0x20000..0x2ffff, name: 'Supplementary Ideographic Plane' },
|
|
14
|
+
{ range: 0x30000..0x3ffff, name: 'Tertiary Ideographic Plane' },
|
|
15
|
+
{ range: 0x40000..0x4ffff, name: 'unassigned' },
|
|
16
|
+
{ range: 0x50000..0x5ffff, name: 'unassigned' },
|
|
17
|
+
{ range: 0x60000..0x6ffff, name: 'unassigned' },
|
|
18
|
+
{ range: 0x70000..0x7ffff, name: 'unassigned' },
|
|
19
|
+
{ range: 0x80000..0x8ffff, name: 'unassigned' },
|
|
20
|
+
{ range: 0x90000..0x9ffff, name: 'unassigned' },
|
|
21
|
+
{ range: 0xa0000..0xaffff, name: 'unassigned' },
|
|
22
|
+
{ range: 0xb0000..0xbffff, name: 'unassigned' },
|
|
23
|
+
{ range: 0xc0000..0xcffff, name: 'unassigned' },
|
|
24
|
+
{ range: 0xd0000..0xdffff, name: 'unassigned' },
|
|
25
|
+
{ range: 0xe0000..0xeffff, name: 'Supplementary Special-purpose Plane' },
|
|
26
|
+
{ range: 0xf0000..0xfffff, name: 'supplementary Private Use Area planes' },
|
|
27
|
+
{ range: 0x100000..0x10ffff, name: 'supplementary Private Use Area planes' }
|
|
28
|
+
].freeze
|
|
29
|
+
|
|
30
|
+
# List Unicode planes name
|
|
31
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count? (warning: very slow, very unoptimized, see {Unisec::Blocks.list})
|
|
32
|
+
# @return [Array<Hash>] blocks name, range and character and blocks count
|
|
33
|
+
# as well as abbreviation
|
|
34
|
+
# @example
|
|
35
|
+
# Unisec::Planes.list # =>
|
|
36
|
+
# # [{range: 0..65535,
|
|
37
|
+
# # name: "Basic Multilingual Plane",
|
|
38
|
+
# # blocks:
|
|
39
|
+
# # [{range: 0..127, name: "Basic Latin", range_size: nil, char_count: nil},
|
|
40
|
+
# # {range: 128..255, name: "Latin-1 Supplement", range_size: nil, char_count: nil},
|
|
41
|
+
# # […]
|
|
42
|
+
def self.list(with_count: false)
|
|
43
|
+
PLANES.zip(plane2blocks(PLANES, with_count: with_count)).map do |base, extra|
|
|
44
|
+
base.merge(blocks: extra)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# List details about target plane including the list of associated blocks
|
|
49
|
+
# @param plane_arg [String|Integer] name or number of the plane
|
|
50
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count? (see {Unisec::Blocks.list})
|
|
51
|
+
# @return [Hash|Array<Hash>|nil] nil if no match, Hash of the plane if one match,
|
|
52
|
+
# Array of planes' Hash if several matches
|
|
53
|
+
# @example
|
|
54
|
+
# Unisec::Planes.plane(4) # =>
|
|
55
|
+
# # {range: 196608..262143,
|
|
56
|
+
# # name: "unassigned",
|
|
57
|
+
# # blocks:
|
|
58
|
+
# # [{range: 196608..201551, name: "CJK Unified Ideographs Extension G", range_size: nil, char_count: nil},
|
|
59
|
+
# # {range: 201552..205743, name: "CJK Unified Ideographs Extension H", range_size: nil, char_count: nil},
|
|
60
|
+
# # {range: 205744..210047, name: "CJK Unified Ideographs Extension J", range_size: nil, char_count: nil}]}
|
|
61
|
+
# Unisec::Planes.plane('Supplementary Ideographic Plane') # =>
|
|
62
|
+
# # {range: 131072..196607,
|
|
63
|
+
# # name: "Supplementary Ideographic Plane",
|
|
64
|
+
# # blocks:
|
|
65
|
+
# # [{range: 131072..173791, name: "CJK Unified Ideographs Extension B", range_size: nil, char_count: nil},
|
|
66
|
+
# # {range: 173824..177983, name: "CJK Unified Ideographs Extension C", range_size: nil, char_count: nil},
|
|
67
|
+
# # {range: 177984..178207, name: "CJK Unified Ideographs Extension D", range_size: nil, char_count: nil},
|
|
68
|
+
# # {range: 178208..183983, name: "CJK Unified Ideographs Extension E", range_size: nil, char_count: nil},
|
|
69
|
+
# # {range: 183984..191471, name: "CJK Unified Ideographs Extension F", range_size: nil, char_count: nil},
|
|
70
|
+
# # {range: 191472..192095, name: "CJK Unified Ideographs Extension I", range_size: nil, char_count: nil},
|
|
71
|
+
# # {range: 194560..195103, name: "CJK Compatibility Ideographs Supplement", range_size: nil, char_count: nil}]}
|
|
72
|
+
# Unisec::Planes.plane('unassigned') # =>
|
|
73
|
+
# # [{range: 262144..327679, name: "unassigned", blocks: []},
|
|
74
|
+
# # {range: 327680..393215, name: "unassigned", blocks: []},
|
|
75
|
+
# # {range: 393216..458751, name: "unassigned", blocks: []},
|
|
76
|
+
# # {range: 458752..524287, name: "unassigned", blocks: []},
|
|
77
|
+
# # {range: 524288..589823, name: "unassigned", blocks: []},
|
|
78
|
+
# # {range: 589824..655359, name: "unassigned", blocks: []},
|
|
79
|
+
# # {range: 655360..720895, name: "unassigned", blocks: []},
|
|
80
|
+
# # {range: 720896..786431, name: "unassigned", blocks: []},
|
|
81
|
+
# # {range: 786432..851967, name: "unassigned", blocks: []},
|
|
82
|
+
# # {range: 851968..917503, name: "unassigned", blocks: []}]
|
|
83
|
+
def self.plane(plane_arg, with_count: false) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength
|
|
84
|
+
case plane_arg
|
|
85
|
+
when Integer # search by plane number
|
|
86
|
+
res = PLANES[plane_arg]
|
|
87
|
+
when String # search by plane name
|
|
88
|
+
res = PLANES.select { |plane| plane[:name].downcase == plane_arg.downcase }
|
|
89
|
+
return nil if res.empty?
|
|
90
|
+
|
|
91
|
+
res = res.first if res.size == 1 # Hash if one, Array of Hash if multiples
|
|
92
|
+
else
|
|
93
|
+
raise ArgumentError
|
|
94
|
+
end
|
|
95
|
+
case res
|
|
96
|
+
when nil
|
|
97
|
+
nil # handle invalide search term
|
|
98
|
+
# Enrich plane data with blocks
|
|
99
|
+
when Hash # When 1 plane
|
|
100
|
+
res[:blocks] = plane2blocks(res, with_count: with_count)
|
|
101
|
+
res
|
|
102
|
+
when Array # When multiple planes
|
|
103
|
+
res.zip(plane2blocks(res, with_count: with_count)).map do |base, extra|
|
|
104
|
+
base.merge(blocks: extra)
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Find the blocks included in a given plane
|
|
110
|
+
# @param plane [Hash|Array<Hash>] plane hash or array of plane hash
|
|
111
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count? (see {Unisec::Blocks.list})
|
|
112
|
+
# @return [Array<Hash>] plane(s) enriched with blocks data
|
|
113
|
+
# @example
|
|
114
|
+
# Unisec::Planes.plane2blocks({ range: 0x20000..0x2ffff, name: 'Supplementary Ideographic Plane' }) # =>
|
|
115
|
+
# # [{range: 131072..173791, name: "CJK Unified Ideographs Extension B", range_size: nil, char_count: nil},
|
|
116
|
+
# # {range: 173824..177983, name: "CJK Unified Ideographs Extension C", range_size: nil, char_count: nil},
|
|
117
|
+
# # {range: 177984..178207, name: "CJK Unified Ideographs Extension D", range_size: nil, char_count: nil},
|
|
118
|
+
# # {range: 178208..183983, name: "CJK Unified Ideographs Extension E", range_size: nil, char_count: nil},
|
|
119
|
+
# # {range: 183984..191471, name: "CJK Unified Ideographs Extension F", range_size: nil, char_count: nil},
|
|
120
|
+
# # {range: 191472..192095, name: "CJK Unified Ideographs Extension I", range_size: nil, char_count: nil},
|
|
121
|
+
# # {range: 194560..195103, name: "CJK Compatibility Ideographs Supplement", range_size: nil, char_count: nil}]
|
|
122
|
+
def self.plane2blocks(plane, with_count: false)
|
|
123
|
+
blocks = []
|
|
124
|
+
case plane
|
|
125
|
+
when Hash
|
|
126
|
+
Unisec::Blocks.list(with_count: with_count).each do |block|
|
|
127
|
+
blocks << block if plane[:range].include_range?(block[:range])
|
|
128
|
+
end
|
|
129
|
+
when Array
|
|
130
|
+
plane.each do |pl|
|
|
131
|
+
blocks << plane2blocks(pl, with_count: with_count)
|
|
132
|
+
end
|
|
133
|
+
else
|
|
134
|
+
raise ArgumentError
|
|
135
|
+
end
|
|
136
|
+
blocks
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Abbreviate a plane name (based on uppercase letters)
|
|
140
|
+
# @param name [String] plane name (as in {PLANES} `:name`)
|
|
141
|
+
# @return [String] plane abbreviation
|
|
142
|
+
# @example
|
|
143
|
+
# Unisec::Planes.abbr('Basic Multilingual Plane') # => "BMP"
|
|
144
|
+
# Unisec::Planes.abbr('supplementary Private Use Area planes') # => "PUA"
|
|
145
|
+
def self.abbr(name)
|
|
146
|
+
name.scan(/\p{Upper}/).join
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Display a CLI-friendly output listing all planes
|
|
150
|
+
# @param with_blocks [TrueClass|FalseClass] display the blocks associated with each plane
|
|
151
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count? (see {Unisec::Blocks.list})
|
|
152
|
+
# @return [nil]
|
|
153
|
+
# @example
|
|
154
|
+
# Unisec::Planes.list_display(with_blocks: true, with_count: false)
|
|
155
|
+
# # Range: U+0000 - U+FFFF Name: Basic Multilingual Plane
|
|
156
|
+
# # Blocks:
|
|
157
|
+
# # Range: U+0000 - U+007F Name: Basic Latin
|
|
158
|
+
# # Range: U+0080 - U+00FF Name: Latin-1 Supplement
|
|
159
|
+
# # Range: U+0100 - U+017F Name: Latin Extended-A
|
|
160
|
+
# # […]
|
|
161
|
+
def self.list_display(with_blocks: false, with_count: false) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
162
|
+
planes = list(with_count: with_count)
|
|
163
|
+
display = ->(key, value, just) { print Paint[key, :red, :bold] + " #{value}".ljust(just) }
|
|
164
|
+
display_blk = ->(key, value, just) { print Paint[key, :magenta, :bold] + " #{value}".ljust(just) }
|
|
165
|
+
planes.each do |pla|
|
|
166
|
+
display.call('Range:', Utils::Range.range2codepoint_range(pla[:range]), 22)
|
|
167
|
+
display.call('Name:', pla[:name], 50)
|
|
168
|
+
if with_blocks
|
|
169
|
+
puts
|
|
170
|
+
display.call(' Blocks:', "\n", 0)
|
|
171
|
+
pla[:blocks].each do |block|
|
|
172
|
+
display_blk.call(' Range:', Utils::Range.range2codepoint_range(block[:range]), 22)
|
|
173
|
+
display_blk.call('Name:', block[:name], 50)
|
|
174
|
+
if with_count
|
|
175
|
+
display_blk.call('Range size:', block[:range_size], 8)
|
|
176
|
+
display_blk.call('Char count:', block[:char_count], 0)
|
|
177
|
+
end
|
|
178
|
+
puts
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
puts
|
|
182
|
+
end
|
|
183
|
+
nil
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Display a CLI-friendly output searchfing for a plane
|
|
187
|
+
# @param plane_arg [String|Integer] name or number of the plane
|
|
188
|
+
# @param with_blocks [TrueClass|FalseClass] display the blocks associated with each plane
|
|
189
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count? (see {Unisec::Blocks.list})
|
|
190
|
+
# @return [nil]
|
|
191
|
+
# @example
|
|
192
|
+
# Unisec::Planes.plane_display(3, with_blocks: true)
|
|
193
|
+
# # Range: U+30000 - U+3FFFF Name: Tertiary Ideographic Plane
|
|
194
|
+
# # Blocks:
|
|
195
|
+
# # Range: U+30000 - U+3134F Name: CJK Unified Ideographs Extension G
|
|
196
|
+
# # Range: U+31350 - U+323AF Name: CJK Unified Ideographs Extension H
|
|
197
|
+
# # Range: U+323B0 - U+3347F Name: CJK Unified Ideographs Extension J
|
|
198
|
+
def self.plane_display(plane_arg, with_blocks: false, with_count: false) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
199
|
+
planes = plane(plane_arg, with_count: with_count)
|
|
200
|
+
planes = [planes] if planes.is_a?(Hash)
|
|
201
|
+
display = ->(key, value, just) { print Paint[key, :red, :bold] + " #{value}".ljust(just) }
|
|
202
|
+
display_blk = ->(key, value, just) { print Paint[key, :magenta, :bold] + " #{value}".ljust(just) }
|
|
203
|
+
planes.each do |pla|
|
|
204
|
+
display.call('Range:', Utils::Range.range2codepoint_range(pla[:range]), 22)
|
|
205
|
+
display.call('Name:', pla[:name], 50)
|
|
206
|
+
if with_blocks
|
|
207
|
+
puts
|
|
208
|
+
display.call(' Blocks:', "\n", 0)
|
|
209
|
+
pla[:blocks].each do |block|
|
|
210
|
+
display_blk.call(' Range:', Utils::Range.range2codepoint_range(block[:range]), 22)
|
|
211
|
+
display_blk.call('Name:', block[:name], 50)
|
|
212
|
+
if with_count
|
|
213
|
+
display_blk.call('Range size:', block[:range_size], 8)
|
|
214
|
+
display_blk.call('Char count:', block[:char_count], 0)
|
|
215
|
+
end
|
|
216
|
+
puts
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
puts
|
|
220
|
+
end
|
|
221
|
+
nil
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
data/lib/unisec/properties.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'twitter_cldr'
|
|
4
4
|
require 'paint'
|
|
5
|
+
require 'unisec/utils'
|
|
5
6
|
|
|
6
7
|
module Unisec
|
|
7
8
|
# Manipulate Unicode properties
|
|
@@ -50,7 +51,7 @@ module Unisec
|
|
|
50
51
|
def self.codepoints_display(prop)
|
|
51
52
|
codepoints = Properties.codepoints(prop)
|
|
52
53
|
codepoints.each do |cp|
|
|
53
|
-
puts "#{
|
|
54
|
+
puts "#{Utils::Integer.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
|
54
55
|
end
|
|
55
56
|
nil
|
|
56
57
|
end
|
|
@@ -77,7 +78,7 @@ module Unisec
|
|
|
77
78
|
block: props.block.join,
|
|
78
79
|
category: categories[1],
|
|
79
80
|
subcategory: categories[0],
|
|
80
|
-
codepoint:
|
|
81
|
+
codepoint: Utils::String.char2codepoint(chr),
|
|
81
82
|
name: cp.name,
|
|
82
83
|
script: props.script.join,
|
|
83
84
|
case: {
|
|
@@ -127,22 +128,22 @@ module Unisec
|
|
|
127
128
|
display.call('Since (age):', "Version #{data[:age]}")
|
|
128
129
|
puts
|
|
129
130
|
x = data.dig(:case, :twitter, :uppercase)
|
|
130
|
-
display.call('Uppercase:', x + " (#{
|
|
131
|
+
display.call('Uppercase:', x + " (#{Utils::String.char2codepoint(x)})")
|
|
131
132
|
x = data.dig(:case, :twitter, :lowercase)
|
|
132
|
-
display.call('Lowercase:', x + " (#{
|
|
133
|
+
display.call('Lowercase:', x + " (#{Utils::String.char2codepoint(x)})")
|
|
133
134
|
x = data.dig(:case, :twitter, :titlecase)
|
|
134
|
-
display.call('Titlecase:', x + " (#{
|
|
135
|
+
display.call('Titlecase:', x + " (#{Utils::String.char2codepoint(x)})")
|
|
135
136
|
x = data.dig(:case, :twitter, :casefold)
|
|
136
|
-
display.call('Casefold:', x + " (#{
|
|
137
|
+
display.call('Casefold:', x + " (#{Utils::String.char2codepoint(x)})")
|
|
137
138
|
puts
|
|
138
139
|
x = data.dig(:normalization, :twitter, :nfkd)
|
|
139
|
-
display.call('Normalization NFKD:', x + " (#{
|
|
140
|
+
display.call('Normalization NFKD:', x + " (#{Utils::String.chars2codepoints(x)})")
|
|
140
141
|
x = data.dig(:normalization, :twitter, :nfkc)
|
|
141
|
-
display.call('Normalization NFKC:', x + " (#{
|
|
142
|
+
display.call('Normalization NFKC:', x + " (#{Utils::String.chars2codepoints(x)})")
|
|
142
143
|
x = data.dig(:normalization, :twitter, :nfd)
|
|
143
|
-
display.call('Normalization NFD:', x + " (#{
|
|
144
|
+
display.call('Normalization NFD:', x + " (#{Utils::String.chars2codepoints(x)})")
|
|
144
145
|
x = data.dig(:normalization, :twitter, :nfc)
|
|
145
|
-
display.call('Normalization NFC:', x + " (#{
|
|
146
|
+
display.call('Normalization NFC:', x + " (#{Utils::String.chars2codepoints(x)})")
|
|
146
147
|
if extended
|
|
147
148
|
puts
|
|
148
149
|
data[:other_properties].each do |k, v|
|
|
@@ -151,37 +152,5 @@ module Unisec
|
|
|
151
152
|
end
|
|
152
153
|
nil
|
|
153
154
|
end
|
|
154
|
-
|
|
155
|
-
# Display the code point in Unicode format for a given character (code point as string)
|
|
156
|
-
# @param chr [String] Unicode code point (as character / string)
|
|
157
|
-
# @return [String] code point in Unicode format
|
|
158
|
-
# @example
|
|
159
|
-
# Unisec::Properties.char2codepoint('💎') # => "U+1F48E"
|
|
160
|
-
def self.char2codepoint(chr)
|
|
161
|
-
Properties.deccp2stdhexcp(chr.codepoints.first)
|
|
162
|
-
end
|
|
163
|
-
|
|
164
|
-
# Display the code points in Unicode format for the given characters (code points as string)
|
|
165
|
-
# @param chrs [String] Unicode code points (as characters / string)
|
|
166
|
-
# @return [String] code points in Unicode format
|
|
167
|
-
# @example
|
|
168
|
-
# Unisec::Properties.chars2codepoints("ỳ́") # => "U+0079 U+0300 U+0301"
|
|
169
|
-
# Unisec::Properties.chars2codepoints("🧑🌾") # => "U+1F9D1 U+200D U+1F33E"
|
|
170
|
-
def self.chars2codepoints(chrs)
|
|
171
|
-
out = []
|
|
172
|
-
chrs.each_char do |chr|
|
|
173
|
-
out << Properties.char2codepoint(chr)
|
|
174
|
-
end
|
|
175
|
-
out.join(' ')
|
|
176
|
-
end
|
|
177
|
-
|
|
178
|
-
# Convert from decimal code point to standardized format hexadecimal code point
|
|
179
|
-
# @param int_cp [Integer] Code point in decimal format
|
|
180
|
-
# @return [String] code point in Unicode format
|
|
181
|
-
# @example
|
|
182
|
-
# Unisec::Properties.intcp2stdhexcp(128640) # => "U+1F680"
|
|
183
|
-
def self.deccp2stdhexcp(int_cp)
|
|
184
|
-
"U+#{format('%.4x', int_cp).upcase}"
|
|
185
|
-
end
|
|
186
155
|
end
|
|
187
156
|
end
|
data/lib/unisec/rugrep.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'twitter_cldr'
|
|
4
4
|
require 'paint'
|
|
5
|
+
require 'unisec/utils'
|
|
5
6
|
|
|
6
7
|
module Unisec
|
|
7
8
|
# Ruby grep : Ruby regular expression search for Unicode code point names
|
|
@@ -64,7 +65,7 @@ module Unisec
|
|
|
64
65
|
def self.regrep_display(regexp)
|
|
65
66
|
codepoints = regrep(regexp)
|
|
66
67
|
codepoints.each do |cp|
|
|
67
|
-
puts "#{
|
|
68
|
+
puts "#{Utils::Integer.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
|
68
69
|
end
|
|
69
70
|
nil
|
|
70
71
|
end
|
|
@@ -118,7 +119,7 @@ module Unisec
|
|
|
118
119
|
def self.regrep_display_slow(regexp)
|
|
119
120
|
codepoints = regrep_slow(regexp)
|
|
120
121
|
codepoints.each do |cp|
|
|
121
|
-
puts "#{
|
|
122
|
+
puts "#{Utils::Integer.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
|
122
123
|
end
|
|
123
124
|
nil
|
|
124
125
|
end
|