unisec 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +2 -1
- data/data/Blocks.txt +383 -0
- data/data/DerivedName.txt +1659 -12
- data/lib/unisec/bidi.rb +8 -8
- data/lib/unisec/blocks.rb +209 -0
- data/lib/unisec/cli/blocks.rb +93 -0
- data/lib/unisec/cli/cli.rb +10 -2
- data/lib/unisec/cli/dump.rb +87 -0
- data/lib/unisec/cli/planes.rb +99 -0
- data/lib/unisec/cli/rugrep.rb +1 -1
- data/lib/unisec/confusables.rb +3 -1
- data/lib/unisec/decdump.rb +118 -0
- data/lib/unisec/hexdump.rb +1 -1
- data/lib/unisec/normalization.rb +4 -2
- data/lib/unisec/planes.rb +224 -0
- data/lib/unisec/properties.rb +11 -42
- data/lib/unisec/rugrep.rb +3 -2
- data/lib/unisec/utils.rb +94 -0
- data/lib/unisec/version.rb +1 -1
- data/lib/unisec/versions.rb +5 -0
- data/lib/unisec.rb +3 -0
- metadata +24 -27
- data/lib/unisec/cli/hexdump.rb +0 -47
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'ctf_party'
|
|
4
|
+
require 'paint'
|
|
5
|
+
|
|
6
|
+
module Unisec
|
|
7
|
+
# Decimal dump (decdump) of all Unicode encodings.
|
|
8
|
+
class Decdump
|
|
9
|
+
# UTF-8 decdump
|
|
10
|
+
# @return [String] UTF-8 decdump
|
|
11
|
+
attr_reader :utf8
|
|
12
|
+
|
|
13
|
+
# UTF-16BE decdump
|
|
14
|
+
# @return [String] UTF-16BE decdump
|
|
15
|
+
attr_reader :utf16be
|
|
16
|
+
|
|
17
|
+
# UTF-16LE decdump
|
|
18
|
+
# @return [String] UTF-16LE decdump
|
|
19
|
+
attr_reader :utf16le
|
|
20
|
+
|
|
21
|
+
# UTF-32BE decdump
|
|
22
|
+
# @return [String] UTF-32BE decdump
|
|
23
|
+
attr_reader :utf32be
|
|
24
|
+
|
|
25
|
+
# UTF-32LE decdump
|
|
26
|
+
# @return [String] UTF-32LE decdump
|
|
27
|
+
attr_reader :utf32le
|
|
28
|
+
|
|
29
|
+
# Init the decdump.
|
|
30
|
+
# @param str [String] Input string to encode
|
|
31
|
+
# @example
|
|
32
|
+
# ded = Unisec::Decdump.new('I 💕 Ruby 💎')
|
|
33
|
+
# ded.utf8 # => "073 032 240 159 146 149 032 082 117 098 121 032 240 159 146 142"
|
|
34
|
+
# ded.utf16be # => "|000 073| |000 032| |216 061| |220 149| |000 032| |000 082| |000 117| |000 098| |000 121| |000 032| |216 061| |220 142|"
|
|
35
|
+
# ded.utf32be # => "|000 000 000 073| |000 000 000 032| |000 001 244 149| |000 000 000 032| |000 000 000 082| |000 000 000 117| |000 000 000 098| |000 000 000 121| |000 000 000 032| |000 001 244 142|"
|
|
36
|
+
def initialize(str)
|
|
37
|
+
@utf8 = Decdump.utf8(str)
|
|
38
|
+
@utf16be = Decdump.utf16be(str)
|
|
39
|
+
@utf16le = Decdump.utf16le(str)
|
|
40
|
+
@utf32be = Decdump.utf32be(str)
|
|
41
|
+
@utf32le = Decdump.utf32le(str)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Encode to UTF-8 in decdump format (spaced at every code unit = every byte)
|
|
45
|
+
# @param str [String] Input string to encode
|
|
46
|
+
# @return [String] decdump (UTF-8 encoded)
|
|
47
|
+
# @example
|
|
48
|
+
# Unisec::Decdump.utf8('🐋') # => "240 159 144 139"
|
|
49
|
+
def self.utf8(str)
|
|
50
|
+
str.encode('UTF-8').to_hex.scan(/.{2}/).map { |x| x.hex2dec(padding: 3) }.join(' ')
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Encode to UTF-16BE in decdump format (packed by code unit = every 2 bytes)
|
|
54
|
+
# @param str [String] Input string to encode
|
|
55
|
+
# @return [String] decdump (UTF-16BE encoded)
|
|
56
|
+
# @example
|
|
57
|
+
# Unisec::Decdump.utf16be('🐋') # => "|216 061| |220 011|"
|
|
58
|
+
def self.utf16be(str)
|
|
59
|
+
dec_chuncks = str.encode('UTF-16BE').to_hex.scan(/.{2}/).map do |x|
|
|
60
|
+
x.hex2dec(padding: 3)
|
|
61
|
+
end
|
|
62
|
+
dec_chuncks.join(' ').scan(/\d+ \d+/).map { |x| "|#{x}|" }.join(' ')
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Encode to UTF-16LE in decdump format (packed by code unit = every 2 bytes)
|
|
66
|
+
# @param str [String] Input string to encode
|
|
67
|
+
# @return [String] decdump (UTF-16LE encoded)
|
|
68
|
+
# @example
|
|
69
|
+
# Unisec::Decdump.utf16le('🐋') # => "|061 216| |011 220|"
|
|
70
|
+
def self.utf16le(str)
|
|
71
|
+
dec_chuncks = str.encode('UTF-16LE').to_hex.scan(/.{2}/).map do |x|
|
|
72
|
+
x.hex2dec(padding: 3)
|
|
73
|
+
end
|
|
74
|
+
dec_chuncks.join(' ').scan(/\d+ \d+/).map { |x| "|#{x}|" }.join(' ')
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Encode to UTF-32BE in decdump format (packed by code unit = every 4 bytes)
|
|
78
|
+
# @param str [String] Input string to encode
|
|
79
|
+
# @return [String] decdump (UTF-32BE encoded)
|
|
80
|
+
# @example
|
|
81
|
+
# Unisec::Decdump.utf32be('🐋') # => "|000 001 244 011|"
|
|
82
|
+
def self.utf32be(str)
|
|
83
|
+
dec_chuncks = str.encode('UTF-32BE').to_hex.scan(/.{2}/).map do |x|
|
|
84
|
+
x.hex2dec(padding: 3)
|
|
85
|
+
end
|
|
86
|
+
dec_chuncks.join(' ').scan(/\d+ \d+ \d+ \d+/).map { |x| "|#{x}|" }.join(' ')
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Encode to UTF-32LE in decdump format (packed by code unit = every 4 bytes)
|
|
90
|
+
# @param str [String] Input string to encode
|
|
91
|
+
# @return [String] decdump (UTF-32LE encoded)
|
|
92
|
+
# @example
|
|
93
|
+
# Unisec::Decdump.utf32le('🐋') # => "|011 244 001 000|"
|
|
94
|
+
def self.utf32le(str)
|
|
95
|
+
dec_chuncks = str.encode('UTF-32LE').to_hex.scan(/.{2}/).map do |x|
|
|
96
|
+
x.hex2dec(padding: 3)
|
|
97
|
+
end
|
|
98
|
+
dec_chuncks.join(' ').scan(/\d+ \d+ \d+ \d+/).map { |x| "|#{x}|" }.join(' ')
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Display a CLI-friendly output summurizing the decdump in all Unicode encodings
|
|
102
|
+
# @return [String] CLI-ready output
|
|
103
|
+
# @example
|
|
104
|
+
# puts Unisec::Decdump.new('K').display # =>
|
|
105
|
+
# # UTF-8: 226 132 170
|
|
106
|
+
# # UTF-16BE: |033 042|
|
|
107
|
+
# # UTF-16LE: |042 033|
|
|
108
|
+
# # UTF-32BE: |000 000 033 042|
|
|
109
|
+
# # UTF-32LE: |042 033 000 000|
|
|
110
|
+
def display
|
|
111
|
+
"UTF-8: #{@utf8}\n" \
|
|
112
|
+
"UTF-16BE: #{@utf16be}\n" \
|
|
113
|
+
"UTF-16LE: #{@utf16le}\n" \
|
|
114
|
+
"UTF-32BE: #{@utf32be}\n" \
|
|
115
|
+
"UTF-32LE: #{@utf32le}".gsub('|', Paint['|', :red])
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
data/lib/unisec/hexdump.rb
CHANGED
data/lib/unisec/normalization.rb
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'ctf_party'
|
|
4
|
+
require 'paint'
|
|
5
|
+
require 'unisec/utils'
|
|
4
6
|
|
|
5
7
|
module Unisec
|
|
6
8
|
# Normalization Forms
|
|
@@ -111,7 +113,7 @@ module Unisec
|
|
|
111
113
|
def display
|
|
112
114
|
colorize = lambda { |form_title, form_attr|
|
|
113
115
|
"#{Paint[form_title.to_s, :underline,
|
|
114
|
-
:bold]}: #{form_attr}\n #{Paint[Unisec::
|
|
116
|
+
:bold]}: #{form_attr}\n #{Paint[Unisec::Utils::String.chars2codepoints(form_attr), :red]}\n"
|
|
115
117
|
}
|
|
116
118
|
colorize.call('Original', @original) +
|
|
117
119
|
colorize.call('NFC', @nfc) +
|
|
@@ -125,7 +127,7 @@ module Unisec
|
|
|
125
127
|
def display_replace
|
|
126
128
|
colorize = lambda { |form_title, form_attr|
|
|
127
129
|
"#{Paint[form_title.to_s, :underline,
|
|
128
|
-
:bold]}: #{form_attr}\n #{Paint[Unisec::
|
|
130
|
+
:bold]}: #{form_attr}\n #{Paint[Unisec::Utils::String.chars2codepoints(form_attr), :red]}\n"
|
|
129
131
|
}
|
|
130
132
|
payload = replace_bypass
|
|
131
133
|
colorize.call('Original', @original) +
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'paint'
|
|
4
|
+
require 'unisec/utils'
|
|
5
|
+
|
|
6
|
+
module Unisec
|
|
7
|
+
# Operations about Unicode planes
|
|
8
|
+
class Planes # rubocop:disable Metrics/ClassLength
|
|
9
|
+
# Data about the planes
|
|
10
|
+
PLANES = [
|
|
11
|
+
{ range: 0x0..0xffff, name: 'Basic Multilingual Plane' },
|
|
12
|
+
{ range: 0x10000..0x1ffff, name: 'Supplementary Multilingual Plane' },
|
|
13
|
+
{ range: 0x20000..0x2ffff, name: 'Supplementary Ideographic Plane' },
|
|
14
|
+
{ range: 0x30000..0x3ffff, name: 'Tertiary Ideographic Plane' },
|
|
15
|
+
{ range: 0x40000..0x4ffff, name: 'unassigned' },
|
|
16
|
+
{ range: 0x50000..0x5ffff, name: 'unassigned' },
|
|
17
|
+
{ range: 0x60000..0x6ffff, name: 'unassigned' },
|
|
18
|
+
{ range: 0x70000..0x7ffff, name: 'unassigned' },
|
|
19
|
+
{ range: 0x80000..0x8ffff, name: 'unassigned' },
|
|
20
|
+
{ range: 0x90000..0x9ffff, name: 'unassigned' },
|
|
21
|
+
{ range: 0xa0000..0xaffff, name: 'unassigned' },
|
|
22
|
+
{ range: 0xb0000..0xbffff, name: 'unassigned' },
|
|
23
|
+
{ range: 0xc0000..0xcffff, name: 'unassigned' },
|
|
24
|
+
{ range: 0xd0000..0xdffff, name: 'unassigned' },
|
|
25
|
+
{ range: 0xe0000..0xeffff, name: 'Supplementary Special-purpose Plane' },
|
|
26
|
+
{ range: 0xf0000..0xfffff, name: 'supplementary Private Use Area planes' },
|
|
27
|
+
{ range: 0x100000..0x10ffff, name: 'supplementary Private Use Area planes' }
|
|
28
|
+
].freeze
|
|
29
|
+
|
|
30
|
+
# List Unicode planes name
|
|
31
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count? (warning: very slow, very unoptimized, see {Unisec::Blocks.list})
|
|
32
|
+
# @return [Array<Hash>] blocks name, range and character and blocks count
|
|
33
|
+
# as well as abbreviation
|
|
34
|
+
# @example
|
|
35
|
+
# Unisec::Planes.list # =>
|
|
36
|
+
# # [{range: 0..65535,
|
|
37
|
+
# # name: "Basic Multilingual Plane",
|
|
38
|
+
# # blocks:
|
|
39
|
+
# # [{range: 0..127, name: "Basic Latin", range_size: nil, char_count: nil},
|
|
40
|
+
# # {range: 128..255, name: "Latin-1 Supplement", range_size: nil, char_count: nil},
|
|
41
|
+
# # […]
|
|
42
|
+
def self.list(with_count: false)
|
|
43
|
+
PLANES.zip(plane2blocks(PLANES, with_count: with_count)).map do |base, extra|
|
|
44
|
+
base.merge(blocks: extra)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# List details about target plane including the list of associated blocks
|
|
49
|
+
# @param plane_arg [String|Integer] name or number of the plane
|
|
50
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count? (see {Unisec::Blocks.list})
|
|
51
|
+
# @return [Hash|Array<Hash>|nil] nil if no match, Hash of the plane if one match,
|
|
52
|
+
# Array of planes' Hash if several matches
|
|
53
|
+
# @example
|
|
54
|
+
# Unisec::Planes.plane(4) # =>
|
|
55
|
+
# # {range: 196608..262143,
|
|
56
|
+
# # name: "unassigned",
|
|
57
|
+
# # blocks:
|
|
58
|
+
# # [{range: 196608..201551, name: "CJK Unified Ideographs Extension G", range_size: nil, char_count: nil},
|
|
59
|
+
# # {range: 201552..205743, name: "CJK Unified Ideographs Extension H", range_size: nil, char_count: nil},
|
|
60
|
+
# # {range: 205744..210047, name: "CJK Unified Ideographs Extension J", range_size: nil, char_count: nil}]}
|
|
61
|
+
# Unisec::Planes.plane('Supplementary Ideographic Plane') # =>
|
|
62
|
+
# # {range: 131072..196607,
|
|
63
|
+
# # name: "Supplementary Ideographic Plane",
|
|
64
|
+
# # blocks:
|
|
65
|
+
# # [{range: 131072..173791, name: "CJK Unified Ideographs Extension B", range_size: nil, char_count: nil},
|
|
66
|
+
# # {range: 173824..177983, name: "CJK Unified Ideographs Extension C", range_size: nil, char_count: nil},
|
|
67
|
+
# # {range: 177984..178207, name: "CJK Unified Ideographs Extension D", range_size: nil, char_count: nil},
|
|
68
|
+
# # {range: 178208..183983, name: "CJK Unified Ideographs Extension E", range_size: nil, char_count: nil},
|
|
69
|
+
# # {range: 183984..191471, name: "CJK Unified Ideographs Extension F", range_size: nil, char_count: nil},
|
|
70
|
+
# # {range: 191472..192095, name: "CJK Unified Ideographs Extension I", range_size: nil, char_count: nil},
|
|
71
|
+
# # {range: 194560..195103, name: "CJK Compatibility Ideographs Supplement", range_size: nil, char_count: nil}]}
|
|
72
|
+
# Unisec::Planes.plane('unassigned') # =>
|
|
73
|
+
# # [{range: 262144..327679, name: "unassigned", blocks: []},
|
|
74
|
+
# # {range: 327680..393215, name: "unassigned", blocks: []},
|
|
75
|
+
# # {range: 393216..458751, name: "unassigned", blocks: []},
|
|
76
|
+
# # {range: 458752..524287, name: "unassigned", blocks: []},
|
|
77
|
+
# # {range: 524288..589823, name: "unassigned", blocks: []},
|
|
78
|
+
# # {range: 589824..655359, name: "unassigned", blocks: []},
|
|
79
|
+
# # {range: 655360..720895, name: "unassigned", blocks: []},
|
|
80
|
+
# # {range: 720896..786431, name: "unassigned", blocks: []},
|
|
81
|
+
# # {range: 786432..851967, name: "unassigned", blocks: []},
|
|
82
|
+
# # {range: 851968..917503, name: "unassigned", blocks: []}]
|
|
83
|
+
def self.plane(plane_arg, with_count: false) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength
|
|
84
|
+
case plane_arg
|
|
85
|
+
when Integer # search by plane number
|
|
86
|
+
res = PLANES[plane_arg]
|
|
87
|
+
when String # search by plane name
|
|
88
|
+
res = PLANES.select { |plane| plane[:name].downcase == plane_arg.downcase }
|
|
89
|
+
return nil if res.empty?
|
|
90
|
+
|
|
91
|
+
res = res.first if res.size == 1 # Hash if one, Array of Hash if multiples
|
|
92
|
+
else
|
|
93
|
+
raise ArgumentError
|
|
94
|
+
end
|
|
95
|
+
case res
|
|
96
|
+
when nil
|
|
97
|
+
nil # handle invalide search term
|
|
98
|
+
# Enrich plane data with blocks
|
|
99
|
+
when Hash # When 1 plane
|
|
100
|
+
res[:blocks] = plane2blocks(res, with_count: with_count)
|
|
101
|
+
res
|
|
102
|
+
when Array # When multiple planes
|
|
103
|
+
res.zip(plane2blocks(res, with_count: with_count)).map do |base, extra|
|
|
104
|
+
base.merge(blocks: extra)
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Find the blocks included in a given plane
|
|
110
|
+
# @param plane [Hash|Array<Hash>] plane hash or array of plane hash
|
|
111
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count? (see {Unisec::Blocks.list})
|
|
112
|
+
# @return [Array<Hash>] plane(s) enriched with blocks data
|
|
113
|
+
# @example
|
|
114
|
+
# Unisec::Planes.plane2blocks({ range: 0x20000..0x2ffff, name: 'Supplementary Ideographic Plane' }) # =>
|
|
115
|
+
# # [{range: 131072..173791, name: "CJK Unified Ideographs Extension B", range_size: nil, char_count: nil},
|
|
116
|
+
# # {range: 173824..177983, name: "CJK Unified Ideographs Extension C", range_size: nil, char_count: nil},
|
|
117
|
+
# # {range: 177984..178207, name: "CJK Unified Ideographs Extension D", range_size: nil, char_count: nil},
|
|
118
|
+
# # {range: 178208..183983, name: "CJK Unified Ideographs Extension E", range_size: nil, char_count: nil},
|
|
119
|
+
# # {range: 183984..191471, name: "CJK Unified Ideographs Extension F", range_size: nil, char_count: nil},
|
|
120
|
+
# # {range: 191472..192095, name: "CJK Unified Ideographs Extension I", range_size: nil, char_count: nil},
|
|
121
|
+
# # {range: 194560..195103, name: "CJK Compatibility Ideographs Supplement", range_size: nil, char_count: nil}]
|
|
122
|
+
def self.plane2blocks(plane, with_count: false)
|
|
123
|
+
blocks = []
|
|
124
|
+
case plane
|
|
125
|
+
when Hash
|
|
126
|
+
Unisec::Blocks.list(with_count: with_count).each do |block|
|
|
127
|
+
blocks << block if plane[:range].include_range?(block[:range])
|
|
128
|
+
end
|
|
129
|
+
when Array
|
|
130
|
+
plane.each do |pl|
|
|
131
|
+
blocks << plane2blocks(pl, with_count: with_count)
|
|
132
|
+
end
|
|
133
|
+
else
|
|
134
|
+
raise ArgumentError
|
|
135
|
+
end
|
|
136
|
+
blocks
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Abbreviate a plane name (based on uppercase letters)
|
|
140
|
+
# @param name [String] plane name (as in {PLANES} `:name`)
|
|
141
|
+
# @return [String] plane abbreviation
|
|
142
|
+
# @example
|
|
143
|
+
# Unisec::Planes.abbr('Basic Multilingual Plane') # => "BMP"
|
|
144
|
+
# Unisec::Planes.abbr('supplementary Private Use Area planes') # => "PUA"
|
|
145
|
+
def self.abbr(name)
|
|
146
|
+
name.scan(/\p{Upper}/).join
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Display a CLI-friendly output listing all planes
|
|
150
|
+
# @param with_blocks [TrueClass|FalseClass] display the blocks associated with each plane
|
|
151
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count? (see {Unisec::Blocks.list})
|
|
152
|
+
# @return [nil]
|
|
153
|
+
# @example
|
|
154
|
+
# Unisec::Planes.list_display(with_blocks: true, with_count: false)
|
|
155
|
+
# # Range: U+0000 - U+FFFF Name: Basic Multilingual Plane
|
|
156
|
+
# # Blocks:
|
|
157
|
+
# # Range: U+0000 - U+007F Name: Basic Latin
|
|
158
|
+
# # Range: U+0080 - U+00FF Name: Latin-1 Supplement
|
|
159
|
+
# # Range: U+0100 - U+017F Name: Latin Extended-A
|
|
160
|
+
# # […]
|
|
161
|
+
def self.list_display(with_blocks: false, with_count: false) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
162
|
+
planes = list(with_count: with_count)
|
|
163
|
+
display = ->(key, value, just) { print Paint[key, :red, :bold] + " #{value}".ljust(just) }
|
|
164
|
+
display_blk = ->(key, value, just) { print Paint[key, :magenta, :bold] + " #{value}".ljust(just) }
|
|
165
|
+
planes.each do |pla|
|
|
166
|
+
display.call('Range:', Utils::Range.range2codepoint_range(pla[:range]), 22)
|
|
167
|
+
display.call('Name:', pla[:name], 50)
|
|
168
|
+
if with_blocks
|
|
169
|
+
puts
|
|
170
|
+
display.call(' Blocks:', "\n", 0)
|
|
171
|
+
pla[:blocks].each do |block|
|
|
172
|
+
display_blk.call(' Range:', Utils::Range.range2codepoint_range(block[:range]), 22)
|
|
173
|
+
display_blk.call('Name:', block[:name], 50)
|
|
174
|
+
if with_count
|
|
175
|
+
display_blk.call('Range size:', block[:range_size], 8)
|
|
176
|
+
display_blk.call('Char count:', block[:char_count], 0)
|
|
177
|
+
end
|
|
178
|
+
puts
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
puts
|
|
182
|
+
end
|
|
183
|
+
nil
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Display a CLI-friendly output searchfing for a plane
|
|
187
|
+
# @param plane_arg [String|Integer] name or number of the plane
|
|
188
|
+
# @param with_blocks [TrueClass|FalseClass] display the blocks associated with each plane
|
|
189
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count? (see {Unisec::Blocks.list})
|
|
190
|
+
# @return [nil]
|
|
191
|
+
# @example
|
|
192
|
+
# Unisec::Planes.plane_display(3, with_blocks: true)
|
|
193
|
+
# # Range: U+30000 - U+3FFFF Name: Tertiary Ideographic Plane
|
|
194
|
+
# # Blocks:
|
|
195
|
+
# # Range: U+30000 - U+3134F Name: CJK Unified Ideographs Extension G
|
|
196
|
+
# # Range: U+31350 - U+323AF Name: CJK Unified Ideographs Extension H
|
|
197
|
+
# # Range: U+323B0 - U+3347F Name: CJK Unified Ideographs Extension J
|
|
198
|
+
def self.plane_display(plane_arg, with_blocks: false, with_count: false) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
199
|
+
planes = plane(plane_arg, with_count: with_count)
|
|
200
|
+
planes = [planes] if planes.is_a?(Hash)
|
|
201
|
+
display = ->(key, value, just) { print Paint[key, :red, :bold] + " #{value}".ljust(just) }
|
|
202
|
+
display_blk = ->(key, value, just) { print Paint[key, :magenta, :bold] + " #{value}".ljust(just) }
|
|
203
|
+
planes.each do |pla|
|
|
204
|
+
display.call('Range:', Utils::Range.range2codepoint_range(pla[:range]), 22)
|
|
205
|
+
display.call('Name:', pla[:name], 50)
|
|
206
|
+
if with_blocks
|
|
207
|
+
puts
|
|
208
|
+
display.call(' Blocks:', "\n", 0)
|
|
209
|
+
pla[:blocks].each do |block|
|
|
210
|
+
display_blk.call(' Range:', Utils::Range.range2codepoint_range(block[:range]), 22)
|
|
211
|
+
display_blk.call('Name:', block[:name], 50)
|
|
212
|
+
if with_count
|
|
213
|
+
display_blk.call('Range size:', block[:range_size], 8)
|
|
214
|
+
display_blk.call('Char count:', block[:char_count], 0)
|
|
215
|
+
end
|
|
216
|
+
puts
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
puts
|
|
220
|
+
end
|
|
221
|
+
nil
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
data/lib/unisec/properties.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'twitter_cldr'
|
|
4
4
|
require 'paint'
|
|
5
|
+
require 'unisec/utils'
|
|
5
6
|
|
|
6
7
|
module Unisec
|
|
7
8
|
# Manipulate Unicode properties
|
|
@@ -50,7 +51,7 @@ module Unisec
|
|
|
50
51
|
def self.codepoints_display(prop)
|
|
51
52
|
codepoints = Properties.codepoints(prop)
|
|
52
53
|
codepoints.each do |cp|
|
|
53
|
-
puts "#{
|
|
54
|
+
puts "#{Utils::Integer.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
|
54
55
|
end
|
|
55
56
|
nil
|
|
56
57
|
end
|
|
@@ -77,7 +78,7 @@ module Unisec
|
|
|
77
78
|
block: props.block.join,
|
|
78
79
|
category: categories[1],
|
|
79
80
|
subcategory: categories[0],
|
|
80
|
-
codepoint:
|
|
81
|
+
codepoint: Utils::String.char2codepoint(chr),
|
|
81
82
|
name: cp.name,
|
|
82
83
|
script: props.script.join,
|
|
83
84
|
case: {
|
|
@@ -127,22 +128,22 @@ module Unisec
|
|
|
127
128
|
display.call('Since (age):', "Version #{data[:age]}")
|
|
128
129
|
puts
|
|
129
130
|
x = data.dig(:case, :twitter, :uppercase)
|
|
130
|
-
display.call('Uppercase:', x + " (#{
|
|
131
|
+
display.call('Uppercase:', x + " (#{Utils::String.char2codepoint(x)})")
|
|
131
132
|
x = data.dig(:case, :twitter, :lowercase)
|
|
132
|
-
display.call('Lowercase:', x + " (#{
|
|
133
|
+
display.call('Lowercase:', x + " (#{Utils::String.char2codepoint(x)})")
|
|
133
134
|
x = data.dig(:case, :twitter, :titlecase)
|
|
134
|
-
display.call('Titlecase:', x + " (#{
|
|
135
|
+
display.call('Titlecase:', x + " (#{Utils::String.char2codepoint(x)})")
|
|
135
136
|
x = data.dig(:case, :twitter, :casefold)
|
|
136
|
-
display.call('Casefold:', x + " (#{
|
|
137
|
+
display.call('Casefold:', x + " (#{Utils::String.char2codepoint(x)})")
|
|
137
138
|
puts
|
|
138
139
|
x = data.dig(:normalization, :twitter, :nfkd)
|
|
139
|
-
display.call('Normalization NFKD:', x + " (#{
|
|
140
|
+
display.call('Normalization NFKD:', x + " (#{Utils::String.chars2codepoints(x)})")
|
|
140
141
|
x = data.dig(:normalization, :twitter, :nfkc)
|
|
141
|
-
display.call('Normalization NFKC:', x + " (#{
|
|
142
|
+
display.call('Normalization NFKC:', x + " (#{Utils::String.chars2codepoints(x)})")
|
|
142
143
|
x = data.dig(:normalization, :twitter, :nfd)
|
|
143
|
-
display.call('Normalization NFD:', x + " (#{
|
|
144
|
+
display.call('Normalization NFD:', x + " (#{Utils::String.chars2codepoints(x)})")
|
|
144
145
|
x = data.dig(:normalization, :twitter, :nfc)
|
|
145
|
-
display.call('Normalization NFC:', x + " (#{
|
|
146
|
+
display.call('Normalization NFC:', x + " (#{Utils::String.chars2codepoints(x)})")
|
|
146
147
|
if extended
|
|
147
148
|
puts
|
|
148
149
|
data[:other_properties].each do |k, v|
|
|
@@ -151,37 +152,5 @@ module Unisec
|
|
|
151
152
|
end
|
|
152
153
|
nil
|
|
153
154
|
end
|
|
154
|
-
|
|
155
|
-
# Display the code point in Unicode format for a given character (code point as string)
|
|
156
|
-
# @param chr [String] Unicode code point (as character / string)
|
|
157
|
-
# @return [String] code point in Unicode format
|
|
158
|
-
# @example
|
|
159
|
-
# Unisec::Properties.char2codepoint('💎') # => "U+1F48E"
|
|
160
|
-
def self.char2codepoint(chr)
|
|
161
|
-
Properties.deccp2stdhexcp(chr.codepoints.first)
|
|
162
|
-
end
|
|
163
|
-
|
|
164
|
-
# Display the code points in Unicode format for the given characters (code points as string)
|
|
165
|
-
# @param chrs [String] Unicode code points (as characters / string)
|
|
166
|
-
# @return [String] code points in Unicode format
|
|
167
|
-
# @example
|
|
168
|
-
# Unisec::Properties.chars2codepoints("ỳ́") # => "U+0079 U+0300 U+0301"
|
|
169
|
-
# Unisec::Properties.chars2codepoints("🧑🌾") # => "U+1F9D1 U+200D U+1F33E"
|
|
170
|
-
def self.chars2codepoints(chrs)
|
|
171
|
-
out = []
|
|
172
|
-
chrs.each_char do |chr|
|
|
173
|
-
out << Properties.char2codepoint(chr)
|
|
174
|
-
end
|
|
175
|
-
out.join(' ')
|
|
176
|
-
end
|
|
177
|
-
|
|
178
|
-
# Convert from decimal code point to standardized format hexadecimal code point
|
|
179
|
-
# @param int_cp [Integer] Code point in decimal format
|
|
180
|
-
# @return [String] code point in Unicode format
|
|
181
|
-
# @example
|
|
182
|
-
# Unisec::Properties.intcp2stdhexcp(128640) # => "U+1F680"
|
|
183
|
-
def self.deccp2stdhexcp(int_cp)
|
|
184
|
-
"U+#{format('%.4x', int_cp).upcase}"
|
|
185
|
-
end
|
|
186
155
|
end
|
|
187
156
|
end
|
data/lib/unisec/rugrep.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'twitter_cldr'
|
|
4
4
|
require 'paint'
|
|
5
|
+
require 'unisec/utils'
|
|
5
6
|
|
|
6
7
|
module Unisec
|
|
7
8
|
# Ruby grep : Ruby regular expression search for Unicode code point names
|
|
@@ -64,7 +65,7 @@ module Unisec
|
|
|
64
65
|
def self.regrep_display(regexp)
|
|
65
66
|
codepoints = regrep(regexp)
|
|
66
67
|
codepoints.each do |cp|
|
|
67
|
-
puts "#{
|
|
68
|
+
puts "#{Utils::Integer.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
|
68
69
|
end
|
|
69
70
|
nil
|
|
70
71
|
end
|
|
@@ -118,7 +119,7 @@ module Unisec
|
|
|
118
119
|
def self.regrep_display_slow(regexp)
|
|
119
120
|
codepoints = regrep_slow(regexp)
|
|
120
121
|
codepoints.each do |cp|
|
|
121
|
-
puts "#{
|
|
122
|
+
puts "#{Utils::Integer.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
|
122
123
|
end
|
|
123
124
|
nil
|
|
124
125
|
end
|
data/lib/unisec/utils.rb
CHANGED
|
@@ -20,6 +20,35 @@ class Integer
|
|
|
20
20
|
end
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
+
class String
|
|
24
|
+
# Convert a string to a boolean
|
|
25
|
+
# @return [TrueClass|FalseClass]
|
|
26
|
+
# @example
|
|
27
|
+
# "true".to_bool # => true
|
|
28
|
+
def to_bool
|
|
29
|
+
case to_s.chomp.downcase
|
|
30
|
+
when 'true', 'yes', 'y', '1'
|
|
31
|
+
true
|
|
32
|
+
when 'false', 'no', 'n', '0'
|
|
33
|
+
false
|
|
34
|
+
else
|
|
35
|
+
raise ArgumentError, "invalid value for Boolean: #{str.inspect}"
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
class Range
|
|
41
|
+
# Is a range included in another range? Are all values of range B included in range A?
|
|
42
|
+
# @param range [Range]
|
|
43
|
+
# @return [TrueClass|FalseClass]
|
|
44
|
+
# @example
|
|
45
|
+
# (1..10).include_range?(2..11) # => false
|
|
46
|
+
# (1..10).include_range?(2..4) # => true
|
|
47
|
+
def include_range?(range)
|
|
48
|
+
self.begin <= range.begin && self.end >= range.end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
23
52
|
module Unisec
|
|
24
53
|
# Generic stuff not Unicode-related that can be re-used.
|
|
25
54
|
module Utils
|
|
@@ -108,6 +137,71 @@ module Unisec
|
|
|
108
137
|
def self.grapheme_reverse(str)
|
|
109
138
|
str.grapheme_clusters.reverse.join
|
|
110
139
|
end
|
|
140
|
+
|
|
141
|
+
# Display the code point in Unicode format for a given character (code point as string)
|
|
142
|
+
# @param chr [String] Unicode code point (as character / string)
|
|
143
|
+
# @return [String] code point in Unicode format
|
|
144
|
+
# @example
|
|
145
|
+
# Unisec::Properties.char2codepoint('💎') # => "U+1F48E"
|
|
146
|
+
def self.char2codepoint(chr)
|
|
147
|
+
Integer.deccp2stdhexcp(chr.codepoints.first)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Display the code points in Unicode format for the given characters (code points as string)
|
|
151
|
+
# @param chrs [String] Unicode code points (as characters / string)
|
|
152
|
+
# @return [String] code points in Unicode format
|
|
153
|
+
# @example
|
|
154
|
+
# Unisec::Properties.chars2codepoints("ỳ́") # => "U+0079 U+0300 U+0301"
|
|
155
|
+
# Unisec::Properties.chars2codepoints("🧑🌾") # => "U+1F9D1 U+200D U+1F33E"
|
|
156
|
+
def self.chars2codepoints(chrs)
|
|
157
|
+
out = []
|
|
158
|
+
chrs.each_char do |chr|
|
|
159
|
+
out << char2codepoint(chr)
|
|
160
|
+
end
|
|
161
|
+
out.join(' ')
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Convert a string of hex encoded Unicode code points range to actual
|
|
165
|
+
# integer Ruby range.
|
|
166
|
+
# @param range_str [String] Unicode code points range as in data/Blocks.txt
|
|
167
|
+
# @return [Range]
|
|
168
|
+
# @example
|
|
169
|
+
# Unisec::Utils::String::to_range('0080..00FF') # => 128..255
|
|
170
|
+
def self.to_range(range_str)
|
|
171
|
+
::Range.new(*range_str.split('..').map { |x| x.hex2dec.to_i })
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Convert from standardized format hexadecimal code point to decimal code point
|
|
175
|
+
# @param std_hex_cp [String] Code point in standardized hexadecimal format
|
|
176
|
+
# @return [Integer] Code point in decimal format
|
|
177
|
+
# @example
|
|
178
|
+
# Unisec::Utils::String.stdhexcp2deccp('U+2026') # => 8230
|
|
179
|
+
def self.stdhexcp2deccp(std_hex_cp)
|
|
180
|
+
hex = "0x#{std_hex_cp[2..]}" # replace U+ prefix with 0x
|
|
181
|
+
convert_to_integer(hex)
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
module Integer
|
|
186
|
+
# Convert from decimal code point to standardized format hexadecimal code point
|
|
187
|
+
# @param int_cp [Integer] Code point in decimal format
|
|
188
|
+
# @return [String] code point in Unicode format
|
|
189
|
+
# @example
|
|
190
|
+
# Unisec::Utils::Integer.deccp2stdhexcp(128640) # => "U+1F680"
|
|
191
|
+
def self.deccp2stdhexcp(int_cp)
|
|
192
|
+
"U+#{format('%.4x', int_cp).upcase}"
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
module Range
|
|
197
|
+
# Convert a (integer) range to a range of Unicode code points
|
|
198
|
+
# @param range [::Range]
|
|
199
|
+
# @return [String]
|
|
200
|
+
# @example
|
|
201
|
+
# Unisec::Utils::Range.range2codepoint_range(1048576..1114111) # => "U+100000 - U+10FFFF"
|
|
202
|
+
def self.range2codepoint_range(range)
|
|
203
|
+
"#{Integer.deccp2stdhexcp(range.begin)} - #{Integer.deccp2stdhexcp(range.end)}"
|
|
204
|
+
end
|
|
111
205
|
end
|
|
112
206
|
end
|
|
113
207
|
end
|
data/lib/unisec/version.rb
CHANGED
data/lib/unisec/versions.rb
CHANGED
|
@@ -55,6 +55,10 @@ module Unisec
|
|
|
55
55
|
ucd_derivedname: {
|
|
56
56
|
version: Unisec::Rugrep.ucd_derivedname_version,
|
|
57
57
|
label: 'UCD (data/DerivedName.txt)'
|
|
58
|
+
},
|
|
59
|
+
ucd_blocks: {
|
|
60
|
+
version: Unisec::Blocks.ucd_blocks_version,
|
|
61
|
+
label: 'UCD (data/Blocks.txt)'
|
|
58
62
|
}
|
|
59
63
|
}
|
|
60
64
|
end
|
|
@@ -81,6 +85,7 @@ module Unisec
|
|
|
81
85
|
colorize.call(:twittercldr_cldr) +
|
|
82
86
|
colorize.call(:ruby_unicode_emoji) +
|
|
83
87
|
colorize.call(:ucd_derivedname) +
|
|
88
|
+
colorize.call(:ucd_blocks) +
|
|
84
89
|
Paint["\nGems:\n", :underline] +
|
|
85
90
|
colorize.call(:unisec) +
|
|
86
91
|
colorize.call(:twittercldr) +
|
data/lib/unisec.rb
CHANGED
|
@@ -3,9 +3,12 @@
|
|
|
3
3
|
require 'unisec/version'
|
|
4
4
|
|
|
5
5
|
require 'unisec/bidi'
|
|
6
|
+
require 'unisec/blocks'
|
|
6
7
|
require 'unisec/confusables'
|
|
8
|
+
require 'unisec/decdump'
|
|
7
9
|
require 'unisec/hexdump'
|
|
8
10
|
require 'unisec/normalization'
|
|
11
|
+
require 'unisec/planes'
|
|
9
12
|
require 'unisec/properties'
|
|
10
13
|
require 'unisec/rugrep'
|
|
11
14
|
require 'unisec/size'
|