unisec 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +2 -1
- data/data/Blocks.txt +383 -0
- data/data/DerivedName.txt +1659 -12
- data/lib/unisec/bidi.rb +8 -8
- data/lib/unisec/blocks.rb +209 -0
- data/lib/unisec/cli/blocks.rb +93 -0
- data/lib/unisec/cli/cli.rb +10 -2
- data/lib/unisec/cli/dump.rb +87 -0
- data/lib/unisec/cli/planes.rb +99 -0
- data/lib/unisec/cli/rugrep.rb +1 -1
- data/lib/unisec/confusables.rb +3 -1
- data/lib/unisec/decdump.rb +118 -0
- data/lib/unisec/hexdump.rb +1 -1
- data/lib/unisec/normalization.rb +4 -2
- data/lib/unisec/planes.rb +224 -0
- data/lib/unisec/properties.rb +11 -42
- data/lib/unisec/rugrep.rb +3 -2
- data/lib/unisec/utils.rb +94 -0
- data/lib/unisec/version.rb +1 -1
- data/lib/unisec/versions.rb +5 -0
- data/lib/unisec.rb +3 -0
- metadata +24 -27
- data/lib/unisec/cli/hexdump.rb +0 -47
data/lib/unisec/bidi.rb
CHANGED
|
@@ -18,10 +18,10 @@ module Unisec
|
|
|
18
18
|
# @param input [String] the target string
|
|
19
19
|
# @param opts [Hash] optional parameters, see {Spoof.bidi_affix}
|
|
20
20
|
# @return [String] the target string
|
|
21
|
-
def set_target_display(input, **
|
|
21
|
+
def set_target_display(input, **)
|
|
22
22
|
@target_display = input
|
|
23
|
-
@spoof_string = reverse(**
|
|
24
|
-
@spoof_payload = bidi_affix(**
|
|
23
|
+
@spoof_string = reverse(**)
|
|
24
|
+
@spoof_payload = bidi_affix(**)
|
|
25
25
|
@target_display
|
|
26
26
|
end
|
|
27
27
|
|
|
@@ -66,8 +66,8 @@ module Unisec
|
|
|
66
66
|
end
|
|
67
67
|
|
|
68
68
|
# Call {Spoof.reverse} with `@target_display` as default input (target).
|
|
69
|
-
def reverse(**
|
|
70
|
-
Spoof.reverse(@target_display, **
|
|
69
|
+
def reverse(**)
|
|
70
|
+
Spoof.reverse(@target_display, **)
|
|
71
71
|
end
|
|
72
72
|
|
|
73
73
|
# Inject BiDi characters into the input string
|
|
@@ -121,8 +121,8 @@ module Unisec
|
|
|
121
121
|
end
|
|
122
122
|
|
|
123
123
|
# Call {Spoof.bidi_affix} with `@spoof_string` as input.
|
|
124
|
-
def bidi_affix(**
|
|
125
|
-
Spoof.bidi_affix(@spoof_string, **
|
|
124
|
+
def bidi_affix(**)
|
|
125
|
+
Spoof.bidi_affix(@spoof_string, **)
|
|
126
126
|
end
|
|
127
127
|
|
|
128
128
|
# Display a CLI-friendly output summurizing the spoof payload
|
|
@@ -157,7 +157,7 @@ module Unisec
|
|
|
157
157
|
"Spoof payload (hex, escaped): #{@spoof_payload.to_hex(prefixall: '\\x')}\n" \
|
|
158
158
|
"Spoof payload (base64): #{@spoof_payload.to_b64}\n" \
|
|
159
159
|
"Spoof payload (urlencode): #{@spoof_payload.urlencode}\n" \
|
|
160
|
-
"Spoof payload (code points): #{Unisec::
|
|
160
|
+
"Spoof payload (code points): #{Unisec::Utils::String.chars2codepoints(@spoof_payload)}\n" \
|
|
161
161
|
"\n\n\n" \
|
|
162
162
|
'⚠: for the spoof payload to display correctly, be sure your VTE has RTL support, ' \
|
|
163
163
|
"e.g. see https://wiki.archlinux.org/title/Bidirectional_text#Terminal.\n" \
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'paint'
|
|
4
|
+
require 'unisec/utils'
|
|
5
|
+
|
|
6
|
+
module Unisec
|
|
7
|
+
# Operations about Unicode blocks
|
|
8
|
+
class Blocks # rubocop:disable Metrics/ClassLength
|
|
9
|
+
# UCD Blocks file location
|
|
10
|
+
# @see https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt
|
|
11
|
+
UCD_BLOCKS = File.join(__dir__, '../../data/Blocks.txt')
|
|
12
|
+
|
|
13
|
+
# List of invalid, private, reserved ranges. Unasigned, unallocated ranges are calculated dynamically in {list_unassigned}.
|
|
14
|
+
INVALID_RANGES = [
|
|
15
|
+
{ range: 0xd800..0xdfff, name: 'Surrogates (invalid outside UTF-16)' },
|
|
16
|
+
{ range: 0xe000..0xf8ff, name: 'Private Use Area (located in BMP)' },
|
|
17
|
+
{ range: 0xf0000..0xfffff, name: 'Supplementary Private Use Area-A' },
|
|
18
|
+
{ range: 0x100000..0x10ffff, name: 'Supplementary Private Use Area-B' }
|
|
19
|
+
].freeze
|
|
20
|
+
|
|
21
|
+
# Returns the version of Unicode used in UCD local file (data/Blocks.txt)
|
|
22
|
+
# @return [String] Unicode version
|
|
23
|
+
# @example
|
|
24
|
+
# Unisec::Blocks.ucd_blocks_version # => "17.0.0"
|
|
25
|
+
def self.ucd_blocks_version
|
|
26
|
+
first_line = File.open(UCD_BLOCKS, &:readline)
|
|
27
|
+
first_line.match(/-(\d+\.\d+\.\d+)\.txt/).captures.first
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# List Unicode blocks name
|
|
31
|
+
# ⚠️ Char count value may be wrong for CJK UNIFIED IDEOGRAPH because they are poorly described in DerivedName.txt.
|
|
32
|
+
# ⚠️ Populating char_count is slow and can take a few seconds.
|
|
33
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count?
|
|
34
|
+
# @return [Array<Hash>] List of blocks (block name, range and count)
|
|
35
|
+
# @example
|
|
36
|
+
# Unisec::Blocks.list # => [{range: 0..127, name: "Basic Latin", range_size: nil, char_count: nil}, … ]
|
|
37
|
+
# Unisec::Blocks.list(with_count: true) # => [{range: 0..127, name: "Basic Latin", range_size: 128, char_count: 95}, … ]
|
|
38
|
+
def self.list(with_count: false)
|
|
39
|
+
out = []
|
|
40
|
+
file = File.new(UCD_BLOCKS)
|
|
41
|
+
file.each_line(chomp: true) do |line|
|
|
42
|
+
# Skip if the line is empty or a comment
|
|
43
|
+
next if line.empty? || line[0] == '#'
|
|
44
|
+
|
|
45
|
+
# parse the line to extract code point range and the name
|
|
46
|
+
blk_range, blk_name = line.split(';')
|
|
47
|
+
blk_range = Unisec::Utils::String.to_range(blk_range)
|
|
48
|
+
blk_name.lstrip!
|
|
49
|
+
out << {
|
|
50
|
+
range: blk_range,
|
|
51
|
+
name: blk_name,
|
|
52
|
+
range_size: with_count ? blk_range.size : nil,
|
|
53
|
+
char_count: with_count ? count_char_in_block(blk_range) : nil
|
|
54
|
+
}
|
|
55
|
+
end
|
|
56
|
+
out
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Count the number of characters allocated in a block.
|
|
60
|
+
# ⚠️ Char count value may be wrong for CJK UNIFIED IDEOGRAPH because they are poorly described in DerivedName.txt.
|
|
61
|
+
# @param range [Range] Block code point range
|
|
62
|
+
# @return [Integer] number of code points in the block
|
|
63
|
+
# @example
|
|
64
|
+
# Unisec::Blocks::count_char_in_block(0xAC00..0xD7AF) # => 11172
|
|
65
|
+
def self.count_char_in_block(range) # rubocop:disable Metrics/AbcSize
|
|
66
|
+
counter = 0
|
|
67
|
+
file = File.new(Rugrep::UCD_DERIVEDNAME)
|
|
68
|
+
file.each_line(chomp: true) do |line|
|
|
69
|
+
# Skip if the line is empty or a comment
|
|
70
|
+
next if line.empty? || line[0] == '#'
|
|
71
|
+
|
|
72
|
+
# parse the line to extract code point as integer and the name
|
|
73
|
+
cp_int, _name = line.split(';')
|
|
74
|
+
if cp_int.include?('..') # handle ranges in DerivedName.txt
|
|
75
|
+
ucd_range = Utils::String.to_range(cp_int)
|
|
76
|
+
next unless range.include_range?(ucd_range)
|
|
77
|
+
|
|
78
|
+
counter += ucd_range.size
|
|
79
|
+
next
|
|
80
|
+
end
|
|
81
|
+
cp_int = cp_int.chomp.to_i(16)
|
|
82
|
+
next unless range.include?(cp_int)
|
|
83
|
+
|
|
84
|
+
counter += 1
|
|
85
|
+
break if cp_int == range.end
|
|
86
|
+
end
|
|
87
|
+
counter
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Find the block including the target character or code point, or matching the provided name.
|
|
91
|
+
# @param block_arg [Integer|String] Decimal code point or standardized hexadecimal codepoint or string character (only one, so be careful with emojis, composed or joint characters using several units) or directly look for the block name (case insensitive).
|
|
92
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count?
|
|
93
|
+
# @return [Hash|nil] Maching block (block name, range and count) or nil if not found
|
|
94
|
+
# @example
|
|
95
|
+
# Unisec::Blocks.block(65, with_count:true) # => {range: 0..127, name: "Basic Latin", range_size: 128, char_count: 95}
|
|
96
|
+
# Unisec::Blocks.block("U+1f4a9") # => {range: 127744..128511, name: "Miscellaneous Symbols and Pictographs", range_size: nil, char_count: nil}
|
|
97
|
+
# Unisec::Blocks.block("…", with_count:true) # => {range: 8192..8303, name: "General Punctuation", range_size: 112, char_count: 111}
|
|
98
|
+
# Unisec::Blocks.block("javanese") # => {range: 43392..43487, name: "Javanese", range_size: nil, char_count: nil}
|
|
99
|
+
def self.block(block_arg, with_count: false) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity
|
|
100
|
+
file = File.new(UCD_BLOCKS)
|
|
101
|
+
found = false
|
|
102
|
+
file.each_line(chomp: true) do |line|
|
|
103
|
+
# Skip if the line is empty or a comment
|
|
104
|
+
next if line.empty? || line[0] == '#'
|
|
105
|
+
|
|
106
|
+
# parse the line to extract code point range and the name
|
|
107
|
+
blk_range, blk_name = line.split(';')
|
|
108
|
+
blk_range = Unisec::Utils::String.to_range(blk_range)
|
|
109
|
+
blk_name.lstrip!
|
|
110
|
+
case block_arg
|
|
111
|
+
when Integer # block_arg is an intgeger code point
|
|
112
|
+
found = true if blk_range.include?(block_arg)
|
|
113
|
+
when String # can be a char or block name or a string code point
|
|
114
|
+
if block_arg.size == 1 # is a char (1 code unit, not one grapheme)
|
|
115
|
+
found = true if blk_range.include?(Utils::String.convert_to_integer(block_arg))
|
|
116
|
+
elsif block_arg.start_with?('U+') # string code point
|
|
117
|
+
found = true if blk_range.include?(Utils::String.stdhexcp2deccp(block_arg))
|
|
118
|
+
elsif blk_name.downcase == block_arg.downcase # block name
|
|
119
|
+
found = true
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
if found
|
|
123
|
+
return {
|
|
124
|
+
range: blk_range,
|
|
125
|
+
name: blk_name,
|
|
126
|
+
range_size: with_count ? blk_range.size : nil,
|
|
127
|
+
char_count: with_count ? count_char_in_block(blk_range) : nil
|
|
128
|
+
}
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
nil # not found
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# List unasigned, unallocated ranges.
|
|
135
|
+
# @return [Array<Range>] List of unassigned (code-point) ranges
|
|
136
|
+
# @example
|
|
137
|
+
# Unisec::Blocks.list_unassigned # => [12256..12271, 66048..66175, …]
|
|
138
|
+
def self.list_unassigned # rubocop:disable Metrics/AbcSize
|
|
139
|
+
base = (0x0000..0x10ffff)
|
|
140
|
+
assigned = Unisec::Blocks.list.map { |b| b[:range] }
|
|
141
|
+
|
|
142
|
+
unassigned = []
|
|
143
|
+
cursor = base.begin
|
|
144
|
+
|
|
145
|
+
assigned.each do |r|
|
|
146
|
+
unassigned << (cursor..(r.begin - 1)) if cursor < r.begin
|
|
147
|
+
cursor = r.end + 1
|
|
148
|
+
break if cursor > base.end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
unassigned << (cursor..base.end) if cursor <= base.end
|
|
152
|
+
|
|
153
|
+
unassigned
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Display a CLI-friendly output listing all blocks
|
|
157
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count?
|
|
158
|
+
def self.list_display(with_count: false) # rubocop:disable Metrics/AbcSize
|
|
159
|
+
blocks = list(with_count: with_count)
|
|
160
|
+
display = ->(key, value, just) { print Paint[key, :red, :bold] + " #{value}".ljust(just) }
|
|
161
|
+
blocks.each do |blk|
|
|
162
|
+
display.call('Range:', Utils::Range.range2codepoint_range(blk[:range]), 22)
|
|
163
|
+
display.call('Name:', blk[:name], 50)
|
|
164
|
+
if with_count
|
|
165
|
+
display.call('Range size:', blk[:range_size], 8)
|
|
166
|
+
display.call('Char count:', blk[:char_count], 0)
|
|
167
|
+
end
|
|
168
|
+
puts
|
|
169
|
+
end
|
|
170
|
+
nil
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Display a CLI-friendly output detailing the searched block
|
|
174
|
+
# @param block_arg [Integer|String] Decimal code point or standardized hexadecimal codepoint or string character (only one, so be careful with emojis, composed or joint characters using several units) or directly look for the block name (case insensitive).
|
|
175
|
+
# @param with_count [TrueClass|FalseClass] calculate block's range size & char count?
|
|
176
|
+
def self.block_display(block_arg, with_count: false)
|
|
177
|
+
blk = block(block_arg, with_count: with_count)
|
|
178
|
+
if blk.nil?
|
|
179
|
+
puts "no block found with #{block_arg}"
|
|
180
|
+
else
|
|
181
|
+
display = ->(key, value) { puts Paint[key, :red, :bold] + " #{value}" }
|
|
182
|
+
display.call('Range:', Utils::Range.range2codepoint_range(blk[:range]))
|
|
183
|
+
display.call('Name:', blk[:name])
|
|
184
|
+
if with_count
|
|
185
|
+
display.call('Range size:', blk[:range_size])
|
|
186
|
+
display.call('Char count:', blk[:char_count])
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
nil
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Display a CLI-friendly output listing all invalid and unsassigned ranges.
|
|
193
|
+
def self.list_invalid_display # rubocop:disable Metrics/AbcSize
|
|
194
|
+
display = ->(key, value, just) { print Paint[key, :red, :bold] + " #{value}".ljust(just) }
|
|
195
|
+
puts '(Assigned) invalid, private, reserved ranges:'
|
|
196
|
+
INVALID_RANGES.each do |blk|
|
|
197
|
+
display.call('Range:', Utils::Range.range2codepoint_range(blk[:range]), 22)
|
|
198
|
+
display.call('Name:', blk[:name], 50)
|
|
199
|
+
puts
|
|
200
|
+
end
|
|
201
|
+
puts "\nUnasigned, unallocated ranges:"
|
|
202
|
+
list_unassigned.each do |blk|
|
|
203
|
+
display.call('Range:', Utils::Range.range2codepoint_range(blk), 22)
|
|
204
|
+
puts
|
|
205
|
+
end
|
|
206
|
+
nil
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'dry/cli'
|
|
4
|
+
require 'unisec'
|
|
5
|
+
require 'unisec/utils'
|
|
6
|
+
|
|
7
|
+
module Unisec
|
|
8
|
+
module CLI
|
|
9
|
+
module Commands
|
|
10
|
+
# CLI sub-commands `unisec blocks xxx` for the class {Unisec::Blocks} from the lib.
|
|
11
|
+
module Blocks
|
|
12
|
+
# Command `unisec blocks list`
|
|
13
|
+
#
|
|
14
|
+
# Example:
|
|
15
|
+
#
|
|
16
|
+
# ```plaintext
|
|
17
|
+
# $ unisec blocks list
|
|
18
|
+
# Range: U+0000 - U+007F Name: Basic Latin
|
|
19
|
+
# Range: U+0080 - U+00FF Name: Latin-1 Supplement
|
|
20
|
+
# …
|
|
21
|
+
# ```
|
|
22
|
+
class List < Dry::CLI::Command
|
|
23
|
+
desc 'List all Unicode blocks'
|
|
24
|
+
|
|
25
|
+
option :with_count, default: 'false', values: %w[true false],
|
|
26
|
+
desc: "calculate block's range size & char count?"
|
|
27
|
+
|
|
28
|
+
# List Unicode blocks
|
|
29
|
+
def call(**options)
|
|
30
|
+
Unisec::Blocks.list_display(with_count: options[:with_count].to_bool)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Command `unisec blocks search`
|
|
35
|
+
#
|
|
36
|
+
# Example:
|
|
37
|
+
#
|
|
38
|
+
# ```plaintext
|
|
39
|
+
# $ unisec blocks search 127745
|
|
40
|
+
# $ unisec blocks search U+1f4a9
|
|
41
|
+
# $ unisec blocks search …
|
|
42
|
+
# $ unisec blocks search javanese
|
|
43
|
+
# ```
|
|
44
|
+
class Search < Dry::CLI::Command
|
|
45
|
+
desc 'Search for a specific block'
|
|
46
|
+
|
|
47
|
+
argument :block_arg, required: true,
|
|
48
|
+
desc: 'Decimal code point | standardized hexadecimal codepoint | string character ' \
|
|
49
|
+
'(only one, so be careful with emojis, composed or joint characters using ' \
|
|
50
|
+
'several units) | block name (case insensitive)'
|
|
51
|
+
|
|
52
|
+
option :with_count, default: 'false', values: %w[true false],
|
|
53
|
+
desc: "calculate block's range size & char count?"
|
|
54
|
+
|
|
55
|
+
# Display a block matching a decimal code point, standardized hexadecimal codepoint, string character or block name
|
|
56
|
+
# @param block_arg [Integer|String] Decimal code point or standardized hexadecimal codepoint or string character (only one, so be careful with emojis, composed or joint characters using several units) or directly look for the block name (case insensitive).
|
|
57
|
+
def call(block_arg: nil, **options)
|
|
58
|
+
block_arg = block_arg.to_i if /\A\d+\Z/.match?(block_arg) # cast decimal string to integer
|
|
59
|
+
Unisec::Blocks.block_display(block_arg, with_count: options[:with_count].to_bool)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Command `unisec blocks invalid`
|
|
64
|
+
#
|
|
65
|
+
# Example:
|
|
66
|
+
#
|
|
67
|
+
# ```plaintext
|
|
68
|
+
# $ unisec blocks invalid
|
|
69
|
+
# (Assigned) invalid, private, reserved ranges:
|
|
70
|
+
# Range: U+D800 - U+DFFF Name: Surrogates (invalid outside UTF-16)
|
|
71
|
+
# Range: U+E000 - U+F8FF Name: Private Use Area (located in BMP)
|
|
72
|
+
# Range: U+F0000 - U+FFFFF Name: Supplementary Private Use Area-A
|
|
73
|
+
# Range: U+100000 - U+10FFFF Name: Supplementary Private Use Area-B
|
|
74
|
+
#
|
|
75
|
+
# Unasigned, unallocated ranges:
|
|
76
|
+
# Range: U+2FE0 - U+2FEF
|
|
77
|
+
# Range: U+10200 - U+1027F
|
|
78
|
+
# Range: U+103E0 - U+103FF
|
|
79
|
+
# Range: U+107C0 - U+107FF
|
|
80
|
+
# …
|
|
81
|
+
# ```
|
|
82
|
+
class Invalid < Dry::CLI::Command
|
|
83
|
+
desc 'List all invalid and unsassigned ranges'
|
|
84
|
+
|
|
85
|
+
# List all invalid and unsassigned ranges
|
|
86
|
+
def call(**)
|
|
87
|
+
Unisec::Blocks.list_invalid_display
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
data/lib/unisec/cli/cli.rb
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'unisec/cli/bidi'
|
|
4
|
+
require 'unisec/cli/blocks'
|
|
4
5
|
require 'unisec/cli/confusables'
|
|
5
|
-
require 'unisec/cli/
|
|
6
|
+
require 'unisec/cli/dump'
|
|
6
7
|
require 'unisec/cli/normalization'
|
|
8
|
+
require 'unisec/cli/planes'
|
|
7
9
|
require 'unisec/cli/properties'
|
|
8
10
|
require 'unisec/cli/rugrep'
|
|
9
11
|
require 'unisec/cli/size'
|
|
@@ -20,12 +22,18 @@ module Unisec
|
|
|
20
22
|
# Mapping between the (sub-)commands as seen by the user
|
|
21
23
|
# on the command-line interface and the CLI modules in the lib
|
|
22
24
|
register 'bidi spoof', Bidi::Spoof
|
|
25
|
+
register 'blocks invalid', Blocks::Invalid
|
|
26
|
+
register 'blocks list', Blocks::List
|
|
27
|
+
register 'blocks search', Blocks::Search
|
|
23
28
|
register 'confusables list', Confusables::List
|
|
24
29
|
register 'confusables randomize', Confusables::Randomize
|
|
30
|
+
register 'dump dec', Dump::Dec
|
|
31
|
+
register 'dump hex', Dump::Hex
|
|
25
32
|
register 'grep', Grep
|
|
26
|
-
register 'hexdump', Hexdump
|
|
27
33
|
register 'normalize all', Normalize::All
|
|
28
34
|
register 'normalize replace', Normalize::Replace
|
|
35
|
+
register 'planes list', Planes::List
|
|
36
|
+
register 'planes search', Planes::Search
|
|
29
37
|
register 'properties char', Properties::Char
|
|
30
38
|
register 'properties codepoints', Properties::Codepoints
|
|
31
39
|
register 'properties list', Properties::List
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'dry/cli'
|
|
4
|
+
require 'unisec'
|
|
5
|
+
|
|
6
|
+
module Unisec
|
|
7
|
+
module CLI
|
|
8
|
+
module Commands
|
|
9
|
+
# CLI sub-commands `unisec dump xxx` for several dump classes like {Unisec::Hexdump} or {Unisec::Decxdump} from the lib.
|
|
10
|
+
module Dump
|
|
11
|
+
# CLI command `unisec dump hex` for the class {Unisec::Hexdump} from the lib.
|
|
12
|
+
#
|
|
13
|
+
# Example:
|
|
14
|
+
#
|
|
15
|
+
# ```plaintext
|
|
16
|
+
# $ unisec dump hex "ACCEIS"
|
|
17
|
+
# UTF-8: 41 43 43 45 49 53
|
|
18
|
+
# UTF-16BE: 0041 0043 0043 0045 0049 0053
|
|
19
|
+
# UTF-16LE: 4100 4300 4300 4500 4900 5300
|
|
20
|
+
# UTF-32BE: 00000041 00000043 00000043 00000045 00000049 00000053
|
|
21
|
+
# UTF-32LE: 41000000 43000000 43000000 45000000 49000000 53000000
|
|
22
|
+
#
|
|
23
|
+
# $ unisec dump hex "ACCEIS" --enc utf16le
|
|
24
|
+
# 4100 4300 4300 4500 4900 5300
|
|
25
|
+
# ```
|
|
26
|
+
class Hex < Dry::CLI::Command
|
|
27
|
+
desc 'Hexadecimal dump (hexdump) in all Unicode encodings'
|
|
28
|
+
|
|
29
|
+
argument :input, required: true,
|
|
30
|
+
desc: 'String input. Read from STDIN if equal to -.'
|
|
31
|
+
|
|
32
|
+
option :enc, default: nil, values: %w[utf8 utf16be utf16le utf32be utf32le],
|
|
33
|
+
desc: 'Output only in the specified encoding.'
|
|
34
|
+
|
|
35
|
+
# Hexdump of all Unicode encodings.
|
|
36
|
+
# @param input [String] Input string to encode
|
|
37
|
+
def call(input: nil, **options)
|
|
38
|
+
input = $stdin.read.chomp if input == '-'
|
|
39
|
+
if options[:enc].nil?
|
|
40
|
+
puts Unisec::Hexdump.new(input).display
|
|
41
|
+
else
|
|
42
|
+
# using send() is safe here thanks to the value whitelist
|
|
43
|
+
puts Unisec::Hexdump.send(options[:enc], input)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# CLI command `unisec dump dec` for the class {Unisec::Decdump} from the lib.
|
|
49
|
+
#
|
|
50
|
+
# Example:
|
|
51
|
+
#
|
|
52
|
+
# ```plaintext
|
|
53
|
+
# $ unisec dump dec "noraj"
|
|
54
|
+
# UTF-8: 110 111 114 097 106
|
|
55
|
+
# UTF-16BE: |000 110| |000 111| |000 114| |000 097| |000 106|
|
|
56
|
+
# UTF-16LE: |110 000| |111 000| |114 000| |097 000| |106 000|
|
|
57
|
+
# UTF-32BE: |000 000 000 110| |000 000 000 111| |000 000 000 114| |000 000 000 097| |000 000 000 106|
|
|
58
|
+
# UTF-32LE: |110 000 000 000| |111 000 000 000| |114 000 000 000| |097 000 000 000| |106 000 000 000|
|
|
59
|
+
#
|
|
60
|
+
# $ unisec dump dec "noraj" --enc utf16le
|
|
61
|
+
# |110 000| |111 000| |114 000| |097 000| |106 000|
|
|
62
|
+
# ```
|
|
63
|
+
class Dec < Dry::CLI::Command
|
|
64
|
+
desc 'Decimal dump (decdump) in all Unicode encodings'
|
|
65
|
+
|
|
66
|
+
argument :input, required: true,
|
|
67
|
+
desc: 'String input. Read from STDIN if equal to -.'
|
|
68
|
+
|
|
69
|
+
option :enc, default: nil, values: %w[utf8 utf16be utf16le utf32be utf32le],
|
|
70
|
+
desc: 'Output only in the specified encoding.'
|
|
71
|
+
|
|
72
|
+
# Decdump of all Unicode encodings.
|
|
73
|
+
# @param input [String] Input string to encode
|
|
74
|
+
def call(input: nil, **options)
|
|
75
|
+
input = $stdin.read.chomp if input == '-'
|
|
76
|
+
if options[:enc].nil?
|
|
77
|
+
puts Unisec::Decdump.new(input).display
|
|
78
|
+
else
|
|
79
|
+
# using send() is safe here thanks to the value whitelist
|
|
80
|
+
puts Unisec::Decdump.send(options[:enc], input)
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'dry/cli'
|
|
4
|
+
require 'unisec'
|
|
5
|
+
require 'unisec/utils'
|
|
6
|
+
|
|
7
|
+
module Unisec
|
|
8
|
+
module CLI
|
|
9
|
+
module Commands
|
|
10
|
+
# CLI sub-commands `unisec planes xxx` for the class {Unisec::Planes} from the lib.
|
|
11
|
+
module Planes
|
|
12
|
+
# Command `unisec planes list`
|
|
13
|
+
#
|
|
14
|
+
# Example:
|
|
15
|
+
#
|
|
16
|
+
# ```plaintext
|
|
17
|
+
# $ unisec planes list
|
|
18
|
+
# Range: U+0000 - U+FFFF Name: Basic Multilingual Plane
|
|
19
|
+
# Range: U+10000 - U+1FFFF Name: Supplementary Multilingual Plane
|
|
20
|
+
# Range: U+20000 - U+2FFFF Name: Supplementary Ideographic Plane
|
|
21
|
+
# Range: U+30000 - U+3FFFF Name: Tertiary Ideographic Plane
|
|
22
|
+
# …
|
|
23
|
+
# $ unisec planes list --with-blocks=true
|
|
24
|
+
# Range: U+0000 - U+FFFF Name: Basic Multilingual Plane
|
|
25
|
+
# Blocks:
|
|
26
|
+
# Range: U+0000 - U+007F Name: Basic Latin
|
|
27
|
+
# Range: U+0080 - U+00FF Name: Latin-1 Supplement
|
|
28
|
+
# Range: U+0100 - U+017F Name: Latin Extended-A
|
|
29
|
+
# Range: U+0180 - U+024F Name: Latin Extended-B
|
|
30
|
+
# ```
|
|
31
|
+
class List < Dry::CLI::Command
|
|
32
|
+
desc 'List all Unicode planes'
|
|
33
|
+
|
|
34
|
+
option :with_blocks, default: 'false', values: %w[true false],
|
|
35
|
+
desc: 'display the blocks associated with each plane?'
|
|
36
|
+
option :with_count, default: 'false', values: %w[true false],
|
|
37
|
+
desc: "calculate block's range size & char count?"
|
|
38
|
+
|
|
39
|
+
# List Unicode blocks
|
|
40
|
+
def call(**options)
|
|
41
|
+
Unisec::Planes.list_display(with_blocks: options[:with_blocks].to_bool,
|
|
42
|
+
with_count: options[:with_count].to_bool)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Command `unisec planes search`
|
|
47
|
+
#
|
|
48
|
+
# Example:
|
|
49
|
+
#
|
|
50
|
+
# ```plaintext
|
|
51
|
+
# $ unisec planes search 3
|
|
52
|
+
# Range: U+30000 - U+3FFFF Name: Tertiary Ideographic Plane
|
|
53
|
+
# $ unisec planes search 2 --with-blocks=true
|
|
54
|
+
# Range: U+20000 - U+2FFFF Name: Supplementary Ideographic Plane
|
|
55
|
+
# Blocks:
|
|
56
|
+
# Range: U+20000 - U+2A6DF Name: CJK Unified Ideographs Extension B
|
|
57
|
+
# Range: U+2A700 - U+2B73F Name: CJK Unified Ideographs Extension C
|
|
58
|
+
# Range: U+2B740 - U+2B81F Name: CJK Unified Ideographs Extension D
|
|
59
|
+
# Range: U+2B820 - U+2CEAF Name: CJK Unified Ideographs Extension E
|
|
60
|
+
# Range: U+2CEB0 - U+2EBEF Name: CJK Unified Ideographs Extension F
|
|
61
|
+
# Range: U+2EBF0 - U+2EE5F Name: CJK Unified Ideographs Extension I
|
|
62
|
+
# Range: U+2F800 - U+2FA1F Name: CJK Compatibility Ideographs Supplement
|
|
63
|
+
# $ unisec planes search 'basic multilingual plane'
|
|
64
|
+
# Range: U+0000 - U+FFFF Name: Basic Multilingual Plane
|
|
65
|
+
# $ unisec planes search 'unassigned'
|
|
66
|
+
# Range: U+40000 - U+4FFFF Name: unassigned
|
|
67
|
+
# Range: U+50000 - U+5FFFF Name: unassigned
|
|
68
|
+
# Range: U+60000 - U+6FFFF Name: unassigned
|
|
69
|
+
# Range: U+70000 - U+7FFFF Name: unassigned
|
|
70
|
+
# Range: U+80000 - U+8FFFF Name: unassigned
|
|
71
|
+
# Range: U+90000 - U+9FFFF Name: unassigned
|
|
72
|
+
# Range: U+A0000 - U+AFFFF Name: unassigned
|
|
73
|
+
# Range: U+B0000 - U+BFFFF Name: unassigned
|
|
74
|
+
# Range: U+C0000 - U+CFFFF Name: unassigned
|
|
75
|
+
# Range: U+D0000 - U+DFFFF Name: unassigned
|
|
76
|
+
# ```
|
|
77
|
+
class Search < Dry::CLI::Command
|
|
78
|
+
desc 'Search for a specific plane'
|
|
79
|
+
|
|
80
|
+
argument :plane_arg, required: true,
|
|
81
|
+
desc: 'Name or number of the plane'
|
|
82
|
+
|
|
83
|
+
option :with_blocks, default: 'false', values: %w[true false],
|
|
84
|
+
desc: 'display the blocks associated with each plane?'
|
|
85
|
+
option :with_count, default: 'false', values: %w[true false],
|
|
86
|
+
desc: "calculate block's range size & char count?"
|
|
87
|
+
|
|
88
|
+
# Display a plane matching a plane name or plane number
|
|
89
|
+
# @param plane_arg [String|Integer] name or number of the plane
|
|
90
|
+
def call(plane_arg: nil, **options)
|
|
91
|
+
plane_arg = plane_arg.to_i if /\A\d+\Z/.match?(plane_arg) # cast decimal string to integer
|
|
92
|
+
Unisec::Planes.plane_display(plane_arg, with_blocks: options[:with_blocks].to_bool,
|
|
93
|
+
with_count: options[:with_count].to_bool)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
data/lib/unisec/cli/rugrep.rb
CHANGED
|
@@ -21,7 +21,7 @@ module Unisec
|
|
|
21
21
|
argument :regexp, required: true,
|
|
22
22
|
desc: 'regular expression'
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# Unicode code point names matching regexp.
|
|
25
25
|
# @param regexp [Regexp] Regular expression without delimiters or modifiers.
|
|
26
26
|
# Supports everything Ruby Regexp supports
|
|
27
27
|
def call(regexp: nil, **)
|
data/lib/unisec/confusables.rb
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
require 'unicode/confusable'
|
|
4
4
|
require 'twitter_cldr'
|
|
5
|
+
require 'paint'
|
|
6
|
+
require 'unisec/utils'
|
|
5
7
|
|
|
6
8
|
module Unisec
|
|
7
9
|
# Operations about Unicode confusable characters (homoglyphs).
|
|
@@ -22,7 +24,7 @@ module Unisec
|
|
|
22
24
|
# @param map [Boolean] allows partial mapping, includes confusable where the given chart is a part of
|
|
23
25
|
def self.list_display(chr, map: true)
|
|
24
26
|
Confusables.list(chr, map: map).each do |confu|
|
|
25
|
-
puts "#{
|
|
27
|
+
puts "#{Utils::String.char2codepoint(confu).ljust(9)} #{confu.ljust(4)} " \
|
|
26
28
|
"#{TwitterCldr::Shared::CodePoint.get(confu.codepoints.first).name}"
|
|
27
29
|
end
|
|
28
30
|
nil
|