unisec 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9081ac95de968e70cd91438a73b185efee13e33ccf16c40a2791da5963d3d67c
4
- data.tar.gz: 99a651d4efc5f6b36ae088ec254e0dc950b5d84db9105dc851f29d51609615f8
3
+ metadata.gz: e1c859ae327cc9381cc578456525a9fc0d6e68299f10bce6cd4f6439431a7fc0
4
+ data.tar.gz: 8c091df7ffc3e8f720ca9e5cee3d022e4cba4876530727150cc8277d61509f7c
5
5
  SHA512:
6
- metadata.gz: ab342720e300cd25e167385f70402e00ef240ed6c422f7aeea666774b4e477f423164fe3de49146bc0f4a3f2565f86effb1465f112085568c65b1625b9d911e5
7
- data.tar.gz: bc105f1430c812711727600365db10871e1bd69ad274c7ccc0a2b5e1362676304ce27e5e85c6a988986fe9a82bc6189bf3c14e6ecb577f327dc41bd6923f241d
6
+ metadata.gz: 7981fd667521cbccf1c3fdfda8610722fdf9892392568be8bacdd36719109982e07d906c9c4b5c3aff4c90d10252b93460698a3f404348d5dcbd8783124e77cb
7
+ data.tar.gz: 3b32516d01be17f5d462acade421755c5420f1f2f7d596f972c87d17425a64e06cee0fc7963d916113ea970f6a8882b47aa4e84113d31c992f8cc115c2ea5f59
data/lib/unisec/bidi.rb CHANGED
@@ -18,10 +18,10 @@ module Unisec
18
18
  # @param input [String] the target string
19
19
  # @param opts [Hash] optional parameters, see {Spoof.bidi_affix}
20
20
  # @return [String] the target string
21
- def set_target_display(input, **)
21
+ def set_target_display(input, **opts)
22
22
  @target_display = input
23
- @spoof_string = reverse(**)
24
- @spoof_payload = bidi_affix(**)
23
+ @spoof_string = reverse(**opts)
24
+ @spoof_payload = bidi_affix(**opts)
25
25
  @target_display
26
26
  end
27
27
 
data/lib/unisec/blocks.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'paint'
4
+ require 'twitter_cldr'
4
5
  require 'unisec/utils'
5
6
 
6
7
  module Unisec
@@ -114,7 +115,7 @@ module Unisec
114
115
  if block_arg.size == 1 # is a char (1 code unit, not one grapheme)
115
116
  found = true if blk_range.include?(Utils::String.convert_to_integer(block_arg))
116
117
  elsif block_arg.start_with?('U+') # string code point
117
- found = true if blk_range.include?(Utils::String.stdhexcp2deccp(block_arg))
118
+ found = true if blk_range.include?(Utils::String.convert(block_arg, :integer))
118
119
  elsif blk_name.downcase == block_arg.downcase # block name
119
120
  found = true
120
121
  end
@@ -205,5 +206,38 @@ module Unisec
205
206
  end
206
207
  nil
207
208
  end
209
+
210
+ # Returns the name of the Unicode block containing the given character.
211
+ # @param char [String] Single character (only one code unit, so be careful with
212
+ # emojis, composed or joint characters using several units, only the first
213
+ # code unit will be kept).
214
+ # @return [String] Block name or empty string if not found.
215
+ # @example
216
+ # Unisec::Blocks.reverse('…') # => "General Punctuation"
217
+ # Unisec::Blocks.reverse('A') # => "Basic Latin"
218
+ # Unisec::Blocks.reverse('💩') # => "Miscellaneous Symbols and Pictographs"
219
+ # Unisec::Blocks.reverse('🇫🇷') # => "Enclosed Alphanumeric Supplement" (only first unit is kept)
220
+ def self.reverse(char)
221
+ cp_num = TwitterCldr::Utils::CodePoints.from_string(char)
222
+ cp = TwitterCldr::Shared::CodePoint.get(cp_num.first)
223
+ props = cp.properties
224
+ props.block.join
225
+ rescue NoMethodError # in case of invalid character where CodePoint.get() => nil
226
+ ''
227
+ end
228
+
229
+ # Display a CLI-friendly output showing the block name for a given character.
230
+ # @param char [String] Single character (only one code unit, so be careful with
231
+ # emojis, composed or joint characters using several units, only the first
232
+ # code unit will be kept).
233
+ def self.reverse_display(char)
234
+ blk_name = reverse(char)
235
+ if blk_name.empty?
236
+ puts "no block found for #{char.inspect}"
237
+ else
238
+ puts blk_name
239
+ end
240
+ nil
241
+ end
208
242
  end
209
243
  end
@@ -60,6 +60,34 @@ module Unisec
60
60
  end
61
61
  end
62
62
 
63
+ # Command `unisec blocks reverse`
64
+ #
65
+ # Example:
66
+ #
67
+ # ```plaintext
68
+ # $ unisec blocks reverse '…'
69
+ # General Punctuation
70
+ # $ unisec blocks reverse 'A'
71
+ # Basic Latin
72
+ # $ unisec blocks reverse '💩'
73
+ # Miscellaneous Symbols and Pictographs
74
+ # $ unisec blocks reverse '🇫🇷'
75
+ # Enclosed Alphanumeric Supplement
76
+ # ```
77
+ class Reverse < Dry::CLI::Command
78
+ desc 'Search in which Unicode block a given character is'
79
+
80
+ argument :char, required: true,
81
+ desc: 'Single character (only one code unit, so be careful with emojis, composed or ' \
82
+ 'joint characters using several units, only the first code unit will be kept)'
83
+
84
+ # Display the Unicode block name for a given character
85
+ # @param char [String] Single character (only one code unit, so be careful with emojis, composed or joint characters using several units, only the first code unit will be kept).
86
+ def call(char: nil, **)
87
+ Unisec::Blocks.reverse_display(char)
88
+ end
89
+ end
90
+
63
91
  # Command `unisec blocks invalid`
64
92
  #
65
93
  # Example:
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'dry/cli/completion/command'
3
4
  require 'unisec/cli/bidi'
4
5
  require 'unisec/cli/blocks'
5
6
  require 'unisec/cli/confusables'
@@ -24,15 +25,23 @@ module Unisec
24
25
  register 'bidi spoof', Bidi::Spoof
25
26
  register 'blocks invalid', Blocks::Invalid
26
27
  register 'blocks list', Blocks::List
28
+ register 'blocks reverse', Blocks::Reverse
27
29
  register 'blocks search', Blocks::Search
30
+ register 'completion', Dry::CLI::Completion::Command[self]
28
31
  register 'confusables list', Confusables::List
29
32
  register 'confusables randomize', Confusables::Randomize
33
+ register 'dump codepoints integer', Dump::Codepoints::Integer
34
+ register 'dump codepoints standard', Dump::Codepoints::Standard
30
35
  register 'dump dec', Dump::Dec
31
36
  register 'dump hex', Dump::Hex
37
+ register 'dump rev', Dump::Reverse
32
38
  register 'grep', Grep
33
39
  register 'normalize all', Normalize::All
34
40
  register 'normalize replace', Normalize::Replace
41
+ register 'normalize reverse', Normalize::Reverse
42
+ register 'planes block', Planes::Block
35
43
  register 'planes list', Planes::List
44
+ register 'planes reverse', Planes::Reverse
36
45
  register 'planes search', Planes::Search
37
46
  register 'properties char', Properties::Char
38
47
  register 'properties codepoints', Properties::Codepoints
@@ -81,6 +81,87 @@ module Unisec
81
81
  end
82
82
  end
83
83
  end
84
+
85
+ module Codepoints
86
+ # CLI command `unisec dump codepoints standard`.
87
+ #
88
+ # Example:
89
+ #
90
+ # ```plaintext
91
+ # $ unisec dump codepoints standard "unicode"
92
+ # U+0075 U+006E U+0069 U+0063 U+006F U+0064 U+0065
93
+ # ```
94
+ class Standard < Dry::CLI::Command
95
+ desc 'Code point dump (standard format)'
96
+
97
+ argument :input, required: true,
98
+ desc: 'String input. Read from STDIN if equal to -.'
99
+
100
+ # Code point dump (standard format).
101
+ # @param input [String] Input string to encode
102
+ def call(input: nil)
103
+ input = $stdin.read.chomp if input == '-'
104
+ puts Unisec::Utils::String.chars2codepoints(input)
105
+ end
106
+ end
107
+
108
+ # CLI command `unisec dump codepoints integer`.
109
+ #
110
+ # Example:
111
+ #
112
+ # ```plaintext
113
+ # $ unisec dump codepoints integer 'I 💕 Ruby 💎'
114
+ # 73 32 128149 32 82 117 98 121 32 128142
115
+ # ```
116
+ class Integer < Dry::CLI::Command
117
+ desc 'Code point dump (integer format)'
118
+
119
+ argument :input, required: true,
120
+ desc: 'String input. Read from STDIN if equal to -.'
121
+
122
+ # Code point dump (integer format).
123
+ # @param input [String] Input string to encode
124
+ def call(input: nil)
125
+ input = $stdin.read.chomp if input == '-'
126
+ puts Unisec::Utils::String.chars2intcodepoints(input)
127
+ end
128
+ end
129
+ end
130
+
131
+ # CLI command `unisec dump rev` for the method {Unisec::Hexdump.reverse} from the lib.
132
+ #
133
+ # Example:
134
+ #
135
+ # ```plaintext
136
+ # $ unisec dump rev 0a0d --enc=utf16be
137
+ # ਍ (U+0A0D) - 0a0d
138
+ #
139
+ # $ unisec dump rev 808080 --enc=utf8 --exact=false
140
+ # 񀀀 (U+40000) - f1 80 80 80
141
+ # 򀀀 (U+80000) - f2 80 80 80
142
+ # 󀀀 (U+C0000) - f3 80 80 80
143
+ # 􀀀 (U+100000) - f4 80 80 80
144
+ # ```
145
+ class Reverse < Dry::CLI::Command
146
+ desc 'Reverse search in hexadecimal dump'
147
+
148
+ argument :hexbytes, required: true,
149
+ desc: 'Byte(s) in hexadecimal to search for. Read from STDIN if equal to -.'
150
+
151
+ option :enc, default: 'utf8', values: %w[utf8 utf16be utf16le utf32be utf32le],
152
+ desc: 'The target encoding in which to search.'
153
+
154
+ option :exact, default: 'true', values: %w[true false],
155
+ desc: 'true (default) = exact search, false = "sub-string" search / the value is included ' \
156
+ 'in the encoded value'
157
+
158
+ # Search X byte(s) hexadecimal value in Y encoding, basically which characters will give this resulting encoded value
159
+ # @param hexbytes [String] The target encoding in which to search.
160
+ def call(hexbytes: nil, **options)
161
+ hexbytes = $stdin.read.chomp if hexbytes == '-'
162
+ puts Unisec::Hexdump.display_reverse(hexbytes, options[:enc], exact: options[:exact].to_bool)
163
+ end
164
+ end
84
165
  end
85
166
  end
86
167
  end
@@ -81,6 +81,37 @@ module Unisec
81
81
  puts Unisec::Normalization.new(input).display_replace
82
82
  end
83
83
  end
84
+
85
+ # Command `unisec normalize reverse '<'`
86
+ #
87
+ # Example:
88
+ #
89
+ # ```plaintext
90
+ # $ unisec normalize reverse '"' --forms 'nfkc,nfkd'
91
+ # Original:
92
+ # " (U+0022)
93
+ # NFKC
94
+ # " (U+FF02)
95
+ # NFKD
96
+ # " (U+FF02)
97
+ # ```
98
+ class Reverse < Dry::CLI::Command
99
+ desc 'List reverse normalization candidates (what characters will transform into target after normalization)'
100
+
101
+ argument :target, required: true,
102
+ desc: 'Normalization target. Read from STDIN if equal to -.'
103
+
104
+ option :forms, default: %i[nfc nfd nfkc nfkd],
105
+ desc: 'Output only in the specified normalization form(s). ' \
106
+ 'Separate by comma if multiple values.'
107
+
108
+ # Reverse normalize
109
+ # @param target [String] Normalization target
110
+ def call(target: nil, **options)
111
+ target = $stdin.read.chomp if target == '-'
112
+ puts Unisec::Normalization.display_reverse_normalize(target, forms: options[:forms])
113
+ end
114
+ end
84
115
  end
85
116
  end
86
117
  end
@@ -93,6 +93,58 @@ module Unisec
93
93
  with_count: options[:with_count].to_bool)
94
94
  end
95
95
  end
96
+
97
+ # Command `unisec planes reverse`
98
+ #
99
+ # Example:
100
+ #
101
+ # ```plaintext
102
+ # $ unisec planes reverse '…'
103
+ # Basic Multilingual Plane
104
+ # $ unisec planes reverse '🨂'
105
+ # Supplementary Multilingual Plane
106
+ # $ unisec planes reverse '𠀀'
107
+ # Supplementary Ideographic Plane
108
+ # $ unisec planes reverse '🇫🇷'
109
+ # Supplementary Multilingual Plane
110
+ # ```
111
+ class Reverse < Dry::CLI::Command
112
+ desc 'Search in which Unicode plane a given character is'
113
+
114
+ argument :char, required: true,
115
+ desc: 'Single character (only one code unit, so be careful with emojis, composed or joint ' \
116
+ 'characters using several units), only the first code unit will be kept).'
117
+
118
+ # Display the Unicode plane name for a given character
119
+ # @param char [String] Single character (only one code unit, so be careful with emojis,
120
+ # composed or joint characters using several units, only the first code unit will be kept).
121
+ def call(char: nil, **)
122
+ Unisec::Planes.reverse_display(char)
123
+ end
124
+ end
125
+
126
+ # Command `unisec planes block`
127
+ #
128
+ # Example:
129
+ #
130
+ # ```plaintext
131
+ # $ unisec planes block 'Basic Latin'
132
+ # Basic Multilingual Plane
133
+ # $ unisec planes block 'Miscellaneous Symbols and Pictographs'
134
+ # Supplementary Multilingual Plane
135
+ # ```
136
+ class Block < Dry::CLI::Command
137
+ desc 'Search in which Unicode plane a block is'
138
+
139
+ argument :block_arg, required: true,
140
+ desc: 'Block name (case insensitive)'
141
+
142
+ # Display the Unicode plane name for a given block
143
+ # @param block_arg [String] Block name (case insensitive).
144
+ def call(block_arg: nil, **)
145
+ Unisec::Planes.block_display(block_arg)
146
+ end
147
+ end
96
148
  end
97
149
  end
98
150
  end
@@ -85,6 +85,33 @@ module Unisec
85
85
  str.encode('UTF-32LE').to_hex.scan(/.{8}/).join(' ')
86
86
  end
87
87
 
88
+ # Search X byte(s) hexadecimal value in Y encoding, basically which characters will give this resulting encoded value
89
+ # @param hexbytes [String] Byte(s) in hexadecimal to search for
90
+ # @param enc [String] The target encoding in which to search. It uses Unisec CLI argument values (utf8 utf16be utf16le utf32be utf32le).
91
+ # @param exact [TrueClass|FalseClass] true (default) = exact search, false = "sub-string" search / the value is included in the encoded value
92
+ # @return [Array<String>] all matching source characters
93
+ # @example
94
+ # Unisec::Hexdump.reverse('61', 'utf8') # => ["a"]
95
+ # Unisec::Hexdump.reverse('a6', 'utf8', exact: true) # => []
96
+ # Unisec::Hexdump.reverse('a6', 'utf8', exact: false) # => ["¦", "æ", "Ħ", "Ŧ", "Ʀ", "Ǧ", … ]
97
+ # Unisec::Hexdump.reverse('0d0a', 'utf16be', exact: true) # => ["\u0D0A"] (ഊ)
98
+ def self.reverse(hexbytes, enc, exact: true)
99
+ chars = []
100
+ (0x000000..0x10FFFF).each do |i|
101
+ char = i.chr(Unisec::Utils::Arguments.argenc2enc(enc, target: 'class'))
102
+ encoded_value = Unisec::Hexdump.send(enc, char).delete(' ')
103
+ if exact && encoded_value == hexbytes # exact match
104
+ chars << char
105
+ break
106
+ elsif !exact && encoded_value.include?(hexbytes) # includes value
107
+ chars << char
108
+ end
109
+ rescue RangeError # skip invalid code points for selected encoding
110
+ next
111
+ end
112
+ chars
113
+ end
114
+
88
115
  # Display a CLI-friendly output summurizing the hexdump in all Unicode encodings
89
116
  # @return [String] CLI-ready output
90
117
  # @example
@@ -101,5 +128,29 @@ module Unisec
101
128
  "UTF-32BE: #{@utf32be}\n" \
102
129
  "UTF-32LE: #{@utf32le}"
103
130
  end
131
+
132
+ # Display a CLI-friendly output summurizing the reverse hexdump search results
133
+ # @param hexbytes [String] see {Unisec::Hexdump.reverse}
134
+ # @param enc [String] see {Unisec::Hexdump.reverse}
135
+ # @param exact [TrueClass|FalseClass] see {Unisec::Hexdump.reverse}
136
+ # @return [String] CLI-ready output
137
+ # @example
138
+ # puts Unisec::Hexdump.display_reverse('0d0a', 'utf16be', exact: true)
139
+ # # ഊ (U+0D0A) - 0d0a
140
+ # puts Unisec::Hexdump.display_reverse('808080', 'utf8', exact: false)
141
+ # # 񀀀 (U+40000) - f1 80 80 80
142
+ # # 򀀀 (U+80000) - f2 80 80 80
143
+ # # 󀀀 (U+C0000) - f3 80 80 80
144
+ # # 􀀀 (U+100000) - f4 80 80 80
145
+ def self.display_reverse(hexbytes, enc, exact: true)
146
+ res = Unisec::Hexdump.reverse(hexbytes, enc, exact: exact)
147
+ out = ''
148
+ res.each do |char|
149
+ cp = Utils::String.char2codepoint(char)
150
+ hxd = Unisec::Hexdump.send(enc, char)
151
+ out += "#{char.encode('UTF-8')} (#{cp}) - #{hxd}\n"
152
+ end
153
+ out
154
+ end
104
155
  end
105
156
  end
@@ -95,6 +95,35 @@ module Unisec
95
95
  Normalization.replace_bypass(@original)
96
96
  end
97
97
 
98
+ # Find the list of symbols that will transform into a given symbol after normalization
99
+ # @param target [String]
100
+ # @param forms [String|Symbol|Array<Symbol>]
101
+ # @return [Hash] (results won't include input)
102
+ # @example
103
+ # Unisec::Normalization.reverse_normalize('<') # => {nfc: [], nfd: [], nfkc: ["﹤", "<"], nfkd: ["﹤", "<"]}
104
+ # Unisec::Normalization.reverse_normalize('.', forms: [:nfkc, :nfkd]) # => {nfkc: ["․", "﹒", "."], nfkd: ["․", "﹒", "."]}
105
+ # Unisec::Normalization.reverse_normalize('ffi', forms: :nfkc) # => {nfkc: ["ffi"]}
106
+ # Unisec::Normalization.reverse_normalize('≯', forms: 'nfd') # => {nfd: ["≯"]}
107
+ # Unisec::Normalization.reverse_normalize('ô', forms: 'nfc,nfd') # => {nfc: [], nfd: []}
108
+ def self.reverse_normalize(target, forms: %i[nfc nfd nfkc nfkd])
109
+ forms = Utils::Arguments.to_array_of_sym(forms)
110
+ result = {}
111
+ forms.each do |form|
112
+ result[form] = []
113
+ end
114
+
115
+ (0x000000..0x10FFFF).each do |codepoint|
116
+ char = codepoint.chr(Encoding::UTF_8)
117
+ forms.each do |form|
118
+ result[form] << char if (char.unicode_normalize(form) == target) && (char != target)
119
+ end
120
+ rescue RangeError # skip UTF-16 surrogates and potential other invalid code points
121
+ next
122
+ end
123
+
124
+ result
125
+ end
126
+
98
127
  # Display a CLI-friendly output summurizing all normalization forms
99
128
  # @return [String] CLI-ready output
100
129
  # @example
@@ -124,6 +153,18 @@ module Unisec
124
153
 
125
154
  # Display a CLI-friendly output of the XSS payload to bypass HTML escape and
126
155
  # what it does once normalized in NFKC & NFKD.
156
+ # @return [String] CLI-ready output
157
+ # @example
158
+ # $ puts Unisec::Normalization.new('<script>').display_replace
159
+ # # =>
160
+ # # Original: <script>
161
+ # # U+003C U+0073 U+0063 U+0072 U+0069 U+0070 U+0074 U+003E
162
+ # # Bypass payload: <script>
163
+ # # U+FF1C U+0073 U+0063 U+0072 U+0069 U+0070 U+0074 U+FF1E
164
+ # # NFKC: <script>
165
+ # # U+003C U+0073 U+0063 U+0072 U+0069 U+0070 U+0074 U+003E
166
+ # # NFKD: <script>
167
+ # # U+003C U+0073 U+0063 U+0072 U+0069 U+0070 U+0074 U+003E
127
168
  def display_replace
128
169
  colorize = lambda { |form_title, form_attr|
129
170
  "#{Paint[form_title.to_s, :underline,
@@ -135,5 +176,36 @@ module Unisec
135
176
  colorize.call('NFKC', Normalization.nfkc(payload)) +
136
177
  colorize.call('NFKD', Normalization.nfkd(payload))
137
178
  end
179
+
180
+ # Display a CLI-friendly output reverse normalization results
181
+ # @param target [String] see {Unisec::Normalization.reverse_normalize}
182
+ # @param forms [String|Symbol|Array<Symbol>] see {Unisec::Normalization.reverse_normalize}
183
+ # @return [String] CLI-ready output
184
+ # @example
185
+ # puts Unisec::Normalization.display_reverse_normalize('<')
186
+ # # =>
187
+ # # Original:
188
+ # # < (U+003C)
189
+ # # NFKC
190
+ # # ﹤ (U+FE64)
191
+ # # < (U+FF1C)
192
+ # # NFKD
193
+ # # ﹤ (U+FE64)
194
+ # # < (U+FF1C)
195
+ def self.display_reverse_normalize(target, forms: %i[nfc nfd nfkc nfkd]) # rubocop:disable Metrics/AbcSize
196
+ colorize_form = ->(form_title) { Paint[form_title, :underline, :bold] }
197
+ colorize_char = ->(char) { " #{char} (#{Paint[Unisec::Utils::String.chars2codepoints(char), :red]})\n" }
198
+ out = "#{colorize_form.call('Original')}:\n#{colorize_char.call(target)}"
199
+ res = Unisec::Normalization.reverse_normalize(target, forms: forms) # => {nfc: [], nfd: [], nfkc: ["﹤", "<"], nfkd: ["﹤", "<"]}
200
+ res.each_key do |k|
201
+ next if res[k].empty?
202
+
203
+ out += "#{colorize_form.call(k.to_s.upcase)}\n"
204
+ res[k].each do |v|
205
+ out += colorize_char.call(v)
206
+ end
207
+ end
208
+ out
209
+ end
138
210
  end
139
211
  end
data/lib/unisec/planes.rb CHANGED
@@ -220,5 +220,71 @@ module Unisec
220
220
  end
221
221
  nil
222
222
  end
223
+
224
+ # Returns the name of the Unicode plane containing the given character.
225
+ # @param char [String] Single character (only one code unit, so be careful with
226
+ # emojis, composed or joint characters using several units, only the first
227
+ # code unit will be kept).
228
+ # @return [String] Plane name or empty string if not found.
229
+ # @example
230
+ # Unisec::Planes.reverse('…') # => "Basic Multilingual Plane"
231
+ # Unisec::Planes.reverse('🨂') # => "Supplementary Multilingual Plane"
232
+ # Unisec::Planes.reverse('𠀀') # => "Supplementary Ideographic Plane"
233
+ # Unisec::Planes.reverse('🇫🇷') # => "Supplementary Multilingual Plane" (first unit kept)
234
+ def self.reverse(char)
235
+ return '' unless char.is_a?(String)
236
+
237
+ cp = Utils::String.convert_to_integer(char[0])
238
+ PLANES.each do |plane|
239
+ return plane[:name] if plane[:range].include?(cp)
240
+ end
241
+ '' # not found
242
+ end
243
+
244
+ # Display a CLI-friendly output showing the plane name for a given character.
245
+ # @param char [String] Single character (only one code unit, so be careful with
246
+ # emojis, composed or joint characters using several units, only the first
247
+ # code unit will be kept).
248
+ def self.reverse_display(char)
249
+ plane_name = reverse(char)
250
+ if plane_name.empty?
251
+ puts "no plane found for #{char.inspect}"
252
+ else
253
+ puts plane_name
254
+ end
255
+ nil
256
+ end
257
+
258
+ # Returns the name of the Unicode plane containing the given block.
259
+ # @param block_arg [String] Block name (case insensitive).
260
+ # @return [String] Plane name or empty string if not found.
261
+ # @example
262
+ # Unisec::Planes.block('Basic Latin') # => "Basic Multilingual Plane"
263
+ # Unisec::Planes.block('Miscellaneous Symbols and Pictographs') # => "Supplementary Multilingual Plane"
264
+ def self.block(block_arg) # rubocop:disable Metrics/CyclomaticComplexity
265
+ # support only search by block name
266
+ return '' if block_arg.is_a?(Integer)
267
+ return '' if block_arg.is_a?(String) && (block_arg.size == 1 || block_arg.start_with?('U+'))
268
+
269
+ blk = Blocks.block(block_arg, with_count: false)
270
+ return '' unless blk # block name not found
271
+
272
+ PLANES.each do |plane|
273
+ return plane[:name] if plane[:range].cover?(blk[:range])
274
+ end
275
+ '' # not found
276
+ end
277
+
278
+ # Display a CLI-friendly output showing the plane name for a given block.
279
+ # @param block_arg [String] Block name (case insensitive).
280
+ def self.block_display(block_arg)
281
+ plane_name = block(block_arg)
282
+ if plane_name.empty?
283
+ puts "no plane found for block #{block_arg.inspect}"
284
+ else
285
+ puts plane_name
286
+ end
287
+ nil
288
+ end
223
289
  end
224
290
  end
@@ -75,9 +75,10 @@ module Unisec
75
75
  end
76
76
  {
77
77
  age: props.age.join,
78
+ plane: Unisec::Planes.reverse(chr),
78
79
  block: props.block.join,
79
80
  category: categories[1],
80
- subcategory: categories[0],
81
+ subcategory: "#{categories[0]} (#{cp.category})",
81
82
  codepoint: Utils::String.char2codepoint(chr),
82
83
  name: cp.name,
83
84
  script: props.script.join,
@@ -119,8 +120,9 @@ module Unisec
119
120
  data = Properties.char(chr)
120
121
  display = ->(key, value) { puts Paint[key, :red, :bold].ljust(30) + " #{value}" }
121
122
  display.call('Name:', data[:name])
122
- display.call('Code Point:', data[:codepoint])
123
+ display.call('Code Point:', data[:codepoint] + " (#{Utils::String.convert(chr, :integer)})")
123
124
  puts
125
+ display.call('Plane', data[:plane])
124
126
  display.call('Block:', data[:block])
125
127
  display.call('Category:', data[:category])
126
128
  display.call('Sub-Category:', data[:subcategory])
data/lib/unisec/utils.rb CHANGED
@@ -55,27 +55,31 @@ module Unisec
55
55
  # About string conversion and manipulation.
56
56
  module String
57
57
  # Convert a string input into the chosen type.
58
- # @param input [String] If the target type is `:integer`, the string must represent a number encoded in
59
- # hexadecimal, decimal, binary. If it's a Unicode string, only the first code point will be taken into account.
60
- # @param target_type [Symbol] Convert to the chosen type. Currently only supports `:integer`.
58
+ # @param input [String] If the input is a Unicode string, only the first code point will be taken into account.
59
+ # The input must represent a character encoded in hexadecimal, decimal, binary or standard code point format.
60
+ # See {convert_to_integer} and {convert_to_char} for detailed examples.
61
+ # @param target_type [Symbol] Convert to the chosen type. Currently only supports `:integer` and `:char`.
61
62
  # @return [Variable] The type of the output depends on the chosen `target_type`.
62
63
  # @example
63
64
  # Unisec::Utils::String.convert('0x1f4a9', :integer) # => 128169
65
+ # Unisec::Utils::String.convert('0x1f4a9', :char) # => "💩"
64
66
  def self.convert(input, target_type)
65
67
  case target_type
66
68
  when :integer
67
69
  convert_to_integer(input)
70
+ when :char
71
+ convert_to_char(input)
68
72
  else
69
73
  raise TypeError, "Target type \"#{target_type}\" not avaible"
70
74
  end
71
75
  end
72
76
 
73
- # Internal method used for {.convert}.
77
+ # Internal method used for {convert}.
74
78
  #
75
79
  # Convert a string input into integer.
76
- # @param input [String] The string must represent a number encoded in hexadecimal, decimal, binary. If it's a
77
- # Unicode string, only the first code point will be taken into account. The input type is determined
78
- # automatically based on the prefix.
80
+ # @param input [String] If the input is a Unicode string, only the first code point will be taken into account.
81
+ # The input must represent a character encoded in hexadecimal, decimal, binary, standard code point format.
82
+ # The input type is determined automatically based on the prefix.
79
83
  # @return [Integer]
80
84
  # @example
81
85
  # # Hexadecimal
@@ -86,10 +90,14 @@ module Unisec
86
90
  # Unisec::Utils::String.convert_to_integer('0b11111010010101001') # => 128169
87
91
  # # Unicode string
88
92
  # Unisec::Utils::String.convert_to_integer('💩') # => 128169
93
+ # # Standardized format of hexadecimal code point
94
+ # Unisec::Utils::String.convert_to_integer('U+1F4A9') # => 128169
89
95
  def self.convert_to_integer(input)
90
96
  case autodetect(input)
91
97
  when :hexadecimal
92
98
  input.hex2dec(prefix: '0x').to_i
99
+ when :stdcp
100
+ input.hex2dec(prefix: 'U+').to_i
93
101
  when :decimal
94
102
  input.to_i
95
103
  when :binary
@@ -101,11 +109,38 @@ module Unisec
101
109
  end
102
110
  end
103
111
 
112
+ # Internal method used for {convert}.
113
+ #
114
+ # Convert a string input into a character.
115
+ # @param input [String] If the input is a Unicode string, only the first code point will be taken into account.
116
+ # The input must represent a character encoded in hexadecimal, decimal, binary, standard code point format.
117
+ # The input type is determined automatically based on the prefix.
118
+ # @return [String]
119
+ # @example
120
+ # # Hexadecimal
121
+ # Unisec::Utils::String.convert_to_char('0x1f4a9') # => "💩"
122
+ # # Decimal
123
+ # Unisec::Utils::String.convert_to_char('0d128169') # => "💩"
124
+ # # Binary
125
+ # Unisec::Utils::String.convert_to_char('0b11111010010101001') # => "💩"
126
+ # # Unicode string
127
+ # Unisec::Utils::String.convert_to_char('💩') # => "💩"
128
+ # # Standardized format of hexadecimal code point
129
+ # Unisec::Utils::String.convert_to_char('U+1F4A9') # => "💩"
130
+ def self.convert_to_char(input)
131
+ case autodetect(input)
132
+ when :hexadecimal, :stdcp, :decimal, :binary, :string
133
+ [convert(input, :integer)].pack('U')
134
+ else
135
+ raise TypeError, "Input \"#{input}\" is not of the expected type"
136
+ end
137
+ end
138
+
104
139
  # Internal method used for {.convert}.
105
140
  #
106
141
  # Autodetect the representation type of the string input.
107
142
  # @param str [String] Input.
108
- # @return [Symbol] the detected type: `:hexadecimal`, `:decimal`, `:binary`, `:string`.
143
+ # @return [Symbol] the detected type: `:hexadecimal`, `:decimal`, `:binary`, `:string`, :stdcp.
109
144
  # @example
110
145
  # # Hexadecimal
111
146
  # Unisec::Utils::String.autodetect('0x1f4a9') # => :hexadecimal
@@ -115,10 +150,14 @@ module Unisec
115
150
  # Unisec::Utils::String.autodetect('0b11111010010101001') # => :binary
116
151
  # # Unicode string
117
152
  # Unisec::Utils::String.autodetect('💩') # => :string
153
+ # # Standardized format of hexadecimal code point
154
+ # Unisec::Utils::String.autodetect('U+1F4A9') # => :stdcp
118
155
  def self.autodetect(str)
119
156
  case str
120
- when /0x[0-9a-fA-F]/
157
+ when /0x[0-9a-fA-F]+/
121
158
  :hexadecimal
159
+ when /U\+[0-9A-F]+/
160
+ :stdcp
122
161
  when /0d[0-9]+/
123
162
  :decimal
124
163
  when /0b[0-1]+/
@@ -141,8 +180,9 @@ module Unisec
141
180
  # Display the code point in Unicode format for a given character (code point as string)
142
181
  # @param chr [String] Unicode code point (as character / string)
143
182
  # @return [String] code point in Unicode format
183
+ # @todo Replace this method by target type :stdcp in String.convert()
144
184
  # @example
145
- # Unisec::Properties.char2codepoint('💎') # => "U+1F48E"
185
+ # Unisec::Utils::String.char2codepoint('💎') # => "U+1F48E"
146
186
  def self.char2codepoint(chr)
147
187
  Integer.deccp2stdhexcp(chr.codepoints.first)
148
188
  end
@@ -151,8 +191,8 @@ module Unisec
151
191
  # @param chrs [String] Unicode code points (as characters / string)
152
192
  # @return [String] code points in Unicode format
153
193
  # @example
154
- # Unisec::Properties.chars2codepoints("ỳ́") # => "U+0079 U+0300 U+0301"
155
- # Unisec::Properties.chars2codepoints("🧑‍🌾") # => "U+1F9D1 U+200D U+1F33E"
194
+ # Unisec::Utils::String.chars2codepoints("ỳ́") # => "U+0079 U+0300 U+0301"
195
+ # Unisec::Utils::String.chars2codepoints("🧑‍🌾") # => "U+1F9D1 U+200D U+1F33E"
156
196
  def self.chars2codepoints(chrs)
157
197
  out = []
158
198
  chrs.each_char do |chr|
@@ -161,6 +201,15 @@ module Unisec
161
201
  out.join(' ')
162
202
  end
163
203
 
204
+ # Display the code points in integer format for the given characters (code points as string)
205
+ # @param chrs [String] Unicode code points (as characters / string)
206
+ # @return [String] code points in integer format
207
+ # @example
208
+ # Unisec::Utils::String.chars2intcodepoints('I 💕 Ruby 💎') # => "73 32 128149 32 82 117 98 121 32 128142"
209
+ def self.chars2intcodepoints(chrs)
210
+ chrs.codepoints.join(' ')
211
+ end
212
+
164
213
  # Convert a string of hex encoded Unicode code points range to actual
165
214
  # integer Ruby range.
166
215
  # @param range_str [String] Unicode code points range as in data/Blocks.txt
@@ -170,22 +219,13 @@ module Unisec
170
219
  def self.to_range(range_str)
171
220
  ::Range.new(*range_str.split('..').map { |x| x.hex2dec.to_i })
172
221
  end
173
-
174
- # Convert from standardized format hexadecimal code point to decimal code point
175
- # @param std_hex_cp [String] Code point in standardized hexadecimal format
176
- # @return [Integer] Code point in decimal format
177
- # @example
178
- # Unisec::Utils::String.stdhexcp2deccp('U+2026') # => 8230
179
- def self.stdhexcp2deccp(std_hex_cp)
180
- hex = "0x#{std_hex_cp[2..]}" # replace U+ prefix with 0x
181
- convert_to_integer(hex)
182
- end
183
222
  end
184
223
 
185
224
  module Integer
186
225
  # Convert from decimal code point to standardized format hexadecimal code point
187
226
  # @param int_cp [Integer] Code point in decimal format
188
227
  # @return [String] code point in Unicode format
228
+ # @todo Replace this method by the Integer.convert()
189
229
  # @example
190
230
  # Unisec::Utils::Integer.deccp2stdhexcp(128640) # => "U+1F680"
191
231
  def self.deccp2stdhexcp(int_cp)
@@ -196,12 +236,54 @@ module Unisec
196
236
  module Range
197
237
  # Convert a (integer) range to a range of Unicode code points
198
238
  # @param range [::Range]
199
- # @return [String]
239
+ # @return [::String]
200
240
  # @example
201
241
  # Unisec::Utils::Range.range2codepoint_range(1048576..1114111) # => "U+100000 - U+10FFFF"
202
242
  def self.range2codepoint_range(range)
203
243
  "#{Integer.deccp2stdhexcp(range.begin)} - #{Integer.deccp2stdhexcp(range.end)}"
204
244
  end
205
245
  end
246
+
247
+ module Arguments
248
+ # Converts an argument that is a string, a string of arguments separated by comma, a symbol to an array of symbol.
249
+ # Useful for methods that are expected to work on array of symbols but can receive various format of imputs (e.g. from CLI).
250
+ # @param input [::String|Symbol] (anything else will be returned untransformed)
251
+ # @return [Array<Symbol>] (or anything else if input type is not respected)
252
+ # @example
253
+ # Unisec::Utils::Arguments.to_array_of_sym("arg") # => [:arg]
254
+ # Unisec::Utils::Arguments.to_array_of_sym("a,b,c") # => [:a, :b, :c]
255
+ # Unisec::Utils::Arguments.to_array_of_sym(:snake) # => [:snake]
256
+ # Unisec::Utils::Arguments.to_array_of_sym([:a, :b, :c]) # => [:a, :b, :c]
257
+ def self.to_array_of_sym(input)
258
+ case input
259
+ when ::String # a,b,c => [:a, :b, :c]
260
+ input.split(',').map(&:to_sym)
261
+ when ::Symbol # :a => [:a]
262
+ [input]
263
+ else
264
+ input
265
+ end
266
+ end
267
+
268
+ # Converts encoding name from CLI to encoding name in standard format or Ruby Class
269
+ # @param argenc [::String] Encoding name as used as argument in Unisec CLI (authorized values are: utf8 utf16be utf16le utf32be utf32le).
270
+ # @param target [::String] 'standard' for standard encoding name, 'class' for Ruby class naming
271
+ # @return [::String|Class]
272
+ # @example
273
+ # Unisec::Utils::Arguments.argenc2enc('utf8', target: 'standard') # => "UTF-8"
274
+ # Unisec::Utils::Arguments.argenc2enc('utf16be', target: 'class') # => #<Encoding:UTF-16BE (autoload)>
275
+ def self.argenc2enc(argenc, target: 'standard')
276
+ argument_encodings = %w[utf8 utf16be utf16le utf32be utf32le]
277
+ raise ArgumentError unless argument_encodings.include?(argenc)
278
+
279
+ if target == 'standard'
280
+ argenc.upcase.insert(3, '-')
281
+ elsif target == 'class'
282
+ Encoding.const_get(argenc.upcase.insert(3, '_')) # const_get safe thanks to input whitelist
283
+ else
284
+ raise ArgumentError
285
+ end
286
+ end
287
+ end
206
288
  end
207
289
  end
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Unisec
4
4
  # Version of unisec library and app
5
- VERSION = '0.0.7'
5
+ VERSION = '0.0.9'
6
6
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unisec
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexandre ZANNI
@@ -29,14 +29,28 @@ dependencies:
29
29
  requirements:
30
30
  - - "~>"
31
31
  - !ruby/object:Gem::Version
32
- version: '1.0'
32
+ version: '1.4'
33
33
  type: :runtime
34
34
  prerelease: false
35
35
  version_requirements: !ruby/object:Gem::Requirement
36
36
  requirements:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
- version: '1.0'
39
+ version: '1.4'
40
+ - !ruby/object:Gem::Dependency
41
+ name: dry-cli-completion
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: 2.0.0
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: 2.0.0
40
54
  - !ruby/object:Gem::Dependency
41
55
  name: paint
42
56
  requirement: !ruby/object:Gem::Requirement
@@ -71,14 +85,14 @@ dependencies:
71
85
  requirements:
72
86
  - - "~>"
73
87
  - !ruby/object:Gem::Version
74
- version: '1.12'
88
+ version: '1.13'
75
89
  type: :runtime
76
90
  prerelease: false
77
91
  version_requirements: !ruby/object:Gem::Requirement
78
92
  requirements:
79
93
  - - "~>"
80
94
  - !ruby/object:Gem::Version
81
- version: '1.12'
95
+ version: '1.13'
82
96
  description: 'Toolkit for security research manipulating Unicode: confusables, homoglyphs,
83
97
  hexdump, code point, UTF-8, UTF-16, UTF-32, properties, regexp search, size, grapheme,
84
98
  surrogates, version, ICU, CLDR, UCD, BiDi, normalization'
@@ -137,7 +151,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
137
151
  requirements:
138
152
  - - ">="
139
153
  - !ruby/object:Gem::Version
140
- version: 3.2.0
154
+ version: 3.3.0
141
155
  - - "<"
142
156
  - !ruby/object:Gem::Version
143
157
  version: '5.0'
@@ -147,7 +161,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
147
161
  - !ruby/object:Gem::Version
148
162
  version: '0'
149
163
  requirements: []
150
- rubygems_version: 4.0.3
164
+ rubygems_version: 4.0.10
151
165
  specification_version: 4
152
166
  summary: Unicode Security Toolkit
153
167
  test_files: []