unisec 0.0.2 β†’ 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,11 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'unisec/cli/surrogates'
3
+ require 'unisec/cli/confusables'
4
4
  require 'unisec/cli/hexdump'
5
5
  require 'unisec/cli/properties'
6
- require 'unisec/cli/confusables'
7
- require 'unisec/cli/versions'
6
+ require 'unisec/cli/rugrep'
8
7
  require 'unisec/cli/size'
8
+ require 'unisec/cli/surrogates'
9
+ require 'unisec/cli/versions'
9
10
 
10
11
  module Unisec
11
12
  # Module used to create the CLI for the executable
@@ -16,16 +17,17 @@ module Unisec
16
17
 
17
18
  # Mapping between the (sub-)commands as seen by the user
18
19
  # on the command-line interface and the CLI modules in the lib
19
- register 'surrogates to', Surrogates::To
20
- register 'surrogates from', Surrogates::From
21
- register 'hexdump', Hexdump
22
- register 'properties list', Properties::List
23
- register 'properties codepoints', Properties::Codepoints
24
- register 'properties char', Properties::Char
25
20
  register 'confusables list', Confusables::List
26
21
  register 'confusables randomize', Confusables::Randomize
27
- register 'versions', Versions
22
+ register 'grep', Grep
23
+ register 'hexdump', Hexdump
24
+ register 'properties char', Properties::Char
25
+ register 'properties codepoints', Properties::Codepoints
26
+ register 'properties list', Properties::List
28
27
  register 'size', Size
28
+ register 'surrogates from', Surrogates::From
29
+ register 'surrogates to', Surrogates::To
30
+ register 'versions', Versions
29
31
  end
30
32
  end
31
33
  end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'dry/cli'
4
+ require 'unisec'
5
+
6
+ module Unisec
7
+ module CLI
8
+ module Commands
9
+ # CLI command `unisec grep` for the class {Unisec::Rugrep} from the lib.
10
+ #
11
+ # Example:
12
+ #
13
+ # ```plaintext
14
+ # $ unisec grep 'FRENCH \w+'
15
+ # U+20A3 β‚£ FRENCH FRANC SIGN
16
+ # U+1F35F 🍟 FRENCH FRIES
17
+ # ```
18
+ class Grep < Dry::CLI::Command
19
+ desc 'Search for Unicode code point names by regular expression'
20
+
21
+ argument :regexp, required: true,
22
+ desc: 'regular expression'
23
+
24
+ # Hexdump of all Unicode encodings.
25
+ # @param regexp [Regexp] Regular expression without delimiters or modifiers.
26
+ # Supports everything Ruby Regexp supports
27
+ def call(regexp: nil, **)
28
+ puts Unisec::Rugrep.regrep_display(regexp)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -50,7 +50,7 @@ module Unisec
50
50
  def self.codepoints_display(prop)
51
51
  codepoints = Properties.codepoints(prop)
52
52
  codepoints.each do |cp|
53
- puts "#{Properties.char2codepoint(cp[:char]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
53
+ puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
54
54
  end
55
55
  nil
56
56
  end
@@ -158,7 +158,7 @@ module Unisec
158
158
  # @example
159
159
  # Unisec::Properties.char2codepoint('πŸ’Ž') # => "U+1F48E"
160
160
  def self.char2codepoint(chr)
161
- "U+#{format('%.4x', chr.codepoints.first).upcase}"
161
+ Properties.deccp2stdhexcp(chr.codepoints.first)
162
162
  end
163
163
 
164
164
  # Display the code points in Unicode format for the given characters (code points as string)
@@ -174,5 +174,14 @@ module Unisec
174
174
  end
175
175
  out.join(' ')
176
176
  end
177
+
178
+ # Convert from decimal code point to standardized format hexadecimal code point
179
+ # @param int_cp [Integer] Code point in decimal format
180
+ # @return [String] code point in Unicode format
181
+ # @example
182
+ # Unisec::Properties.intcp2stdhexcp(128640) # => "U+1F680"
183
+ def self.deccp2stdhexcp(int_cp)
184
+ "U+#{format('%.4x', int_cp).upcase}"
185
+ end
177
186
  end
178
187
  end
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'twitter_cldr'
4
+ require 'paint'
5
+
6
+ module Unisec
7
+ # Ruby grep : Ruby regular expression search for Unicode code point names
8
+ class Rugrep
9
+ # UCD Derived names file location
10
+ # @see https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedName.txt
11
+ UCD_DERIVEDNAME = File.join(__dir__, '../../data/DerivedName.txt')
12
+
13
+ # Search code points by (Ruby) regexp
14
+ # @param regexp [Regexp] Regular expression without delimiters or modifiers.
15
+ # Supports everything Ruby Regexp supports
16
+ # @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
17
+ # @example
18
+ # Unisec::Rugrep.regrep('snowman|snowflake')
19
+ # # =>
20
+ # # [{:char=>"β˜ƒ", :codepoint=>9731, :name=>"SNOWMAN"},
21
+ # # {:char=>"β›„", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
22
+ # # {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
23
+ # # {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
24
+ # # {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
25
+ # # {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
26
+ # Unisec::Rugrep.regrep('greek small letter \w+')
27
+ # # =>
28
+ # # [{:char=>"Ν±", :codepoint=>881, :name=>"GREEK SMALL LETTER HETA"},
29
+ # # {:char=>"Ν³", :codepoint=>883, :name=>"GREEK SMALL LETTER ARCHAIC SAMPI"},
30
+ # # {:char=>"Ν·", :codepoint=>887, :name=>"GREEK SMALL LETTER PAMPHYLIAN DIGAMMA"},
31
+ # # …]
32
+ def self.regrep(regexp)
33
+ out = []
34
+ file = File.new(UCD_DERIVEDNAME)
35
+ file.each_line(chomp: true) do |line|
36
+ # Skip if the line is empty or a comment
37
+ next if line.empty? || line[0] == '#'
38
+
39
+ # parse the line to extract code point as integer and the name
40
+ cp_int, name = line.split(';')
41
+ cp_int = cp_int.chomp.to_i(16)
42
+ name.lstrip!
43
+ next unless /#{regexp}/i.match?(name) # compiling regexp once is surprisingly not faster
44
+
45
+ out << {
46
+ char: TwitterCldr::Utils::CodePoints.to_string([cp_int]),
47
+ codepoint: cp_int,
48
+ name: name
49
+ }
50
+ end
51
+ out
52
+ end
53
+
54
+ # Display a CLI-friendly output listing all code points corresponding to a regular expression.
55
+ # @example
56
+ # Unisec::Rugrep.regrep_display('snowman|snowflake')
57
+ # # =>
58
+ # # U+2603 β˜ƒ SNOWMAN
59
+ # # U+26C4 β›„ SNOWMAN WITHOUT SNOW
60
+ # # U+26C7 ⛇ BLACK SNOWMAN
61
+ # # U+2744 ❄ SNOWFLAKE
62
+ # # U+2745 ❅ TIGHT TRIFOLIATE SNOWFLAKE
63
+ # # U+2746 ❆ HEAVY CHEVRON SNOWFLAKE
64
+ def self.regrep_display(regexp)
65
+ codepoints = regrep(regexp)
66
+ codepoints.each do |cp|
67
+ puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
68
+ end
69
+ nil
70
+ end
71
+
72
+ # Returns the version of Unicode used in UCD local file (data/DerivedName.txt)
73
+ # @return [String] Unicode version
74
+ # @example
75
+ # Unisec::Rugrep.ucd_derivedname_version # => "15.1.0"
76
+ def self.ucd_derivedname_version
77
+ first_line = File.open(UCD_DERIVEDNAME, &:readline)
78
+ first_line.match(/-(\d+\.\d+\.\d+)\.txt/).captures.first
79
+ end
80
+
81
+ # Search code points by (Ruby) regexp
82
+ # @param regexp [Regexp] Regular expression without delimiters or modifiers
83
+ # @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
84
+ # @example
85
+ # Unisec::Rugrep.regrep_slow('snowman|snowflake')
86
+ # # =>
87
+ # # [{:char=>"β˜ƒ", :codepoint=>9731, :name=>"SNOWMAN"},
88
+ # # {:char=>"β›„", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
89
+ # # {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
90
+ # # {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
91
+ # # {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
92
+ # # {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
93
+ # @note ⚠ This command is very time consuming (~ 1min) and unoptimized (execute one regexp per code point…)
94
+ def self.regrep_slow(regexp)
95
+ out = []
96
+ TwitterCldr::Shared::CodePoint.each do |cp|
97
+ next unless /#{regexp}/oi.match?(cp.name) # compiling regexp once is surprisingly not faster
98
+
99
+ out << {
100
+ char: TwitterCldr::Utils::CodePoints.to_string([cp.code_point]),
101
+ codepoint: cp.code_point,
102
+ name: cp.name
103
+ }
104
+ end
105
+ out
106
+ end
107
+
108
+ # Display a CLI-friendly output listing all code points corresponding to a regular expression.
109
+ # @example
110
+ # Unisec::Rugrep.regrep_display_slow('snowman|snowflake')
111
+ # # =>
112
+ # # U+2603 β˜ƒ SNOWMAN
113
+ # # U+26C4 β›„ SNOWMAN WITHOUT SNOW
114
+ # # U+26C7 ⛇ BLACK SNOWMAN
115
+ # # U+2744 ❄ SNOWFLAKE
116
+ # # U+2745 ❅ TIGHT TRIFOLIATE SNOWFLAKE
117
+ # # U+2746 ❆ HEAVY CHEVRON SNOWFLAKE
118
+ def self.regrep_display_slow(regexp)
119
+ codepoints = regrep_slow(regexp)
120
+ codepoints.each do |cp|
121
+ puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
122
+ end
123
+ nil
124
+ end
125
+ end
126
+ end
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Unisec
4
4
  # Version of unisec library and app
5
- VERSION = '0.0.2'
5
+ VERSION = '0.0.3'
6
6
  end
@@ -51,6 +51,10 @@ module Unisec
51
51
  unicodeconfusable_unicode: {
52
52
  version: Unicode::Confusable::UNICODE_VERSION,
53
53
  label: 'Unicode (unicode-confusable gem)'
54
+ },
55
+ ucd_derivedname: {
56
+ version: Unisec::Rugrep.ucd_derivedname_version,
57
+ label: 'UCD (data/DerivedName.txt)'
54
58
  }
55
59
  }
56
60
  end
@@ -76,6 +80,7 @@ module Unisec
76
80
  display.call(:twittercldr_icu)
77
81
  display.call(:twittercldr_cldr)
78
82
  display.call(:ruby_unicode_emoji)
83
+ display.call(:ucd_derivedname)
79
84
  puts Paint["\nGems:", :underline]
80
85
  display.call(:unisec)
81
86
  display.call(:twittercldr)
data/lib/unisec.rb CHANGED
@@ -2,9 +2,10 @@
2
2
 
3
3
  require 'unisec/version'
4
4
 
5
- require 'unisec/surrogates'
5
+ require 'unisec/confusables'
6
6
  require 'unisec/hexdump'
7
7
  require 'unisec/properties'
8
- require 'unisec/confusables'
9
- require 'unisec/versions'
8
+ require 'unisec/rugrep'
10
9
  require 'unisec/size'
10
+ require 'unisec/surrogates'
11
+ require 'unisec/versions'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unisec
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexandre ZANNI
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-18 00:00:00.000000000 Z
11
+ date: 2023-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ctf-party
@@ -86,7 +86,9 @@ dependencies:
86
86
  - - "~>"
87
87
  - !ruby/object:Gem::Version
88
88
  version: '1.9'
89
- description: Toolkit for security research manipulating Unicode
89
+ description: 'Toolkit for security research manipulating Unicode: confusables, homoglyphs,
90
+ hexdump, code point, UTF-8, UTF-16, UTF-32, properties, regexp search, size, grapheme,
91
+ surrogates, version, ICU, CLDR, UCD'
90
92
  email: alexandre.zanni@europe.com
91
93
  executables:
92
94
  - unisec
@@ -95,17 +97,20 @@ extra_rdoc_files: []
95
97
  files:
96
98
  - LICENSE
97
99
  - bin/unisec
100
+ - data/DerivedName.txt
98
101
  - lib/unisec.rb
99
102
  - lib/unisec/cli/cli.rb
100
103
  - lib/unisec/cli/confusables.rb
101
104
  - lib/unisec/cli/hexdump.rb
102
105
  - lib/unisec/cli/properties.rb
106
+ - lib/unisec/cli/rugrep.rb
103
107
  - lib/unisec/cli/size.rb
104
108
  - lib/unisec/cli/surrogates.rb
105
109
  - lib/unisec/cli/versions.rb
106
110
  - lib/unisec/confusables.rb
107
111
  - lib/unisec/hexdump.rb
108
112
  - lib/unisec/properties.rb
113
+ - lib/unisec/rugrep.rb
109
114
  - lib/unisec/size.rb
110
115
  - lib/unisec/surrogates.rb
111
116
  - lib/unisec/utils.rb
@@ -140,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
140
145
  - !ruby/object:Gem::Version
141
146
  version: '0'
142
147
  requirements: []
143
- rubygems_version: 3.4.1
148
+ rubygems_version: 3.4.10
144
149
  signing_key:
145
150
  specification_version: 4
146
151
  summary: Unicode Security Toolkit