unisec 0.0.2 β†’ 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'unisec/cli/surrogates'
3
+ require 'unisec/cli/confusables'
4
4
  require 'unisec/cli/hexdump'
5
5
  require 'unisec/cli/properties'
6
- require 'unisec/cli/confusables'
7
- require 'unisec/cli/versions'
6
+ require 'unisec/cli/rugrep'
8
7
  require 'unisec/cli/size'
8
+ require 'unisec/cli/surrogates'
9
+ require 'unisec/cli/versions'
9
10
 
10
11
  module Unisec
11
12
  # Module used to create the CLI for the executable
@@ -16,16 +17,17 @@ module Unisec
16
17
 
17
18
  # Mapping between the (sub-)commands as seen by the user
18
19
  # on the command-line interface and the CLI modules in the lib
19
- register 'surrogates to', Surrogates::To
20
- register 'surrogates from', Surrogates::From
21
- register 'hexdump', Hexdump
22
- register 'properties list', Properties::List
23
- register 'properties codepoints', Properties::Codepoints
24
- register 'properties char', Properties::Char
25
20
  register 'confusables list', Confusables::List
26
21
  register 'confusables randomize', Confusables::Randomize
27
- register 'versions', Versions
22
+ register 'grep', Grep
23
+ register 'hexdump', Hexdump
24
+ register 'properties char', Properties::Char
25
+ register 'properties codepoints', Properties::Codepoints
26
+ register 'properties list', Properties::List
28
27
  register 'size', Size
28
+ register 'surrogates from', Surrogates::From
29
+ register 'surrogates to', Surrogates::To
30
+ register 'versions', Versions
29
31
  end
30
32
  end
31
33
  end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'dry/cli'
4
+ require 'unisec'
5
+
6
+ module Unisec
7
+ module CLI
8
+ module Commands
9
+ # CLI command `unisec grep` for the class {Unisec::Rugrep} from the lib.
10
+ #
11
+ # Example:
12
+ #
13
+ # ```plaintext
14
+ # $ unisec grep 'FRENCH \w+'
15
+ # U+20A3 β‚£ FRENCH FRANC SIGN
16
+ # U+1F35F 🍟 FRENCH FRIES
17
+ # ```
18
+ class Grep < Dry::CLI::Command
19
+ desc 'Search for Unicode code point names by regular expression'
20
+
21
+ argument :regexp, required: true,
22
+ desc: 'regular expression'
23
+
24
+ # Hexdump of all Unicode encodings.
25
+ # @param regexp [Regexp] Regular expression without delimiters or modifiers.
26
+ # Supports everything Ruby Regexp supports
27
+ def call(regexp: nil, **)
28
+ puts Unisec::Rugrep.regrep_display(regexp)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -50,7 +50,7 @@ module Unisec
50
50
  def self.codepoints_display(prop)
51
51
  codepoints = Properties.codepoints(prop)
52
52
  codepoints.each do |cp|
53
- puts "#{Properties.char2codepoint(cp[:char]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
53
+ puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
54
54
  end
55
55
  nil
56
56
  end
@@ -158,7 +158,7 @@ module Unisec
158
158
  # @example
159
159
  # Unisec::Properties.char2codepoint('πŸ’Ž') # => "U+1F48E"
160
160
  def self.char2codepoint(chr)
161
- "U+#{format('%.4x', chr.codepoints.first).upcase}"
161
+ Properties.deccp2stdhexcp(chr.codepoints.first)
162
162
  end
163
163
 
164
164
  # Display the code points in Unicode format for the given characters (code points as string)
@@ -174,5 +174,14 @@ module Unisec
174
174
  end
175
175
  out.join(' ')
176
176
  end
177
+
178
+ # Convert from decimal code point to standardized format hexadecimal code point
179
+ # @param int_cp [Integer] Code point in decimal format
180
+ # @return [String] code point in Unicode format
181
+ # @example
182
+ # Unisec::Properties.intcp2stdhexcp(128640) # => "U+1F680"
183
+ def self.deccp2stdhexcp(int_cp)
184
+ "U+#{format('%.4x', int_cp).upcase}"
185
+ end
177
186
  end
178
187
  end
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'twitter_cldr'
4
+ require 'paint'
5
+
6
+ module Unisec
7
+ # Ruby grep : Ruby regular expression search for Unicode code point names
8
+ class Rugrep
9
+ # UCD Derived names file location
10
+ # @see https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedName.txt
11
+ UCD_DERIVEDNAME = File.join(__dir__, '../../data/DerivedName.txt')
12
+
13
+ # Search code points by (Ruby) regexp
14
+ # @param regexp [Regexp] Regular expression without delimiters or modifiers.
15
+ # Supports everything Ruby Regexp supports
16
+ # @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
17
+ # @example
18
+ # Unisec::Rugrep.regrep('snowman|snowflake')
19
+ # # =>
20
+ # # [{:char=>"β˜ƒ", :codepoint=>9731, :name=>"SNOWMAN"},
21
+ # # {:char=>"β›„", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
22
+ # # {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
23
+ # # {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
24
+ # # {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
25
+ # # {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
26
+ # Unisec::Rugrep.regrep('greek small letter \w+')
27
+ # # =>
28
+ # # [{:char=>"Ν±", :codepoint=>881, :name=>"GREEK SMALL LETTER HETA"},
29
+ # # {:char=>"Ν³", :codepoint=>883, :name=>"GREEK SMALL LETTER ARCHAIC SAMPI"},
30
+ # # {:char=>"Ν·", :codepoint=>887, :name=>"GREEK SMALL LETTER PAMPHYLIAN DIGAMMA"},
31
+ # # …]
32
+ def self.regrep(regexp)
33
+ out = []
34
+ file = File.new(UCD_DERIVEDNAME)
35
+ file.each_line(chomp: true) do |line|
36
+ # Skip if the line is empty or a comment
37
+ next if line.empty? || line[0] == '#'
38
+
39
+ # parse the line to extract code point as integer and the name
40
+ cp_int, name = line.split(';')
41
+ cp_int = cp_int.chomp.to_i(16)
42
+ name.lstrip!
43
+ next unless /#{regexp}/i.match?(name) # compiling regexp once is surprisingly not faster
44
+
45
+ out << {
46
+ char: TwitterCldr::Utils::CodePoints.to_string([cp_int]),
47
+ codepoint: cp_int,
48
+ name: name
49
+ }
50
+ end
51
+ out
52
+ end
53
+
54
+ # Display a CLI-friendly output listing all code points corresponding to a regular expression.
55
+ # @example
56
+ # Unisec::Rugrep.regrep_display('snowman|snowflake')
57
+ # # =>
58
+ # # U+2603 β˜ƒ SNOWMAN
59
+ # # U+26C4 β›„ SNOWMAN WITHOUT SNOW
60
+ # # U+26C7 ⛇ BLACK SNOWMAN
61
+ # # U+2744 ❄ SNOWFLAKE
62
+ # # U+2745 ❅ TIGHT TRIFOLIATE SNOWFLAKE
63
+ # # U+2746 ❆ HEAVY CHEVRON SNOWFLAKE
64
+ def self.regrep_display(regexp)
65
+ codepoints = regrep(regexp)
66
+ codepoints.each do |cp|
67
+ puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
68
+ end
69
+ nil
70
+ end
71
+
72
+ # Returns the version of Unicode used in UCD local file (data/DerivedName.txt)
73
+ # @return [String] Unicode version
74
+ # @example
75
+ # Unisec::Rugrep.ucd_derivedname_version # => "15.1.0"
76
+ def self.ucd_derivedname_version
77
+ first_line = File.open(UCD_DERIVEDNAME, &:readline)
78
+ first_line.match(/-(\d+\.\d+\.\d+)\.txt/).captures.first
79
+ end
80
+
81
+ # Search code points by (Ruby) regexp
82
+ # @param regexp [Regexp] Regular expression without delimiters or modifiers
83
+ # @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
84
+ # @example
85
+ # Unisec::Rugrep.regrep_slow('snowman|snowflake')
86
+ # # =>
87
+ # # [{:char=>"β˜ƒ", :codepoint=>9731, :name=>"SNOWMAN"},
88
+ # # {:char=>"β›„", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
89
+ # # {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
90
+ # # {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
91
+ # # {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
92
+ # # {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
93
+ # @note ⚠ This command is very time consuming (~ 1min) and unoptimized (execute one regexp per code point…)
94
+ def self.regrep_slow(regexp)
95
+ out = []
96
+ TwitterCldr::Shared::CodePoint.each do |cp|
97
+ next unless /#{regexp}/oi.match?(cp.name) # compiling regexp once is surprisingly not faster
98
+
99
+ out << {
100
+ char: TwitterCldr::Utils::CodePoints.to_string([cp.code_point]),
101
+ codepoint: cp.code_point,
102
+ name: cp.name
103
+ }
104
+ end
105
+ out
106
+ end
107
+
108
+ # Display a CLI-friendly output listing all code points corresponding to a regular expression.
109
+ # @example
110
+ # Unisec::Rugrep.regrep_display_slow('snowman|snowflake')
111
+ # # =>
112
+ # # U+2603 β˜ƒ SNOWMAN
113
+ # # U+26C4 β›„ SNOWMAN WITHOUT SNOW
114
+ # # U+26C7 ⛇ BLACK SNOWMAN
115
+ # # U+2744 ❄ SNOWFLAKE
116
+ # # U+2745 ❅ TIGHT TRIFOLIATE SNOWFLAKE
117
+ # # U+2746 ❆ HEAVY CHEVRON SNOWFLAKE
118
+ def self.regrep_display_slow(regexp)
119
+ codepoints = regrep_slow(regexp)
120
+ codepoints.each do |cp|
121
+ puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
122
+ end
123
+ nil
124
+ end
125
+ end
126
+ end
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Unisec
4
4
  # Version of unisec library and app
5
- VERSION = '0.0.2'
5
+ VERSION = '0.0.3'
6
6
  end
@@ -51,6 +51,10 @@ module Unisec
51
51
  unicodeconfusable_unicode: {
52
52
  version: Unicode::Confusable::UNICODE_VERSION,
53
53
  label: 'Unicode (unicode-confusable gem)'
54
+ },
55
+ ucd_derivedname: {
56
+ version: Unisec::Rugrep.ucd_derivedname_version,
57
+ label: 'UCD (data/DerivedName.txt)'
54
58
  }
55
59
  }
56
60
  end
@@ -76,6 +80,7 @@ module Unisec
76
80
  display.call(:twittercldr_icu)
77
81
  display.call(:twittercldr_cldr)
78
82
  display.call(:ruby_unicode_emoji)
83
+ display.call(:ucd_derivedname)
79
84
  puts Paint["\nGems:", :underline]
80
85
  display.call(:unisec)
81
86
  display.call(:twittercldr)
data/lib/unisec.rb CHANGED
@@ -2,9 +2,10 @@
2
2
 
3
3
  require 'unisec/version'
4
4
 
5
- require 'unisec/surrogates'
5
+ require 'unisec/confusables'
6
6
  require 'unisec/hexdump'
7
7
  require 'unisec/properties'
8
- require 'unisec/confusables'
9
- require 'unisec/versions'
8
+ require 'unisec/rugrep'
10
9
  require 'unisec/size'
10
+ require 'unisec/surrogates'
11
+ require 'unisec/versions'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unisec
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexandre ZANNI
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-18 00:00:00.000000000 Z
11
+ date: 2023-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ctf-party
@@ -86,7 +86,9 @@ dependencies:
86
86
  - - "~>"
87
87
  - !ruby/object:Gem::Version
88
88
  version: '1.9'
89
- description: Toolkit for security research manipulating Unicode
89
+ description: 'Toolkit for security research manipulating Unicode: confusables, homoglyphs,
90
+ hexdump, code point, UTF-8, UTF-16, UTF-32, properties, regexp search, size, grapheme,
91
+ surrogates, version, ICU, CLDR, UCD'
90
92
  email: alexandre.zanni@europe.com
91
93
  executables:
92
94
  - unisec
@@ -95,17 +97,20 @@ extra_rdoc_files: []
95
97
  files:
96
98
  - LICENSE
97
99
  - bin/unisec
100
+ - data/DerivedName.txt
98
101
  - lib/unisec.rb
99
102
  - lib/unisec/cli/cli.rb
100
103
  - lib/unisec/cli/confusables.rb
101
104
  - lib/unisec/cli/hexdump.rb
102
105
  - lib/unisec/cli/properties.rb
106
+ - lib/unisec/cli/rugrep.rb
103
107
  - lib/unisec/cli/size.rb
104
108
  - lib/unisec/cli/surrogates.rb
105
109
  - lib/unisec/cli/versions.rb
106
110
  - lib/unisec/confusables.rb
107
111
  - lib/unisec/hexdump.rb
108
112
  - lib/unisec/properties.rb
113
+ - lib/unisec/rugrep.rb
109
114
  - lib/unisec/size.rb
110
115
  - lib/unisec/surrogates.rb
111
116
  - lib/unisec/utils.rb
@@ -140,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
140
145
  - !ruby/object:Gem::Version
141
146
  version: '0'
142
147
  requirements: []
143
- rubygems_version: 3.4.1
148
+ rubygems_version: 3.4.10
144
149
  signing_key:
145
150
  specification_version: 4
146
151
  summary: Unicode Security Toolkit