unisec 0.0.2 β 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/data/DerivedName.txt +44181 -0
- data/lib/unisec/cli/cli.rb +12 -10
- data/lib/unisec/cli/rugrep.rb +33 -0
- data/lib/unisec/properties.rb +11 -2
- data/lib/unisec/rugrep.rb +126 -0
- data/lib/unisec/version.rb +1 -1
- data/lib/unisec/versions.rb +5 -0
- data/lib/unisec.rb +4 -3
- metadata +9 -4
data/lib/unisec/cli/cli.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'unisec/cli/
|
3
|
+
require 'unisec/cli/confusables'
|
4
4
|
require 'unisec/cli/hexdump'
|
5
5
|
require 'unisec/cli/properties'
|
6
|
-
require 'unisec/cli/
|
7
|
-
require 'unisec/cli/versions'
|
6
|
+
require 'unisec/cli/rugrep'
|
8
7
|
require 'unisec/cli/size'
|
8
|
+
require 'unisec/cli/surrogates'
|
9
|
+
require 'unisec/cli/versions'
|
9
10
|
|
10
11
|
module Unisec
|
11
12
|
# Module used to create the CLI for the executable
|
@@ -16,16 +17,17 @@ module Unisec
|
|
16
17
|
|
17
18
|
# Mapping between the (sub-)commands as seen by the user
|
18
19
|
# on the command-line interface and the CLI modules in the lib
|
19
|
-
register 'surrogates to', Surrogates::To
|
20
|
-
register 'surrogates from', Surrogates::From
|
21
|
-
register 'hexdump', Hexdump
|
22
|
-
register 'properties list', Properties::List
|
23
|
-
register 'properties codepoints', Properties::Codepoints
|
24
|
-
register 'properties char', Properties::Char
|
25
20
|
register 'confusables list', Confusables::List
|
26
21
|
register 'confusables randomize', Confusables::Randomize
|
27
|
-
register '
|
22
|
+
register 'grep', Grep
|
23
|
+
register 'hexdump', Hexdump
|
24
|
+
register 'properties char', Properties::Char
|
25
|
+
register 'properties codepoints', Properties::Codepoints
|
26
|
+
register 'properties list', Properties::List
|
28
27
|
register 'size', Size
|
28
|
+
register 'surrogates from', Surrogates::From
|
29
|
+
register 'surrogates to', Surrogates::To
|
30
|
+
register 'versions', Versions
|
29
31
|
end
|
30
32
|
end
|
31
33
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'dry/cli'
|
4
|
+
require 'unisec'
|
5
|
+
|
6
|
+
module Unisec
|
7
|
+
module CLI
|
8
|
+
module Commands
|
9
|
+
# CLI command `unisec grep` for the class {Unisec::Rugrep} from the lib.
|
10
|
+
#
|
11
|
+
# Example:
|
12
|
+
#
|
13
|
+
# ```plaintext
|
14
|
+
# $ unisec grep 'FRENCH \w+'
|
15
|
+
# U+20A3 β£ FRENCH FRANC SIGN
|
16
|
+
# U+1F35F π FRENCH FRIES
|
17
|
+
# ```
|
18
|
+
class Grep < Dry::CLI::Command
|
19
|
+
desc 'Search for Unicode code point names by regular expression'
|
20
|
+
|
21
|
+
argument :regexp, required: true,
|
22
|
+
desc: 'regular expression'
|
23
|
+
|
24
|
+
# Hexdump of all Unicode encodings.
|
25
|
+
# @param regexp [Regexp] Regular expression without delimiters or modifiers.
|
26
|
+
# Supports everything Ruby Regexp supports
|
27
|
+
def call(regexp: nil, **)
|
28
|
+
puts Unisec::Rugrep.regrep_display(regexp)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/unisec/properties.rb
CHANGED
@@ -50,7 +50,7 @@ module Unisec
|
|
50
50
|
def self.codepoints_display(prop)
|
51
51
|
codepoints = Properties.codepoints(prop)
|
52
52
|
codepoints.each do |cp|
|
53
|
-
puts "#{Properties.
|
53
|
+
puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
54
54
|
end
|
55
55
|
nil
|
56
56
|
end
|
@@ -158,7 +158,7 @@ module Unisec
|
|
158
158
|
# @example
|
159
159
|
# Unisec::Properties.char2codepoint('π') # => "U+1F48E"
|
160
160
|
def self.char2codepoint(chr)
|
161
|
-
|
161
|
+
Properties.deccp2stdhexcp(chr.codepoints.first)
|
162
162
|
end
|
163
163
|
|
164
164
|
# Display the code points in Unicode format for the given characters (code points as string)
|
@@ -174,5 +174,14 @@ module Unisec
|
|
174
174
|
end
|
175
175
|
out.join(' ')
|
176
176
|
end
|
177
|
+
|
178
|
+
# Convert from decimal code point to standardized format hexadecimal code point
|
179
|
+
# @param int_cp [Integer] Code point in decimal format
|
180
|
+
# @return [String] code point in Unicode format
|
181
|
+
# @example
|
182
|
+
# Unisec::Properties.intcp2stdhexcp(128640) # => "U+1F680"
|
183
|
+
def self.deccp2stdhexcp(int_cp)
|
184
|
+
"U+#{format('%.4x', int_cp).upcase}"
|
185
|
+
end
|
177
186
|
end
|
178
187
|
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'twitter_cldr'
|
4
|
+
require 'paint'
|
5
|
+
|
6
|
+
module Unisec
|
7
|
+
# Ruby grep : Ruby regular expression search for Unicode code point names
|
8
|
+
class Rugrep
|
9
|
+
# UCD Derived names file location
|
10
|
+
# @see https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedName.txt
|
11
|
+
UCD_DERIVEDNAME = File.join(__dir__, '../../data/DerivedName.txt')
|
12
|
+
|
13
|
+
# Search code points by (Ruby) regexp
|
14
|
+
# @param regexp [Regexp] Regular expression without delimiters or modifiers.
|
15
|
+
# Supports everything Ruby Regexp supports
|
16
|
+
# @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
|
17
|
+
# @example
|
18
|
+
# Unisec::Rugrep.regrep('snowman|snowflake')
|
19
|
+
# # =>
|
20
|
+
# # [{:char=>"β", :codepoint=>9731, :name=>"SNOWMAN"},
|
21
|
+
# # {:char=>"β", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
|
22
|
+
# # {:char=>"β", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
|
23
|
+
# # {:char=>"β", :codepoint=>10052, :name=>"SNOWFLAKE"},
|
24
|
+
# # {:char=>"β
", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
|
25
|
+
# # {:char=>"β", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
|
26
|
+
# Unisec::Rugrep.regrep('greek small letter \w+')
|
27
|
+
# # =>
|
28
|
+
# # [{:char=>"Ν±", :codepoint=>881, :name=>"GREEK SMALL LETTER HETA"},
|
29
|
+
# # {:char=>"Ν³", :codepoint=>883, :name=>"GREEK SMALL LETTER ARCHAIC SAMPI"},
|
30
|
+
# # {:char=>"Ν·", :codepoint=>887, :name=>"GREEK SMALL LETTER PAMPHYLIAN DIGAMMA"},
|
31
|
+
# # β¦]
|
32
|
+
def self.regrep(regexp)
|
33
|
+
out = []
|
34
|
+
file = File.new(UCD_DERIVEDNAME)
|
35
|
+
file.each_line(chomp: true) do |line|
|
36
|
+
# Skip if the line is empty or a comment
|
37
|
+
next if line.empty? || line[0] == '#'
|
38
|
+
|
39
|
+
# parse the line to extract code point as integer and the name
|
40
|
+
cp_int, name = line.split(';')
|
41
|
+
cp_int = cp_int.chomp.to_i(16)
|
42
|
+
name.lstrip!
|
43
|
+
next unless /#{regexp}/i.match?(name) # compiling regexp once is surprisingly not faster
|
44
|
+
|
45
|
+
out << {
|
46
|
+
char: TwitterCldr::Utils::CodePoints.to_string([cp_int]),
|
47
|
+
codepoint: cp_int,
|
48
|
+
name: name
|
49
|
+
}
|
50
|
+
end
|
51
|
+
out
|
52
|
+
end
|
53
|
+
|
54
|
+
# Display a CLI-friendly output listing all code points corresponding to a regular expression.
|
55
|
+
# @example
|
56
|
+
# Unisec::Rugrep.regrep_display('snowman|snowflake')
|
57
|
+
# # =>
|
58
|
+
# # U+2603 β SNOWMAN
|
59
|
+
# # U+26C4 β SNOWMAN WITHOUT SNOW
|
60
|
+
# # U+26C7 β BLACK SNOWMAN
|
61
|
+
# # U+2744 β SNOWFLAKE
|
62
|
+
# # U+2745 β
TIGHT TRIFOLIATE SNOWFLAKE
|
63
|
+
# # U+2746 β HEAVY CHEVRON SNOWFLAKE
|
64
|
+
def self.regrep_display(regexp)
|
65
|
+
codepoints = regrep(regexp)
|
66
|
+
codepoints.each do |cp|
|
67
|
+
puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
68
|
+
end
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns the version of Unicode used in UCD local file (data/DerivedName.txt)
|
73
|
+
# @return [String] Unicode version
|
74
|
+
# @example
|
75
|
+
# Unisec::Rugrep.ucd_derivedname_version # => "15.1.0"
|
76
|
+
def self.ucd_derivedname_version
|
77
|
+
first_line = File.open(UCD_DERIVEDNAME, &:readline)
|
78
|
+
first_line.match(/-(\d+\.\d+\.\d+)\.txt/).captures.first
|
79
|
+
end
|
80
|
+
|
81
|
+
# Search code points by (Ruby) regexp
|
82
|
+
# @param regexp [Regexp] Regular expression without delimiters or modifiers
|
83
|
+
# @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
|
84
|
+
# @example
|
85
|
+
# Unisec::Rugrep.regrep_slow('snowman|snowflake')
|
86
|
+
# # =>
|
87
|
+
# # [{:char=>"β", :codepoint=>9731, :name=>"SNOWMAN"},
|
88
|
+
# # {:char=>"β", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
|
89
|
+
# # {:char=>"β", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
|
90
|
+
# # {:char=>"β", :codepoint=>10052, :name=>"SNOWFLAKE"},
|
91
|
+
# # {:char=>"β
", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
|
92
|
+
# # {:char=>"β", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
|
93
|
+
# @note β This command is very time consuming (~ 1min) and unoptimized (execute one regexp per code pointβ¦)
|
94
|
+
def self.regrep_slow(regexp)
|
95
|
+
out = []
|
96
|
+
TwitterCldr::Shared::CodePoint.each do |cp|
|
97
|
+
next unless /#{regexp}/oi.match?(cp.name) # compiling regexp once is surprisingly not faster
|
98
|
+
|
99
|
+
out << {
|
100
|
+
char: TwitterCldr::Utils::CodePoints.to_string([cp.code_point]),
|
101
|
+
codepoint: cp.code_point,
|
102
|
+
name: cp.name
|
103
|
+
}
|
104
|
+
end
|
105
|
+
out
|
106
|
+
end
|
107
|
+
|
108
|
+
# Display a CLI-friendly output listing all code points corresponding to a regular expression.
|
109
|
+
# @example
|
110
|
+
# Unisec::Rugrep.regrep_display_slow('snowman|snowflake')
|
111
|
+
# # =>
|
112
|
+
# # U+2603 β SNOWMAN
|
113
|
+
# # U+26C4 β SNOWMAN WITHOUT SNOW
|
114
|
+
# # U+26C7 β BLACK SNOWMAN
|
115
|
+
# # U+2744 β SNOWFLAKE
|
116
|
+
# # U+2745 β
TIGHT TRIFOLIATE SNOWFLAKE
|
117
|
+
# # U+2746 β HEAVY CHEVRON SNOWFLAKE
|
118
|
+
def self.regrep_display_slow(regexp)
|
119
|
+
codepoints = regrep_slow(regexp)
|
120
|
+
codepoints.each do |cp|
|
121
|
+
puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
|
122
|
+
end
|
123
|
+
nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
data/lib/unisec/version.rb
CHANGED
data/lib/unisec/versions.rb
CHANGED
@@ -51,6 +51,10 @@ module Unisec
|
|
51
51
|
unicodeconfusable_unicode: {
|
52
52
|
version: Unicode::Confusable::UNICODE_VERSION,
|
53
53
|
label: 'Unicode (unicode-confusable gem)'
|
54
|
+
},
|
55
|
+
ucd_derivedname: {
|
56
|
+
version: Unisec::Rugrep.ucd_derivedname_version,
|
57
|
+
label: 'UCD (data/DerivedName.txt)'
|
54
58
|
}
|
55
59
|
}
|
56
60
|
end
|
@@ -76,6 +80,7 @@ module Unisec
|
|
76
80
|
display.call(:twittercldr_icu)
|
77
81
|
display.call(:twittercldr_cldr)
|
78
82
|
display.call(:ruby_unicode_emoji)
|
83
|
+
display.call(:ucd_derivedname)
|
79
84
|
puts Paint["\nGems:", :underline]
|
80
85
|
display.call(:unisec)
|
81
86
|
display.call(:twittercldr)
|
data/lib/unisec.rb
CHANGED
@@ -2,9 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'unisec/version'
|
4
4
|
|
5
|
-
require 'unisec/
|
5
|
+
require 'unisec/confusables'
|
6
6
|
require 'unisec/hexdump'
|
7
7
|
require 'unisec/properties'
|
8
|
-
require 'unisec/
|
9
|
-
require 'unisec/versions'
|
8
|
+
require 'unisec/rugrep'
|
10
9
|
require 'unisec/size'
|
10
|
+
require 'unisec/surrogates'
|
11
|
+
require 'unisec/versions'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unisec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alexandre ZANNI
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ctf-party
|
@@ -86,7 +86,9 @@ dependencies:
|
|
86
86
|
- - "~>"
|
87
87
|
- !ruby/object:Gem::Version
|
88
88
|
version: '1.9'
|
89
|
-
description: Toolkit for security research manipulating Unicode
|
89
|
+
description: 'Toolkit for security research manipulating Unicode: confusables, homoglyphs,
|
90
|
+
hexdump, code point, UTF-8, UTF-16, UTF-32, properties, regexp search, size, grapheme,
|
91
|
+
surrogates, version, ICU, CLDR, UCD'
|
90
92
|
email: alexandre.zanni@europe.com
|
91
93
|
executables:
|
92
94
|
- unisec
|
@@ -95,17 +97,20 @@ extra_rdoc_files: []
|
|
95
97
|
files:
|
96
98
|
- LICENSE
|
97
99
|
- bin/unisec
|
100
|
+
- data/DerivedName.txt
|
98
101
|
- lib/unisec.rb
|
99
102
|
- lib/unisec/cli/cli.rb
|
100
103
|
- lib/unisec/cli/confusables.rb
|
101
104
|
- lib/unisec/cli/hexdump.rb
|
102
105
|
- lib/unisec/cli/properties.rb
|
106
|
+
- lib/unisec/cli/rugrep.rb
|
103
107
|
- lib/unisec/cli/size.rb
|
104
108
|
- lib/unisec/cli/surrogates.rb
|
105
109
|
- lib/unisec/cli/versions.rb
|
106
110
|
- lib/unisec/confusables.rb
|
107
111
|
- lib/unisec/hexdump.rb
|
108
112
|
- lib/unisec/properties.rb
|
113
|
+
- lib/unisec/rugrep.rb
|
109
114
|
- lib/unisec/size.rb
|
110
115
|
- lib/unisec/surrogates.rb
|
111
116
|
- lib/unisec/utils.rb
|
@@ -140,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
140
145
|
- !ruby/object:Gem::Version
|
141
146
|
version: '0'
|
142
147
|
requirements: []
|
143
|
-
rubygems_version: 3.4.
|
148
|
+
rubygems_version: 3.4.10
|
144
149
|
signing_key:
|
145
150
|
specification_version: 4
|
146
151
|
summary: Unicode Security Toolkit
|