RubyGems - unisec - Versions diffs - 0.0.2 → 0.0.3 - Mend

unisec 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/lib/unisec/cli/cli.rb CHANGED Viewed

@@ -1,11 +1,12 @@
 # frozen_string_literal: true
-require 'unisec/cli/surrogates'
+require 'unisec/cli/confusables'
 require 'unisec/cli/hexdump'
 require 'unisec/cli/properties'
-require 'unisec/cli/confusables'
-require 'unisec/cli/versions'
+require 'unisec/cli/rugrep'
 require 'unisec/cli/size'
+require 'unisec/cli/surrogates'
+require 'unisec/cli/versions'
 module Unisec
   # Module used to create the CLI for the executable
@@ -16,16 +17,17 @@ module Unisec
       # Mapping between the (sub-)commands as seen by the user
       # on the command-line interface and the CLI modules in the lib
-      register 'surrogates to', Surrogates::To
-      register 'surrogates from', Surrogates::From
-      register 'hexdump', Hexdump
-      register 'properties list', Properties::List
-      register 'properties codepoints', Properties::Codepoints
-      register 'properties char', Properties::Char
       register 'confusables list', Confusables::List
       register 'confusables randomize', Confusables::Randomize
-      register 'versions', Versions
+      register 'grep', Grep
+      register 'hexdump', Hexdump
+      register 'properties char', Properties::Char
+      register 'properties codepoints', Properties::Codepoints
+      register 'properties list', Properties::List
       register 'size', Size
+      register 'surrogates from', Surrogates::From
+      register 'surrogates to', Surrogates::To
+      register 'versions', Versions
     end
   end
 end

data/lib/unisec/cli/rugrep.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+require 'dry/cli'
+require 'unisec'
+module Unisec
+  module CLI
+    module Commands
+      # CLI command `unisec grep` for the class {Unisec::Rugrep} from the lib.
+      #
+      # Example:
+      #
+      # ```plaintext
+      # $ unisec grep 'FRENCH \w+'
+      # U+20A3  ₣    FRENCH FRANC SIGN
+      # U+1F35F 🍟    FRENCH FRIES
+      # ```
+      class Grep < Dry::CLI::Command
+        desc 'Search for Unicode code point names by regular expression'
+        argument :regexp, required: true,
+                          desc: 'regular expression'
+        # Hexdump of all Unicode encodings.
+        # @param regexp [Regexp] Regular expression without delimiters or modifiers.
+        #   Supports everything Ruby Regexp supports
+        def call(regexp: nil, **)
+          puts Unisec::Rugrep.regrep_display(regexp)
+        end
+      end
+    end
+  end
+end

data/lib/unisec/properties.rb CHANGED Viewed

@@ -50,7 +50,7 @@ module Unisec
     def self.codepoints_display(prop)
       codepoints = Properties.codepoints(prop)
       codepoints.each do |cp|
-        puts "#{Properties.char2codepoint(cp[:char]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
+        puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
       end
       nil
     end
@@ -158,7 +158,7 @@ module Unisec
     # @example
     #   Unisec::Properties.char2codepoint('💎') # => "U+1F48E"
     def self.char2codepoint(chr)
-      "U+#{format('%.4x', chr.codepoints.first).upcase}"
+      Properties.deccp2stdhexcp(chr.codepoints.first)
     end
     # Display the code points in Unicode format for the given characters (code points as string)
@@ -174,5 +174,14 @@ module Unisec
       end
       out.join(' ')
     end
+    # Convert from decimal code point to standardized format hexadecimal code point
+    # @param int_cp [Integer] Code point in decimal format
+    # @return [String] code point in Unicode format
+    # @example
+    #   Unisec::Properties.intcp2stdhexcp(128640) # => "U+1F680"
+    def self.deccp2stdhexcp(int_cp)
+      "U+#{format('%.4x', int_cp).upcase}"
+    end
   end
 end

data/lib/unisec/rugrep.rb ADDED Viewed

@@ -0,0 +1,126 @@
+# frozen_string_literal: true
+require 'twitter_cldr'
+require 'paint'
+module Unisec
+  # Ruby grep : Ruby regular expression search for Unicode code point names
+  class Rugrep
+    # UCD Derived names file location
+    # @see https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedName.txt
+    UCD_DERIVEDNAME = File.join(__dir__, '../../data/DerivedName.txt')
+    # Search code points by (Ruby) regexp
+    # @param regexp [Regexp] Regular expression without delimiters or modifiers.
+    #   Supports everything Ruby Regexp supports
+    # @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
+    # @example
+    #   Unisec::Rugrep.regrep('snowman|snowflake')
+    #   # =>
+    #   # [{:char=>"☃", :codepoint=>9731, :name=>"SNOWMAN"},
+    #   #  {:char=>"⛄", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
+    #   #  {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
+    #   #  {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
+    #   #  {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
+    #   #  {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
+    #   Unisec::Rugrep.regrep('greek small letter \w+')
+    #   # =>
+    #   # [{:char=>"ͱ", :codepoint=>881, :name=>"GREEK SMALL LETTER HETA"},
+    #   #  {:char=>"ͳ", :codepoint=>883, :name=>"GREEK SMALL LETTER ARCHAIC SAMPI"},
+    #   #  {:char=>"ͷ", :codepoint=>887, :name=>"GREEK SMALL LETTER PAMPHYLIAN DIGAMMA"},
+    #   #  …]
+    def self.regrep(regexp)
+      out = []
+      file = File.new(UCD_DERIVEDNAME)
+      file.each_line(chomp: true) do |line|
+        # Skip if the line is empty or a comment
+        next if line.empty? || line[0] == '#'
+        # parse the line to extract code point as integer and the name
+        cp_int, name = line.split(';')
+        cp_int = cp_int.chomp.to_i(16)
+        name.lstrip!
+        next unless /#{regexp}/i.match?(name) # compiling regexp once is surprisingly not faster
+        out << {
+          char: TwitterCldr::Utils::CodePoints.to_string([cp_int]),
+          codepoint: cp_int,
+          name: name
+        }
+      end
+      out
+    end
+    # Display a CLI-friendly output listing all code points corresponding to a regular expression.
+    # @example
+    #   Unisec::Rugrep.regrep_display('snowman|snowflake')
+    #   # =>
+    #   # U+2603  ☃    SNOWMAN
+    #   # U+26C4  ⛄    SNOWMAN WITHOUT SNOW
+    #   # U+26C7  ⛇    BLACK SNOWMAN
+    #   # U+2744  ❄    SNOWFLAKE
+    #   # U+2745  ❅    TIGHT TRIFOLIATE SNOWFLAKE
+    #   # U+2746  ❆    HEAVY CHEVRON SNOWFLAKE
+    def self.regrep_display(regexp)
+      codepoints = regrep(regexp)
+      codepoints.each do |cp|
+        puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
+      end
+      nil
+    end
+    # Returns the version of Unicode used in UCD local file (data/DerivedName.txt)
+    # @return [String] Unicode version
+    # @example
+    #   Unisec::Rugrep.ucd_derivedname_version # => "15.1.0"
+    def self.ucd_derivedname_version
+      first_line = File.open(UCD_DERIVEDNAME, &:readline)
+      first_line.match(/-(\d+\.\d+\.\d+)\.txt/).captures.first
+    end
+    # Search code points by (Ruby) regexp
+    # @param regexp [Regexp] Regular expression without delimiters or modifiers
+    # @return [Array<Hash>] Array of code points (`{char: String, codepoint: Integer, name: String}`)
+    # @example
+    #   Unisec::Rugrep.regrep_slow('snowman|snowflake')
+    #   # =>
+    #   # [{:char=>"☃", :codepoint=>9731, :name=>"SNOWMAN"},
+    #   #  {:char=>"⛄", :codepoint=>9924, :name=>"SNOWMAN WITHOUT SNOW"},
+    #   #  {:char=>"⛇", :codepoint=>9927, :name=>"BLACK SNOWMAN"},
+    #   #  {:char=>"❄", :codepoint=>10052, :name=>"SNOWFLAKE"},
+    #   #  {:char=>"❅", :codepoint=>10053, :name=>"TIGHT TRIFOLIATE SNOWFLAKE"},
+    #   #  {:char=>"❆", :codepoint=>10054, :name=>"HEAVY CHEVRON SNOWFLAKE"}]
+    # @note ⚠ This command is very time consuming (~ 1min) and unoptimized (execute one regexp per code point…)
+    def self.regrep_slow(regexp)
+      out = []
+      TwitterCldr::Shared::CodePoint.each do |cp|
+        next unless /#{regexp}/oi.match?(cp.name) # compiling regexp once is surprisingly not faster
+        out << {
+          char: TwitterCldr::Utils::CodePoints.to_string([cp.code_point]),
+          codepoint: cp.code_point,
+          name: cp.name
+        }
+      end
+      out
+    end
+    # Display a CLI-friendly output listing all code points corresponding to a regular expression.
+    # @example
+    #   Unisec::Rugrep.regrep_display_slow('snowman|snowflake')
+    #   # =>
+    #   # U+2603  ☃    SNOWMAN
+    #   # U+26C4  ⛄    SNOWMAN WITHOUT SNOW
+    #   # U+26C7  ⛇    BLACK SNOWMAN
+    #   # U+2744  ❄    SNOWFLAKE
+    #   # U+2745  ❅    TIGHT TRIFOLIATE SNOWFLAKE
+    #   # U+2746  ❆    HEAVY CHEVRON SNOWFLAKE
+    def self.regrep_display_slow(regexp)
+      codepoints = regrep_slow(regexp)
+      codepoints.each do |cp|
+        puts "#{Properties.deccp2stdhexcp(cp[:codepoint]).ljust(7)} #{cp[:char].ljust(4)} #{cp[:name]}"
+      end
+      nil
+    end
+  end
+end

data/lib/unisec/version.rb CHANGED Viewed

@@ -2,5 +2,5 @@
 module Unisec
   # Version of unisec library and app
-  VERSION = '0.0.2'
+  VERSION = '0.0.3'
 end

data/lib/unisec/versions.rb CHANGED Viewed

@@ -51,6 +51,10 @@ module Unisec
         unicodeconfusable_unicode: {
           version: Unicode::Confusable::UNICODE_VERSION,
           label: 'Unicode (unicode-confusable gem)'
+        },
+        ucd_derivedname: {
+          version: Unisec::Rugrep.ucd_derivedname_version,
+          label: 'UCD (data/DerivedName.txt)'
         }
       }
     end
@@ -76,6 +80,7 @@ module Unisec
       display.call(:twittercldr_icu)
       display.call(:twittercldr_cldr)
       display.call(:ruby_unicode_emoji)
+      display.call(:ucd_derivedname)
       puts Paint["\nGems:", :underline]
       display.call(:unisec)
       display.call(:twittercldr)

data/lib/unisec.rb CHANGED Viewed

@@ -2,9 +2,10 @@
 require 'unisec/version'
-require 'unisec/surrogates'
+require 'unisec/confusables'
 require 'unisec/hexdump'
 require 'unisec/properties'
-require 'unisec/confusables'
-require 'unisec/versions'
+require 'unisec/rugrep'
 require 'unisec/size'
+require 'unisec/surrogates'
+require 'unisec/versions'

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: unisec
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Alexandre ZANNI
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-08-18 00:00:00.000000000 Z
+date: 2023-10-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ctf-party
@@ -86,7 +86,9 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.9'
-description: Toolkit for security research manipulating Unicode
+description: 'Toolkit for security research manipulating Unicode: confusables, homoglyphs,
+  hexdump, code point, UTF-8, UTF-16, UTF-32, properties, regexp search, size, grapheme,
+  surrogates, version, ICU, CLDR, UCD'
 email: alexandre.zanni@europe.com
 executables:
 - unisec
@@ -95,17 +97,20 @@ extra_rdoc_files: []
 files:
 - LICENSE
 - bin/unisec
+- data/DerivedName.txt
 - lib/unisec.rb
 - lib/unisec/cli/cli.rb
 - lib/unisec/cli/confusables.rb
 - lib/unisec/cli/hexdump.rb
 - lib/unisec/cli/properties.rb
+- lib/unisec/cli/rugrep.rb
 - lib/unisec/cli/size.rb
 - lib/unisec/cli/surrogates.rb
 - lib/unisec/cli/versions.rb
 - lib/unisec/confusables.rb
 - lib/unisec/hexdump.rb
 - lib/unisec/properties.rb
+- lib/unisec/rugrep.rb
 - lib/unisec/size.rb
 - lib/unisec/surrogates.rb
 - lib/unisec/utils.rb
@@ -140,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.4.1
+rubygems_version: 3.4.10
 signing_key:
 specification_version: 4
 summary: Unicode Security Toolkit