RubyGems - unicode-namecode - Versions diffs - 0.1.0 - Mend

unicode-namecode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +7 -0
data/bin/unicode-namecode +243 -0
data/data/NameAliases.txt +575 -0
data/data/UnicodeData.txt +40116 -0
data/data/emoji-test.txt +5331 -0
data/data/unicode_trie.cache +0 -0
data/lib/unicode_namecode/aliases.rb +58 -0
data/lib/unicode_namecode/data_loader.rb +97 -0
data/lib/unicode_namecode/emoji.rb +68 -0
data/lib/unicode_namecode/fuzzy.rb +60 -0
data/lib/unicode_namecode/trie.rb +69 -0
data/lib/unicode_namecode.rb +104 -0
metadata +71 -0

data/data/unicode_trie.cache ADDED Viewed

Binary file

data/lib/unicode_namecode/aliases.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+module UnicodeNamecode
+  # Handles Unicode name aliases for alternative character names
+  module Aliases
+    ALIASES_PATH = File.expand_path('../../../data/NameAliases.txt', __FILE__)
+    @alias_to_codepoint = nil
+    @codepoint_to_aliases = nil
+    class << self
+      attr_reader :alias_to_codepoint, :codepoint_to_aliases
+      # Parse NameAliases.txt and build lookup tables
+      def load_aliases_data
+        @alias_to_codepoint = {}
+        @codepoint_to_aliases = {}
+        return unless File.exist?(ALIASES_PATH)
+        File.foreach(ALIASES_PATH) do |line|
+          next if line.strip.empty? || line.start_with?('#')
+          # Parse lines like: "0000;NULL;control"
+          if line =~ /^([0-9A-F]+);([^;]+);(.+)$/
+            codepoint_hex, alias_name, alias_type = $1, $2, $3
+            codepoint = codepoint_hex.to_i(16)
+            alias_upper = alias_name.upcase
+            # Store alias -> codepoint mapping
+            @alias_to_codepoint[alias_upper] = codepoint
+            # Store codepoint -> aliases mapping
+            @codepoint_to_aliases[codepoint] ||= []
+            @codepoint_to_aliases[codepoint] << { name: alias_upper, type: alias_type }
+          end
+        end
+      end
+      # Get the codepoint for an alias name
+      def codepoint_for_alias(alias_name)
+        load_aliases_data unless @alias_to_codepoint
+        @alias_to_codepoint[alias_name.upcase]
+      end
+      # Get all aliases for a codepoint
+      def aliases_for_codepoint(codepoint)
+        load_aliases_data unless @codepoint_to_aliases
+        @codepoint_to_aliases[codepoint] || []
+      end
+      # Check if a name is an alias
+      def is_alias?(name)
+        load_aliases_data unless @alias_to_codepoint
+        @alias_to_codepoint.key?(name.upcase)
+      end
+    end
+  end
+end

data/lib/unicode_namecode/data_loader.rb ADDED Viewed

@@ -0,0 +1,97 @@
+# frozen_string_literal: true
+require 'fileutils'
+require 'fuzzy_match'
+require 'etc'
+require_relative 'trie'
+module UnicodeNamecode
+  # Handles data loading, caching, and parallel parsing
+  module DataLoader
+    DATA_PATH = File.expand_path('../../../data/UnicodeData.txt', __FILE__)
+    CACHE_PATH = File.expand_path('../../../data/unicode_trie.cache', __FILE__)
+    @trie = nil
+    @all_names = nil
+    @fuzzy = nil
+    @codepoint_to_name = nil
+    class << self
+      attr_reader :trie, :all_names, :fuzzy, :codepoint_to_name
+      # Main data loading method - handles cache loading and fresh parsing
+      def load_data
+        if File.exist?(CACHE_PATH)
+          File.open(CACHE_PATH, 'rb') { |f| @trie = Marshal.load(f) }
+          @all_names = collect_all_names
+          @fuzzy = FuzzyMatch.new(@all_names)
+          @codepoint_to_name = {}
+          collect_codepoint_to_name(@trie.instance_variable_get(:@root), "")
+          return
+        end
+        # First run: parse UnicodeData.txt and build everything from scratch
+        @trie = Trie.new
+        @codepoint_to_name = {}
+        # Use parallel parsing to speed up the initial load
+        lines = File.readlines(DATA_PATH)
+        n_threads = [Etc.nprocessors, 2].max
+        chunk_size = (lines.size.to_f / n_threads).ceil
+        chunks = lines.each_slice(chunk_size).to_a
+        results = Array.new(n_threads) { [] }
+        # Parse chunks in parallel threads
+        threads = chunks.each_with_index.map do |chunk, idx|
+          Thread.new do
+            chunk.each do |line|
+              fields = line.chomp.split(';')
+              codepoint = fields[0]
+              name = fields[1]
+              next if name =~ /<.*>/
+              results[idx] << [name.upcase, codepoint.to_i(16)]
+              @codepoint_to_name[codepoint.to_i(16)] = name.upcase
+            end
+          end
+        end
+        threads.each(&:join)
+        # Insert all parsed data into the Trie
+        results.flatten(1).each { |name, codepoint| @trie.insert(name, codepoint) }
+        # Cache the built Trie for future fast loads
+        File.open(CACHE_PATH, 'wb') { |f| Marshal.dump(@trie, f) }
+        # Build additional data structures
+        @all_names = collect_all_names
+        @fuzzy = FuzzyMatch.new(@all_names)
+      end
+      # Collect all Unicode names from the Trie for fuzzy matching
+      def collect_all_names
+        names = []
+        collect_names_recursive(@trie.instance_variable_get(:@root), "", names)
+        names
+      end
+      # Recursively traverse the Trie to collect all complete Unicode names
+      def collect_names_recursive(node, current, names)
+        names << current if node.is_end
+        node.children.each do |char, child|
+          collect_names_recursive(child, current + char, names)
+        end
+      end
+      # Build the reverse lookup map: codepoint -> Unicode name
+      def collect_codepoint_to_name(node, current)
+        if node.is_end && node.codepoint
+          @codepoint_to_name[node.codepoint] = current.upcase
+        end
+        node.children.each do |char, child|
+          collect_codepoint_to_name(child, current + char)
+        end
+      end
+    end
+  end
+end

data/lib/unicode_namecode/emoji.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# frozen_string_literal: true
+module UnicodeNamecode
+  # Handles emoji-related functionality and bidirectional lookup
+  module Emoji
+    EMOJI_PATH = File.expand_path('../../../data/emoji-test.txt', __FILE__)
+    @emoji_to_codepoints = nil
+    @emoji_to_name = nil
+    @codepoint_to_emoji = nil
+    class << self
+      attr_reader :emoji_to_codepoints, :emoji_to_name, :codepoint_to_emoji
+      # Parse emoji-test.txt and build lookup tables
+      def load_emoji_data
+        @emoji_to_codepoints = {}
+        @emoji_to_name = {}
+        @codepoint_to_emoji = {}
+        return unless File.exist?(EMOJI_PATH)
+        File.foreach(EMOJI_PATH) do |line|
+          next if line.strip.empty? || line.start_with?('#')
+          # Parse lines like: "1F60A ; fully-qualified # 😊 E1.0 smiling face with smiling eyes"
+          if line =~ /^([0-9A-F ]+)\s*;\s*fully-qualified\s*#\s*(\S+)\s+E[0-9.]+\s+(.+)$/
+            codepoints_hex, emoji, name = $1, $2, $3
+            # Convert hex codepoints to integers
+            codepoints = codepoints_hex.strip.split.map { |cp| cp.to_i(16) }
+            # Store emoji -> codepoint(s) mapping
+            @emoji_to_codepoints[emoji] = codepoints.length == 1 ? codepoints.first : codepoints
+            @emoji_to_name[emoji] = name.upcase
+            # Store codepoint(s) -> emoji mapping for reverse lookup
+            if codepoints.length == 1
+              @codepoint_to_emoji[codepoints.first] = emoji
+            else
+              @codepoint_to_emoji[codepoints] = emoji
+            end
+          end
+        end
+      end
+      # Get the codepoint(s) for an emoji character
+      def codepoint_for_emoji(emoji)
+        load_emoji_data unless @emoji_to_codepoints
+        @emoji_to_codepoints[emoji]
+      end
+      # Get the official Unicode name for an emoji character
+      def name_for_emoji(emoji)
+        load_emoji_data unless @emoji_to_name
+        @emoji_to_name[emoji]
+      end
+      # Get the emoji character for a codepoint or codepoint sequence
+      def emoji_for_codepoint(codepoint_or_array)
+        load_emoji_data unless @codepoint_to_emoji
+        if codepoint_or_array.is_a?(Array)
+          @codepoint_to_emoji[codepoint_or_array] || @codepoint_to_emoji[codepoint_or_array.map(&:to_i)]
+        else
+          @codepoint_to_emoji[codepoint_or_array.to_i]
+        end
+      end
+    end
+  end
+end

data/lib/unicode_namecode/fuzzy.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# frozen_string_literal: true
+require 'fuzzy_match'
+module UnicodeNamecode
+  # Handles fuzzy matching for typo-tolerant Unicode name searches
+  module Fuzzy
+    # Find Unicode names similar to the given name using fuzzy matching
+    def self.fuzzy_search(fuzzy_matcher, name, limit = 5, similarity_threshold = 0.3)
+      return [] unless fuzzy_matcher
+      # Use fuzzy_match gem to find similar names
+      matches = fuzzy_matcher.find_all_with_score(name.upcase, limit: limit * 2)
+      # Filter by similarity threshold and format results
+      matches
+        .select { |match, score| score >= similarity_threshold }
+        .take(limit)
+        .map { |match, score| { name: match, similarity: score } }
+    end
+    # Calculate similarity between two strings using Levenshtein distance
+    def self.calculate_similarity(str1, str2)
+      return 1.0 if str1 == str2
+      return 0.0 if str1.empty? || str2.empty?
+      distance = levenshtein_distance(str1, str2)
+      max_length = [str1.length, str2.length].max
+      1.0 - (distance.to_f / max_length)
+    end
+    # Calculate the Levenshtein distance between two strings
+    def self.levenshtein_distance(str1, str2)
+      # Create a matrix to store distances
+      matrix = Array.new(str1.length + 1) { Array.new(str2.length + 1) }
+      # Initialize first row and column
+      (0..str1.length).each { |i| matrix[i][0] = i }
+      (0..str2.length).each { |j| matrix[0][j] = j }
+      # Fill the matrix using dynamic programming
+      (1..str1.length).each do |i|
+        (1..str2.length).each do |j|
+          if str1[i - 1] == str2[j - 1]
+            matrix[i][j] = matrix[i - 1][j - 1]
+          else
+            matrix[i][j] = [
+              matrix[i - 1][j] + 1,     # deletion
+              matrix[i][j - 1] + 1,     # insertion
+              matrix[i - 1][j - 1] + 1  # substitution
+            ].min
+          end
+        end
+      end
+      matrix[str1.length][str2.length]
+    end
+  end
+end

data/lib/unicode_namecode/trie.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# frozen_string_literal: true
+module UnicodeNamecode
+  # Trie (prefix tree) for efficient Unicode name lookups
+  class Trie
+    def initialize
+      @root = TrieNode.new
+    end
+    # Insert a Unicode name and its codepoint into the Trie
+    def insert(name, codepoint)
+      node = @root
+      name.each_char do |char|
+        node.children[char] ||= TrieNode.new
+        node = node.children[char]
+      end
+      node.codepoint = codepoint
+      node.is_end = true
+    end
+    # Find the exact codepoint for a Unicode name
+    def find(name)
+      node = find_node(name)
+      node&.codepoint
+    end
+    # Find all Unicode names that start with the given prefix
+    def prefix_search(prefix, limit = 100)
+      node = find_node(prefix)
+      return [] unless node
+      results = []
+      collect_words(node, prefix, results, limit)
+      results
+    end
+    private
+    def find_node(name)
+      node = @root
+      name.each_char do |char|
+        return nil unless node.children[char]
+        node = node.children[char]
+      end
+      node
+    end
+    def collect_words(node, current_word, results, limit)
+      return if results.length >= limit
+      results << { name: current_word, codepoint: node.codepoint } if node.is_end
+      node.children.each do |char, child_node|
+        collect_words(child_node, current_word + char, results, limit)
+      end
+    end
+  end
+  # Represents a single node in the Trie
+  class TrieNode
+    attr_accessor :children, :codepoint, :is_end
+    def initialize
+      @children = {}
+      @codepoint = nil
+      @is_end = false
+    end
+  end
+end

data/lib/unicode_namecode.rb ADDED Viewed

@@ -0,0 +1,104 @@
+# frozen_string_literal: true
+require_relative 'unicode_namecode/trie'
+require_relative 'unicode_namecode/emoji'
+require_relative 'unicode_namecode/data_loader'
+require_relative 'unicode_namecode/fuzzy'
+require_relative 'unicode_namecode/aliases'
+module UnicodeNamecode
+  VERSION = "0.1.0"
+  # Look up a Unicode character's codepoint by its official name or alias
+  def self.codepoint(name)
+    DataLoader.load_data unless DataLoader.trie
+    # Try exact match first
+    result = DataLoader.trie.find(name.strip.upcase)
+    return result if result
+    # Try alias lookup
+    Aliases.codepoint_for_alias(name.strip)
+  end
+  # Returns Unicode format (U+XXXX)
+  def self.lookup(name)
+    codepoint = codepoint(name)
+    return nil unless codepoint
+    "U+#{codepoint.to_s(16).upcase.rjust(4, '0')}"
+  end
+  # Get the Unicode name of a character
+  def self.of(character)
+    codepoint = character.ord
+    name_for_codepoint(codepoint)
+  end
+  # Get the codepoint of a character
+  def self.codepoint_of(character)
+    character.ord
+  end
+  # Get the Unicode format of a character
+  def self.unicode_of(character)
+    codepoint = character.ord
+    "U+#{codepoint.to_s(16).upcase.rjust(4, '0')}"
+  end
+  # Find all Unicode names that start with the given prefix
+  def self.prefix_search(prefix, limit = 100)
+    DataLoader.load_data unless DataLoader.trie
+    DataLoader.trie.prefix_search(prefix.upcase, limit)
+  end
+  # Find Unicode names similar to the given name (for typos/partial matches)
+  def self.fuzzy_search(name, limit = 5, similarity_threshold = 0.3)
+    DataLoader.load_data unless DataLoader.fuzzy
+    Fuzzy.fuzzy_search(DataLoader.fuzzy, name, limit, similarity_threshold)
+  end
+  # === EMOJI API METHODS ===
+  def self.codepoint_for_emoji(emoji)
+    Emoji.codepoint_for_emoji(emoji)
+  end
+  def self.name_for_emoji(emoji)
+    Emoji.name_for_emoji(emoji)
+  end
+  def self.emoji_for_codepoint(codepoint_or_array)
+    Emoji.emoji_for_codepoint(codepoint_or_array)
+  end
+  # === REVERSE LOOKUP METHODS ===
+  def self.name_for_codepoint(codepoint)
+    DataLoader.load_data unless DataLoader.codepoint_to_name
+    DataLoader.codepoint_to_name[codepoint]
+  end
+  # === ALIAS API METHODS ===
+  def self.codepoint_for_alias(alias_name)
+    Aliases.codepoint_for_alias(alias_name)
+  end
+  def self.aliases_for_codepoint(codepoint)
+    Aliases.aliases_for_codepoint(codepoint)
+  end
+  def self.is_alias?(name)
+    Aliases.is_alias?(name)
+  end
+  # === UTILITY METHODS ===
+  def self.calculate_similarity(str1, str2)
+    Fuzzy.calculate_similarity(str1, str2)
+  end
+  def self.levenshtein_distance(str1, str2)
+    Fuzzy.levenshtein_distance(str1, str2)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,71 @@
+--- !ruby/object:Gem::Specification
+name: unicode-namecode
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Mikkal Mullen
+bindir: bin
+cert_chain: []
+date: 1980-01-02 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: fuzzy_match
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+description: Fast Unicode character lookups by name, alias, or emoji. Features Trie-based
+  searches, fuzzy matching, prefix search, and comprehensive CLI tools.
+email:
+- mikkalmp@gmail.com
+executables:
+- unicode-namecode
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/unicode-namecode
+- data/NameAliases.txt
+- data/UnicodeData.txt
+- data/emoji-test.txt
+- data/unicode_trie.cache
+- lib/unicode_namecode.rb
+- lib/unicode_namecode/aliases.rb
+- lib/unicode_namecode/data_loader.rb
+- lib/unicode_namecode/emoji.rb
+- lib/unicode_namecode/fuzzy.rb
+- lib/unicode_namecode/trie.rb
+homepage: https://github.com/Aeroswift/unicode-namecode
+licenses:
+- MIT
+metadata:
+  homepage_uri: https://github.com/Aeroswift/unicode-namecode
+  source_code_uri: https://github.com/Aeroswift/unicode-namecode
+  changelog_uri: https://github.com/Aeroswift/unicode-namecode/blob/main/CHANGELOG.md
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 3.0.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.6.9
+specification_version: 4
+summary: A powerful Ruby gem for Unicode character lookups with support for official
+  names, aliases, emojis, and fuzzy matching.
+test_files: []