unicode-namecode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module UnicodeNamecode
4
+ # Handles Unicode name aliases for alternative character names
5
+ module Aliases
6
+ ALIASES_PATH = File.expand_path('../../../data/NameAliases.txt', __FILE__)
7
+ @alias_to_codepoint = nil
8
+ @codepoint_to_aliases = nil
9
+
10
+ class << self
11
+ attr_reader :alias_to_codepoint, :codepoint_to_aliases
12
+
13
+ # Parse NameAliases.txt and build lookup tables
14
+ def load_aliases_data
15
+ @alias_to_codepoint = {}
16
+ @codepoint_to_aliases = {}
17
+ return unless File.exist?(ALIASES_PATH)
18
+
19
+ File.foreach(ALIASES_PATH) do |line|
20
+ next if line.strip.empty? || line.start_with?('#')
21
+
22
+ # Parse lines like: "0000;NULL;control"
23
+ if line =~ /^([0-9A-F]+);([^;]+);(.+)$/
24
+ codepoint_hex, alias_name, alias_type = $1, $2, $3
25
+
26
+ codepoint = codepoint_hex.to_i(16)
27
+ alias_upper = alias_name.upcase
28
+
29
+ # Store alias -> codepoint mapping
30
+ @alias_to_codepoint[alias_upper] = codepoint
31
+
32
+ # Store codepoint -> aliases mapping
33
+ @codepoint_to_aliases[codepoint] ||= []
34
+ @codepoint_to_aliases[codepoint] << { name: alias_upper, type: alias_type }
35
+ end
36
+ end
37
+ end
38
+
39
+ # Get the codepoint for an alias name
40
+ def codepoint_for_alias(alias_name)
41
+ load_aliases_data unless @alias_to_codepoint
42
+ @alias_to_codepoint[alias_name.upcase]
43
+ end
44
+
45
+ # Get all aliases for a codepoint
46
+ def aliases_for_codepoint(codepoint)
47
+ load_aliases_data unless @codepoint_to_aliases
48
+ @codepoint_to_aliases[codepoint] || []
49
+ end
50
+
51
+ # Check if a name is an alias
52
+ def is_alias?(name)
53
+ load_aliases_data unless @alias_to_codepoint
54
+ @alias_to_codepoint.key?(name.upcase)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'fuzzy_match'
5
+ require 'etc'
6
+ require_relative 'trie'
7
+
8
+ module UnicodeNamecode
9
+ # Handles data loading, caching, and parallel parsing
10
+ module DataLoader
11
+ DATA_PATH = File.expand_path('../../../data/UnicodeData.txt', __FILE__)
12
+ CACHE_PATH = File.expand_path('../../../data/unicode_trie.cache', __FILE__)
13
+ @trie = nil
14
+ @all_names = nil
15
+ @fuzzy = nil
16
+ @codepoint_to_name = nil
17
+
18
+ class << self
19
+ attr_reader :trie, :all_names, :fuzzy, :codepoint_to_name
20
+
21
+ # Main data loading method - handles cache loading and fresh parsing
22
+ def load_data
23
+ if File.exist?(CACHE_PATH)
24
+ File.open(CACHE_PATH, 'rb') { |f| @trie = Marshal.load(f) }
25
+ @all_names = collect_all_names
26
+ @fuzzy = FuzzyMatch.new(@all_names)
27
+ @codepoint_to_name = {}
28
+ collect_codepoint_to_name(@trie.instance_variable_get(:@root), "")
29
+ return
30
+ end
31
+
32
+ # First run: parse UnicodeData.txt and build everything from scratch
33
+ @trie = Trie.new
34
+ @codepoint_to_name = {}
35
+
36
+ # Use parallel parsing to speed up the initial load
37
+ lines = File.readlines(DATA_PATH)
38
+ n_threads = [Etc.nprocessors, 2].max
39
+ chunk_size = (lines.size.to_f / n_threads).ceil
40
+ chunks = lines.each_slice(chunk_size).to_a
41
+ results = Array.new(n_threads) { [] }
42
+
43
+ # Parse chunks in parallel threads
44
+ threads = chunks.each_with_index.map do |chunk, idx|
45
+ Thread.new do
46
+ chunk.each do |line|
47
+ fields = line.chomp.split(';')
48
+ codepoint = fields[0]
49
+ name = fields[1]
50
+ next if name =~ /<.*>/
51
+
52
+ results[idx] << [name.upcase, codepoint.to_i(16)]
53
+ @codepoint_to_name[codepoint.to_i(16)] = name.upcase
54
+ end
55
+ end
56
+ end
57
+
58
+ threads.each(&:join)
59
+
60
+ # Insert all parsed data into the Trie
61
+ results.flatten(1).each { |name, codepoint| @trie.insert(name, codepoint) }
62
+
63
+ # Cache the built Trie for future fast loads
64
+ File.open(CACHE_PATH, 'wb') { |f| Marshal.dump(@trie, f) }
65
+
66
+ # Build additional data structures
67
+ @all_names = collect_all_names
68
+ @fuzzy = FuzzyMatch.new(@all_names)
69
+ end
70
+
71
+ # Collect all Unicode names from the Trie for fuzzy matching
72
+ def collect_all_names
73
+ names = []
74
+ collect_names_recursive(@trie.instance_variable_get(:@root), "", names)
75
+ names
76
+ end
77
+
78
+ # Recursively traverse the Trie to collect all complete Unicode names
79
+ def collect_names_recursive(node, current, names)
80
+ names << current if node.is_end
81
+ node.children.each do |char, child|
82
+ collect_names_recursive(child, current + char, names)
83
+ end
84
+ end
85
+
86
+ # Build the reverse lookup map: codepoint -> Unicode name
87
+ def collect_codepoint_to_name(node, current)
88
+ if node.is_end && node.codepoint
89
+ @codepoint_to_name[node.codepoint] = current.upcase
90
+ end
91
+ node.children.each do |char, child|
92
+ collect_codepoint_to_name(child, current + char)
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ module UnicodeNamecode
4
+ # Handles emoji-related functionality and bidirectional lookup
5
+ module Emoji
6
+ EMOJI_PATH = File.expand_path('../../../data/emoji-test.txt', __FILE__)
7
+ @emoji_to_codepoints = nil
8
+ @emoji_to_name = nil
9
+ @codepoint_to_emoji = nil
10
+
11
+ class << self
12
+ attr_reader :emoji_to_codepoints, :emoji_to_name, :codepoint_to_emoji
13
+
14
+ # Parse emoji-test.txt and build lookup tables
15
+ def load_emoji_data
16
+ @emoji_to_codepoints = {}
17
+ @emoji_to_name = {}
18
+ @codepoint_to_emoji = {}
19
+ return unless File.exist?(EMOJI_PATH)
20
+
21
+ File.foreach(EMOJI_PATH) do |line|
22
+ next if line.strip.empty? || line.start_with?('#')
23
+
24
+ # Parse lines like: "1F60A ; fully-qualified # 😊 E1.0 smiling face with smiling eyes"
25
+ if line =~ /^([0-9A-F ]+)\s*;\s*fully-qualified\s*#\s*(\S+)\s+E[0-9.]+\s+(.+)$/
26
+ codepoints_hex, emoji, name = $1, $2, $3
27
+
28
+ # Convert hex codepoints to integers
29
+ codepoints = codepoints_hex.strip.split.map { |cp| cp.to_i(16) }
30
+
31
+ # Store emoji -> codepoint(s) mapping
32
+ @emoji_to_codepoints[emoji] = codepoints.length == 1 ? codepoints.first : codepoints
33
+ @emoji_to_name[emoji] = name.upcase
34
+
35
+ # Store codepoint(s) -> emoji mapping for reverse lookup
36
+ if codepoints.length == 1
37
+ @codepoint_to_emoji[codepoints.first] = emoji
38
+ else
39
+ @codepoint_to_emoji[codepoints] = emoji
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ # Get the codepoint(s) for an emoji character
46
+ def codepoint_for_emoji(emoji)
47
+ load_emoji_data unless @emoji_to_codepoints
48
+ @emoji_to_codepoints[emoji]
49
+ end
50
+
51
+ # Get the official Unicode name for an emoji character
52
+ def name_for_emoji(emoji)
53
+ load_emoji_data unless @emoji_to_name
54
+ @emoji_to_name[emoji]
55
+ end
56
+
57
+ # Get the emoji character for a codepoint or codepoint sequence
58
+ def emoji_for_codepoint(codepoint_or_array)
59
+ load_emoji_data unless @codepoint_to_emoji
60
+ if codepoint_or_array.is_a?(Array)
61
+ @codepoint_to_emoji[codepoint_or_array] || @codepoint_to_emoji[codepoint_or_array.map(&:to_i)]
62
+ else
63
+ @codepoint_to_emoji[codepoint_or_array.to_i]
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fuzzy_match'
4
+
5
+ module UnicodeNamecode
6
+ # Handles fuzzy matching for typo-tolerant Unicode name searches
7
+ module Fuzzy
8
+ # Find Unicode names similar to the given name using fuzzy matching
9
+ def self.fuzzy_search(fuzzy_matcher, name, limit = 5, similarity_threshold = 0.3)
10
+ return [] unless fuzzy_matcher
11
+
12
+ # Use fuzzy_match gem to find similar names
13
+ matches = fuzzy_matcher.find_all_with_score(name.upcase, limit: limit * 2)
14
+
15
+ # Filter by similarity threshold and format results
16
+ matches
17
+ .select { |match, score| score >= similarity_threshold }
18
+ .take(limit)
19
+ .map { |match, score| { name: match, similarity: score } }
20
+ end
21
+
22
+ # Calculate similarity between two strings using Levenshtein distance
23
+ def self.calculate_similarity(str1, str2)
24
+ return 1.0 if str1 == str2
25
+ return 0.0 if str1.empty? || str2.empty?
26
+
27
+ distance = levenshtein_distance(str1, str2)
28
+ max_length = [str1.length, str2.length].max
29
+
30
+ 1.0 - (distance.to_f / max_length)
31
+ end
32
+
33
+ # Calculate the Levenshtein distance between two strings
34
+ def self.levenshtein_distance(str1, str2)
35
+ # Create a matrix to store distances
36
+ matrix = Array.new(str1.length + 1) { Array.new(str2.length + 1) }
37
+
38
+ # Initialize first row and column
39
+ (0..str1.length).each { |i| matrix[i][0] = i }
40
+ (0..str2.length).each { |j| matrix[0][j] = j }
41
+
42
+ # Fill the matrix using dynamic programming
43
+ (1..str1.length).each do |i|
44
+ (1..str2.length).each do |j|
45
+ if str1[i - 1] == str2[j - 1]
46
+ matrix[i][j] = matrix[i - 1][j - 1]
47
+ else
48
+ matrix[i][j] = [
49
+ matrix[i - 1][j] + 1, # deletion
50
+ matrix[i][j - 1] + 1, # insertion
51
+ matrix[i - 1][j - 1] + 1 # substitution
52
+ ].min
53
+ end
54
+ end
55
+ end
56
+
57
+ matrix[str1.length][str2.length]
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module UnicodeNamecode
4
+ # Trie (prefix tree) for efficient Unicode name lookups
5
+ class Trie
6
+ def initialize
7
+ @root = TrieNode.new
8
+ end
9
+
10
+ # Insert a Unicode name and its codepoint into the Trie
11
+ def insert(name, codepoint)
12
+ node = @root
13
+ name.each_char do |char|
14
+ node.children[char] ||= TrieNode.new
15
+ node = node.children[char]
16
+ end
17
+ node.codepoint = codepoint
18
+ node.is_end = true
19
+ end
20
+
21
+ # Find the exact codepoint for a Unicode name
22
+ def find(name)
23
+ node = find_node(name)
24
+ node&.codepoint
25
+ end
26
+
27
+ # Find all Unicode names that start with the given prefix
28
+ def prefix_search(prefix, limit = 100)
29
+ node = find_node(prefix)
30
+ return [] unless node
31
+
32
+ results = []
33
+ collect_words(node, prefix, results, limit)
34
+ results
35
+ end
36
+
37
+ private
38
+
39
+ def find_node(name)
40
+ node = @root
41
+ name.each_char do |char|
42
+ return nil unless node.children[char]
43
+ node = node.children[char]
44
+ end
45
+ node
46
+ end
47
+
48
+ def collect_words(node, current_word, results, limit)
49
+ return if results.length >= limit
50
+
51
+ results << { name: current_word, codepoint: node.codepoint } if node.is_end
52
+
53
+ node.children.each do |char, child_node|
54
+ collect_words(child_node, current_word + char, results, limit)
55
+ end
56
+ end
57
+ end
58
+
59
+ # Represents a single node in the Trie
60
+ class TrieNode
61
+ attr_accessor :children, :codepoint, :is_end
62
+
63
+ def initialize
64
+ @children = {}
65
+ @codepoint = nil
66
+ @is_end = false
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'unicode_namecode/trie'
4
+ require_relative 'unicode_namecode/emoji'
5
+ require_relative 'unicode_namecode/data_loader'
6
+ require_relative 'unicode_namecode/fuzzy'
7
+ require_relative 'unicode_namecode/aliases'
8
+
9
+ module UnicodeNamecode
10
+ VERSION = "0.1.0"
11
+
12
+ # Look up a Unicode character's codepoint by its official name or alias
13
+ def self.codepoint(name)
14
+ DataLoader.load_data unless DataLoader.trie
15
+
16
+ # Try exact match first
17
+ result = DataLoader.trie.find(name.strip.upcase)
18
+ return result if result
19
+
20
+ # Try alias lookup
21
+ Aliases.codepoint_for_alias(name.strip)
22
+ end
23
+
24
+ # Returns Unicode format (U+XXXX)
25
+ def self.lookup(name)
26
+ codepoint = codepoint(name)
27
+ return nil unless codepoint
28
+ "U+#{codepoint.to_s(16).upcase.rjust(4, '0')}"
29
+ end
30
+
31
+ # Get the Unicode name of a character
32
+ def self.of(character)
33
+ codepoint = character.ord
34
+ name_for_codepoint(codepoint)
35
+ end
36
+
37
+ # Get the codepoint of a character
38
+ def self.codepoint_of(character)
39
+ character.ord
40
+ end
41
+
42
+ # Get the Unicode format of a character
43
+ def self.unicode_of(character)
44
+ codepoint = character.ord
45
+ "U+#{codepoint.to_s(16).upcase.rjust(4, '0')}"
46
+ end
47
+
48
+ # Find all Unicode names that start with the given prefix
49
+ def self.prefix_search(prefix, limit = 100)
50
+ DataLoader.load_data unless DataLoader.trie
51
+ DataLoader.trie.prefix_search(prefix.upcase, limit)
52
+ end
53
+
54
+ # Find Unicode names similar to the given name (for typos/partial matches)
55
+ def self.fuzzy_search(name, limit = 5, similarity_threshold = 0.3)
56
+ DataLoader.load_data unless DataLoader.fuzzy
57
+ Fuzzy.fuzzy_search(DataLoader.fuzzy, name, limit, similarity_threshold)
58
+ end
59
+
60
+ # === EMOJI API METHODS ===
61
+
62
+ def self.codepoint_for_emoji(emoji)
63
+ Emoji.codepoint_for_emoji(emoji)
64
+ end
65
+
66
+ def self.name_for_emoji(emoji)
67
+ Emoji.name_for_emoji(emoji)
68
+ end
69
+
70
+ def self.emoji_for_codepoint(codepoint_or_array)
71
+ Emoji.emoji_for_codepoint(codepoint_or_array)
72
+ end
73
+
74
+ # === REVERSE LOOKUP METHODS ===
75
+
76
+ def self.name_for_codepoint(codepoint)
77
+ DataLoader.load_data unless DataLoader.codepoint_to_name
78
+ DataLoader.codepoint_to_name[codepoint]
79
+ end
80
+
81
+ # === ALIAS API METHODS ===
82
+
83
+ def self.codepoint_for_alias(alias_name)
84
+ Aliases.codepoint_for_alias(alias_name)
85
+ end
86
+
87
+ def self.aliases_for_codepoint(codepoint)
88
+ Aliases.aliases_for_codepoint(codepoint)
89
+ end
90
+
91
+ def self.is_alias?(name)
92
+ Aliases.is_alias?(name)
93
+ end
94
+
95
+ # === UTILITY METHODS ===
96
+
97
+ def self.calculate_similarity(str1, str2)
98
+ Fuzzy.calculate_similarity(str1, str2)
99
+ end
100
+
101
+ def self.levenshtein_distance(str1, str2)
102
+ Fuzzy.levenshtein_distance(str1, str2)
103
+ end
104
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: unicode-namecode
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Mikkal Mullen
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: fuzzy_match
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '2.0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '2.0'
26
+ description: Fast Unicode character lookups by name, alias, or emoji. Features Trie-based
27
+ searches, fuzzy matching, prefix search, and comprehensive CLI tools.
28
+ email:
29
+ - mikkalmp@gmail.com
30
+ executables:
31
+ - unicode-namecode
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - bin/unicode-namecode
36
+ - data/NameAliases.txt
37
+ - data/UnicodeData.txt
38
+ - data/emoji-test.txt
39
+ - data/unicode_trie.cache
40
+ - lib/unicode_namecode.rb
41
+ - lib/unicode_namecode/aliases.rb
42
+ - lib/unicode_namecode/data_loader.rb
43
+ - lib/unicode_namecode/emoji.rb
44
+ - lib/unicode_namecode/fuzzy.rb
45
+ - lib/unicode_namecode/trie.rb
46
+ homepage: https://github.com/Aeroswift/unicode-namecode
47
+ licenses:
48
+ - MIT
49
+ metadata:
50
+ homepage_uri: https://github.com/Aeroswift/unicode-namecode
51
+ source_code_uri: https://github.com/Aeroswift/unicode-namecode
52
+ changelog_uri: https://github.com/Aeroswift/unicode-namecode/blob/main/CHANGELOG.md
53
+ rdoc_options: []
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 3.0.0
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ requirements: []
67
+ rubygems_version: 3.6.9
68
+ specification_version: 4
69
+ summary: A powerful Ruby gem for Unicode character lookups with support for official
70
+ names, aliases, emojis, and fuzzy matching.
71
+ test_files: []