unicoder 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 21168580053326da8f495794ab5133fc9fa3daa3fa66209cf5ddbb0ac68de923
4
- data.tar.gz: 3829094ce2cfb8322d79f4a11ef2015235f2b7e602097ab3defb7c8cc4f825de
3
+ metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
4
+ data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
5
5
  SHA512:
6
- metadata.gz: e5a9b3d54c062817485b0b66b34015d1d12536fd4e82a601d604302d73bf06c0963dd3e8ca0b99bc9989ec62799c025b702218dbde445c46f7aa42671ccb4ae0
7
- data.tar.gz: ac856c57cc3a9bd7fb8c0e65f282858b760ce42be93411fc5ae8cc2d3007cd9cc33b3e422cbdf8fa8f95316cf84994d328ed6679ec3ccd09d7515f49f30aa6a1
6
+ metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
7
+ data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb
data/CHANGELOG.md CHANGED
@@ -1,5 +1,16 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.1.1
4
+
5
+ - Fix bug related to unsafe characters
6
+ - Fix squared CJK
7
+ - Small adjustments for scripts and blocks index builders
8
+
9
+ ### 1.1.0
10
+
11
+ - Improve name index size: Support ranges
12
+ - Improve name index size: Replace common words
13
+
3
14
  ### 1.0.0
4
15
 
5
16
  With the first 1.0 release, unicoder supports 10 indexes:
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.0.0)
4
+ unicoder (1.1.1)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
data/README.md CHANGED
@@ -37,8 +37,9 @@ types | [unicode-types](https://github.com/janlelis/unicode-types)
37
37
 
38
38
  Index Name | Module
39
39
  --------------|----
40
- numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-number.js)
41
- name, sequence\_name, type | [unicode-name](https://github.com/janlelis/unicode-name)
40
+ name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
41
+ numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
42
+ scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
42
43
 
43
44
  ## MIT License
44
45
 
@@ -4,12 +4,14 @@ module Unicoder
4
4
  include Builder
5
5
 
6
6
  def initialize_index
7
- @index = []
7
+ @index = {
8
+ BLOCKS: []
9
+ }
8
10
  end
9
11
 
10
12
  def parse!
11
13
  parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
12
- @index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
14
+ @index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
13
15
  end
14
16
  end
15
17
  end
@@ -1,19 +1,34 @@
1
1
  module Unicoder
2
2
  module Builder
3
3
  class Name
4
+
4
5
  include Builder
6
+ include ReplaceCommonWords
5
7
 
6
8
  JAMO_INITIAL = 4352
7
9
  JAMO_MEDIAL = 4449
8
10
  JAMO_FINAL = 4520
9
11
  JAMO_END = 4697
10
12
 
13
+ CJK = "CJK UNIFIED IDEOGRAPH-"
14
+ TANGUT = "TANGUT IDEOGRAPH-"
15
+
16
+ REPLACE_COUNT = 500
17
+ REPLACE_BASE = ?[.ord
18
+
11
19
  def initialize_index
12
20
  @index = {
13
21
  NAMES: {},
14
22
  ALIASES: {},
15
- CJK: [],
16
- HANGUL: [],
23
+ # HANGUL: [],
24
+ CP_RANGES: {
25
+ CJK => [], # filled while parsing
26
+ TANGUT => [], # filled while parsing
27
+ "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
28
+ "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
29
+ "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
30
+ "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
31
+ },
17
32
  # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
18
33
  JAMO: {
19
34
  INITIAL: [],
@@ -21,6 +36,7 @@ module Unicoder
21
36
  FINAL: [""],
22
37
  },
23
38
  }
39
+ @words = []
24
40
  @range_start = nil
25
41
  end
26
42
 
@@ -36,22 +52,32 @@ module Unicoder
36
52
  if line["name"] =~ /First/
37
53
  @range_start = line["codepoint"].to_i(16)
38
54
  elsif line["name"] =~ /Last/ && @range_start
39
- if line["name"] =~ /Hangul/
40
- @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
41
- elsif line["name"] =~ /CJK/
42
- @index[:CJK] << [@range_start, line["codepoint"].to_i(16)]
55
+ case line["name"]
56
+ when /Hangul/
57
+ # currently not necessary
58
+ # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
59
+ when /CJK/
60
+ @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
61
+ when /Tangut/
62
+ @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
43
63
  else
44
64
  # no name
65
+ warn "ignoring range: #{line["name"]}"
45
66
  end
46
67
  @range_start = nil
47
68
  elsif line["name"] != "<control>"
48
69
  raise ArgumentError, "inconsistent range found in data, don't know what to do"
49
70
  end
71
+ elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
72
+ # ignore
50
73
  else
51
74
  assign :NAMES, line["codepoint"].to_i(16), line["name"]
75
+ @words += line["name"].split
52
76
  end
53
77
  end
54
78
 
79
+ replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
80
+
55
81
  parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
56
82
  @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
57
83
  @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
@@ -10,6 +10,12 @@ module Unicoder
10
10
  SCRIPT_EXTENSIONS: {},
11
11
  SCRIPT_ALIASES: {},
12
12
  SCRIPT_NAMES: [],
13
+ OFFSETS: [
14
+ 0x10000,
15
+ 0x1000,
16
+ 0x100,
17
+ 0x10
18
+ ],
13
19
  }
14
20
  @reverse_script_names = {}
15
21
  @reverse_script_extension_names = {}
@@ -21,6 +27,17 @@ module Unicoder
21
27
  }
22
28
  end
23
29
 
30
+ # TODO refactor how multiple indexes are organized
31
+ def assign_classic(sub_index_name, codepoint, value)
32
+ idx = @index[sub_index_name]
33
+
34
+ if option =~ /charkeys/
35
+ idx[[codepoint].pack("U*")] = value
36
+ else
37
+ idx[codepoint] = value
38
+ end
39
+ end
40
+
24
41
  def parse!
25
42
  parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
26
43
  @index[:SCRIPT_NAMES] << line["long"]
@@ -47,10 +64,10 @@ module Unicoder
47
64
  parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
48
65
  if line["to"]
49
66
  (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
50
- @index[:SCRIPT_EXTENSIONS][codepoint] = lookup_extension_names(line["scripts"])
67
+ assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
51
68
  }
52
69
  else
53
- @index[:SCRIPT_EXTENSIONS][line["from"].to_i(16)] = lookup_extension_names(line["scripts"])
70
+ assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
54
71
  end
55
72
  end
56
73
  end
@@ -2,11 +2,18 @@ module Unicoder
2
2
  module Builder
3
3
  class SequenceName
4
4
  include Builder
5
+ include ReplaceCommonWords
6
+
7
+ REPLACE_COUNT = 100
8
+ REPLACE_BASE = ?{.ord
9
+ REPLACE_MIN_WORD_LENGTH = 3
5
10
 
6
11
  def initialize_index
7
12
  @index = {
8
13
  SEQUENCES: {},
14
+ SEQUENCES_NOT_QUALIFIED: {},
9
15
  }
16
+ @words = []
10
17
  end
11
18
 
12
19
  def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
@@ -25,6 +32,8 @@ module Unicoder
25
32
  else
26
33
  idx[key] = value
27
34
  end
35
+
36
+ @words += value.split
28
37
  end
29
38
 
30
39
  def parse!
@@ -61,10 +70,28 @@ module Unicoder
61
70
  assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
62
71
  end
63
72
 
64
- parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
73
+ parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
65
74
  name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
66
- assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
75
+ codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
76
+ assign_codepoint codepoints, name
77
+ if codepoints.include?(0xFE0F)
78
+ # Build all combinations of VS16 present and missing
79
+ codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
80
+ if cur.include? 0xFE0F
81
+ acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
82
+ else
83
+ acc.map{|prev| prev + cur}
84
+ end
85
+ }.
86
+ select {|sub_codepoints| sub_codepoints != codepoints }.
87
+ each { |sub_codepoints|
88
+ assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
89
+ }
90
+ end
67
91
  end
92
+
93
+ replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
94
+ replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
68
95
  end
69
96
  end
70
97
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.0.0"
4
+ VERSION = "1.1.1"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
7
  16.0.0
@@ -0,0 +1,21 @@
1
+ require "json"
2
+
3
+ module Unicoder
4
+ module ReplaceCommonWords
5
+ def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
6
+ base = @words.join.chars.max.ord + 1
7
+ puts "Starting to replace the #{count} most common words (replace base: #{base})"
8
+ @index[:REPLACE_BASE] = base
9
+ @index[:COMMON_WORDS] = words.
10
+ select{_1.size >= min_word_length}.
11
+ tally.
12
+ max_by(count){_2}.
13
+ map(&:first)
14
+ @index[which_index].each{|_, name|
15
+ @index[:COMMON_WORDS].each_with_index{|word, index|
16
+ name.gsub! word + " ", [base + index].pack("U")
17
+ }
18
+ }
19
+ end
20
+ end
21
+ end
data/lib/unicoder.rb CHANGED
@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
2
2
  require_relative "unicoder/downloader"
3
3
  require_relative "unicoder/builder"
4
4
  require_relative "unicoder/multi_dimensional_array_builder"
5
+ require_relative "unicoder/replace_common_words"
5
6
 
6
7
  if defined?(Rake)
7
8
  Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))
data/unicoder.gemspec CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
- gem.required_ruby_version = ">= 2.0", "< 4.0"
20
+ gem.required_ruby_version = ">= 3.0", "< 4.0"
21
21
  gem.add_dependency "rationalist", "~> 2.0"
22
22
  gem.add_dependency "rubyzip", "~> 1.2"
23
23
  gem.add_dependency "oga", "~> 2.9"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-04 00:00:00.000000000 Z
11
+ date: 2024-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist
@@ -85,6 +85,7 @@ files:
85
85
  - lib/unicoder/constants.rb
86
86
  - lib/unicoder/downloader.rb
87
87
  - lib/unicoder/multi_dimensional_array_builder.rb
88
+ - lib/unicoder/replace_common_words.rb
88
89
  - lib/unicoder/tasks.rake
89
90
  - unicoder.gemspec
90
91
  homepage: https://github.com/janlelis/unicoder
@@ -99,7 +100,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
99
100
  requirements:
100
101
  - - ">="
101
102
  - !ruby/object:Gem::Version
102
- version: '2.0'
103
+ version: '3.0'
103
104
  - - "<"
104
105
  - !ruby/object:Gem::Version
105
106
  version: '4.0'