unicoder 1.0.0 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 21168580053326da8f495794ab5133fc9fa3daa3fa66209cf5ddbb0ac68de923
4
- data.tar.gz: 3829094ce2cfb8322d79f4a11ef2015235f2b7e602097ab3defb7c8cc4f825de
3
+ metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
4
+ data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
5
5
  SHA512:
6
- metadata.gz: e5a9b3d54c062817485b0b66b34015d1d12536fd4e82a601d604302d73bf06c0963dd3e8ca0b99bc9989ec62799c025b702218dbde445c46f7aa42671ccb4ae0
7
- data.tar.gz: ac856c57cc3a9bd7fb8c0e65f282858b760ce42be93411fc5ae8cc2d3007cd9cc33b3e422cbdf8fa8f95316cf84994d328ed6679ec3ccd09d7515f49f30aa6a1
6
+ metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
7
+ data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb
data/CHANGELOG.md CHANGED
@@ -1,5 +1,16 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.1.1
4
+
5
+ - Fix bug related to unsafe characters
6
+ - Fix squared CJK
7
+ - Small adjustments for scripts and blocks index builders
8
+
9
+ ### 1.1.0
10
+
11
+ - Improve name index size: Support ranges
12
+ - Improve name index size: Replace common words
13
+
3
14
  ### 1.0.0
4
15
 
5
16
  With the first 1.0 release, unicoder supports 10 indexes:
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.0.0)
4
+ unicoder (1.1.1)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
data/README.md CHANGED
@@ -37,8 +37,9 @@ types | [unicode-types](https://github.com/janlelis/unicode-types)
37
37
 
38
38
  Index Name | Module
39
39
  --------------|----
40
- numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-number.js)
41
- name, sequence\_name, type | [unicode-name](https://github.com/janlelis/unicode-name)
40
+ name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
41
+ numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
42
+ scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
42
43
 
43
44
  ## MIT License
44
45
 
@@ -4,12 +4,14 @@ module Unicoder
4
4
  include Builder
5
5
 
6
6
  def initialize_index
7
- @index = []
7
+ @index = {
8
+ BLOCKS: []
9
+ }
8
10
  end
9
11
 
10
12
  def parse!
11
13
  parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
12
- @index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
14
+ @index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
13
15
  end
14
16
  end
15
17
  end
@@ -1,19 +1,34 @@
1
1
  module Unicoder
2
2
  module Builder
3
3
  class Name
4
+
4
5
  include Builder
6
+ include ReplaceCommonWords
5
7
 
6
8
  JAMO_INITIAL = 4352
7
9
  JAMO_MEDIAL = 4449
8
10
  JAMO_FINAL = 4520
9
11
  JAMO_END = 4697
10
12
 
13
+ CJK = "CJK UNIFIED IDEOGRAPH-"
14
+ TANGUT = "TANGUT IDEOGRAPH-"
15
+
16
+ REPLACE_COUNT = 500
17
+ REPLACE_BASE = ?[.ord
18
+
11
19
  def initialize_index
12
20
  @index = {
13
21
  NAMES: {},
14
22
  ALIASES: {},
15
- CJK: [],
16
- HANGUL: [],
23
+ # HANGUL: [],
24
+ CP_RANGES: {
25
+ CJK => [], # filled while parsing
26
+ TANGUT => [], # filled while parsing
27
+ "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
28
+ "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
29
+ "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
30
+ "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
31
+ },
17
32
  # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
18
33
  JAMO: {
19
34
  INITIAL: [],
@@ -21,6 +36,7 @@ module Unicoder
21
36
  FINAL: [""],
22
37
  },
23
38
  }
39
+ @words = []
24
40
  @range_start = nil
25
41
  end
26
42
 
@@ -36,22 +52,32 @@ module Unicoder
36
52
  if line["name"] =~ /First/
37
53
  @range_start = line["codepoint"].to_i(16)
38
54
  elsif line["name"] =~ /Last/ && @range_start
39
- if line["name"] =~ /Hangul/
40
- @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
41
- elsif line["name"] =~ /CJK/
42
- @index[:CJK] << [@range_start, line["codepoint"].to_i(16)]
55
+ case line["name"]
56
+ when /Hangul/
57
+ # currently not necessary
58
+ # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
59
+ when /CJK/
60
+ @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
61
+ when /Tangut/
62
+ @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
43
63
  else
44
64
  # no name
65
+ warn "ignoring range: #{line["name"]}"
45
66
  end
46
67
  @range_start = nil
47
68
  elsif line["name"] != "<control>"
48
69
  raise ArgumentError, "inconsistent range found in data, don't know what to do"
49
70
  end
71
+ elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
72
+ # ignore
50
73
  else
51
74
  assign :NAMES, line["codepoint"].to_i(16), line["name"]
75
+ @words += line["name"].split
52
76
  end
53
77
  end
54
78
 
79
+ replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
80
+
55
81
  parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
56
82
  @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
57
83
  @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
@@ -10,6 +10,12 @@ module Unicoder
10
10
  SCRIPT_EXTENSIONS: {},
11
11
  SCRIPT_ALIASES: {},
12
12
  SCRIPT_NAMES: [],
13
+ OFFSETS: [
14
+ 0x10000,
15
+ 0x1000,
16
+ 0x100,
17
+ 0x10
18
+ ],
13
19
  }
14
20
  @reverse_script_names = {}
15
21
  @reverse_script_extension_names = {}
@@ -21,6 +27,17 @@ module Unicoder
21
27
  }
22
28
  end
23
29
 
30
+ # TODO refactor how multiple indexes are organized
31
+ def assign_classic(sub_index_name, codepoint, value)
32
+ idx = @index[sub_index_name]
33
+
34
+ if option =~ /charkeys/
35
+ idx[[codepoint].pack("U*")] = value
36
+ else
37
+ idx[codepoint] = value
38
+ end
39
+ end
40
+
24
41
  def parse!
25
42
  parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
26
43
  @index[:SCRIPT_NAMES] << line["long"]
@@ -47,10 +64,10 @@ module Unicoder
47
64
  parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
48
65
  if line["to"]
49
66
  (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
50
- @index[:SCRIPT_EXTENSIONS][codepoint] = lookup_extension_names(line["scripts"])
67
+ assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
51
68
  }
52
69
  else
53
- @index[:SCRIPT_EXTENSIONS][line["from"].to_i(16)] = lookup_extension_names(line["scripts"])
70
+ assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
54
71
  end
55
72
  end
56
73
  end
@@ -2,11 +2,18 @@ module Unicoder
2
2
  module Builder
3
3
  class SequenceName
4
4
  include Builder
5
+ include ReplaceCommonWords
6
+
7
+ REPLACE_COUNT = 100
8
+ REPLACE_BASE = ?{.ord
9
+ REPLACE_MIN_WORD_LENGTH = 3
5
10
 
6
11
  def initialize_index
7
12
  @index = {
8
13
  SEQUENCES: {},
14
+ SEQUENCES_NOT_QUALIFIED: {},
9
15
  }
16
+ @words = []
10
17
  end
11
18
 
12
19
  def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
@@ -25,6 +32,8 @@ module Unicoder
25
32
  else
26
33
  idx[key] = value
27
34
  end
35
+
36
+ @words += value.split
28
37
  end
29
38
 
30
39
  def parse!
@@ -61,10 +70,28 @@ module Unicoder
61
70
  assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
62
71
  end
63
72
 
64
- parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
73
+ parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
65
74
  name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
66
- assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
75
+ codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
76
+ assign_codepoint codepoints, name
77
+ if codepoints.include?(0xFE0F)
78
+ # Build all combinations of VS16 present and missing
79
+ codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
80
+ if cur.include? 0xFE0F
81
+ acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
82
+ else
83
+ acc.map{|prev| prev + cur}
84
+ end
85
+ }.
86
+ select {|sub_codepoints| sub_codepoints != codepoints }.
87
+ each { |sub_codepoints|
88
+ assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
89
+ }
90
+ end
67
91
  end
92
+
93
+ replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
94
+ replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
68
95
  end
69
96
  end
70
97
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.0.0"
4
+ VERSION = "1.1.1"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
7
  16.0.0
@@ -0,0 +1,21 @@
1
+ require "json"
2
+
3
+ module Unicoder
4
+ module ReplaceCommonWords
5
+ def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
6
+ base = @words.join.chars.max.ord + 1
7
+ puts "Starting to replace the #{count} most common words (replace base: #{base})"
8
+ @index[:REPLACE_BASE] = base
9
+ @index[:COMMON_WORDS] = words.
10
+ select{_1.size >= min_word_length}.
11
+ tally.
12
+ max_by(count){_2}.
13
+ map(&:first)
14
+ @index[which_index].each{|_, name|
15
+ @index[:COMMON_WORDS].each_with_index{|word, index|
16
+ name.gsub! word + " ", [base + index].pack("U")
17
+ }
18
+ }
19
+ end
20
+ end
21
+ end
data/lib/unicoder.rb CHANGED
@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
2
2
  require_relative "unicoder/downloader"
3
3
  require_relative "unicoder/builder"
4
4
  require_relative "unicoder/multi_dimensional_array_builder"
5
+ require_relative "unicoder/replace_common_words"
5
6
 
6
7
  if defined?(Rake)
7
8
  Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))
data/unicoder.gemspec CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
- gem.required_ruby_version = ">= 2.0", "< 4.0"
20
+ gem.required_ruby_version = ">= 3.0", "< 4.0"
21
21
  gem.add_dependency "rationalist", "~> 2.0"
22
22
  gem.add_dependency "rubyzip", "~> 1.2"
23
23
  gem.add_dependency "oga", "~> 2.9"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-04 00:00:00.000000000 Z
11
+ date: 2024-10-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist
@@ -85,6 +85,7 @@ files:
85
85
  - lib/unicoder/constants.rb
86
86
  - lib/unicoder/downloader.rb
87
87
  - lib/unicoder/multi_dimensional_array_builder.rb
88
+ - lib/unicoder/replace_common_words.rb
88
89
  - lib/unicoder/tasks.rake
89
90
  - unicoder.gemspec
90
91
  homepage: https://github.com/janlelis/unicoder
@@ -99,7 +100,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
99
100
  requirements:
100
101
  - - ">="
101
102
  - !ruby/object:Gem::Version
102
- version: '2.0'
103
+ version: '3.0'
103
104
  - - "<"
104
105
  - !ruby/object:Gem::Version
105
106
  version: '4.0'