unicoder 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 21168580053326da8f495794ab5133fc9fa3daa3fa66209cf5ddbb0ac68de923
4
- data.tar.gz: 3829094ce2cfb8322d79f4a11ef2015235f2b7e602097ab3defb7c8cc4f825de
3
+ metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
4
+ data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
5
5
  SHA512:
6
- metadata.gz: e5a9b3d54c062817485b0b66b34015d1d12536fd4e82a601d604302d73bf06c0963dd3e8ca0b99bc9989ec62799c025b702218dbde445c46f7aa42671ccb4ae0
7
- data.tar.gz: ac856c57cc3a9bd7fb8c0e65f282858b760ce42be93411fc5ae8cc2d3007cd9cc33b3e422cbdf8fa8f95316cf84994d328ed6679ec3ccd09d7515f49f30aa6a1
6
+ metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
7
+ data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.1.0
4
+
5
+ - Improve name index size: Support ranges
6
+ - Improve name index size: Replace common words
7
+
3
8
  ### 1.0.0
4
9
 
5
10
  With the first 1.0 release, unicoder supports 10 indexes:
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.0.0)
4
+ unicoder (1.1.0)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
data/README.md CHANGED
@@ -37,8 +37,8 @@ types | [unicode-types](https://github.com/janlelis/unicode-types)
37
37
 
38
38
  Index Name | Module
39
39
  --------------|----
40
- numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-number.js)
41
- name, sequence\_name, type | [unicode-name](https://github.com/janlelis/unicode-name)
40
+ name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
41
+ numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
42
42
 
43
43
  ## MIT License
44
44
 
@@ -1,19 +1,34 @@
1
1
  module Unicoder
2
2
  module Builder
3
3
  class Name
4
+
4
5
  include Builder
6
+ include ReplaceCommonWords
5
7
 
6
8
  JAMO_INITIAL = 4352
7
9
  JAMO_MEDIAL = 4449
8
10
  JAMO_FINAL = 4520
9
11
  JAMO_END = 4697
10
12
 
13
+ CJK = "CJK UNIFIED IDEOGRAPH-"
14
+ TANGUT = "TANGUT IDEOGRAPH-"
15
+
16
+ REPLACE_COUNT = 500
17
+ REPLACE_BASE = ?[.ord
18
+
11
19
  def initialize_index
12
20
  @index = {
13
21
  NAMES: {},
14
22
  ALIASES: {},
15
- CJK: [],
16
- HANGUL: [],
23
+ # HANGUL: [],
24
+ CP_RANGES: {
25
+ CJK => [], # filled while parsing
26
+ TANGUT => [], # filled while parsing
27
+ "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
28
+ "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
29
+ "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
30
+ "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
31
+ },
17
32
  # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
18
33
  JAMO: {
19
34
  INITIAL: [],
@@ -21,6 +36,7 @@ module Unicoder
21
36
  FINAL: [""],
22
37
  },
23
38
  }
39
+ @words = []
24
40
  @range_start = nil
25
41
  end
26
42
 
@@ -36,22 +52,32 @@ module Unicoder
36
52
  if line["name"] =~ /First/
37
53
  @range_start = line["codepoint"].to_i(16)
38
54
  elsif line["name"] =~ /Last/ && @range_start
39
- if line["name"] =~ /Hangul/
40
- @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
41
- elsif line["name"] =~ /CJK/
42
- @index[:CJK] << [@range_start, line["codepoint"].to_i(16)]
55
+ case line["name"]
56
+ when /Hangul/
57
+ # currently not necessary
58
+ # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
59
+ when /CJK/
60
+ @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
61
+ when /Tangut/
62
+ @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
43
63
  else
44
64
  # no name
65
+ warn "ignoring range: #{line["name"]}"
45
66
  end
46
67
  @range_start = nil
47
68
  elsif line["name"] != "<control>"
48
69
  raise ArgumentError, "inconsistent range found in data, don't know what to do"
49
70
  end
71
+ elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
72
+ # ignore
50
73
  else
51
74
  assign :NAMES, line["codepoint"].to_i(16), line["name"]
75
+ @words += line["name"].split
52
76
  end
53
77
  end
54
78
 
79
+ replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
80
+
55
81
  parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
56
82
  @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
57
83
  @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
@@ -2,11 +2,18 @@ module Unicoder
2
2
  module Builder
3
3
  class SequenceName
4
4
  include Builder
5
+ include ReplaceCommonWords
6
+
7
+ REPLACE_COUNT = 100
8
+ REPLACE_BASE = ?{.ord
9
+ REPLACE_MIN_WORD_LENGTH = 3
5
10
 
6
11
  def initialize_index
7
12
  @index = {
8
13
  SEQUENCES: {},
14
+ SEQUENCES_NOT_QUALIFIED: {},
9
15
  }
16
+ @words = []
10
17
  end
11
18
 
12
19
  def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
@@ -25,6 +32,8 @@ module Unicoder
25
32
  else
26
33
  idx[key] = value
27
34
  end
35
+
36
+ @words += value.split
28
37
  end
29
38
 
30
39
  def parse!
@@ -61,10 +70,28 @@ module Unicoder
61
70
  assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
62
71
  end
63
72
 
64
- parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
73
+ parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
65
74
  name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
66
- assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
75
+ codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
76
+ assign_codepoint codepoints, name
77
+ if codepoints.include?(0xFE0F)
78
+ # Build all combinations of VS16 present and missing
79
+ codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
80
+ if cur.include? 0xFE0F
81
+ acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
82
+ else
83
+ acc.map{|prev| prev + cur}
84
+ end
85
+ }.
86
+ select {|sub_codepoints| sub_codepoints != codepoints }.
87
+ each { |sub_codepoints|
88
+ assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
89
+ }
90
+ end
67
91
  end
92
+
93
+ replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
94
+ replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
68
95
  end
69
96
  end
70
97
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.0.0"
4
+ VERSION = "1.1.0"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
7
  16.0.0
@@ -0,0 +1,20 @@
1
+ require "json"
2
+
3
+ module Unicoder
4
+ module ReplaceCommonWords
5
+ def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
6
+ puts "Starting to replace the #{count} most common words"
7
+ @index[:REPLACE_BASE] = base
8
+ @index[:COMMON_WORDS] = words.
9
+ select{_1.size >= min_word_length}.
10
+ tally.
11
+ max_by(count){_2}.
12
+ map(&:first)
13
+ @index[which_index].each{|_, name|
14
+ @index[:COMMON_WORDS].each_with_index{|word, index|
15
+ name.gsub! word + " ", [base + index].pack("U")
16
+ }
17
+ }
18
+ end
19
+ end
20
+ end
data/lib/unicoder.rb CHANGED
@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
2
2
  require_relative "unicoder/downloader"
3
3
  require_relative "unicoder/builder"
4
4
  require_relative "unicoder/multi_dimensional_array_builder"
5
+ require_relative "unicoder/replace_common_words"
5
6
 
6
7
  if defined?(Rake)
7
8
  Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))
data/unicoder.gemspec CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
- gem.required_ruby_version = ">= 2.0", "< 4.0"
20
+ gem.required_ruby_version = ">= 3.0", "< 4.0"
21
21
  gem.add_dependency "rationalist", "~> 2.0"
22
22
  gem.add_dependency "rubyzip", "~> 1.2"
23
23
  gem.add_dependency "oga", "~> 2.9"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-04 00:00:00.000000000 Z
11
+ date: 2024-10-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist
@@ -85,6 +85,7 @@ files:
85
85
  - lib/unicoder/constants.rb
86
86
  - lib/unicoder/downloader.rb
87
87
  - lib/unicoder/multi_dimensional_array_builder.rb
88
+ - lib/unicoder/replace_common_words.rb
88
89
  - lib/unicoder/tasks.rake
89
90
  - unicoder.gemspec
90
91
  homepage: https://github.com/janlelis/unicoder
@@ -99,7 +100,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
99
100
  requirements:
100
101
  - - ">="
101
102
  - !ruby/object:Gem::Version
102
- version: '2.0'
103
+ version: '3.0'
103
104
  - - "<"
104
105
  - !ruby/object:Gem::Version
105
106
  version: '4.0'