unicoder 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 21168580053326da8f495794ab5133fc9fa3daa3fa66209cf5ddbb0ac68de923
4
- data.tar.gz: 3829094ce2cfb8322d79f4a11ef2015235f2b7e602097ab3defb7c8cc4f825de
3
+ metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
4
+ data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
5
5
  SHA512:
6
- metadata.gz: e5a9b3d54c062817485b0b66b34015d1d12536fd4e82a601d604302d73bf06c0963dd3e8ca0b99bc9989ec62799c025b702218dbde445c46f7aa42671ccb4ae0
7
- data.tar.gz: ac856c57cc3a9bd7fb8c0e65f282858b760ce42be93411fc5ae8cc2d3007cd9cc33b3e422cbdf8fa8f95316cf84994d328ed6679ec3ccd09d7515f49f30aa6a1
6
+ metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
7
+ data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.1.0
4
+
5
+ - Improve name index size: Support ranges
6
+ - Improve name index size: Replace common words
7
+
3
8
  ### 1.0.0
4
9
 
5
10
  With the first 1.0 release, unicoder supports 10 indexes:
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- unicoder (1.0.0)
4
+ unicoder (1.1.0)
5
5
  oga (~> 2.9)
6
6
  rationalist (~> 2.0)
7
7
  rubyzip (~> 1.2)
data/README.md CHANGED
@@ -37,8 +37,8 @@ types | [unicode-types](https://github.com/janlelis/unicode-types)
37
37
 
38
38
  Index Name | Module
39
39
  --------------|----
40
- numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-number.js)
41
- name, sequence\_name, type | [unicode-name](https://github.com/janlelis/unicode-name)
40
+ name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
41
+ numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
42
42
 
43
43
  ## MIT License
44
44
 
@@ -1,19 +1,34 @@
1
1
  module Unicoder
2
2
  module Builder
3
3
  class Name
4
+
4
5
  include Builder
6
+ include ReplaceCommonWords
5
7
 
6
8
  JAMO_INITIAL = 4352
7
9
  JAMO_MEDIAL = 4449
8
10
  JAMO_FINAL = 4520
9
11
  JAMO_END = 4697
10
12
 
13
+ CJK = "CJK UNIFIED IDEOGRAPH-"
14
+ TANGUT = "TANGUT IDEOGRAPH-"
15
+
16
+ REPLACE_COUNT = 500
17
+ REPLACE_BASE = ?[.ord
18
+
11
19
  def initialize_index
12
20
  @index = {
13
21
  NAMES: {},
14
22
  ALIASES: {},
15
- CJK: [],
16
- HANGUL: [],
23
+ # HANGUL: [],
24
+ CP_RANGES: {
25
+ CJK => [], # filled while parsing
26
+ TANGUT => [], # filled while parsing
27
+ "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
28
+ "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
29
+ "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
30
+ "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
31
+ },
17
32
  # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
18
33
  JAMO: {
19
34
  INITIAL: [],
@@ -21,6 +36,7 @@ module Unicoder
21
36
  FINAL: [""],
22
37
  },
23
38
  }
39
+ @words = []
24
40
  @range_start = nil
25
41
  end
26
42
 
@@ -36,22 +52,32 @@ module Unicoder
36
52
  if line["name"] =~ /First/
37
53
  @range_start = line["codepoint"].to_i(16)
38
54
  elsif line["name"] =~ /Last/ && @range_start
39
- if line["name"] =~ /Hangul/
40
- @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
41
- elsif line["name"] =~ /CJK/
42
- @index[:CJK] << [@range_start, line["codepoint"].to_i(16)]
55
+ case line["name"]
56
+ when /Hangul/
57
+ # currently not necessary
58
+ # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
59
+ when /CJK/
60
+ @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
61
+ when /Tangut/
62
+ @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
43
63
  else
44
64
  # no name
65
+ warn "ignoring range: #{line["name"]}"
45
66
  end
46
67
  @range_start = nil
47
68
  elsif line["name"] != "<control>"
48
69
  raise ArgumentError, "inconsistent range found in data, don't know what to do"
49
70
  end
71
+ elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
72
+ # ignore
50
73
  else
51
74
  assign :NAMES, line["codepoint"].to_i(16), line["name"]
75
+ @words += line["name"].split
52
76
  end
53
77
  end
54
78
 
79
+ replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
80
+
55
81
  parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
56
82
  @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
57
83
  @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
@@ -2,11 +2,18 @@ module Unicoder
2
2
  module Builder
3
3
  class SequenceName
4
4
  include Builder
5
+ include ReplaceCommonWords
6
+
7
+ REPLACE_COUNT = 100
8
+ REPLACE_BASE = ?{.ord
9
+ REPLACE_MIN_WORD_LENGTH = 3
5
10
 
6
11
  def initialize_index
7
12
  @index = {
8
13
  SEQUENCES: {},
14
+ SEQUENCES_NOT_QUALIFIED: {},
9
15
  }
16
+ @words = []
10
17
  end
11
18
 
12
19
  def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
@@ -25,6 +32,8 @@ module Unicoder
25
32
  else
26
33
  idx[key] = value
27
34
  end
35
+
36
+ @words += value.split
28
37
  end
29
38
 
30
39
  def parse!
@@ -61,10 +70,28 @@ module Unicoder
61
70
  assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
62
71
  end
63
72
 
64
- parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
73
+ parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
65
74
  name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
66
- assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
75
+ codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
76
+ assign_codepoint codepoints, name
77
+ if codepoints.include?(0xFE0F)
78
+ # Build all combinations of VS16 present and missing
79
+ codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
80
+ if cur.include? 0xFE0F
81
+ acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
82
+ else
83
+ acc.map{|prev| prev + cur}
84
+ end
85
+ }.
86
+ select {|sub_codepoints| sub_codepoints != codepoints }.
87
+ each { |sub_codepoints|
88
+ assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
89
+ }
90
+ end
67
91
  end
92
+
93
+ replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
94
+ replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
68
95
  end
69
96
  end
70
97
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Unicoder
4
- VERSION = "1.0.0"
4
+ VERSION = "1.1.0"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
7
  16.0.0
@@ -0,0 +1,20 @@
1
+ require "json"
2
+
3
+ module Unicoder
4
+ module ReplaceCommonWords
5
+ def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
6
+ puts "Starting to replace the #{count} most common words"
7
+ @index[:REPLACE_BASE] = base
8
+ @index[:COMMON_WORDS] = words.
9
+ select{_1.size >= min_word_length}.
10
+ tally.
11
+ max_by(count){_2}.
12
+ map(&:first)
13
+ @index[which_index].each{|_, name|
14
+ @index[:COMMON_WORDS].each_with_index{|word, index|
15
+ name.gsub! word + " ", [base + index].pack("U")
16
+ }
17
+ }
18
+ end
19
+ end
20
+ end
data/lib/unicoder.rb CHANGED
@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
2
2
  require_relative "unicoder/downloader"
3
3
  require_relative "unicoder/builder"
4
4
  require_relative "unicoder/multi_dimensional_array_builder"
5
+ require_relative "unicoder/replace_common_words"
5
6
 
6
7
  if defined?(Rake)
7
8
  Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))
data/unicoder.gemspec CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
- gem.required_ruby_version = ">= 2.0", "< 4.0"
20
+ gem.required_ruby_version = ">= 3.0", "< 4.0"
21
21
  gem.add_dependency "rationalist", "~> 2.0"
22
22
  gem.add_dependency "rubyzip", "~> 1.2"
23
23
  gem.add_dependency "oga", "~> 2.9"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: unicoder
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jan Lelis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-10-04 00:00:00.000000000 Z
11
+ date: 2024-10-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rationalist
@@ -85,6 +85,7 @@ files:
85
85
  - lib/unicoder/constants.rb
86
86
  - lib/unicoder/downloader.rb
87
87
  - lib/unicoder/multi_dimensional_array_builder.rb
88
+ - lib/unicoder/replace_common_words.rb
88
89
  - lib/unicoder/tasks.rake
89
90
  - unicoder.gemspec
90
91
  homepage: https://github.com/janlelis/unicoder
@@ -99,7 +100,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
99
100
  requirements:
100
101
  - - ">="
101
102
  - !ruby/object:Gem::Version
102
- version: '2.0'
103
+ version: '3.0'
103
104
  - - "<"
104
105
  - !ruby/object:Gem::Version
105
106
  version: '4.0'