unicoder 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +1 -1
- data/README.md +2 -2
- data/lib/unicoder/builders/name.rb +32 -6
- data/lib/unicoder/builders/sequence_name.rb +29 -2
- data/lib/unicoder/constants.rb +1 -1
- data/lib/unicoder/replace_common_words.rb +20 -0
- data/lib/unicoder.rb +1 -0
- data/unicoder.gemspec +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
|
4
|
+
data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
|
7
|
+
data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -37,8 +37,8 @@ types | [unicode-types](https://github.com/janlelis/unicode-types)
|
|
37
37
|
|
38
38
|
Index Name | Module
|
39
39
|
--------------|----
|
40
|
-
|
41
|
-
|
40
|
+
name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
|
41
|
+
numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
|
42
42
|
|
43
43
|
## MIT License
|
44
44
|
|
@@ -1,19 +1,34 @@
|
|
1
1
|
module Unicoder
|
2
2
|
module Builder
|
3
3
|
class Name
|
4
|
+
|
4
5
|
include Builder
|
6
|
+
include ReplaceCommonWords
|
5
7
|
|
6
8
|
JAMO_INITIAL = 4352
|
7
9
|
JAMO_MEDIAL = 4449
|
8
10
|
JAMO_FINAL = 4520
|
9
11
|
JAMO_END = 4697
|
10
12
|
|
13
|
+
CJK = "CJK UNIFIED IDEOGRAPH-"
|
14
|
+
TANGUT = "TANGUT IDEOGRAPH-"
|
15
|
+
|
16
|
+
REPLACE_COUNT = 500
|
17
|
+
REPLACE_BASE = ?[.ord
|
18
|
+
|
11
19
|
def initialize_index
|
12
20
|
@index = {
|
13
21
|
NAMES: {},
|
14
22
|
ALIASES: {},
|
15
|
-
|
16
|
-
|
23
|
+
# HANGUL: [],
|
24
|
+
CP_RANGES: {
|
25
|
+
CJK => [], # filled while parsing
|
26
|
+
TANGUT => [], # filled while parsing
|
27
|
+
"EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
|
28
|
+
"KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
|
29
|
+
"NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
|
30
|
+
"CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
|
31
|
+
},
|
17
32
|
# see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
|
18
33
|
JAMO: {
|
19
34
|
INITIAL: [],
|
@@ -21,6 +36,7 @@ module Unicoder
|
|
21
36
|
FINAL: [""],
|
22
37
|
},
|
23
38
|
}
|
39
|
+
@words = []
|
24
40
|
@range_start = nil
|
25
41
|
end
|
26
42
|
|
@@ -36,22 +52,32 @@ module Unicoder
|
|
36
52
|
if line["name"] =~ /First/
|
37
53
|
@range_start = line["codepoint"].to_i(16)
|
38
54
|
elsif line["name"] =~ /Last/ && @range_start
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
@index[:
|
55
|
+
case line["name"]
|
56
|
+
when /Hangul/
|
57
|
+
# currently not necessary
|
58
|
+
# @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
|
59
|
+
when /CJK/
|
60
|
+
@index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
|
61
|
+
when /Tangut/
|
62
|
+
@index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
|
43
63
|
else
|
44
64
|
# no name
|
65
|
+
warn "ignoring range: #{line["name"]}"
|
45
66
|
end
|
46
67
|
@range_start = nil
|
47
68
|
elsif line["name"] != "<control>"
|
48
69
|
raise ArgumentError, "inconsistent range found in data, don't know what to do"
|
49
70
|
end
|
71
|
+
elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
|
72
|
+
# ignore
|
50
73
|
else
|
51
74
|
assign :NAMES, line["codepoint"].to_i(16), line["name"]
|
75
|
+
@words += line["name"].split
|
52
76
|
end
|
53
77
|
end
|
54
78
|
|
79
|
+
replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
|
80
|
+
|
55
81
|
parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
|
56
82
|
@index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
|
57
83
|
@index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
|
@@ -2,11 +2,18 @@ module Unicoder
|
|
2
2
|
module Builder
|
3
3
|
class SequenceName
|
4
4
|
include Builder
|
5
|
+
include ReplaceCommonWords
|
6
|
+
|
7
|
+
REPLACE_COUNT = 100
|
8
|
+
REPLACE_BASE = ?{.ord
|
9
|
+
REPLACE_MIN_WORD_LENGTH = 3
|
5
10
|
|
6
11
|
def initialize_index
|
7
12
|
@index = {
|
8
13
|
SEQUENCES: {},
|
14
|
+
SEQUENCES_NOT_QUALIFIED: {},
|
9
15
|
}
|
16
|
+
@words = []
|
10
17
|
end
|
11
18
|
|
12
19
|
def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
|
@@ -25,6 +32,8 @@ module Unicoder
|
|
25
32
|
else
|
26
33
|
idx[key] = value
|
27
34
|
end
|
35
|
+
|
36
|
+
@words += value.split
|
28
37
|
end
|
29
38
|
|
30
39
|
def parse!
|
@@ -61,10 +70,28 @@ module Unicoder
|
|
61
70
|
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
|
62
71
|
end
|
63
72
|
|
64
|
-
parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
|
73
|
+
parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
|
65
74
|
name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
|
66
|
-
|
75
|
+
codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
|
76
|
+
assign_codepoint codepoints, name
|
77
|
+
if codepoints.include?(0xFE0F)
|
78
|
+
# Build all combinations of VS16 present and missing
|
79
|
+
codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
|
80
|
+
if cur.include? 0xFE0F
|
81
|
+
acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
|
82
|
+
else
|
83
|
+
acc.map{|prev| prev + cur}
|
84
|
+
end
|
85
|
+
}.
|
86
|
+
select {|sub_codepoints| sub_codepoints != codepoints }.
|
87
|
+
each { |sub_codepoints|
|
88
|
+
assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
|
89
|
+
}
|
90
|
+
end
|
67
91
|
end
|
92
|
+
|
93
|
+
replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
94
|
+
replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
68
95
|
end
|
69
96
|
end
|
70
97
|
end
|
data/lib/unicoder/constants.rb
CHANGED
@@ -0,0 +1,20 @@
|
|
1
|
+
require "json"
|
2
|
+
|
3
|
+
module Unicoder
|
4
|
+
module ReplaceCommonWords
|
5
|
+
def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
|
6
|
+
puts "Starting to replace the #{count} most common words"
|
7
|
+
@index[:REPLACE_BASE] = base
|
8
|
+
@index[:COMMON_WORDS] = words.
|
9
|
+
select{_1.size >= min_word_length}.
|
10
|
+
tally.
|
11
|
+
max_by(count){_2}.
|
12
|
+
map(&:first)
|
13
|
+
@index[which_index].each{|_, name|
|
14
|
+
@index[:COMMON_WORDS].each_with_index{|word, index|
|
15
|
+
name.gsub! word + " ", [base + index].pack("U")
|
16
|
+
}
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/unicoder.rb
CHANGED
@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
|
|
2
2
|
require_relative "unicoder/downloader"
|
3
3
|
require_relative "unicoder/builder"
|
4
4
|
require_relative "unicoder/multi_dimensional_array_builder"
|
5
|
+
require_relative "unicoder/replace_common_words"
|
5
6
|
|
6
7
|
if defined?(Rake)
|
7
8
|
Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))
|
data/unicoder.gemspec
CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
19
|
|
20
|
-
gem.required_ruby_version = ">=
|
20
|
+
gem.required_ruby_version = ">= 3.0", "< 4.0"
|
21
21
|
gem.add_dependency "rationalist", "~> 2.0"
|
22
22
|
gem.add_dependency "rubyzip", "~> 1.2"
|
23
23
|
gem.add_dependency "oga", "~> 2.9"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicoder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rationalist
|
@@ -85,6 +85,7 @@ files:
|
|
85
85
|
- lib/unicoder/constants.rb
|
86
86
|
- lib/unicoder/downloader.rb
|
87
87
|
- lib/unicoder/multi_dimensional_array_builder.rb
|
88
|
+
- lib/unicoder/replace_common_words.rb
|
88
89
|
- lib/unicoder/tasks.rake
|
89
90
|
- unicoder.gemspec
|
90
91
|
homepage: https://github.com/janlelis/unicoder
|
@@ -99,7 +100,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
99
100
|
requirements:
|
100
101
|
- - ">="
|
101
102
|
- !ruby/object:Gem::Version
|
102
|
-
version: '
|
103
|
+
version: '3.0'
|
103
104
|
- - "<"
|
104
105
|
- !ruby/object:Gem::Version
|
105
106
|
version: '4.0'
|