unicoder 1.0.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Gemfile.lock +1 -1
- data/README.md +3 -2
- data/lib/unicoder/builders/blocks.rb +4 -2
- data/lib/unicoder/builders/name.rb +32 -6
- data/lib/unicoder/builders/scripts.rb +19 -2
- data/lib/unicoder/builders/sequence_name.rb +29 -2
- data/lib/unicoder/constants.rb +1 -1
- data/lib/unicoder/replace_common_words.rb +21 -0
- data/lib/unicoder.rb +1 -0
- data/unicoder.gemspec +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
|
4
|
+
data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
|
7
|
+
data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,16 @@
|
|
1
1
|
## CHANGELOG
|
2
2
|
|
3
|
+
### 1.1.1
|
4
|
+
|
5
|
+
- Fix bug related to unsafe characters
|
6
|
+
- Fix squared CJK
|
7
|
+
- Small adjustments for scripts and blocks index builders
|
8
|
+
|
9
|
+
### 1.1.0
|
10
|
+
|
11
|
+
- Improve name index size: Support ranges
|
12
|
+
- Improve name index size: Replace common words
|
13
|
+
|
3
14
|
### 1.0.0
|
4
15
|
|
5
16
|
With the first 1.0 release, unicoder supports 10 indexes:
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -37,8 +37,9 @@ types | [unicode-types](https://github.com/janlelis/unicode-types)
|
|
37
37
|
|
38
38
|
Index Name | Module
|
39
39
|
--------------|----
|
40
|
-
|
41
|
-
|
40
|
+
name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
|
41
|
+
numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
|
42
|
+
scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
|
42
43
|
|
43
44
|
## MIT License
|
44
45
|
|
@@ -4,12 +4,14 @@ module Unicoder
|
|
4
4
|
include Builder
|
5
5
|
|
6
6
|
def initialize_index
|
7
|
-
@index =
|
7
|
+
@index = {
|
8
|
+
BLOCKS: []
|
9
|
+
}
|
8
10
|
end
|
9
11
|
|
10
12
|
def parse!
|
11
13
|
parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
|
12
|
-
@index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
|
14
|
+
@index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
|
13
15
|
end
|
14
16
|
end
|
15
17
|
end
|
@@ -1,19 +1,34 @@
|
|
1
1
|
module Unicoder
|
2
2
|
module Builder
|
3
3
|
class Name
|
4
|
+
|
4
5
|
include Builder
|
6
|
+
include ReplaceCommonWords
|
5
7
|
|
6
8
|
JAMO_INITIAL = 4352
|
7
9
|
JAMO_MEDIAL = 4449
|
8
10
|
JAMO_FINAL = 4520
|
9
11
|
JAMO_END = 4697
|
10
12
|
|
13
|
+
CJK = "CJK UNIFIED IDEOGRAPH-"
|
14
|
+
TANGUT = "TANGUT IDEOGRAPH-"
|
15
|
+
|
16
|
+
REPLACE_COUNT = 500
|
17
|
+
REPLACE_BASE = ?[.ord
|
18
|
+
|
11
19
|
def initialize_index
|
12
20
|
@index = {
|
13
21
|
NAMES: {},
|
14
22
|
ALIASES: {},
|
15
|
-
|
16
|
-
|
23
|
+
# HANGUL: [],
|
24
|
+
CP_RANGES: {
|
25
|
+
CJK => [], # filled while parsing
|
26
|
+
TANGUT => [], # filled while parsing
|
27
|
+
"EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
|
28
|
+
"KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
|
29
|
+
"NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
|
30
|
+
"CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
|
31
|
+
},
|
17
32
|
# see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
|
18
33
|
JAMO: {
|
19
34
|
INITIAL: [],
|
@@ -21,6 +36,7 @@ module Unicoder
|
|
21
36
|
FINAL: [""],
|
22
37
|
},
|
23
38
|
}
|
39
|
+
@words = []
|
24
40
|
@range_start = nil
|
25
41
|
end
|
26
42
|
|
@@ -36,22 +52,32 @@ module Unicoder
|
|
36
52
|
if line["name"] =~ /First/
|
37
53
|
@range_start = line["codepoint"].to_i(16)
|
38
54
|
elsif line["name"] =~ /Last/ && @range_start
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
@index[:
|
55
|
+
case line["name"]
|
56
|
+
when /Hangul/
|
57
|
+
# currently not necessary
|
58
|
+
# @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
|
59
|
+
when /CJK/
|
60
|
+
@index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
|
61
|
+
when /Tangut/
|
62
|
+
@index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
|
43
63
|
else
|
44
64
|
# no name
|
65
|
+
warn "ignoring range: #{line["name"]}"
|
45
66
|
end
|
46
67
|
@range_start = nil
|
47
68
|
elsif line["name"] != "<control>"
|
48
69
|
raise ArgumentError, "inconsistent range found in data, don't know what to do"
|
49
70
|
end
|
71
|
+
elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
|
72
|
+
# ignore
|
50
73
|
else
|
51
74
|
assign :NAMES, line["codepoint"].to_i(16), line["name"]
|
75
|
+
@words += line["name"].split
|
52
76
|
end
|
53
77
|
end
|
54
78
|
|
79
|
+
replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
|
80
|
+
|
55
81
|
parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
|
56
82
|
@index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
|
57
83
|
@index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
|
@@ -10,6 +10,12 @@ module Unicoder
|
|
10
10
|
SCRIPT_EXTENSIONS: {},
|
11
11
|
SCRIPT_ALIASES: {},
|
12
12
|
SCRIPT_NAMES: [],
|
13
|
+
OFFSETS: [
|
14
|
+
0x10000,
|
15
|
+
0x1000,
|
16
|
+
0x100,
|
17
|
+
0x10
|
18
|
+
],
|
13
19
|
}
|
14
20
|
@reverse_script_names = {}
|
15
21
|
@reverse_script_extension_names = {}
|
@@ -21,6 +27,17 @@ module Unicoder
|
|
21
27
|
}
|
22
28
|
end
|
23
29
|
|
30
|
+
# TODO refactor how multiple indexes are organized
|
31
|
+
def assign_classic(sub_index_name, codepoint, value)
|
32
|
+
idx = @index[sub_index_name]
|
33
|
+
|
34
|
+
if option =~ /charkeys/
|
35
|
+
idx[[codepoint].pack("U*")] = value
|
36
|
+
else
|
37
|
+
idx[codepoint] = value
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
24
41
|
def parse!
|
25
42
|
parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
|
26
43
|
@index[:SCRIPT_NAMES] << line["long"]
|
@@ -47,10 +64,10 @@ module Unicoder
|
|
47
64
|
parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
|
48
65
|
if line["to"]
|
49
66
|
(line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
|
50
|
-
|
67
|
+
assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
|
51
68
|
}
|
52
69
|
else
|
53
|
-
|
70
|
+
assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
|
54
71
|
end
|
55
72
|
end
|
56
73
|
end
|
@@ -2,11 +2,18 @@ module Unicoder
|
|
2
2
|
module Builder
|
3
3
|
class SequenceName
|
4
4
|
include Builder
|
5
|
+
include ReplaceCommonWords
|
6
|
+
|
7
|
+
REPLACE_COUNT = 100
|
8
|
+
REPLACE_BASE = ?{.ord
|
9
|
+
REPLACE_MIN_WORD_LENGTH = 3
|
5
10
|
|
6
11
|
def initialize_index
|
7
12
|
@index = {
|
8
13
|
SEQUENCES: {},
|
14
|
+
SEQUENCES_NOT_QUALIFIED: {},
|
9
15
|
}
|
16
|
+
@words = []
|
10
17
|
end
|
11
18
|
|
12
19
|
def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
|
@@ -25,6 +32,8 @@ module Unicoder
|
|
25
32
|
else
|
26
33
|
idx[key] = value
|
27
34
|
end
|
35
|
+
|
36
|
+
@words += value.split
|
28
37
|
end
|
29
38
|
|
30
39
|
def parse!
|
@@ -61,10 +70,28 @@ module Unicoder
|
|
61
70
|
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
|
62
71
|
end
|
63
72
|
|
64
|
-
parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
|
73
|
+
parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
|
65
74
|
name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
|
66
|
-
|
75
|
+
codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
|
76
|
+
assign_codepoint codepoints, name
|
77
|
+
if codepoints.include?(0xFE0F)
|
78
|
+
# Build all combinations of VS16 present and missing
|
79
|
+
codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
|
80
|
+
if cur.include? 0xFE0F
|
81
|
+
acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
|
82
|
+
else
|
83
|
+
acc.map{|prev| prev + cur}
|
84
|
+
end
|
85
|
+
}.
|
86
|
+
select {|sub_codepoints| sub_codepoints != codepoints }.
|
87
|
+
each { |sub_codepoints|
|
88
|
+
assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
|
89
|
+
}
|
90
|
+
end
|
67
91
|
end
|
92
|
+
|
93
|
+
replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
94
|
+
replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
68
95
|
end
|
69
96
|
end
|
70
97
|
end
|
data/lib/unicoder/constants.rb
CHANGED
@@ -0,0 +1,21 @@
|
|
1
|
+
require "json"
|
2
|
+
|
3
|
+
module Unicoder
|
4
|
+
module ReplaceCommonWords
|
5
|
+
def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
|
6
|
+
base = @words.join.chars.max.ord + 1
|
7
|
+
puts "Starting to replace the #{count} most common words (replace base: #{base})"
|
8
|
+
@index[:REPLACE_BASE] = base
|
9
|
+
@index[:COMMON_WORDS] = words.
|
10
|
+
select{_1.size >= min_word_length}.
|
11
|
+
tally.
|
12
|
+
max_by(count){_2}.
|
13
|
+
map(&:first)
|
14
|
+
@index[which_index].each{|_, name|
|
15
|
+
@index[:COMMON_WORDS].each_with_index{|word, index|
|
16
|
+
name.gsub! word + " ", [base + index].pack("U")
|
17
|
+
}
|
18
|
+
}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/unicoder.rb
CHANGED
@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
|
|
2
2
|
require_relative "unicoder/downloader"
|
3
3
|
require_relative "unicoder/builder"
|
4
4
|
require_relative "unicoder/multi_dimensional_array_builder"
|
5
|
+
require_relative "unicoder/replace_common_words"
|
5
6
|
|
6
7
|
if defined?(Rake)
|
7
8
|
Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))
|
data/unicoder.gemspec
CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
19
|
|
20
|
-
gem.required_ruby_version = ">=
|
20
|
+
gem.required_ruby_version = ">= 3.0", "< 4.0"
|
21
21
|
gem.add_dependency "rationalist", "~> 2.0"
|
22
22
|
gem.add_dependency "rubyzip", "~> 1.2"
|
23
23
|
gem.add_dependency "oga", "~> 2.9"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicoder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rationalist
|
@@ -85,6 +85,7 @@ files:
|
|
85
85
|
- lib/unicoder/constants.rb
|
86
86
|
- lib/unicoder/downloader.rb
|
87
87
|
- lib/unicoder/multi_dimensional_array_builder.rb
|
88
|
+
- lib/unicoder/replace_common_words.rb
|
88
89
|
- lib/unicoder/tasks.rake
|
89
90
|
- unicoder.gemspec
|
90
91
|
homepage: https://github.com/janlelis/unicoder
|
@@ -99,7 +100,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
99
100
|
requirements:
|
100
101
|
- - ">="
|
101
102
|
- !ruby/object:Gem::Version
|
102
|
-
version: '
|
103
|
+
version: '3.0'
|
103
104
|
- - "<"
|
104
105
|
- !ruby/object:Gem::Version
|
105
106
|
version: '4.0'
|