unicoder 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Gemfile.lock +1 -1
- data/README.md +3 -2
- data/lib/unicoder/builders/blocks.rb +4 -2
- data/lib/unicoder/builders/name.rb +32 -6
- data/lib/unicoder/builders/scripts.rb +19 -2
- data/lib/unicoder/builders/sequence_name.rb +29 -2
- data/lib/unicoder/constants.rb +1 -1
- data/lib/unicoder/replace_common_words.rb +21 -0
- data/lib/unicoder.rb +1 -0
- data/unicoder.gemspec +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f531e7ea0b5d27ea2bcb05bba6ddd93e0b7325d3d097abaf9ed815cc11d9a197
|
4
|
+
data.tar.gz: 8c3d133fb02f3b3d516c9a07390f768509eb3b22f2ae2850ad134819b0390462
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05bd755d93de557ca8786c740675e88847da99c31c07a1517040f51fc0e9846537eeb9317087d955f4cc65d4c6d2434cd83a5216ad21dba8f583edd1166eb10a
|
7
|
+
data.tar.gz: 1644c4b6dee2db05f6d7ec91a5250798045e358e3580e141327f39d8c150cce5ad7d687cce8ffca796ca1de728151bb0a29b9d8084ae7fe6b87ec4e080fb95cb
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,16 @@
|
|
1
1
|
## CHANGELOG
|
2
2
|
|
3
|
+
### 1.1.1
|
4
|
+
|
5
|
+
- Fix bug related to unsafe characters
|
6
|
+
- Fix squared CJK
|
7
|
+
- Small adjustments for scripts and blocks index builders
|
8
|
+
|
9
|
+
### 1.1.0
|
10
|
+
|
11
|
+
- Improve name index size: Support ranges
|
12
|
+
- Improve name index size: Replace common words
|
13
|
+
|
3
14
|
### 1.0.0
|
4
15
|
|
5
16
|
With the first 1.0 release, unicoder supports 10 indexes:
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -37,8 +37,9 @@ types | [unicode-types](https://github.com/janlelis/unicode-types)
|
|
37
37
|
|
38
38
|
Index Name | Module
|
39
39
|
--------------|----
|
40
|
-
|
41
|
-
|
40
|
+
name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
|
41
|
+
numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
|
42
|
+
scripts | [unicode-scripts.js](https://github.com/janlelis/unicode-scripts.js)
|
42
43
|
|
43
44
|
## MIT License
|
44
45
|
|
@@ -4,12 +4,14 @@ module Unicoder
|
|
4
4
|
include Builder
|
5
5
|
|
6
6
|
def initialize_index
|
7
|
-
@index =
|
7
|
+
@index = {
|
8
|
+
BLOCKS: []
|
9
|
+
}
|
8
10
|
end
|
9
11
|
|
10
12
|
def parse!
|
11
13
|
parse_file :blocks, :line, regex: /^(?<from>\S+?)\.\.(?<to>\S+);\s(?<name>.+)$/ do |line|
|
12
|
-
@index << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
|
14
|
+
@index[:BLOCKS] << [line["from"].to_i(16), line["to"].to_i(16), line["name"]]
|
13
15
|
end
|
14
16
|
end
|
15
17
|
end
|
@@ -1,19 +1,34 @@
|
|
1
1
|
module Unicoder
|
2
2
|
module Builder
|
3
3
|
class Name
|
4
|
+
|
4
5
|
include Builder
|
6
|
+
include ReplaceCommonWords
|
5
7
|
|
6
8
|
JAMO_INITIAL = 4352
|
7
9
|
JAMO_MEDIAL = 4449
|
8
10
|
JAMO_FINAL = 4520
|
9
11
|
JAMO_END = 4697
|
10
12
|
|
13
|
+
CJK = "CJK UNIFIED IDEOGRAPH-"
|
14
|
+
TANGUT = "TANGUT IDEOGRAPH-"
|
15
|
+
|
16
|
+
REPLACE_COUNT = 500
|
17
|
+
REPLACE_BASE = ?[.ord
|
18
|
+
|
11
19
|
def initialize_index
|
12
20
|
@index = {
|
13
21
|
NAMES: {},
|
14
22
|
ALIASES: {},
|
15
|
-
|
16
|
-
|
23
|
+
# HANGUL: [],
|
24
|
+
CP_RANGES: {
|
25
|
+
CJK => [], # filled while parsing
|
26
|
+
TANGUT => [], # filled while parsing
|
27
|
+
"EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
|
28
|
+
"KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
|
29
|
+
"NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
|
30
|
+
"CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
|
31
|
+
},
|
17
32
|
# see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
|
18
33
|
JAMO: {
|
19
34
|
INITIAL: [],
|
@@ -21,6 +36,7 @@ module Unicoder
|
|
21
36
|
FINAL: [""],
|
22
37
|
},
|
23
38
|
}
|
39
|
+
@words = []
|
24
40
|
@range_start = nil
|
25
41
|
end
|
26
42
|
|
@@ -36,22 +52,32 @@ module Unicoder
|
|
36
52
|
if line["name"] =~ /First/
|
37
53
|
@range_start = line["codepoint"].to_i(16)
|
38
54
|
elsif line["name"] =~ /Last/ && @range_start
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
@index[:
|
55
|
+
case line["name"]
|
56
|
+
when /Hangul/
|
57
|
+
# currently not necessary
|
58
|
+
# @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
|
59
|
+
when /CJK/
|
60
|
+
@index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
|
61
|
+
when /Tangut/
|
62
|
+
@index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
|
43
63
|
else
|
44
64
|
# no name
|
65
|
+
warn "ignoring range: #{line["name"]}"
|
45
66
|
end
|
46
67
|
@range_start = nil
|
47
68
|
elsif line["name"] != "<control>"
|
48
69
|
raise ArgumentError, "inconsistent range found in data, don't know what to do"
|
49
70
|
end
|
71
|
+
elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys.map{/^#{_1}/})
|
72
|
+
# ignore
|
50
73
|
else
|
51
74
|
assign :NAMES, line["codepoint"].to_i(16), line["name"]
|
75
|
+
@words += line["name"].split
|
52
76
|
end
|
53
77
|
end
|
54
78
|
|
79
|
+
replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
|
80
|
+
|
55
81
|
parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
|
56
82
|
@index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
|
57
83
|
@index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
|
@@ -10,6 +10,12 @@ module Unicoder
|
|
10
10
|
SCRIPT_EXTENSIONS: {},
|
11
11
|
SCRIPT_ALIASES: {},
|
12
12
|
SCRIPT_NAMES: [],
|
13
|
+
OFFSETS: [
|
14
|
+
0x10000,
|
15
|
+
0x1000,
|
16
|
+
0x100,
|
17
|
+
0x10
|
18
|
+
],
|
13
19
|
}
|
14
20
|
@reverse_script_names = {}
|
15
21
|
@reverse_script_extension_names = {}
|
@@ -21,6 +27,17 @@ module Unicoder
|
|
21
27
|
}
|
22
28
|
end
|
23
29
|
|
30
|
+
# TODO refactor how multiple indexes are organized
|
31
|
+
def assign_classic(sub_index_name, codepoint, value)
|
32
|
+
idx = @index[sub_index_name]
|
33
|
+
|
34
|
+
if option =~ /charkeys/
|
35
|
+
idx[[codepoint].pack("U*")] = value
|
36
|
+
else
|
37
|
+
idx[codepoint] = value
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
24
41
|
def parse!
|
25
42
|
parse_file :property_value_aliases, :line, regex: /^sc ; (?<short>\S+?)\s*; (?<long>\S+?)(?:\s*; (?<short2>\S+))?$/ do |line|
|
26
43
|
@index[:SCRIPT_NAMES] << line["long"]
|
@@ -47,10 +64,10 @@ module Unicoder
|
|
47
64
|
parse_file :script_extensions, :line, regex: /^(?<from>\S+?)(\.\.(?<to>\S+))?\s+; (?<scripts>.+?) #.*$/ do |line|
|
48
65
|
if line["to"]
|
49
66
|
(line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
|
50
|
-
|
67
|
+
assign_classic :SCRIPT_EXTENSIONS, codepoint, lookup_extension_names(line["scripts"])
|
51
68
|
}
|
52
69
|
else
|
53
|
-
|
70
|
+
assign_classic :SCRIPT_EXTENSIONS, line["from"].to_i(16), lookup_extension_names(line["scripts"])
|
54
71
|
end
|
55
72
|
end
|
56
73
|
end
|
@@ -2,11 +2,18 @@ module Unicoder
|
|
2
2
|
module Builder
|
3
3
|
class SequenceName
|
4
4
|
include Builder
|
5
|
+
include ReplaceCommonWords
|
6
|
+
|
7
|
+
REPLACE_COUNT = 100
|
8
|
+
REPLACE_BASE = ?{.ord
|
9
|
+
REPLACE_MIN_WORD_LENGTH = 3
|
5
10
|
|
6
11
|
def initialize_index
|
7
12
|
@index = {
|
8
13
|
SEQUENCES: {},
|
14
|
+
SEQUENCES_NOT_QUALIFIED: {},
|
9
15
|
}
|
16
|
+
@words = []
|
10
17
|
end
|
11
18
|
|
12
19
|
def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
|
@@ -25,6 +32,8 @@ module Unicoder
|
|
25
32
|
else
|
26
33
|
idx[key] = value
|
27
34
|
end
|
35
|
+
|
36
|
+
@words += value.split
|
28
37
|
end
|
29
38
|
|
30
39
|
def parse!
|
@@ -61,10 +70,28 @@ module Unicoder
|
|
61
70
|
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
|
62
71
|
end
|
63
72
|
|
64
|
-
parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
|
73
|
+
parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
|
65
74
|
name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
|
66
|
-
|
75
|
+
codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
|
76
|
+
assign_codepoint codepoints, name
|
77
|
+
if codepoints.include?(0xFE0F)
|
78
|
+
# Build all combinations of VS16 present and missing
|
79
|
+
codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
|
80
|
+
if cur.include? 0xFE0F
|
81
|
+
acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
|
82
|
+
else
|
83
|
+
acc.map{|prev| prev + cur}
|
84
|
+
end
|
85
|
+
}.
|
86
|
+
select {|sub_codepoints| sub_codepoints != codepoints }.
|
87
|
+
each { |sub_codepoints|
|
88
|
+
assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
|
89
|
+
}
|
90
|
+
end
|
67
91
|
end
|
92
|
+
|
93
|
+
replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
94
|
+
replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
68
95
|
end
|
69
96
|
end
|
70
97
|
end
|
data/lib/unicoder/constants.rb
CHANGED
@@ -0,0 +1,21 @@
|
|
1
|
+
require "json"
|
2
|
+
|
3
|
+
module Unicoder
|
4
|
+
module ReplaceCommonWords
|
5
|
+
def replace_common_words!(which_index, words, count = 500, _ = ?[.ord, min_word_length = 4)
|
6
|
+
base = @words.join.chars.max.ord + 1
|
7
|
+
puts "Starting to replace the #{count} most common words (replace base: #{base})"
|
8
|
+
@index[:REPLACE_BASE] = base
|
9
|
+
@index[:COMMON_WORDS] = words.
|
10
|
+
select{_1.size >= min_word_length}.
|
11
|
+
tally.
|
12
|
+
max_by(count){_2}.
|
13
|
+
map(&:first)
|
14
|
+
@index[which_index].each{|_, name|
|
15
|
+
@index[:COMMON_WORDS].each_with_index{|word, index|
|
16
|
+
name.gsub! word + " ", [base + index].pack("U")
|
17
|
+
}
|
18
|
+
}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/unicoder.rb
CHANGED
@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
|
|
2
2
|
require_relative "unicoder/downloader"
|
3
3
|
require_relative "unicoder/builder"
|
4
4
|
require_relative "unicoder/multi_dimensional_array_builder"
|
5
|
+
require_relative "unicoder/replace_common_words"
|
5
6
|
|
6
7
|
if defined?(Rake)
|
7
8
|
Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))
|
data/unicoder.gemspec
CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
19
|
|
20
|
-
gem.required_ruby_version = ">=
|
20
|
+
gem.required_ruby_version = ">= 3.0", "< 4.0"
|
21
21
|
gem.add_dependency "rationalist", "~> 2.0"
|
22
22
|
gem.add_dependency "rubyzip", "~> 1.2"
|
23
23
|
gem.add_dependency "oga", "~> 2.9"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unicoder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Lelis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rationalist
|
@@ -85,6 +85,7 @@ files:
|
|
85
85
|
- lib/unicoder/constants.rb
|
86
86
|
- lib/unicoder/downloader.rb
|
87
87
|
- lib/unicoder/multi_dimensional_array_builder.rb
|
88
|
+
- lib/unicoder/replace_common_words.rb
|
88
89
|
- lib/unicoder/tasks.rake
|
89
90
|
- unicoder.gemspec
|
90
91
|
homepage: https://github.com/janlelis/unicoder
|
@@ -99,7 +100,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
99
100
|
requirements:
|
100
101
|
- - ">="
|
101
102
|
- !ruby/object:Gem::Version
|
102
|
-
version: '
|
103
|
+
version: '3.0'
|
103
104
|
- - "<"
|
104
105
|
- !ruby/object:Gem::Version
|
105
106
|
version: '4.0'
|