unicoder 0.1.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +6 -1
- data/.travis.yml +13 -13
- data/CHANGELOG.md +24 -1
- data/Gemfile +2 -0
- data/Gemfile.lock +99 -0
- data/MIT-LICENSE.txt +1 -1
- data/README.md +35 -5
- data/bin/unicoder +1 -1
- data/lib/unicoder/builder.rb +77 -15
- data/lib/unicoder/builders/categories.rb +7 -12
- data/lib/unicoder/builders/display_width.rb +28 -7
- data/lib/unicoder/builders/emoji.rb +97 -0
- data/lib/unicoder/builders/name.rb +101 -0
- data/lib/unicoder/builders/numeric_value.rb +30 -0
- data/lib/unicoder/builders/sequence_name.rb +99 -0
- data/lib/unicoder/builders/types.rb +83 -0
- data/lib/unicoder/constants.rb +81 -16
- data/lib/unicoder/downloader.rb +54 -8
- data/lib/unicoder/multi_dimensional_array_builder.rb +24 -2
- data/lib/unicoder/replace_common_words.rb +20 -0
- data/lib/unicoder.rb +1 -0
- data/unicoder.gemspec +7 -5
- metadata +50 -26
- data/data/.keep +0 -0
- data/data/unicode/8.0.0/ucd/Blocks.txt +0 -298
- data/data/unicode/8.0.0/ucd/EastAsianWidth.txt +0 -2174
- data/data/unicode/8.0.0/ucd/NameAliases.txt +0 -554
- data/data/unicode/8.0.0/ucd/PropertyValueAliases.txt +0 -1420
- data/data/unicode/8.0.0/ucd/ScriptExtensions.txt +0 -454
- data/data/unicode/8.0.0/ucd/Scripts.txt +0 -2539
- data/data/unicode/8.0.0/ucd/UnicodeData.txt +0 -29215
- data/data/unicode/8.0.0/ucd/extracted/DerivedGeneralCategory.txt +0 -3789
- data/data/unicode/security/8.0.0/confusables.txt +0 -9274
- data/spec/unicoder_spec.rb +0 -9
@@ -0,0 +1,101 @@
|
|
1
|
+
module Unicoder
|
2
|
+
module Builder
|
3
|
+
class Name
|
4
|
+
|
5
|
+
include Builder
|
6
|
+
include ReplaceCommonWords
|
7
|
+
|
8
|
+
JAMO_INITIAL = 4352
|
9
|
+
JAMO_MEDIAL = 4449
|
10
|
+
JAMO_FINAL = 4520
|
11
|
+
JAMO_END = 4697
|
12
|
+
|
13
|
+
CJK = "CJK UNIFIED IDEOGRAPH-"
|
14
|
+
TANGUT = "TANGUT IDEOGRAPH-"
|
15
|
+
|
16
|
+
REPLACE_COUNT = 500
|
17
|
+
REPLACE_BASE = ?[.ord
|
18
|
+
|
19
|
+
def initialize_index
|
20
|
+
@index = {
|
21
|
+
NAMES: {},
|
22
|
+
ALIASES: {},
|
23
|
+
# HANGUL: [],
|
24
|
+
CP_RANGES: {
|
25
|
+
CJK => [], # filled while parsing
|
26
|
+
TANGUT => [], # filled while parsing
|
27
|
+
"EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
|
28
|
+
"KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
|
29
|
+
"NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
|
30
|
+
"CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
|
31
|
+
},
|
32
|
+
# see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
|
33
|
+
JAMO: {
|
34
|
+
INITIAL: [],
|
35
|
+
MEDIAL: [],
|
36
|
+
FINAL: [""],
|
37
|
+
},
|
38
|
+
}
|
39
|
+
@words = []
|
40
|
+
@range_start = nil
|
41
|
+
end
|
42
|
+
|
43
|
+
def parse!
|
44
|
+
if option =~ /charkeys/
|
45
|
+
get_key = ->(codepoint){ [codepoint].pack("U*") }
|
46
|
+
else
|
47
|
+
get_key = -> (codepoint){ codepoint }
|
48
|
+
end
|
49
|
+
|
50
|
+
parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<name>.+?);.*$/ do |line|
|
51
|
+
if line["name"][0] == "<" && line["name"][-1] == ">"
|
52
|
+
if line["name"] =~ /First/
|
53
|
+
@range_start = line["codepoint"].to_i(16)
|
54
|
+
elsif line["name"] =~ /Last/ && @range_start
|
55
|
+
case line["name"]
|
56
|
+
when /Hangul/
|
57
|
+
# currently not necessary
|
58
|
+
# @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
|
59
|
+
when /CJK/
|
60
|
+
@index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
|
61
|
+
when /Tangut/
|
62
|
+
@index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
|
63
|
+
else
|
64
|
+
# no name
|
65
|
+
warn "ignoring range: #{line["name"]}"
|
66
|
+
end
|
67
|
+
@range_start = nil
|
68
|
+
elsif line["name"] != "<control>"
|
69
|
+
raise ArgumentError, "inconsistent range found in data, don't know what to do"
|
70
|
+
end
|
71
|
+
elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
|
72
|
+
# ignore
|
73
|
+
else
|
74
|
+
assign :NAMES, line["codepoint"].to_i(16), line["name"]
|
75
|
+
@words += line["name"].split
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
|
80
|
+
|
81
|
+
parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
|
82
|
+
@index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
|
83
|
+
@index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
|
84
|
+
@index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] << line["alias"]
|
85
|
+
end
|
86
|
+
|
87
|
+
parse_file :jamo, :line, regex: /^(?<codepoint>.+?); (?<short_name>.*?) +#.*$/ do |line|
|
88
|
+
case line["codepoint"].to_i(16)
|
89
|
+
when JAMO_INITIAL...JAMO_MEDIAL
|
90
|
+
@index[:JAMO][:INITIAL] << line["short_name"]
|
91
|
+
when JAMO_MEDIAL...JAMO_FINAL
|
92
|
+
@index[:JAMO][:MEDIAL] << line["short_name"]
|
93
|
+
when JAMO_FINAL..JAMO_END
|
94
|
+
@index[:JAMO][:FINAL] << line["short_name"]
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Unicoder
|
2
|
+
module Builder
|
3
|
+
class NumericValue
|
4
|
+
include Builder
|
5
|
+
|
6
|
+
def initialize_index
|
7
|
+
@index = {
|
8
|
+
NUMBERS: {},
|
9
|
+
}
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse!
|
13
|
+
parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(.*?;){7}(?<value>.*?);.*$/ do |line|
|
14
|
+
unless line["value"].empty?
|
15
|
+
if line["value"] =~ %r</>
|
16
|
+
|
17
|
+
assign :NUMBERS, line["codepoint"].to_i(16), option =~ /stringfractions/ ? "#{line["value"]}" : line["value"].to_r
|
18
|
+
else
|
19
|
+
assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
parse_file :unihan_numeric_values, :line, regex: /^U\+(?<codepoint>\S+)\s+\S+\s+(?<value>\S+)$/ do |line|
|
25
|
+
assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
module Unicoder
|
2
|
+
module Builder
|
3
|
+
class SequenceName
|
4
|
+
include Builder
|
5
|
+
include ReplaceCommonWords
|
6
|
+
|
7
|
+
REPLACE_COUNT = 100
|
8
|
+
REPLACE_BASE = ?{.ord
|
9
|
+
REPLACE_MIN_WORD_LENGTH = 3
|
10
|
+
|
11
|
+
def initialize_index
|
12
|
+
@index = {
|
13
|
+
SEQUENCES: {},
|
14
|
+
SEQUENCES_NOT_QUALIFIED: {},
|
15
|
+
}
|
16
|
+
@words = []
|
17
|
+
end
|
18
|
+
|
19
|
+
def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
|
20
|
+
if option =~ /charkeys/
|
21
|
+
key = codepoints.pack("U*")
|
22
|
+
else
|
23
|
+
key = codepoints
|
24
|
+
end
|
25
|
+
|
26
|
+
if idx.has_key?(codepoints)
|
27
|
+
if combine
|
28
|
+
idx[key] << " / #{value}"
|
29
|
+
else
|
30
|
+
# ignore new one
|
31
|
+
end
|
32
|
+
else
|
33
|
+
idx[key] = value
|
34
|
+
end
|
35
|
+
|
36
|
+
@words += value.split
|
37
|
+
end
|
38
|
+
|
39
|
+
def parse!
|
40
|
+
parse_file :named_sequences, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
|
41
|
+
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
|
42
|
+
end
|
43
|
+
|
44
|
+
parse_file :named_sequences_prov, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
|
45
|
+
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
|
46
|
+
end
|
47
|
+
|
48
|
+
parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?);\s*(?<variant>.+?)\s*;\s*(?<context>.*?)\s*# (?<name>.+)$/ do |line|
|
49
|
+
name = "#{line["name"].strip} (#{line["variant"]})"
|
50
|
+
name << " [#{line["context"]}]" if line["context"] && !line["context"].empty?
|
51
|
+
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name, combine: true
|
52
|
+
end
|
53
|
+
|
54
|
+
parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?); (?<name>.+?)\s*;$/ do |line|
|
55
|
+
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
|
56
|
+
end
|
57
|
+
|
58
|
+
parse_file :ivd_sequences, :line, regex: /^(?<codepoints>.+?);.*?; (?<name>.+?)$/ do |line|
|
59
|
+
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"], combine: true
|
60
|
+
end
|
61
|
+
|
62
|
+
parse_file :emoji_variation_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<variant>.+?)\s*;\s*# \(.*\)\s*(?<name>.+?)\s*$/ do |line|
|
63
|
+
name = "#{line["name"].strip} (#{line["variant"]})"
|
64
|
+
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
|
65
|
+
end
|
66
|
+
|
67
|
+
parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<type>.+?)\s*; (?<name>.+?)\s*#/ do |line|
|
68
|
+
next if line["type"] == "Basic_Emoji"
|
69
|
+
name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
|
70
|
+
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
|
71
|
+
end
|
72
|
+
|
73
|
+
parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
|
74
|
+
name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
|
75
|
+
codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
|
76
|
+
assign_codepoint codepoints, name
|
77
|
+
if codepoints.include?(0xFE0F)
|
78
|
+
# Build all combinations of VS16 present and missing
|
79
|
+
codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
|
80
|
+
if cur.include? 0xFE0F
|
81
|
+
acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
|
82
|
+
else
|
83
|
+
acc.map{|prev| prev + cur}
|
84
|
+
end
|
85
|
+
}.
|
86
|
+
select {|sub_codepoints| sub_codepoints != codepoints }.
|
87
|
+
each { |sub_codepoints|
|
88
|
+
assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
|
89
|
+
}
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
94
|
+
replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module Unicoder
|
2
|
+
module Builder
|
3
|
+
class Types
|
4
|
+
include Builder
|
5
|
+
include MultiDimensionalArrayBuilder
|
6
|
+
|
7
|
+
NONCHARACTERS = [
|
8
|
+
*0xFDD0..0xFDEF,
|
9
|
+
0xFFFE, 0xFFFF,
|
10
|
+
0x1FFFE, 0x1FFFF,
|
11
|
+
0x2FFFE, 0x2FFFF,
|
12
|
+
0x3FFFE, 0x3FFFF,
|
13
|
+
0x4FFFE, 0x4FFFF,
|
14
|
+
0x5FFFE, 0x5FFFF,
|
15
|
+
0x6FFFE, 0x6FFFF,
|
16
|
+
0x7FFFE, 0x7FFFF,
|
17
|
+
0x8FFFE, 0x8FFFF,
|
18
|
+
0x9FFFE, 0x9FFFF,
|
19
|
+
0xAFFFE, 0xAFFFF,
|
20
|
+
0xBFFFE, 0xBFFFF,
|
21
|
+
0xCFFFE, 0xCFFFF,
|
22
|
+
0xDFFFE, 0xDFFFF,
|
23
|
+
0xEFFFE, 0xEFFFF,
|
24
|
+
0xFFFFE, 0xFFFFF,
|
25
|
+
0x10FFFE, 0x10FFFF,
|
26
|
+
]
|
27
|
+
|
28
|
+
def initialize_index
|
29
|
+
@index = {
|
30
|
+
TYPES: [],
|
31
|
+
TYPE_NAMES: %w[
|
32
|
+
Graphic
|
33
|
+
Format
|
34
|
+
Control
|
35
|
+
Private-use
|
36
|
+
Surrogate
|
37
|
+
Noncharacter
|
38
|
+
Reserved
|
39
|
+
],
|
40
|
+
OFFSETS: [
|
41
|
+
0x10000,
|
42
|
+
0x1000,
|
43
|
+
0x100,
|
44
|
+
0x10
|
45
|
+
],
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
def parse!
|
50
|
+
parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
|
51
|
+
if line["to"]
|
52
|
+
codepoints = Range.new(line["from"].to_i(16), line["to"].to_i(16))
|
53
|
+
else
|
54
|
+
codepoints = [line["from"].to_i(16)]
|
55
|
+
end
|
56
|
+
|
57
|
+
codepoints.each{ |codepoint|
|
58
|
+
case line["category"]
|
59
|
+
when "Cf", "Zl", "Zp"
|
60
|
+
type = 1
|
61
|
+
when "Cc"
|
62
|
+
type = 2
|
63
|
+
when "Co"
|
64
|
+
type = 3
|
65
|
+
when "Cs"
|
66
|
+
type = 4
|
67
|
+
when "Cn"
|
68
|
+
if NONCHARACTERS.include?(codepoint)
|
69
|
+
type = 5
|
70
|
+
else
|
71
|
+
type = 6
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
assign :TYPES, codepoint, type
|
76
|
+
}
|
77
|
+
end
|
78
|
+
|
79
|
+
4.times{ compress! @index[:TYPES] }
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
data/lib/unicoder/constants.rb
CHANGED
@@ -1,29 +1,94 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.1.0".freeze
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
|
-
|
3
|
+
module Unicoder
|
4
|
+
VERSION = "1.1.0"
|
5
5
|
|
6
6
|
UNICODE_VERSIONS = %w[
|
7
|
-
|
8
|
-
|
9
|
-
|
7
|
+
16.0.0
|
8
|
+
15.1.0
|
9
|
+
15.0.0
|
10
|
+
14.0.0
|
11
|
+
13.0.0
|
12
|
+
12.1.0
|
13
|
+
12.0.0
|
14
|
+
11.0.0
|
15
|
+
10.0.0
|
10
16
|
9.0.0
|
17
|
+
8.0.0
|
18
|
+
7.0.0
|
19
|
+
6.3.0
|
11
20
|
].freeze
|
12
21
|
|
13
|
-
|
22
|
+
CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
|
23
|
+
|
24
|
+
EMOJI_VERSIONS = %w[
|
25
|
+
16.0
|
26
|
+
15.1
|
27
|
+
15.0
|
28
|
+
14.0
|
29
|
+
13.1
|
30
|
+
13.0
|
31
|
+
12.1
|
32
|
+
12.0
|
33
|
+
11.0
|
34
|
+
5.0
|
35
|
+
4.0
|
36
|
+
3.0
|
37
|
+
2.0
|
38
|
+
].freeze
|
39
|
+
|
40
|
+
EMOJI_RELATED_UNICODE_VERSIONS = {
|
41
|
+
"16.0" => "16.0.0",
|
42
|
+
"15.1" => "15.1.0",
|
43
|
+
"15.0" => "15.0.0",
|
44
|
+
"14.0" => "14.0.0",
|
45
|
+
"13.1" => "13.0.0",
|
46
|
+
"13.0" => "13.0.0",
|
47
|
+
"12.1" => "12.1.0",
|
48
|
+
"12.0" => "12.0.0",
|
49
|
+
"11.0" => "11.0.0",
|
50
|
+
"5.0" => "10.0.0",
|
51
|
+
"4.0" => "9.0.0",
|
52
|
+
"3.0" => "9.0.0",
|
53
|
+
"2.0" => "8.0.0",
|
54
|
+
}.freeze
|
55
|
+
|
56
|
+
CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
|
57
|
+
|
58
|
+
IVD_VERSION = "2022-09-13"
|
59
|
+
|
60
|
+
CLDR_VERSION = "45"
|
61
|
+
|
62
|
+
UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
|
14
63
|
|
15
64
|
LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
|
16
65
|
|
17
66
|
UNICODE_FILES = {
|
18
|
-
east_asian_width:
|
19
|
-
unicode_data:
|
20
|
-
name_aliases:
|
21
|
-
confusables:
|
22
|
-
blocks:
|
23
|
-
scripts:
|
24
|
-
script_extensions:
|
25
|
-
property_value_aliases:
|
26
|
-
general_categories:
|
67
|
+
east_asian_width: "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
|
68
|
+
unicode_data: "/UNICODE_VERSION/ucd/UnicodeData.txt",
|
69
|
+
name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
|
70
|
+
confusables: "/security/UNICODE_VERSION/confusables.txt",
|
71
|
+
blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
|
72
|
+
scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
|
73
|
+
script_extensions: "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
|
74
|
+
property_value_aliases: "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
|
75
|
+
general_categories: "/UNICODE_VERSION/ucd/extracted/DerivedGeneralCategory.txt",
|
76
|
+
unihan_numeric_values: "/UNICODE_VERSION/ucd/Unihan.zip/Unihan_NumericValues.txt",
|
77
|
+
jamo: "/UNICODE_VERSION/ucd/Jamo.txt",
|
78
|
+
named_sequences: "/UNICODE_VERSION/ucd/NamedSequences.txt",
|
79
|
+
named_sequences_prov: "/UNICODE_VERSION/ucd/NamedSequencesProv.txt",
|
80
|
+
standardized_variants: "/UNICODE_VERSION/ucd/StandardizedVariants.txt",
|
81
|
+
ivd_sequences: "https://www.unicode.org/ivd/data/#{IVD_VERSION}/IVD_Sequences.txt",
|
82
|
+
# emoji_data: "/EMOJI_VERSION/ucd/emoji/",
|
83
|
+
emoji_data: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-data.txt",
|
84
|
+
emoji_sequences: "/emoji/EMOJI_VERSION/emoji-sequences.txt",
|
85
|
+
# emoji_variation_sequences: "/emoji/EMOJI_VERSION/emoji-variation-sequences.txt",
|
86
|
+
emoji_variation_sequences: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-variation-sequences.txt",
|
87
|
+
emoji_zwj_sequences: "/emoji/EMOJI_VERSION/emoji-zwj-sequences.txt",
|
88
|
+
emoji_test: "/emoji/EMOJI_VERSION/emoji-test.txt",
|
89
|
+
# valid_subdivisions: "https://www.unicode.org/repos/cldr/tags/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
|
90
|
+
valid_subdivisions: "https://raw.githubusercontent.com/unicode-org/cldr/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
|
91
|
+
# ""
|
27
92
|
}
|
28
93
|
end
|
29
94
|
|
data/lib/unicoder/downloader.rb
CHANGED
@@ -1,28 +1,74 @@
|
|
1
1
|
require "open-uri"
|
2
2
|
require "fileutils"
|
3
|
+
require "zip"
|
3
4
|
|
4
5
|
module Unicoder
|
5
6
|
module Downloader
|
6
7
|
def self.fetch(identifier,
|
7
8
|
unicode_version: CURRENT_UNICODE_VERSION,
|
9
|
+
emoji_version: CURRENT_EMOJI_VERSION,
|
8
10
|
destination_directory: LOCAL_DATA_DIRECTORY,
|
9
11
|
destination: nil,
|
10
12
|
filename: nil
|
11
13
|
)
|
12
14
|
filename = UNICODE_FILES[identifier.to_sym] || filename
|
13
15
|
raise ArgumentError, "No valid file identifier or filename given" if !filename
|
14
|
-
filename
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
16
|
+
filename = filename.dup
|
17
|
+
filename.sub! 'UNICODE_VERSION', unicode_version
|
18
|
+
filename.sub! 'EMOJI_VERSION', emoji_version
|
19
|
+
filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[emoji_version]
|
20
|
+
if filename =~ /\A(https?|ftp):\/\//
|
21
|
+
source = filename
|
22
|
+
destination ||= destination_directory + filename.sub(/\A(https?|ftp):\//, "")
|
23
|
+
else
|
24
|
+
source = UNICODE_DATA_ENDPOINT + filename
|
25
|
+
destination ||= destination_directory + filename
|
26
|
+
end
|
22
27
|
|
23
28
|
puts "GET #{source} => #{destination}"
|
29
|
+
|
30
|
+
if source =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
|
31
|
+
# Too much magic, download unzip zip files
|
32
|
+
zip = true
|
33
|
+
source = $~[:outer_path] + ".zip"
|
34
|
+
inner_zip_filename = $~[:inner_path]
|
35
|
+
if destination =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
|
36
|
+
destination = $~[:outer_path] + ".zip"
|
37
|
+
destination_files = $~[:outer_path]
|
38
|
+
else
|
39
|
+
raise "uncoder bug"
|
40
|
+
end
|
41
|
+
else
|
42
|
+
zip = false
|
43
|
+
end
|
44
|
+
|
45
|
+
if File.exist?(destination)
|
46
|
+
puts "Skipping download of #{source} (already exists)"
|
47
|
+
else
|
48
|
+
URI.open(source){ |f|
|
49
|
+
FileUtils.mkdir_p(File.dirname(destination))
|
50
|
+
File.write(destination, f.read)
|
51
|
+
}
|
52
|
+
end
|
53
|
+
|
54
|
+
if zip
|
55
|
+
unzip(destination, [inner_zip_filename], destination_files)
|
56
|
+
end
|
24
57
|
rescue => e
|
25
58
|
$stderr.puts "#{e.class}: #{e.message}"
|
26
59
|
end
|
60
|
+
|
61
|
+
def self.unzip(archive, files, destination_dir)
|
62
|
+
Zip::File.open(archive) do |zip|
|
63
|
+
zip.each do |file_in_zip|
|
64
|
+
if files.include?(file_in_zip.name)
|
65
|
+
FileUtils.mkdir_p(destination_dir)
|
66
|
+
puts "Extract #{file_in_zip.name}"
|
67
|
+
file_in_zip.extract(destination_dir + "/#{file_in_zip.name}")
|
68
|
+
end
|
69
|
+
end
|
70
|
+
# entry = zip.glob('*.csv').first
|
71
|
+
end
|
72
|
+
end
|
27
73
|
end
|
28
74
|
end
|
@@ -59,6 +59,28 @@ module Unicoder
|
|
59
59
|
end
|
60
60
|
}
|
61
61
|
end
|
62
|
-
|
62
|
+
|
63
|
+
def remove_trailing_nils!(index = @index)
|
64
|
+
index.each{ |plane|
|
65
|
+
if plane.is_a?(Array)
|
66
|
+
plane.pop while plane[-1] == nil
|
67
|
+
plane.each{ |row|
|
68
|
+
if row.is_a?(Array)
|
69
|
+
row.pop while row[-1] == nil
|
70
|
+
row.each{ |byte|
|
71
|
+
if byte.is_a?(Array)
|
72
|
+
byte.pop while byte[-1] == nil
|
73
|
+
byte.each{ |nibble|
|
74
|
+
if nibble.is_a?(Array)
|
75
|
+
nibble.pop while nibble[-1] == nil
|
76
|
+
end
|
77
|
+
}
|
78
|
+
end
|
79
|
+
}
|
80
|
+
end
|
81
|
+
}
|
82
|
+
end
|
83
|
+
}
|
84
|
+
end
|
63
85
|
end
|
64
|
-
end
|
86
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require "json"
|
2
|
+
|
3
|
+
module Unicoder
|
4
|
+
module ReplaceCommonWords
|
5
|
+
def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
|
6
|
+
puts "Starting to replace the #{count} most common words"
|
7
|
+
@index[:REPLACE_BASE] = base
|
8
|
+
@index[:COMMON_WORDS] = words.
|
9
|
+
select{_1.size >= min_word_length}.
|
10
|
+
tally.
|
11
|
+
max_by(count){_2}.
|
12
|
+
map(&:first)
|
13
|
+
@index[which_index].each{|_, name|
|
14
|
+
@index[:COMMON_WORDS].each_with_index{|word, index|
|
15
|
+
name.gsub! word + " ", [base + index].pack("U")
|
16
|
+
}
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/unicoder.rb
CHANGED
@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
|
|
2
2
|
require_relative "unicoder/downloader"
|
3
3
|
require_relative "unicoder/builder"
|
4
4
|
require_relative "unicoder/multi_dimensional_array_builder"
|
5
|
+
require_relative "unicoder/replace_common_words"
|
5
6
|
|
6
7
|
if defined?(Rake)
|
7
8
|
Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))
|
data/unicoder.gemspec
CHANGED
@@ -5,18 +5,20 @@ require File.dirname(__FILE__) + "/lib/unicoder/constants"
|
|
5
5
|
Gem::Specification.new do |gem|
|
6
6
|
gem.name = "unicoder"
|
7
7
|
gem.version = Unicoder::VERSION
|
8
|
-
gem.summary = "
|
9
|
-
gem.description = "
|
8
|
+
gem.summary = "Creates specialized indexes for Unicode data lookup"
|
9
|
+
gem.description = "Generates specialized indexes for Unicode data lookup"
|
10
10
|
gem.authors = ["Jan Lelis"]
|
11
|
-
gem.email = ["
|
11
|
+
gem.email = ["hi@ruby.consulting"]
|
12
12
|
gem.homepage = "https://github.com/janlelis/unicoder"
|
13
13
|
gem.license = "MIT"
|
14
14
|
|
15
|
-
gem.files = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^pkg/ }
|
15
|
+
gem.files = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^(pkg|data)/ && path !~ /(marshal|mjs|json)(.gz)?$/ }
|
16
16
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
19
|
|
20
|
-
gem.required_ruby_version = "
|
20
|
+
gem.required_ruby_version = ">= 3.0", "< 4.0"
|
21
21
|
gem.add_dependency "rationalist", "~> 2.0"
|
22
|
+
gem.add_dependency "rubyzip", "~> 1.2"
|
23
|
+
gem.add_dependency "oga", "~> 2.9"
|
22
24
|
end
|