unicoder 0.1.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +6 -1
- data/.travis.yml +13 -13
- data/CHANGELOG.md +24 -1
- data/Gemfile +2 -0
- data/Gemfile.lock +99 -0
- data/MIT-LICENSE.txt +1 -1
- data/README.md +35 -5
- data/bin/unicoder +1 -1
- data/lib/unicoder/builder.rb +77 -15
- data/lib/unicoder/builders/categories.rb +7 -12
- data/lib/unicoder/builders/display_width.rb +28 -7
- data/lib/unicoder/builders/emoji.rb +97 -0
- data/lib/unicoder/builders/name.rb +101 -0
- data/lib/unicoder/builders/numeric_value.rb +30 -0
- data/lib/unicoder/builders/sequence_name.rb +99 -0
- data/lib/unicoder/builders/types.rb +83 -0
- data/lib/unicoder/constants.rb +81 -16
- data/lib/unicoder/downloader.rb +54 -8
- data/lib/unicoder/multi_dimensional_array_builder.rb +24 -2
- data/lib/unicoder/replace_common_words.rb +20 -0
- data/lib/unicoder.rb +1 -0
- data/unicoder.gemspec +7 -5
- metadata +50 -26
- data/data/.keep +0 -0
- data/data/unicode/8.0.0/ucd/Blocks.txt +0 -298
- data/data/unicode/8.0.0/ucd/EastAsianWidth.txt +0 -2174
- data/data/unicode/8.0.0/ucd/NameAliases.txt +0 -554
- data/data/unicode/8.0.0/ucd/PropertyValueAliases.txt +0 -1420
- data/data/unicode/8.0.0/ucd/ScriptExtensions.txt +0 -454
- data/data/unicode/8.0.0/ucd/Scripts.txt +0 -2539
- data/data/unicode/8.0.0/ucd/UnicodeData.txt +0 -29215
- data/data/unicode/8.0.0/ucd/extracted/DerivedGeneralCategory.txt +0 -3789
- data/data/unicode/security/8.0.0/confusables.txt +0 -9274
- data/spec/unicoder_spec.rb +0 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b9cec551ee0c7308313eada0859d3b2cbe1a8c3aaeb45072d3fb08a886b25e8a
|
4
|
+
data.tar.gz: 907185cfd8e98d4d8f291a33b64b0af349716b75757f72abc00de1e98e7bdd3a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ac60e2255690882f372023e66fff6bbfaf1b039f50896e7496ce6cdec2324d993a5fa644b7de539f3039a20bd8bdbaf18f6b81aa727ccf1a6d411608b3355a6e
|
7
|
+
data.tar.gz: 403549b0dfdc3fe4dc3f93f3a4c743fdef4e2e057364c49ee7d38d71c4722b90d50bf64ea9050d0ad2975c118ceb2c2bcd6c9464d4976e843655f69f0bffe2b1
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
@@ -1,20 +1,20 @@
|
|
1
1
|
sudo: false
|
2
2
|
language: ruby
|
3
3
|
|
4
|
-
script: bundle exec ruby spec/unicoder_spec.rb
|
5
|
-
|
6
4
|
rvm:
|
7
|
-
- 2.
|
8
|
-
- 2.
|
9
|
-
- 2.
|
10
|
-
- 2.
|
5
|
+
- 2.7
|
6
|
+
- 2.6
|
7
|
+
- 2.5
|
8
|
+
- 2.4
|
9
|
+
- 2.3
|
11
10
|
- ruby-head
|
12
|
-
-
|
13
|
-
-
|
14
|
-
- jruby-9000
|
15
|
-
|
16
|
-
cache:
|
17
|
-
- bundler
|
11
|
+
- jruby-9.2.9.0
|
12
|
+
- truffleruby
|
18
13
|
|
19
|
-
|
14
|
+
matrix:
|
15
|
+
allow_failures:
|
16
|
+
- rvm: 2.3
|
17
|
+
- rvm: ruby-head
|
18
|
+
- rvm: jruby-2.9.2.0
|
19
|
+
- rvm: truffleruby
|
20
20
|
# fast_finish: true
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,28 @@
|
|
1
1
|
## CHANGELOG
|
2
2
|
|
3
|
+
### 1.1.0
|
4
|
+
|
5
|
+
- Improve name index size: Support ranges
|
6
|
+
- Improve name index size: Replace common words
|
7
|
+
|
8
|
+
### 1.0.0
|
9
|
+
|
10
|
+
With the first 1.0 release, unicoder supports 10 indexes:
|
11
|
+
|
12
|
+
- blocks
|
13
|
+
- categories
|
14
|
+
- confusable
|
15
|
+
- display_width
|
16
|
+
- emoji
|
17
|
+
- name
|
18
|
+
- numeric_value
|
19
|
+
- scripts
|
20
|
+
- sequence_name
|
21
|
+
- types
|
22
|
+
|
23
|
+
All indexes can be build in `marshal` format (Ruby's internal
|
24
|
+
serialization format) and some now support `esm` (JavaScript module)
|
25
|
+
|
3
26
|
### 0.1.0
|
4
27
|
|
5
|
-
*
|
28
|
+
* Initial release
|
data/Gemfile
CHANGED
data/Gemfile.lock
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
unicoder (1.1.0)
|
5
|
+
oga (~> 2.9)
|
6
|
+
rationalist (~> 2.0)
|
7
|
+
rubyzip (~> 1.2)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
ansi (1.5.0)
|
13
|
+
ast (2.4.2)
|
14
|
+
cd (1.0.2)
|
15
|
+
clipboard (2.0.0)
|
16
|
+
code (0.9.4)
|
17
|
+
coderay (~> 1.1)
|
18
|
+
method_source (>= 0.9, < 2.0)
|
19
|
+
coderay (1.1.3)
|
20
|
+
core_docs (0.9.11)
|
21
|
+
yard (~> 0.9.11)
|
22
|
+
debugging (2.1.0)
|
23
|
+
paint (>= 0.9, < 3.0)
|
24
|
+
every_day_irb (2.2.0)
|
25
|
+
cd (~> 1.0)
|
26
|
+
fancy_irb (2.1.2)
|
27
|
+
irb (>= 1.7, < 2.0)
|
28
|
+
paint (>= 0.9, < 3.0)
|
29
|
+
unicode-display_width (>= 2.5)
|
30
|
+
ffi (1.17.0)
|
31
|
+
hirb (0.7.3)
|
32
|
+
interactive_editor (0.0.12)
|
33
|
+
spoon (~> 0.0.6)
|
34
|
+
io-console (0.7.2)
|
35
|
+
irb (1.14.1)
|
36
|
+
rdoc (>= 4.0.0)
|
37
|
+
reline (>= 0.4.2)
|
38
|
+
irbtools (4.1.0)
|
39
|
+
clipboard (>= 1.4, < 3.0)
|
40
|
+
code (>= 0.9.4, < 2.0)
|
41
|
+
coderay (~> 1.1)
|
42
|
+
core_docs (~> 0.9.11)
|
43
|
+
debugging (~> 2.1)
|
44
|
+
every_day_irb (~> 2.2)
|
45
|
+
fancy_irb (~> 2.1)
|
46
|
+
hirb (~> 0.7, >= 0.7.3)
|
47
|
+
interactive_editor (~> 0.0, >= 0.0.12)
|
48
|
+
irb (>= 1.13.0, < 1.15)
|
49
|
+
looksee (~> 5.0)
|
50
|
+
methodfinder (~> 2.2, >= 2.2.5)
|
51
|
+
object_shadow (~> 1.1)
|
52
|
+
os (~> 1.1, >= 1.1.4)
|
53
|
+
paint (>= 0.9, < 3.0)
|
54
|
+
ruby_engine (~> 2.0)
|
55
|
+
ruby_version (~> 1.0)
|
56
|
+
wirb (~> 2.0, >= 2.2.1)
|
57
|
+
looksee (5.0.0)
|
58
|
+
method_source (1.1.0)
|
59
|
+
methodfinder (2.2.5)
|
60
|
+
minitest (5.25.1)
|
61
|
+
object_shadow (1.1.1)
|
62
|
+
oga (2.15)
|
63
|
+
ast
|
64
|
+
ruby-ll (~> 2.1)
|
65
|
+
os (1.1.4)
|
66
|
+
paint (2.3.0)
|
67
|
+
psych (5.1.2)
|
68
|
+
stringio
|
69
|
+
rake (13.2.1)
|
70
|
+
rationalist (2.0.1)
|
71
|
+
rdoc (6.7.0)
|
72
|
+
psych (>= 4.0.0)
|
73
|
+
reline (0.5.10)
|
74
|
+
io-console (~> 0.5)
|
75
|
+
ruby-ll (2.1.3)
|
76
|
+
ansi
|
77
|
+
ast
|
78
|
+
ruby_engine (2.0.3)
|
79
|
+
ruby_version (1.0.3)
|
80
|
+
rubyzip (1.3.0)
|
81
|
+
spoon (0.0.6)
|
82
|
+
ffi
|
83
|
+
stringio (3.1.1)
|
84
|
+
unicode-display_width (2.6.0)
|
85
|
+
wirb (2.2.2)
|
86
|
+
paint (>= 0.9, < 3.0)
|
87
|
+
yard (0.9.37)
|
88
|
+
|
89
|
+
PLATFORMS
|
90
|
+
ruby
|
91
|
+
|
92
|
+
DEPENDENCIES
|
93
|
+
irbtools
|
94
|
+
minitest
|
95
|
+
rake
|
96
|
+
unicoder!
|
97
|
+
|
98
|
+
BUNDLED WITH
|
99
|
+
2.5.21
|
data/MIT-LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -1,15 +1,45 @@
|
|
1
|
-
# unicoder [![[version]](https://badge.fury.io/rb/unicoder.svg)](
|
2
|
-
|
3
|
-
WIP
|
1
|
+
# unicoder [![[version]](https://badge.fury.io/rb/unicoder.svg)](https://badge.fury.io/rb/unicoder)
|
4
2
|
|
3
|
+
unicoder turns Unicode data into bundles for programming libraries.
|
5
4
|
|
6
5
|
## Usage
|
7
6
|
|
8
7
|
```
|
9
|
-
$ unicoder build index_name
|
8
|
+
$ unicoder build <index_name> [--gzip]
|
9
|
+
```
|
10
|
+
|
11
|
+
Examples:
|
12
|
+
|
13
|
+
```
|
14
|
+
$ unicoder build emoji --format marshal --gzip
|
15
|
+
$ unicoder build numeric_value --format esm
|
10
16
|
```
|
11
17
|
|
12
18
|
|
19
|
+
## Libraries With unicoder-based Indexes
|
20
|
+
|
21
|
+
### Ruby
|
22
|
+
|
23
|
+
Index Name | Gem
|
24
|
+
--------------|----
|
25
|
+
blocks | [unicode-blocks](https://github.com/janlelis/unicode-blocks)
|
26
|
+
categories | [unicode-categories](https://github.com/janlelis/unicode-categories)
|
27
|
+
confusable | [unicode-confusable](https://github.com/janlelis/unicode-confusable)
|
28
|
+
emoji | [unicode-emoji](https://github.com/janlelis/unicode-emoji)
|
29
|
+
display\_width| [unicode-display_width](https://github.com/janlelis/unicode-display_width)
|
30
|
+
name | [unicode-name](https://github.com/janlelis/unicode-name)
|
31
|
+
numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-numeric_value)
|
32
|
+
scripts | [unicode-scripts](https://github.com/janlelis/unicode-scripts)
|
33
|
+
sequence\_name| [unicode-sequence_name](https://github.com/janlelis/unicode-sequence_name)
|
34
|
+
types | [unicode-types](https://github.com/janlelis/unicode-types)
|
35
|
+
|
36
|
+
### JavaScript (ESM)
|
37
|
+
|
38
|
+
Index Name | Module
|
39
|
+
--------------|----
|
40
|
+
name, sequence\_name, type | [unicode-name.js](https://github.com/janlelis/unicode-name.js)
|
41
|
+
numeric\_value| [unicode-number.js](https://github.com/janlelis/unicode-number.js)
|
42
|
+
|
13
43
|
## MIT License
|
14
44
|
|
15
|
-
Copyright (C) 2016 Jan Lelis <
|
45
|
+
Copyright (C) 2016-2024 Jan Lelis <https://janlelis.com>. Released under the MIT license.
|
data/bin/unicoder
CHANGED
@@ -6,7 +6,7 @@ require "rationalist"
|
|
6
6
|
args = Rationalist.parse
|
7
7
|
command = args[:_][0]
|
8
8
|
identifier = args[:_][1]
|
9
|
-
KNOWN_OPTIONS = [:version, :help, :verbose, :format, :gzip]
|
9
|
+
KNOWN_OPTIONS = [:version, :help, :verbose, :format, :gzip, :option, :meta]
|
10
10
|
options = args.select { |option,| KNOWN_OPTIONS.include? option }
|
11
11
|
|
12
12
|
if options.has_key?(:version)
|
data/lib/unicoder/builder.rb
CHANGED
@@ -1,13 +1,42 @@
|
|
1
1
|
require "json"
|
2
|
+
require "rubygems/util"
|
2
3
|
|
3
4
|
module Unicoder
|
4
5
|
# A builder defines a parse function which translates one (ore more) unicode data
|
5
6
|
# files into an index hash
|
6
7
|
module Builder
|
7
|
-
attr_reader :index
|
8
|
+
attr_reader :index, :formats, :option
|
9
|
+
attr_writer :option
|
8
10
|
|
9
|
-
def
|
10
|
-
|
11
|
+
def formats
|
12
|
+
{
|
13
|
+
marshal: {
|
14
|
+
ext: ".marshal",
|
15
|
+
},
|
16
|
+
json: {
|
17
|
+
ext: ".json",
|
18
|
+
option: "charkeys+stringfractions"
|
19
|
+
},
|
20
|
+
esm: {
|
21
|
+
ext: ".mjs",
|
22
|
+
option: "charkeys+stringfractions"
|
23
|
+
}
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
def meta
|
28
|
+
{
|
29
|
+
META: {
|
30
|
+
generator: "unicoder v#{Unicoder::VERSION}",
|
31
|
+
unicodeVersion: @unicode_version,
|
32
|
+
},
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize(unicode_version = nil, emoji_version = nil, format = nil)
|
37
|
+
@unicode_version = unicode_version || CURRENT_UNICODE_VERSION
|
38
|
+
@emoji_version = emoji_version || CURRENT_EMOJI_VERSION
|
39
|
+
@option = formats[format.to_sym] ? formats[format.to_sym][:option] || "" : ""
|
11
40
|
initialize_index
|
12
41
|
end
|
13
42
|
|
@@ -15,8 +44,16 @@ module Unicoder
|
|
15
44
|
@index = {}
|
16
45
|
end
|
17
46
|
|
18
|
-
def assign_codepoint(codepoint, value,
|
19
|
-
|
47
|
+
def assign_codepoint(codepoint, value, idx = @index)
|
48
|
+
if option =~ /charkeys/
|
49
|
+
idx[[codepoint].pack("U*")] = value
|
50
|
+
else
|
51
|
+
idx[codepoint] = value
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def assign(sub_index_name, codepoint, value)
|
56
|
+
assign_codepoint(codepoint, value, index[sub_index_name])
|
20
57
|
end
|
21
58
|
|
22
59
|
def parse!
|
@@ -26,47 +63,72 @@ module Unicoder
|
|
26
63
|
def parse_file(identifier, parse_mode, **parse_options)
|
27
64
|
filename = UNICODE_FILES[identifier.to_sym] || filename
|
28
65
|
raise ArgumentError, "No valid file identifier or filename given" if !filename
|
29
|
-
filename
|
30
|
-
|
66
|
+
filename = filename.dup
|
67
|
+
filename.sub! 'UNICODE_VERSION', @unicode_version
|
68
|
+
filename.sub! 'EMOJI_VERSION', @emoji_version
|
69
|
+
filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[@emoji_version]
|
70
|
+
filename.sub! '.zip', ''
|
71
|
+
filename.sub! /\A(https?|ftp):\//, ""
|
72
|
+
Downloader.fetch(identifier) unless File.exist?(LOCAL_DATA_DIRECTORY + filename)
|
31
73
|
file = File.read(LOCAL_DATA_DIRECTORY + filename)
|
32
74
|
|
33
75
|
if parse_mode == :line
|
34
76
|
file.each_line{ |line|
|
35
77
|
yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
|
36
78
|
}
|
79
|
+
elsif parse_mode == :xml
|
80
|
+
require "oga"
|
81
|
+
yield Oga.parse_xml(file)
|
82
|
+
else
|
83
|
+
yield file
|
37
84
|
end
|
38
85
|
end
|
39
86
|
|
40
87
|
def export(format: :marshal, **options)
|
41
88
|
p index if options[:verbose]
|
42
89
|
|
90
|
+
if options[:meta]
|
91
|
+
idx = meta.merge(index)
|
92
|
+
else
|
93
|
+
idx = index
|
94
|
+
end
|
95
|
+
|
96
|
+
|
43
97
|
case format.to_sym
|
44
98
|
when :marshal
|
45
|
-
index_file = Marshal.dump(
|
99
|
+
index_file = Marshal.dump(idx)
|
46
100
|
when :json
|
47
|
-
index_file = JSON.dump(
|
101
|
+
index_file = JSON.dump(idx)
|
102
|
+
when :esm
|
103
|
+
index_file = "export default " + JSON.dump(idx)
|
48
104
|
end
|
49
105
|
|
50
|
-
# if false# || options[:gzip]
|
51
106
|
if options[:gzip]
|
52
|
-
Gem.gzip(index_file)
|
107
|
+
Gem::Util.gzip(index_file)
|
53
108
|
else
|
54
109
|
index_file
|
55
110
|
end
|
56
111
|
end
|
57
|
-
|
112
|
+
|
58
113
|
def self.build(identifier, **options)
|
59
114
|
format = options[:format] || :marshal
|
60
115
|
require_relative "builders/#{identifier}"
|
61
116
|
# require "unicoder/builders/#{identifier}"
|
62
117
|
builder_class = self.const_get(identifier.to_s.gsub(/(?:^|_)([a-z])/){ $1.upcase })
|
63
|
-
builder = builder_class.new(
|
118
|
+
builder = builder_class.new(
|
119
|
+
options[:unicode_version],
|
120
|
+
options[:emoji_version],
|
121
|
+
format
|
122
|
+
)
|
64
123
|
puts "Building index for #{identifier}…"
|
124
|
+
if options[:option]
|
125
|
+
builder.option = options[:option]
|
126
|
+
end
|
65
127
|
builder.parse!
|
66
|
-
index_file = builder.export(options)
|
128
|
+
index_file = builder.export(**options)
|
67
129
|
|
68
130
|
destination ||= options[:destination] || identifier.to_s
|
69
|
-
destination += "
|
131
|
+
destination += "#{builder.formats.dig(format.to_sym, :ext)}"
|
70
132
|
destination += ".gz" if options[:gzip]
|
71
133
|
bytes = File.write destination, index_file
|
72
134
|
|
@@ -14,23 +14,18 @@ module Unicoder
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def parse!
|
17
|
-
parse_file :
|
18
|
-
if line["
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
(@range_start..line["codepoint"].to_i(16)).each{ |codepoint|
|
23
|
-
assign_codepoint(codepoint, line["category"], @index[:CATEGORIES])
|
24
|
-
}
|
25
|
-
else
|
26
|
-
raise ArgumentError, "inconsistent range found in data, don't know what to do"
|
27
|
-
end
|
17
|
+
parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
|
18
|
+
if line["to"]
|
19
|
+
(line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
|
20
|
+
assign_codepoint(codepoint, line["category"] == "Cn" ? nil : line["category"], @index[:CATEGORIES])
|
21
|
+
}
|
28
22
|
else
|
29
|
-
assign_codepoint(line["
|
23
|
+
assign_codepoint(line["from"].to_i(16), line["category"] == "Cn" ? nil : line["category"], @index[:CATEGORIES])
|
30
24
|
end
|
31
25
|
end
|
32
26
|
|
33
27
|
4.times{ compress! @index[:CATEGORIES] }
|
28
|
+
remove_trailing_nils! @index[:CATEGORIES]
|
34
29
|
|
35
30
|
parse_file :property_value_aliases, :line, regex: /^gc ; (?<short>\S{2}?) *; (?<long>\S+).*$/ do |line|
|
36
31
|
@index[:CATEGORY_NAMES][line["short"]] = line["long"]
|
@@ -6,7 +6,23 @@ module Unicoder
|
|
6
6
|
|
7
7
|
IGNORE_CATEGORIES = %w[Cs Co Cn].freeze
|
8
8
|
ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
|
9
|
-
|
9
|
+
|
10
|
+
ZERO_WIDTH_RANGES = [
|
11
|
+
*0x1160..0x11FF, # HANGUL JUNGSEONG
|
12
|
+
*0xD7B0..0xD7FF, # HANGUL JUNGSEONG
|
13
|
+
*0x2060..0x206F, # Ignorables
|
14
|
+
*0xFFF0..0xFFF8, # Ignorables
|
15
|
+
*0xE0000..0xE0FFF, # Ignorables
|
16
|
+
].freeze
|
17
|
+
|
18
|
+
WIDE_RANGES = [
|
19
|
+
*0x3400..0x4DBF,
|
20
|
+
*0x4E00..0x9FFF,
|
21
|
+
*0xF900..0xFAFF,
|
22
|
+
*0x20000..0x2FFFD,
|
23
|
+
*0x30000..0x3FFFD,
|
24
|
+
].freeze
|
25
|
+
|
10
26
|
SPECIAL_WIDTHS = {
|
11
27
|
0x0 => 0, # \0 NULL
|
12
28
|
0x5 => 0, # ENQUIRY
|
@@ -18,7 +34,7 @@ module Unicoder
|
|
18
34
|
0xD => 0, # \r CARRIAGE RETURN
|
19
35
|
0xE => 0, # SHIFT OUT
|
20
36
|
0xF => 0, # SHIFT IN
|
21
|
-
0x00AD =>
|
37
|
+
0x00AD => nil, # SOFT HYPHEN
|
22
38
|
0x2E3A => 2, # TWO-EM DASH
|
23
39
|
0x2E3B => 3, # THREE-EM DASH
|
24
40
|
}.freeze
|
@@ -28,7 +44,7 @@ module Unicoder
|
|
28
44
|
end
|
29
45
|
|
30
46
|
def parse!
|
31
|
-
parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)
|
47
|
+
parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
|
32
48
|
next if IGNORE_CATEGORIES.include?(line["category"])
|
33
49
|
|
34
50
|
if line["codepoints"]['..']
|
@@ -44,19 +60,24 @@ module Unicoder
|
|
44
60
|
}
|
45
61
|
end
|
46
62
|
|
63
|
+
ZERO_WIDTH_RANGES.each{ |codepoint|
|
64
|
+
assign_codepoint codepoint, 0
|
65
|
+
}
|
66
|
+
|
67
|
+
WIDE_RANGES.each{ |codepoint|
|
68
|
+
assign_codepoint codepoint, 2
|
69
|
+
}
|
70
|
+
|
47
71
|
SPECIAL_WIDTHS.each{ |codepoint, value|
|
48
72
|
assign_codepoint codepoint, value
|
49
73
|
}
|
50
74
|
|
51
75
|
4.times{ compress! }
|
52
|
-
|
53
|
-
p @index
|
54
76
|
end
|
55
77
|
|
56
78
|
def determine_width(codepoint, category, east_asian_width)
|
57
79
|
if ( ZERO_WIDTH_CATEGORIES.include?(category) &&
|
58
|
-
[codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
|
59
|
-
ZERO_WIDTH_CODEPOINTS.include?(codepoint)
|
80
|
+
[codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
|
60
81
|
0
|
61
82
|
elsif east_asian_width == "F" || east_asian_width == "W"
|
62
83
|
2
|
@@ -0,0 +1,97 @@
|
|
1
|
+
module Unicoder
|
2
|
+
module Builder
|
3
|
+
class Emoji
|
4
|
+
include Builder
|
5
|
+
|
6
|
+
REVERSE_PROPERTY_NAMES = {
|
7
|
+
"Emoji" => :E,
|
8
|
+
"Emoji_Modifier_Base" => :B,
|
9
|
+
"Emoji_Modifier" => :M,
|
10
|
+
"Emoji_Component" => :C,
|
11
|
+
"Emoji_Presentation" => :P,
|
12
|
+
"Extended_Pictographic" => :X,
|
13
|
+
}
|
14
|
+
|
15
|
+
def initialize_index
|
16
|
+
@index = {
|
17
|
+
PROPERTIES: {},
|
18
|
+
FLAGS: [],
|
19
|
+
TAGS: [],
|
20
|
+
KEYCAPS: [],
|
21
|
+
ZWJ: [],
|
22
|
+
SD: [],
|
23
|
+
LIST: {},
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse!
|
28
|
+
parse_file :emoji_data, :line, regex: /^(?<codepoints>\S+?) +; (?<property>\S+) *#/ do |line|
|
29
|
+
if line["codepoints"]['..']
|
30
|
+
codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
|
31
|
+
codepoint.to_i(16)
|
32
|
+
})
|
33
|
+
else
|
34
|
+
codepoints = [line["codepoints"].to_i(16)]
|
35
|
+
end
|
36
|
+
|
37
|
+
codepoints.each{ |codepoint|
|
38
|
+
@index[:PROPERTIES][codepoint] ||= []
|
39
|
+
@index[:PROPERTIES][codepoint] << (REVERSE_PROPERTY_NAMES[line["property"]] || line["property"])
|
40
|
+
}
|
41
|
+
end
|
42
|
+
|
43
|
+
parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; RGI_Emoji_Flag_Sequence/ do |line|
|
44
|
+
codepoints = line["codepoints"].split
|
45
|
+
@index[:FLAGS] << codepoints.map{|e| e.to_i(16)}
|
46
|
+
end
|
47
|
+
|
48
|
+
parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; RGI_Emoji_Tag_Sequence/ do |line|
|
49
|
+
codepoints = line["codepoints"].split
|
50
|
+
@index[:TAGS] << codepoints.map{|e| e.to_i(16)}
|
51
|
+
end
|
52
|
+
|
53
|
+
parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; Emoji_Keycap_Sequence/ do |line|
|
54
|
+
@index[:KEYCAPS] << line["codepoints"].split[0].to_i(16)
|
55
|
+
end
|
56
|
+
|
57
|
+
parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;/ do |line|
|
58
|
+
codepoints = line["codepoints"].split
|
59
|
+
@index[:ZWJ] << codepoints.map{|e| e.to_i(16)}
|
60
|
+
end
|
61
|
+
|
62
|
+
parse_file :valid_subdivisions, :xml do |xml|
|
63
|
+
subdivisions = []
|
64
|
+
xml.css('[idStatus="regular"], [idStatus="deprecated"]').each{ |id|
|
65
|
+
subdivisions += id.text.split
|
66
|
+
}
|
67
|
+
@index[:SD] = subdivisions.uniq
|
68
|
+
end
|
69
|
+
|
70
|
+
parse_file :emoji_test, :line, regex: /^(?:# (?<sub>sub)?group: (?<group_name>.*)$)|(?:(?<codepoints>.+?)\s*; fully-qualified )/ do |line|
|
71
|
+
if line["group_name"]
|
72
|
+
if !line["sub"]
|
73
|
+
@current_group_name = line["group_name"]
|
74
|
+
@index[:LIST][@current_group_name] = {}
|
75
|
+
else
|
76
|
+
@current_subgroup_name = line["group_name"]
|
77
|
+
@index[:LIST][@current_group_name][@current_subgroup_name] = []
|
78
|
+
end
|
79
|
+
else
|
80
|
+
codepoints = line["codepoints"].split
|
81
|
+
@index[:LIST][@current_group_name][@current_subgroup_name] << codepoints.map{|e| e.to_i(16)}.pack("U*")
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
=begin alternative
|
90
|
+
current_index_level = @index[:SEQUENCES]
|
91
|
+
codepoints.each{ |cp|
|
92
|
+
ord = cp.to_i(16)
|
93
|
+
current_index_level[ord] ||= {}
|
94
|
+
current_index_level = current_index_level[ord]
|
95
|
+
}
|
96
|
+
current_index_level[true] = true # end mark
|
97
|
+
=end
|