unicoder 0.1.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f79eb48ad06b13b61fc4ceb7fc5e176ee4e9e984
4
- data.tar.gz: 94a62eb108e01e1d7da774b58352ab0585235bc7
2
+ SHA256:
3
+ metadata.gz: 21168580053326da8f495794ab5133fc9fa3daa3fa66209cf5ddbb0ac68de923
4
+ data.tar.gz: 3829094ce2cfb8322d79f4a11ef2015235f2b7e602097ab3defb7c8cc4f825de
5
5
  SHA512:
6
- metadata.gz: 01714742c72568ab92a9c3df0b700f3918e32482b7f658da8f099e2cfb54359e098e90fa1caa72a343cbdf2ede36081a9c01a6d65ee76cee841e65b87c9083ad
7
- data.tar.gz: dd5b55100962d9408a503b338ebf25062c3dee7dc1ff9ceaccd97e30d57f97d131191ef96ce266e267cc729d70e6a0860702f22fbb1c9a6e4b512547ff1b5805
6
+ metadata.gz: e5a9b3d54c062817485b0b66b34015d1d12536fd4e82a601d604302d73bf06c0963dd3e8ca0b99bc9989ec62799c025b702218dbde445c46f7aa42671ccb4ae0
7
+ data.tar.gz: ac856c57cc3a9bd7fb8c0e65f282858b760ce42be93411fc5ae8cc2d3007cd9cc33b3e422cbdf8fa8f95316cf84994d328ed6679ec3ccd09d7515f49f30aa6a1
data/.gitignore CHANGED
@@ -1,3 +1,8 @@
1
1
  Gemfile.lock
2
2
  /pkg
3
- /data
3
+ /data*
4
+ *.marshal
5
+ *.marshal.gz
6
+ *.json
7
+ *.mjs
8
+ /old-data
data/.travis.yml CHANGED
@@ -1,20 +1,20 @@
1
1
  sudo: false
2
2
  language: ruby
3
3
 
4
- script: bundle exec ruby spec/unicoder_spec.rb
5
-
6
4
  rvm:
7
- - 2.3.0
8
- - 2.2
9
- - 2.1
10
- - 2.0
5
+ - 2.7
6
+ - 2.6
7
+ - 2.5
8
+ - 2.4
9
+ - 2.3
11
10
  - ruby-head
12
- - rbx-2
13
- - jruby-head
14
- - jruby-9000
15
-
16
- cache:
17
- - bundler
11
+ - jruby-9.2.9.0
12
+ - truffleruby
18
13
 
19
- # matrix:
14
+ matrix:
15
+ allow_failures:
16
+ - rvm: 2.3
17
+ - rvm: ruby-head
18
+ - rvm: jruby-2.9.2.0
19
+ - rvm: truffleruby
20
20
  # fast_finish: true
data/CHANGELOG.md CHANGED
@@ -1,5 +1,23 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.0.0
4
+
5
+ With the first 1.0 release, unicoder supports 10 indexes:
6
+
7
+ - blocks
8
+ - categories
9
+ - confusable
10
+ - display_width
11
+ - emoji
12
+ - name
13
+ - numeric_value
14
+ - scripts
15
+ - sequence_name
16
+ - types
17
+
18
+ All indexes can be build in `marshal` format (Ruby's internal
19
+ serialization format) and some now support `esm` (JavaScript module)
20
+
3
21
  ### 0.1.0
4
22
 
5
- * WIP
23
+ * Initial release
data/Gemfile CHANGED
@@ -3,3 +3,5 @@ source 'https://rubygems.org'
3
3
  gemspec
4
4
 
5
5
  gem 'minitest'
6
+ gem 'rake'
7
+ gem 'irbtools', require: "irbtools/binding"
data/Gemfile.lock ADDED
@@ -0,0 +1,99 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ unicoder (1.0.0)
5
+ oga (~> 2.9)
6
+ rationalist (~> 2.0)
7
+ rubyzip (~> 1.2)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ ansi (1.5.0)
13
+ ast (2.4.2)
14
+ cd (1.0.2)
15
+ clipboard (2.0.0)
16
+ code (0.9.4)
17
+ coderay (~> 1.1)
18
+ method_source (>= 0.9, < 2.0)
19
+ coderay (1.1.3)
20
+ core_docs (0.9.11)
21
+ yard (~> 0.9.11)
22
+ debugging (2.1.0)
23
+ paint (>= 0.9, < 3.0)
24
+ every_day_irb (2.2.0)
25
+ cd (~> 1.0)
26
+ fancy_irb (2.1.2)
27
+ irb (>= 1.7, < 2.0)
28
+ paint (>= 0.9, < 3.0)
29
+ unicode-display_width (>= 2.5)
30
+ ffi (1.17.0)
31
+ hirb (0.7.3)
32
+ interactive_editor (0.0.12)
33
+ spoon (~> 0.0.6)
34
+ io-console (0.7.2)
35
+ irb (1.14.1)
36
+ rdoc (>= 4.0.0)
37
+ reline (>= 0.4.2)
38
+ irbtools (4.1.0)
39
+ clipboard (>= 1.4, < 3.0)
40
+ code (>= 0.9.4, < 2.0)
41
+ coderay (~> 1.1)
42
+ core_docs (~> 0.9.11)
43
+ debugging (~> 2.1)
44
+ every_day_irb (~> 2.2)
45
+ fancy_irb (~> 2.1)
46
+ hirb (~> 0.7, >= 0.7.3)
47
+ interactive_editor (~> 0.0, >= 0.0.12)
48
+ irb (>= 1.13.0, < 1.15)
49
+ looksee (~> 5.0)
50
+ methodfinder (~> 2.2, >= 2.2.5)
51
+ object_shadow (~> 1.1)
52
+ os (~> 1.1, >= 1.1.4)
53
+ paint (>= 0.9, < 3.0)
54
+ ruby_engine (~> 2.0)
55
+ ruby_version (~> 1.0)
56
+ wirb (~> 2.0, >= 2.2.1)
57
+ looksee (5.0.0)
58
+ method_source (1.1.0)
59
+ methodfinder (2.2.5)
60
+ minitest (5.25.1)
61
+ object_shadow (1.1.1)
62
+ oga (2.15)
63
+ ast
64
+ ruby-ll (~> 2.1)
65
+ os (1.1.4)
66
+ paint (2.3.0)
67
+ psych (5.1.2)
68
+ stringio
69
+ rake (13.2.1)
70
+ rationalist (2.0.1)
71
+ rdoc (6.7.0)
72
+ psych (>= 4.0.0)
73
+ reline (0.5.10)
74
+ io-console (~> 0.5)
75
+ ruby-ll (2.1.3)
76
+ ansi
77
+ ast
78
+ ruby_engine (2.0.3)
79
+ ruby_version (1.0.3)
80
+ rubyzip (1.3.0)
81
+ spoon (0.0.6)
82
+ ffi
83
+ stringio (3.1.1)
84
+ unicode-display_width (2.6.0)
85
+ wirb (2.2.2)
86
+ paint (>= 0.9, < 3.0)
87
+ yard (0.9.37)
88
+
89
+ PLATFORMS
90
+ ruby
91
+
92
+ DEPENDENCIES
93
+ irbtools
94
+ minitest
95
+ rake
96
+ unicoder!
97
+
98
+ BUNDLED WITH
99
+ 2.5.21
data/MIT-LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2016 Jan Lelis, mail@janlelis.de
1
+ Copyright (c) 2016-2020 Jan Lelis, https://janlelis.com
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,15 +1,45 @@
1
- # unicoder [![[version]](https://badge.fury.io/rb/unicoder.svg)](http://badge.fury.io/rb/unicoder)
2
-
3
- WIP
1
+ # unicoder [![[version]](https://badge.fury.io/rb/unicoder.svg)](https://badge.fury.io/rb/unicoder)
4
2
 
3
+ unicoder turns Unicode data into bundles for programming libraries.
5
4
 
6
5
  ## Usage
7
6
 
8
7
  ```
9
- $ unicoder build index_name
8
+ $ unicoder build <index_name> [--gzip]
9
+ ```
10
+
11
+ Examples:
12
+
13
+ ```
14
+ $ unicoder build emoji --format marshal --gzip
15
+ $ unicoder build numeric_value --format esm
10
16
  ```
11
17
 
12
18
 
19
+ ## Libraries With unicoder-based Indexes
20
+
21
+ ### Ruby
22
+
23
+ Index Name | Gem
24
+ --------------|----
25
+ blocks | [unicode-blocks](https://github.com/janlelis/unicode-blocks)
26
+ categories | [unicode-categories](https://github.com/janlelis/unicode-categories)
27
+ confusable | [unicode-confusable](https://github.com/janlelis/unicode-confusable)
28
+ emoji | [unicode-emoji](https://github.com/janlelis/unicode-emoji)
29
+ display\_width| [unicode-display_width](https://github.com/janlelis/unicode-display_width)
30
+ name | [unicode-name](https://github.com/janlelis/unicode-name)
31
+ numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-numeric_value)
32
+ scripts | [unicode-scripts](https://github.com/janlelis/unicode-scripts)
33
+ sequence\_name| [unicode-sequence_name](https://github.com/janlelis/unicode-sequence_name)
34
+ types | [unicode-types](https://github.com/janlelis/unicode-types)
35
+
36
+ ### JavaScript (ESM)
37
+
38
+ Index Name | Module
39
+ --------------|----
40
+ numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-number.js)
41
+ name, sequence\_name, type | [unicode-name](https://github.com/janlelis/unicode-name)
42
+
13
43
  ## MIT License
14
44
 
15
- Copyright (C) 2016 Jan Lelis <http://janlelis.com>. Released under the MIT license.
45
+ Copyright (C) 2016-2024 Jan Lelis <https://janlelis.com>. Released under the MIT license.
data/bin/unicoder CHANGED
@@ -6,7 +6,7 @@ require "rationalist"
6
6
  args = Rationalist.parse
7
7
  command = args[:_][0]
8
8
  identifier = args[:_][1]
9
- KNOWN_OPTIONS = [:version, :help, :verbose, :format, :gzip]
9
+ KNOWN_OPTIONS = [:version, :help, :verbose, :format, :gzip, :option, :meta]
10
10
  options = args.select { |option,| KNOWN_OPTIONS.include? option }
11
11
 
12
12
  if options.has_key?(:version)
@@ -1,13 +1,42 @@
1
1
  require "json"
2
+ require "rubygems/util"
2
3
 
3
4
  module Unicoder
4
5
  # A builder defines a parse function which translates one (ore more) unicode data
5
6
  # files into an index hash
6
7
  module Builder
7
- attr_reader :index
8
+ attr_reader :index, :formats, :option
9
+ attr_writer :option
8
10
 
9
- def initialize(unicode_version = nil)
10
- @unicode_version = unicode_version
11
+ def formats
12
+ {
13
+ marshal: {
14
+ ext: ".marshal",
15
+ },
16
+ json: {
17
+ ext: ".json",
18
+ option: "charkeys+stringfractions"
19
+ },
20
+ esm: {
21
+ ext: ".mjs",
22
+ option: "charkeys+stringfractions"
23
+ }
24
+ }
25
+ end
26
+
27
+ def meta
28
+ {
29
+ META: {
30
+ generator: "unicoder v#{Unicoder::VERSION}",
31
+ unicodeVersion: @unicode_version,
32
+ },
33
+ }
34
+ end
35
+
36
+ def initialize(unicode_version = nil, emoji_version = nil, format = nil)
37
+ @unicode_version = unicode_version || CURRENT_UNICODE_VERSION
38
+ @emoji_version = emoji_version || CURRENT_EMOJI_VERSION
39
+ @option = formats[format.to_sym] ? formats[format.to_sym][:option] || "" : ""
11
40
  initialize_index
12
41
  end
13
42
 
@@ -15,8 +44,16 @@ module Unicoder
15
44
  @index = {}
16
45
  end
17
46
 
18
- def assign_codepoint(codepoint, value, index = @index)
19
- index[codepoint] = value
47
+ def assign_codepoint(codepoint, value, idx = @index)
48
+ if option =~ /charkeys/
49
+ idx[[codepoint].pack("U*")] = value
50
+ else
51
+ idx[codepoint] = value
52
+ end
53
+ end
54
+
55
+ def assign(sub_index_name, codepoint, value)
56
+ assign_codepoint(codepoint, value, index[sub_index_name])
20
57
  end
21
58
 
22
59
  def parse!
@@ -26,47 +63,72 @@ module Unicoder
26
63
  def parse_file(identifier, parse_mode, **parse_options)
27
64
  filename = UNICODE_FILES[identifier.to_sym] || filename
28
65
  raise ArgumentError, "No valid file identifier or filename given" if !filename
29
- filename.sub! 'VERSION', @unicode_version
30
- Downloader.fetch(identifier) unless File.exists?(filename)
66
+ filename = filename.dup
67
+ filename.sub! 'UNICODE_VERSION', @unicode_version
68
+ filename.sub! 'EMOJI_VERSION', @emoji_version
69
+ filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[@emoji_version]
70
+ filename.sub! '.zip', ''
71
+ filename.sub! /\A(https?|ftp):\//, ""
72
+ Downloader.fetch(identifier) unless File.exist?(LOCAL_DATA_DIRECTORY + filename)
31
73
  file = File.read(LOCAL_DATA_DIRECTORY + filename)
32
74
 
33
75
  if parse_mode == :line
34
76
  file.each_line{ |line|
35
77
  yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
36
78
  }
79
+ elsif parse_mode == :xml
80
+ require "oga"
81
+ yield Oga.parse_xml(file)
82
+ else
83
+ yield file
37
84
  end
38
85
  end
39
86
 
40
87
  def export(format: :marshal, **options)
41
88
  p index if options[:verbose]
42
89
 
90
+ if options[:meta]
91
+ idx = meta.merge(index)
92
+ else
93
+ idx = index
94
+ end
95
+
96
+
43
97
  case format.to_sym
44
98
  when :marshal
45
- index_file = Marshal.dump(index)
99
+ index_file = Marshal.dump(idx)
46
100
  when :json
47
- index_file = JSON.dump(index)
101
+ index_file = JSON.dump(idx)
102
+ when :esm
103
+ index_file = "export default " + JSON.dump(idx)
48
104
  end
49
105
 
50
- # if false# || options[:gzip]
51
106
  if options[:gzip]
52
- Gem.gzip(index_file)
107
+ Gem::Util.gzip(index_file)
53
108
  else
54
109
  index_file
55
110
  end
56
111
  end
57
-
112
+
58
113
  def self.build(identifier, **options)
59
114
  format = options[:format] || :marshal
60
115
  require_relative "builders/#{identifier}"
61
116
  # require "unicoder/builders/#{identifier}"
62
117
  builder_class = self.const_get(identifier.to_s.gsub(/(?:^|_)([a-z])/){ $1.upcase })
63
- builder = builder_class.new(options[:unicode_version] || CURRENT_UNICODE_VERSION)
118
+ builder = builder_class.new(
119
+ options[:unicode_version],
120
+ options[:emoji_version],
121
+ format
122
+ )
64
123
  puts "Building index for #{identifier}…"
124
+ if options[:option]
125
+ builder.option = options[:option]
126
+ end
65
127
  builder.parse!
66
- index_file = builder.export(options)
128
+ index_file = builder.export(**options)
67
129
 
68
130
  destination ||= options[:destination] || identifier.to_s
69
- destination += ".#{format}"
131
+ destination += "#{builder.formats.dig(format.to_sym, :ext)}"
70
132
  destination += ".gz" if options[:gzip]
71
133
  bytes = File.write destination, index_file
72
134
 
@@ -14,23 +14,18 @@ module Unicoder
14
14
  end
15
15
 
16
16
  def parse!
17
- parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<range><(?!control).+>)?.*?;(?<category>.+?);.*$/ do |line|
18
- if line["range"]
19
- if line["range"] =~ /First/
20
- @range_start = line["codepoint"].to_i(16)
21
- elsif line["range"] =~ /Last/ && @range_start
22
- (@range_start..line["codepoint"].to_i(16)).each{ |codepoint|
23
- assign_codepoint(codepoint, line["category"], @index[:CATEGORIES])
24
- }
25
- else
26
- raise ArgumentError, "inconsistent range found in data, don't know what to do"
27
- end
17
+ parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
18
+ if line["to"]
19
+ (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
20
+ assign_codepoint(codepoint, line["category"] == "Cn" ? nil : line["category"], @index[:CATEGORIES])
21
+ }
28
22
  else
29
- assign_codepoint(line["codepoint"].to_i(16), line["category"], @index[:CATEGORIES])
23
+ assign_codepoint(line["from"].to_i(16), line["category"] == "Cn" ? nil : line["category"], @index[:CATEGORIES])
30
24
  end
31
25
  end
32
26
 
33
27
  4.times{ compress! @index[:CATEGORIES] }
28
+ remove_trailing_nils! @index[:CATEGORIES]
34
29
 
35
30
  parse_file :property_value_aliases, :line, regex: /^gc ; (?<short>\S{2}?) *; (?<long>\S+).*$/ do |line|
36
31
  @index[:CATEGORY_NAMES][line["short"]] = line["long"]
@@ -6,7 +6,23 @@ module Unicoder
6
6
 
7
7
  IGNORE_CATEGORIES = %w[Cs Co Cn].freeze
8
8
  ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
9
- ZERO_WIDTH_CODEPOINTS = [*0x1160..0x11FF].freeze
9
+
10
+ ZERO_WIDTH_RANGES = [
11
+ *0x1160..0x11FF, # HANGUL JUNGSEONG
12
+ *0xD7B0..0xD7FF, # HANGUL JUNGSEONG
13
+ *0x2060..0x206F, # Ignorables
14
+ *0xFFF0..0xFFF8, # Ignorables
15
+ *0xE0000..0xE0FFF, # Ignorables
16
+ ].freeze
17
+
18
+ WIDE_RANGES = [
19
+ *0x3400..0x4DBF,
20
+ *0x4E00..0x9FFF,
21
+ *0xF900..0xFAFF,
22
+ *0x20000..0x2FFFD,
23
+ *0x30000..0x3FFFD,
24
+ ].freeze
25
+
10
26
  SPECIAL_WIDTHS = {
11
27
  0x0 => 0, # \0 NULL
12
28
  0x5 => 0, # ENQUIRY
@@ -18,7 +34,7 @@ module Unicoder
18
34
  0xD => 0, # \r CARRIAGE RETURN
19
35
  0xE => 0, # SHIFT OUT
20
36
  0xF => 0, # SHIFT IN
21
- 0x00AD => 1, # SOFT HYPHEN
37
+ 0x00AD => nil, # SOFT HYPHEN
22
38
  0x2E3A => 2, # TWO-EM DASH
23
39
  0x2E3B => 3, # THREE-EM DASH
24
40
  }.freeze
@@ -28,7 +44,7 @@ module Unicoder
28
44
  end
29
45
 
30
46
  def parse!
31
- parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?);(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
47
+ parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
32
48
  next if IGNORE_CATEGORIES.include?(line["category"])
33
49
 
34
50
  if line["codepoints"]['..']
@@ -44,19 +60,24 @@ module Unicoder
44
60
  }
45
61
  end
46
62
 
63
+ ZERO_WIDTH_RANGES.each{ |codepoint|
64
+ assign_codepoint codepoint, 0
65
+ }
66
+
67
+ WIDE_RANGES.each{ |codepoint|
68
+ assign_codepoint codepoint, 2
69
+ }
70
+
47
71
  SPECIAL_WIDTHS.each{ |codepoint, value|
48
72
  assign_codepoint codepoint, value
49
73
  }
50
74
 
51
75
  4.times{ compress! }
52
-
53
- p @index
54
76
  end
55
77
 
56
78
  def determine_width(codepoint, category, east_asian_width)
57
79
  if ( ZERO_WIDTH_CATEGORIES.include?(category) &&
58
- [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ ) ||
59
- ZERO_WIDTH_CODEPOINTS.include?(codepoint)
80
+ [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
60
81
  0
61
82
  elsif east_asian_width == "F" || east_asian_width == "W"
62
83
  2
@@ -0,0 +1,97 @@
1
+ module Unicoder
2
+ module Builder
3
+ class Emoji
4
+ include Builder
5
+
6
+ REVERSE_PROPERTY_NAMES = {
7
+ "Emoji" => :E,
8
+ "Emoji_Modifier_Base" => :B,
9
+ "Emoji_Modifier" => :M,
10
+ "Emoji_Component" => :C,
11
+ "Emoji_Presentation" => :P,
12
+ "Extended_Pictographic" => :X,
13
+ }
14
+
15
+ def initialize_index
16
+ @index = {
17
+ PROPERTIES: {},
18
+ FLAGS: [],
19
+ TAGS: [],
20
+ KEYCAPS: [],
21
+ ZWJ: [],
22
+ SD: [],
23
+ LIST: {},
24
+ }
25
+ end
26
+
27
+ def parse!
28
+ parse_file :emoji_data, :line, regex: /^(?<codepoints>\S+?) +; (?<property>\S+) *#/ do |line|
29
+ if line["codepoints"]['..']
30
+ codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
31
+ codepoint.to_i(16)
32
+ })
33
+ else
34
+ codepoints = [line["codepoints"].to_i(16)]
35
+ end
36
+
37
+ codepoints.each{ |codepoint|
38
+ @index[:PROPERTIES][codepoint] ||= []
39
+ @index[:PROPERTIES][codepoint] << (REVERSE_PROPERTY_NAMES[line["property"]] || line["property"])
40
+ }
41
+ end
42
+
43
+ parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; RGI_Emoji_Flag_Sequence/ do |line|
44
+ codepoints = line["codepoints"].split
45
+ @index[:FLAGS] << codepoints.map{|e| e.to_i(16)}
46
+ end
47
+
48
+ parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; RGI_Emoji_Tag_Sequence/ do |line|
49
+ codepoints = line["codepoints"].split
50
+ @index[:TAGS] << codepoints.map{|e| e.to_i(16)}
51
+ end
52
+
53
+ parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; Emoji_Keycap_Sequence/ do |line|
54
+ @index[:KEYCAPS] << line["codepoints"].split[0].to_i(16)
55
+ end
56
+
57
+ parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;/ do |line|
58
+ codepoints = line["codepoints"].split
59
+ @index[:ZWJ] << codepoints.map{|e| e.to_i(16)}
60
+ end
61
+
62
+ parse_file :valid_subdivisions, :xml do |xml|
63
+ subdivisions = []
64
+ xml.css('[idStatus="regular"], [idStatus="deprecated"]').each{ |id|
65
+ subdivisions += id.text.split
66
+ }
67
+ @index[:SD] = subdivisions.uniq
68
+ end
69
+
70
+ parse_file :emoji_test, :line, regex: /^(?:# (?<sub>sub)?group: (?<group_name>.*)$)|(?:(?<codepoints>.+?)\s*; fully-qualified )/ do |line|
71
+ if line["group_name"]
72
+ if !line["sub"]
73
+ @current_group_name = line["group_name"]
74
+ @index[:LIST][@current_group_name] = {}
75
+ else
76
+ @current_subgroup_name = line["group_name"]
77
+ @index[:LIST][@current_group_name][@current_subgroup_name] = []
78
+ end
79
+ else
80
+ codepoints = line["codepoints"].split
81
+ @index[:LIST][@current_group_name][@current_subgroup_name] << codepoints.map{|e| e.to_i(16)}.pack("U*")
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ =begin alternative
90
+ current_index_level = @index[:SEQUENCES]
91
+ codepoints.each{ |cp|
92
+ ord = cp.to_i(16)
93
+ current_index_level[ord] ||= {}
94
+ current_index_level = current_index_level[ord]
95
+ }
96
+ current_index_level[true] = true # end mark
97
+ =end
@@ -0,0 +1,75 @@
1
+ module Unicoder
2
+ module Builder
3
+ class Name
4
+ include Builder
5
+
6
+ JAMO_INITIAL = 4352
7
+ JAMO_MEDIAL = 4449
8
+ JAMO_FINAL = 4520
9
+ JAMO_END = 4697
10
+
11
+ def initialize_index
12
+ @index = {
13
+ NAMES: {},
14
+ ALIASES: {},
15
+ CJK: [],
16
+ HANGUL: [],
17
+ # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
18
+ JAMO: {
19
+ INITIAL: [],
20
+ MEDIAL: [],
21
+ FINAL: [""],
22
+ },
23
+ }
24
+ @range_start = nil
25
+ end
26
+
27
+ def parse!
28
+ if option =~ /charkeys/
29
+ get_key = ->(codepoint){ [codepoint].pack("U*") }
30
+ else
31
+ get_key = -> (codepoint){ codepoint }
32
+ end
33
+
34
+ parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<name>.+?);.*$/ do |line|
35
+ if line["name"][0] == "<" && line["name"][-1] == ">"
36
+ if line["name"] =~ /First/
37
+ @range_start = line["codepoint"].to_i(16)
38
+ elsif line["name"] =~ /Last/ && @range_start
39
+ if line["name"] =~ /Hangul/
40
+ @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
41
+ elsif line["name"] =~ /CJK/
42
+ @index[:CJK] << [@range_start, line["codepoint"].to_i(16)]
43
+ else
44
+ # no name
45
+ end
46
+ @range_start = nil
47
+ elsif line["name"] != "<control>"
48
+ raise ArgumentError, "inconsistent range found in data, don't know what to do"
49
+ end
50
+ else
51
+ assign :NAMES, line["codepoint"].to_i(16), line["name"]
52
+ end
53
+ end
54
+
55
+ parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
56
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
57
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
58
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] << line["alias"]
59
+ end
60
+
61
+ parse_file :jamo, :line, regex: /^(?<codepoint>.+?); (?<short_name>.*?) +#.*$/ do |line|
62
+ case line["codepoint"].to_i(16)
63
+ when JAMO_INITIAL...JAMO_MEDIAL
64
+ @index[:JAMO][:INITIAL] << line["short_name"]
65
+ when JAMO_MEDIAL...JAMO_FINAL
66
+ @index[:JAMO][:MEDIAL] << line["short_name"]
67
+ when JAMO_FINAL..JAMO_END
68
+ @index[:JAMO][:FINAL] << line["short_name"]
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
75
+