unicoder 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: f79eb48ad06b13b61fc4ceb7fc5e176ee4e9e984
4
- data.tar.gz: 94a62eb108e01e1d7da774b58352ab0585235bc7
2
+ SHA256:
3
+ metadata.gz: 21168580053326da8f495794ab5133fc9fa3daa3fa66209cf5ddbb0ac68de923
4
+ data.tar.gz: 3829094ce2cfb8322d79f4a11ef2015235f2b7e602097ab3defb7c8cc4f825de
5
5
  SHA512:
6
- metadata.gz: 01714742c72568ab92a9c3df0b700f3918e32482b7f658da8f099e2cfb54359e098e90fa1caa72a343cbdf2ede36081a9c01a6d65ee76cee841e65b87c9083ad
7
- data.tar.gz: dd5b55100962d9408a503b338ebf25062c3dee7dc1ff9ceaccd97e30d57f97d131191ef96ce266e267cc729d70e6a0860702f22fbb1c9a6e4b512547ff1b5805
6
+ metadata.gz: e5a9b3d54c062817485b0b66b34015d1d12536fd4e82a601d604302d73bf06c0963dd3e8ca0b99bc9989ec62799c025b702218dbde445c46f7aa42671ccb4ae0
7
+ data.tar.gz: ac856c57cc3a9bd7fb8c0e65f282858b760ce42be93411fc5ae8cc2d3007cd9cc33b3e422cbdf8fa8f95316cf84994d328ed6679ec3ccd09d7515f49f30aa6a1
data/.gitignore CHANGED
@@ -1,3 +1,8 @@
1
1
  Gemfile.lock
2
2
  /pkg
3
- /data
3
+ /data*
4
+ *.marshal
5
+ *.marshal.gz
6
+ *.json
7
+ *.mjs
8
+ /old-data
data/.travis.yml CHANGED
@@ -1,20 +1,20 @@
1
1
  sudo: false
2
2
  language: ruby
3
3
 
4
- script: bundle exec ruby spec/unicoder_spec.rb
5
-
6
4
  rvm:
7
- - 2.3.0
8
- - 2.2
9
- - 2.1
10
- - 2.0
5
+ - 2.7
6
+ - 2.6
7
+ - 2.5
8
+ - 2.4
9
+ - 2.3
11
10
  - ruby-head
12
- - rbx-2
13
- - jruby-head
14
- - jruby-9000
15
-
16
- cache:
17
- - bundler
11
+ - jruby-9.2.9.0
12
+ - truffleruby
18
13
 
19
- # matrix:
14
+ matrix:
15
+ allow_failures:
16
+ - rvm: 2.3
17
+ - rvm: ruby-head
18
+ - rvm: jruby-2.9.2.0
19
+ - rvm: truffleruby
20
20
  # fast_finish: true
data/CHANGELOG.md CHANGED
@@ -1,5 +1,23 @@
1
1
  ## CHANGELOG
2
2
 
3
+ ### 1.0.0
4
+
5
+ With the first 1.0 release, unicoder supports 10 indexes:
6
+
7
+ - blocks
8
+ - categories
9
+ - confusable
10
+ - display_width
11
+ - emoji
12
+ - name
13
+ - numeric_value
14
+ - scripts
15
+ - sequence_name
16
+ - types
17
+
18
+ All indexes can be build in `marshal` format (Ruby's internal
19
+ serialization format) and some now support `esm` (JavaScript module)
20
+
3
21
  ### 0.1.0
4
22
 
5
- * WIP
23
+ * Initial release
data/Gemfile CHANGED
@@ -3,3 +3,5 @@ source 'https://rubygems.org'
3
3
  gemspec
4
4
 
5
5
  gem 'minitest'
6
+ gem 'rake'
7
+ gem 'irbtools', require: "irbtools/binding"
data/Gemfile.lock ADDED
@@ -0,0 +1,99 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ unicoder (1.0.0)
5
+ oga (~> 2.9)
6
+ rationalist (~> 2.0)
7
+ rubyzip (~> 1.2)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ ansi (1.5.0)
13
+ ast (2.4.2)
14
+ cd (1.0.2)
15
+ clipboard (2.0.0)
16
+ code (0.9.4)
17
+ coderay (~> 1.1)
18
+ method_source (>= 0.9, < 2.0)
19
+ coderay (1.1.3)
20
+ core_docs (0.9.11)
21
+ yard (~> 0.9.11)
22
+ debugging (2.1.0)
23
+ paint (>= 0.9, < 3.0)
24
+ every_day_irb (2.2.0)
25
+ cd (~> 1.0)
26
+ fancy_irb (2.1.2)
27
+ irb (>= 1.7, < 2.0)
28
+ paint (>= 0.9, < 3.0)
29
+ unicode-display_width (>= 2.5)
30
+ ffi (1.17.0)
31
+ hirb (0.7.3)
32
+ interactive_editor (0.0.12)
33
+ spoon (~> 0.0.6)
34
+ io-console (0.7.2)
35
+ irb (1.14.1)
36
+ rdoc (>= 4.0.0)
37
+ reline (>= 0.4.2)
38
+ irbtools (4.1.0)
39
+ clipboard (>= 1.4, < 3.0)
40
+ code (>= 0.9.4, < 2.0)
41
+ coderay (~> 1.1)
42
+ core_docs (~> 0.9.11)
43
+ debugging (~> 2.1)
44
+ every_day_irb (~> 2.2)
45
+ fancy_irb (~> 2.1)
46
+ hirb (~> 0.7, >= 0.7.3)
47
+ interactive_editor (~> 0.0, >= 0.0.12)
48
+ irb (>= 1.13.0, < 1.15)
49
+ looksee (~> 5.0)
50
+ methodfinder (~> 2.2, >= 2.2.5)
51
+ object_shadow (~> 1.1)
52
+ os (~> 1.1, >= 1.1.4)
53
+ paint (>= 0.9, < 3.0)
54
+ ruby_engine (~> 2.0)
55
+ ruby_version (~> 1.0)
56
+ wirb (~> 2.0, >= 2.2.1)
57
+ looksee (5.0.0)
58
+ method_source (1.1.0)
59
+ methodfinder (2.2.5)
60
+ minitest (5.25.1)
61
+ object_shadow (1.1.1)
62
+ oga (2.15)
63
+ ast
64
+ ruby-ll (~> 2.1)
65
+ os (1.1.4)
66
+ paint (2.3.0)
67
+ psych (5.1.2)
68
+ stringio
69
+ rake (13.2.1)
70
+ rationalist (2.0.1)
71
+ rdoc (6.7.0)
72
+ psych (>= 4.0.0)
73
+ reline (0.5.10)
74
+ io-console (~> 0.5)
75
+ ruby-ll (2.1.3)
76
+ ansi
77
+ ast
78
+ ruby_engine (2.0.3)
79
+ ruby_version (1.0.3)
80
+ rubyzip (1.3.0)
81
+ spoon (0.0.6)
82
+ ffi
83
+ stringio (3.1.1)
84
+ unicode-display_width (2.6.0)
85
+ wirb (2.2.2)
86
+ paint (>= 0.9, < 3.0)
87
+ yard (0.9.37)
88
+
89
+ PLATFORMS
90
+ ruby
91
+
92
+ DEPENDENCIES
93
+ irbtools
94
+ minitest
95
+ rake
96
+ unicoder!
97
+
98
+ BUNDLED WITH
99
+ 2.5.21
data/MIT-LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2016 Jan Lelis, mail@janlelis.de
1
+ Copyright (c) 2016-2020 Jan Lelis, https://janlelis.com
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.md CHANGED
@@ -1,15 +1,45 @@
1
- # unicoder [![[version]](https://badge.fury.io/rb/unicoder.svg)](http://badge.fury.io/rb/unicoder)
2
-
3
- WIP
1
+ # unicoder [![[version]](https://badge.fury.io/rb/unicoder.svg)](https://badge.fury.io/rb/unicoder)
4
2
 
3
+ unicoder turns Unicode data into bundles for programming libraries.
5
4
 
6
5
  ## Usage
7
6
 
8
7
  ```
9
- $ unicoder build index_name
8
+ $ unicoder build <index_name> [--gzip]
9
+ ```
10
+
11
+ Examples:
12
+
13
+ ```
14
+ $ unicoder build emoji --format marshal --gzip
15
+ $ unicoder build numeric_value --format esm
10
16
  ```
11
17
 
12
18
 
19
+ ## Libraries With unicoder-based Indexes
20
+
21
+ ### Ruby
22
+
23
+ Index Name | Gem
24
+ --------------|----
25
+ blocks | [unicode-blocks](https://github.com/janlelis/unicode-blocks)
26
+ categories | [unicode-categories](https://github.com/janlelis/unicode-categories)
27
+ confusable | [unicode-confusable](https://github.com/janlelis/unicode-confusable)
28
+ emoji | [unicode-emoji](https://github.com/janlelis/unicode-emoji)
29
+ display\_width| [unicode-display_width](https://github.com/janlelis/unicode-display_width)
30
+ name | [unicode-name](https://github.com/janlelis/unicode-name)
31
+ numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-numeric_value)
32
+ scripts | [unicode-scripts](https://github.com/janlelis/unicode-scripts)
33
+ sequence\_name| [unicode-sequence_name](https://github.com/janlelis/unicode-sequence_name)
34
+ types | [unicode-types](https://github.com/janlelis/unicode-types)
35
+
36
+ ### JavaScript (ESM)
37
+
38
+ Index Name | Module
39
+ --------------|----
40
+ numeric\_value| [unicode-numeric_value](https://github.com/janlelis/unicode-number.js)
41
+ name, sequence\_name, type | [unicode-name](https://github.com/janlelis/unicode-name)
42
+
13
43
  ## MIT License
14
44
 
15
- Copyright (C) 2016 Jan Lelis <http://janlelis.com>. Released under the MIT license.
45
+ Copyright (C) 2016-2024 Jan Lelis <https://janlelis.com>. Released under the MIT license.
data/bin/unicoder CHANGED
@@ -6,7 +6,7 @@ require "rationalist"
6
6
  args = Rationalist.parse
7
7
  command = args[:_][0]
8
8
  identifier = args[:_][1]
9
- KNOWN_OPTIONS = [:version, :help, :verbose, :format, :gzip]
9
+ KNOWN_OPTIONS = [:version, :help, :verbose, :format, :gzip, :option, :meta]
10
10
  options = args.select { |option,| KNOWN_OPTIONS.include? option }
11
11
 
12
12
  if options.has_key?(:version)
@@ -1,13 +1,42 @@
1
1
  require "json"
2
+ require "rubygems/util"
2
3
 
3
4
  module Unicoder
4
5
  # A builder defines a parse function which translates one (ore more) unicode data
5
6
  # files into an index hash
6
7
  module Builder
7
- attr_reader :index
8
+ attr_reader :index, :formats, :option
9
+ attr_writer :option
8
10
 
9
- def initialize(unicode_version = nil)
10
- @unicode_version = unicode_version
11
+ def formats
12
+ {
13
+ marshal: {
14
+ ext: ".marshal",
15
+ },
16
+ json: {
17
+ ext: ".json",
18
+ option: "charkeys+stringfractions"
19
+ },
20
+ esm: {
21
+ ext: ".mjs",
22
+ option: "charkeys+stringfractions"
23
+ }
24
+ }
25
+ end
26
+
27
+ def meta
28
+ {
29
+ META: {
30
+ generator: "unicoder v#{Unicoder::VERSION}",
31
+ unicodeVersion: @unicode_version,
32
+ },
33
+ }
34
+ end
35
+
36
+ def initialize(unicode_version = nil, emoji_version = nil, format = nil)
37
+ @unicode_version = unicode_version || CURRENT_UNICODE_VERSION
38
+ @emoji_version = emoji_version || CURRENT_EMOJI_VERSION
39
+ @option = formats[format.to_sym] ? formats[format.to_sym][:option] || "" : ""
11
40
  initialize_index
12
41
  end
13
42
 
@@ -15,8 +44,16 @@ module Unicoder
15
44
  @index = {}
16
45
  end
17
46
 
18
- def assign_codepoint(codepoint, value, index = @index)
19
- index[codepoint] = value
47
+ def assign_codepoint(codepoint, value, idx = @index)
48
+ if option =~ /charkeys/
49
+ idx[[codepoint].pack("U*")] = value
50
+ else
51
+ idx[codepoint] = value
52
+ end
53
+ end
54
+
55
+ def assign(sub_index_name, codepoint, value)
56
+ assign_codepoint(codepoint, value, index[sub_index_name])
20
57
  end
21
58
 
22
59
  def parse!
@@ -26,47 +63,72 @@ module Unicoder
26
63
  def parse_file(identifier, parse_mode, **parse_options)
27
64
  filename = UNICODE_FILES[identifier.to_sym] || filename
28
65
  raise ArgumentError, "No valid file identifier or filename given" if !filename
29
- filename.sub! 'VERSION', @unicode_version
30
- Downloader.fetch(identifier) unless File.exists?(filename)
66
+ filename = filename.dup
67
+ filename.sub! 'UNICODE_VERSION', @unicode_version
68
+ filename.sub! 'EMOJI_VERSION', @emoji_version
69
+ filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[@emoji_version]
70
+ filename.sub! '.zip', ''
71
+ filename.sub! /\A(https?|ftp):\//, ""
72
+ Downloader.fetch(identifier) unless File.exist?(LOCAL_DATA_DIRECTORY + filename)
31
73
  file = File.read(LOCAL_DATA_DIRECTORY + filename)
32
74
 
33
75
  if parse_mode == :line
34
76
  file.each_line{ |line|
35
77
  yield Hash[ $~.names.zip( $~.captures ) ] if line =~ parse_options[:regex]
36
78
  }
79
+ elsif parse_mode == :xml
80
+ require "oga"
81
+ yield Oga.parse_xml(file)
82
+ else
83
+ yield file
37
84
  end
38
85
  end
39
86
 
40
87
  def export(format: :marshal, **options)
41
88
  p index if options[:verbose]
42
89
 
90
+ if options[:meta]
91
+ idx = meta.merge(index)
92
+ else
93
+ idx = index
94
+ end
95
+
96
+
43
97
  case format.to_sym
44
98
  when :marshal
45
- index_file = Marshal.dump(index)
99
+ index_file = Marshal.dump(idx)
46
100
  when :json
47
- index_file = JSON.dump(index)
101
+ index_file = JSON.dump(idx)
102
+ when :esm
103
+ index_file = "export default " + JSON.dump(idx)
48
104
  end
49
105
 
50
- # if false# || options[:gzip]
51
106
  if options[:gzip]
52
- Gem.gzip(index_file)
107
+ Gem::Util.gzip(index_file)
53
108
  else
54
109
  index_file
55
110
  end
56
111
  end
57
-
112
+
58
113
  def self.build(identifier, **options)
59
114
  format = options[:format] || :marshal
60
115
  require_relative "builders/#{identifier}"
61
116
  # require "unicoder/builders/#{identifier}"
62
117
  builder_class = self.const_get(identifier.to_s.gsub(/(?:^|_)([a-z])/){ $1.upcase })
63
- builder = builder_class.new(options[:unicode_version] || CURRENT_UNICODE_VERSION)
118
+ builder = builder_class.new(
119
+ options[:unicode_version],
120
+ options[:emoji_version],
121
+ format
122
+ )
64
123
  puts "Building index for #{identifier}…"
124
+ if options[:option]
125
+ builder.option = options[:option]
126
+ end
65
127
  builder.parse!
66
- index_file = builder.export(options)
128
+ index_file = builder.export(**options)
67
129
 
68
130
  destination ||= options[:destination] || identifier.to_s
69
- destination += ".#{format}"
131
+ destination += "#{builder.formats.dig(format.to_sym, :ext)}"
70
132
  destination += ".gz" if options[:gzip]
71
133
  bytes = File.write destination, index_file
72
134
 
@@ -14,23 +14,18 @@ module Unicoder
14
14
  end
15
15
 
16
16
  def parse!
17
- parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<range><(?!control).+>)?.*?;(?<category>.+?);.*$/ do |line|
18
- if line["range"]
19
- if line["range"] =~ /First/
20
- @range_start = line["codepoint"].to_i(16)
21
- elsif line["range"] =~ /Last/ && @range_start
22
- (@range_start..line["codepoint"].to_i(16)).each{ |codepoint|
23
- assign_codepoint(codepoint, line["category"], @index[:CATEGORIES])
24
- }
25
- else
26
- raise ArgumentError, "inconsistent range found in data, don't know what to do"
27
- end
17
+ parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
18
+ if line["to"]
19
+ (line["from"].to_i(16)..line["to"].to_i(16)).each{ |codepoint|
20
+ assign_codepoint(codepoint, line["category"] == "Cn" ? nil : line["category"], @index[:CATEGORIES])
21
+ }
28
22
  else
29
- assign_codepoint(line["codepoint"].to_i(16), line["category"], @index[:CATEGORIES])
23
+ assign_codepoint(line["from"].to_i(16), line["category"] == "Cn" ? nil : line["category"], @index[:CATEGORIES])
30
24
  end
31
25
  end
32
26
 
33
27
  4.times{ compress! @index[:CATEGORIES] }
28
+ remove_trailing_nils! @index[:CATEGORIES]
34
29
 
35
30
  parse_file :property_value_aliases, :line, regex: /^gc ; (?<short>\S{2}?) *; (?<long>\S+).*$/ do |line|
36
31
  @index[:CATEGORY_NAMES][line["short"]] = line["long"]
@@ -6,7 +6,23 @@ module Unicoder
6
6
 
7
7
  IGNORE_CATEGORIES = %w[Cs Co Cn].freeze
8
8
  ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
9
- ZERO_WIDTH_CODEPOINTS = [*0x1160..0x11FF].freeze
9
+
10
+ ZERO_WIDTH_RANGES = [
11
+ *0x1160..0x11FF, # HANGUL JUNGSEONG
12
+ *0xD7B0..0xD7FF, # HANGUL JUNGSEONG
13
+ *0x2060..0x206F, # Ignorables
14
+ *0xFFF0..0xFFF8, # Ignorables
15
+ *0xE0000..0xE0FFF, # Ignorables
16
+ ].freeze
17
+
18
+ WIDE_RANGES = [
19
+ *0x3400..0x4DBF,
20
+ *0x4E00..0x9FFF,
21
+ *0xF900..0xFAFF,
22
+ *0x20000..0x2FFFD,
23
+ *0x30000..0x3FFFD,
24
+ ].freeze
25
+
10
26
  SPECIAL_WIDTHS = {
11
27
  0x0 => 0, # \0 NULL
12
28
  0x5 => 0, # ENQUIRY
@@ -18,7 +34,7 @@ module Unicoder
18
34
  0xD => 0, # \r CARRIAGE RETURN
19
35
  0xE => 0, # SHIFT OUT
20
36
  0xF => 0, # SHIFT IN
21
- 0x00AD => 1, # SOFT HYPHEN
37
+ 0x00AD => nil, # SOFT HYPHEN
22
38
  0x2E3A => 2, # TWO-EM DASH
23
39
  0x2E3B => 3, # THREE-EM DASH
24
40
  }.freeze
@@ -28,7 +44,7 @@ module Unicoder
28
44
  end
29
45
 
30
46
  def parse!
31
- parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?);(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
47
+ parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
32
48
  next if IGNORE_CATEGORIES.include?(line["category"])
33
49
 
34
50
  if line["codepoints"]['..']
@@ -44,19 +60,24 @@ module Unicoder
44
60
  }
45
61
  end
46
62
 
63
+ ZERO_WIDTH_RANGES.each{ |codepoint|
64
+ assign_codepoint codepoint, 0
65
+ }
66
+
67
+ WIDE_RANGES.each{ |codepoint|
68
+ assign_codepoint codepoint, 2
69
+ }
70
+
47
71
  SPECIAL_WIDTHS.each{ |codepoint, value|
48
72
  assign_codepoint codepoint, value
49
73
  }
50
74
 
51
75
  4.times{ compress! }
52
-
53
- p @index
54
76
  end
55
77
 
56
78
  def determine_width(codepoint, category, east_asian_width)
57
79
  if ( ZERO_WIDTH_CATEGORIES.include?(category) &&
58
- [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ ) ||
59
- ZERO_WIDTH_CODEPOINTS.include?(codepoint)
80
+ [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
60
81
  0
61
82
  elsif east_asian_width == "F" || east_asian_width == "W"
62
83
  2
@@ -0,0 +1,97 @@
1
+ module Unicoder
2
+ module Builder
3
+ class Emoji
4
+ include Builder
5
+
6
+ REVERSE_PROPERTY_NAMES = {
7
+ "Emoji" => :E,
8
+ "Emoji_Modifier_Base" => :B,
9
+ "Emoji_Modifier" => :M,
10
+ "Emoji_Component" => :C,
11
+ "Emoji_Presentation" => :P,
12
+ "Extended_Pictographic" => :X,
13
+ }
14
+
15
+ def initialize_index
16
+ @index = {
17
+ PROPERTIES: {},
18
+ FLAGS: [],
19
+ TAGS: [],
20
+ KEYCAPS: [],
21
+ ZWJ: [],
22
+ SD: [],
23
+ LIST: {},
24
+ }
25
+ end
26
+
27
+ def parse!
28
+ parse_file :emoji_data, :line, regex: /^(?<codepoints>\S+?) +; (?<property>\S+) *#/ do |line|
29
+ if line["codepoints"]['..']
30
+ codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
31
+ codepoint.to_i(16)
32
+ })
33
+ else
34
+ codepoints = [line["codepoints"].to_i(16)]
35
+ end
36
+
37
+ codepoints.each{ |codepoint|
38
+ @index[:PROPERTIES][codepoint] ||= []
39
+ @index[:PROPERTIES][codepoint] << (REVERSE_PROPERTY_NAMES[line["property"]] || line["property"])
40
+ }
41
+ end
42
+
43
+ parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; RGI_Emoji_Flag_Sequence/ do |line|
44
+ codepoints = line["codepoints"].split
45
+ @index[:FLAGS] << codepoints.map{|e| e.to_i(16)}
46
+ end
47
+
48
+ parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; RGI_Emoji_Tag_Sequence/ do |line|
49
+ codepoints = line["codepoints"].split
50
+ @index[:TAGS] << codepoints.map{|e| e.to_i(16)}
51
+ end
52
+
53
+ parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*; Emoji_Keycap_Sequence/ do |line|
54
+ @index[:KEYCAPS] << line["codepoints"].split[0].to_i(16)
55
+ end
56
+
57
+ parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;/ do |line|
58
+ codepoints = line["codepoints"].split
59
+ @index[:ZWJ] << codepoints.map{|e| e.to_i(16)}
60
+ end
61
+
62
+ parse_file :valid_subdivisions, :xml do |xml|
63
+ subdivisions = []
64
+ xml.css('[idStatus="regular"], [idStatus="deprecated"]').each{ |id|
65
+ subdivisions += id.text.split
66
+ }
67
+ @index[:SD] = subdivisions.uniq
68
+ end
69
+
70
+ parse_file :emoji_test, :line, regex: /^(?:# (?<sub>sub)?group: (?<group_name>.*)$)|(?:(?<codepoints>.+?)\s*; fully-qualified )/ do |line|
71
+ if line["group_name"]
72
+ if !line["sub"]
73
+ @current_group_name = line["group_name"]
74
+ @index[:LIST][@current_group_name] = {}
75
+ else
76
+ @current_subgroup_name = line["group_name"]
77
+ @index[:LIST][@current_group_name][@current_subgroup_name] = []
78
+ end
79
+ else
80
+ codepoints = line["codepoints"].split
81
+ @index[:LIST][@current_group_name][@current_subgroup_name] << codepoints.map{|e| e.to_i(16)}.pack("U*")
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
88
+
89
+ =begin alternative
90
+ current_index_level = @index[:SEQUENCES]
91
+ codepoints.each{ |cp|
92
+ ord = cp.to_i(16)
93
+ current_index_level[ord] ||= {}
94
+ current_index_level = current_index_level[ord]
95
+ }
96
+ current_index_level[true] = true # end mark
97
+ =end
@@ -0,0 +1,75 @@
1
+ module Unicoder
2
+ module Builder
3
+ class Name
4
+ include Builder
5
+
6
+ JAMO_INITIAL = 4352
7
+ JAMO_MEDIAL = 4449
8
+ JAMO_FINAL = 4520
9
+ JAMO_END = 4697
10
+
11
+ def initialize_index
12
+ @index = {
13
+ NAMES: {},
14
+ ALIASES: {},
15
+ CJK: [],
16
+ HANGUL: [],
17
+ # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
18
+ JAMO: {
19
+ INITIAL: [],
20
+ MEDIAL: [],
21
+ FINAL: [""],
22
+ },
23
+ }
24
+ @range_start = nil
25
+ end
26
+
27
+ def parse!
28
+ if option =~ /charkeys/
29
+ get_key = ->(codepoint){ [codepoint].pack("U*") }
30
+ else
31
+ get_key = -> (codepoint){ codepoint }
32
+ end
33
+
34
+ parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<name>.+?);.*$/ do |line|
35
+ if line["name"][0] == "<" && line["name"][-1] == ">"
36
+ if line["name"] =~ /First/
37
+ @range_start = line["codepoint"].to_i(16)
38
+ elsif line["name"] =~ /Last/ && @range_start
39
+ if line["name"] =~ /Hangul/
40
+ @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
41
+ elsif line["name"] =~ /CJK/
42
+ @index[:CJK] << [@range_start, line["codepoint"].to_i(16)]
43
+ else
44
+ # no name
45
+ end
46
+ @range_start = nil
47
+ elsif line["name"] != "<control>"
48
+ raise ArgumentError, "inconsistent range found in data, don't know what to do"
49
+ end
50
+ else
51
+ assign :NAMES, line["codepoint"].to_i(16), line["name"]
52
+ end
53
+ end
54
+
55
+ parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
56
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
57
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
58
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] << line["alias"]
59
+ end
60
+
61
+ parse_file :jamo, :line, regex: /^(?<codepoint>.+?); (?<short_name>.*?) +#.*$/ do |line|
62
+ case line["codepoint"].to_i(16)
63
+ when JAMO_INITIAL...JAMO_MEDIAL
64
+ @index[:JAMO][:INITIAL] << line["short_name"]
65
+ when JAMO_MEDIAL...JAMO_FINAL
66
+ @index[:JAMO][:MEDIAL] << line["short_name"]
67
+ when JAMO_FINAL..JAMO_END
68
+ @index[:JAMO][:FINAL] << line["short_name"]
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
75
+