unicoder 0.1.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,101 @@
1
+ module Unicoder
2
+ module Builder
3
+ class Name
4
+
5
+ include Builder
6
+ include ReplaceCommonWords
7
+
8
+ JAMO_INITIAL = 4352
9
+ JAMO_MEDIAL = 4449
10
+ JAMO_FINAL = 4520
11
+ JAMO_END = 4697
12
+
13
+ CJK = "CJK UNIFIED IDEOGRAPH-"
14
+ TANGUT = "TANGUT IDEOGRAPH-"
15
+
16
+ REPLACE_COUNT = 500
17
+ REPLACE_BASE = ?[.ord
18
+
19
+ def initialize_index
20
+ @index = {
21
+ NAMES: {},
22
+ ALIASES: {},
23
+ # HANGUL: [],
24
+ CP_RANGES: {
25
+ CJK => [], # filled while parsing
26
+ TANGUT => [], # filled while parsing
27
+ "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
28
+ "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
29
+ "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
30
+ "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
31
+ },
32
+ # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
33
+ JAMO: {
34
+ INITIAL: [],
35
+ MEDIAL: [],
36
+ FINAL: [""],
37
+ },
38
+ }
39
+ @words = []
40
+ @range_start = nil
41
+ end
42
+
43
+ def parse!
44
+ if option =~ /charkeys/
45
+ get_key = ->(codepoint){ [codepoint].pack("U*") }
46
+ else
47
+ get_key = -> (codepoint){ codepoint }
48
+ end
49
+
50
+ parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<name>.+?);.*$/ do |line|
51
+ if line["name"][0] == "<" && line["name"][-1] == ">"
52
+ if line["name"] =~ /First/
53
+ @range_start = line["codepoint"].to_i(16)
54
+ elsif line["name"] =~ /Last/ && @range_start
55
+ case line["name"]
56
+ when /Hangul/
57
+ # currently not necessary
58
+ # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
59
+ when /CJK/
60
+ @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
61
+ when /Tangut/
62
+ @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
63
+ else
64
+ # no name
65
+ warn "ignoring range: #{line["name"]}"
66
+ end
67
+ @range_start = nil
68
+ elsif line["name"] != "<control>"
69
+ raise ArgumentError, "inconsistent range found in data, don't know what to do"
70
+ end
71
+ elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
72
+ # ignore
73
+ else
74
+ assign :NAMES, line["codepoint"].to_i(16), line["name"]
75
+ @words += line["name"].split
76
+ end
77
+ end
78
+
79
+ replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
80
+
81
+ parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
82
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
83
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
84
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] << line["alias"]
85
+ end
86
+
87
+ parse_file :jamo, :line, regex: /^(?<codepoint>.+?); (?<short_name>.*?) +#.*$/ do |line|
88
+ case line["codepoint"].to_i(16)
89
+ when JAMO_INITIAL...JAMO_MEDIAL
90
+ @index[:JAMO][:INITIAL] << line["short_name"]
91
+ when JAMO_MEDIAL...JAMO_FINAL
92
+ @index[:JAMO][:MEDIAL] << line["short_name"]
93
+ when JAMO_FINAL..JAMO_END
94
+ @index[:JAMO][:FINAL] << line["short_name"]
95
+ end
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
101
+
@@ -0,0 +1,30 @@
1
+ module Unicoder
2
+ module Builder
3
+ class NumericValue
4
+ include Builder
5
+
6
+ def initialize_index
7
+ @index = {
8
+ NUMBERS: {},
9
+ }
10
+ end
11
+
12
+ def parse!
13
+ parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(.*?;){7}(?<value>.*?);.*$/ do |line|
14
+ unless line["value"].empty?
15
+ if line["value"] =~ %r</>
16
+
17
+ assign :NUMBERS, line["codepoint"].to_i(16), option =~ /stringfractions/ ? "#{line["value"]}" : line["value"].to_r
18
+ else
19
+ assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
20
+ end
21
+ end
22
+ end
23
+
24
+ parse_file :unihan_numeric_values, :line, regex: /^U\+(?<codepoint>\S+)\s+\S+\s+(?<value>\S+)$/ do |line|
25
+ assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,99 @@
1
+ module Unicoder
2
+ module Builder
3
+ class SequenceName
4
+ include Builder
5
+ include ReplaceCommonWords
6
+
7
+ REPLACE_COUNT = 100
8
+ REPLACE_BASE = ?{.ord
9
+ REPLACE_MIN_WORD_LENGTH = 3
10
+
11
+ def initialize_index
12
+ @index = {
13
+ SEQUENCES: {},
14
+ SEQUENCES_NOT_QUALIFIED: {},
15
+ }
16
+ @words = []
17
+ end
18
+
19
+ def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
20
+ if option =~ /charkeys/
21
+ key = codepoints.pack("U*")
22
+ else
23
+ key = codepoints
24
+ end
25
+
26
+ if idx.has_key?(codepoints)
27
+ if combine
28
+ idx[key] << " / #{value}"
29
+ else
30
+ # ignore new one
31
+ end
32
+ else
33
+ idx[key] = value
34
+ end
35
+
36
+ @words += value.split
37
+ end
38
+
39
+ def parse!
40
+ parse_file :named_sequences, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
41
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
42
+ end
43
+
44
+ parse_file :named_sequences_prov, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
45
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
46
+ end
47
+
48
+ parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?);\s*(?<variant>.+?)\s*;\s*(?<context>.*?)\s*# (?<name>.+)$/ do |line|
49
+ name = "#{line["name"].strip} (#{line["variant"]})"
50
+ name << " [#{line["context"]}]" if line["context"] && !line["context"].empty?
51
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name, combine: true
52
+ end
53
+
54
+ parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?); (?<name>.+?)\s*;$/ do |line|
55
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
56
+ end
57
+
58
+ parse_file :ivd_sequences, :line, regex: /^(?<codepoints>.+?);.*?; (?<name>.+?)$/ do |line|
59
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"], combine: true
60
+ end
61
+
62
+ parse_file :emoji_variation_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<variant>.+?)\s*;\s*# \(.*\)\s*(?<name>.+?)\s*$/ do |line|
63
+ name = "#{line["name"].strip} (#{line["variant"]})"
64
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
65
+ end
66
+
67
+ parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<type>.+?)\s*; (?<name>.+?)\s*#/ do |line|
68
+ next if line["type"] == "Basic_Emoji"
69
+ name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
70
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
71
+ end
72
+
73
+ parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
74
+ name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
75
+ codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
76
+ assign_codepoint codepoints, name
77
+ if codepoints.include?(0xFE0F)
78
+ # Build all combinations of VS16 present and missing
79
+ codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
80
+ if cur.include? 0xFE0F
81
+ acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
82
+ else
83
+ acc.map{|prev| prev + cur}
84
+ end
85
+ }.
86
+ select {|sub_codepoints| sub_codepoints != codepoints }.
87
+ each { |sub_codepoints|
88
+ assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
89
+ }
90
+ end
91
+ end
92
+
93
+ replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
94
+ replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
95
+ end
96
+ end
97
+ end
98
+ end
99
+
@@ -0,0 +1,83 @@
1
+ module Unicoder
2
+ module Builder
3
+ class Types
4
+ include Builder
5
+ include MultiDimensionalArrayBuilder
6
+
7
+ NONCHARACTERS = [
8
+ *0xFDD0..0xFDEF,
9
+ 0xFFFE, 0xFFFF,
10
+ 0x1FFFE, 0x1FFFF,
11
+ 0x2FFFE, 0x2FFFF,
12
+ 0x3FFFE, 0x3FFFF,
13
+ 0x4FFFE, 0x4FFFF,
14
+ 0x5FFFE, 0x5FFFF,
15
+ 0x6FFFE, 0x6FFFF,
16
+ 0x7FFFE, 0x7FFFF,
17
+ 0x8FFFE, 0x8FFFF,
18
+ 0x9FFFE, 0x9FFFF,
19
+ 0xAFFFE, 0xAFFFF,
20
+ 0xBFFFE, 0xBFFFF,
21
+ 0xCFFFE, 0xCFFFF,
22
+ 0xDFFFE, 0xDFFFF,
23
+ 0xEFFFE, 0xEFFFF,
24
+ 0xFFFFE, 0xFFFFF,
25
+ 0x10FFFE, 0x10FFFF,
26
+ ]
27
+
28
+ def initialize_index
29
+ @index = {
30
+ TYPES: [],
31
+ TYPE_NAMES: %w[
32
+ Graphic
33
+ Format
34
+ Control
35
+ Private-use
36
+ Surrogate
37
+ Noncharacter
38
+ Reserved
39
+ ],
40
+ OFFSETS: [
41
+ 0x10000,
42
+ 0x1000,
43
+ 0x100,
44
+ 0x10
45
+ ],
46
+ }
47
+ end
48
+
49
+ def parse!
50
+ parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
51
+ if line["to"]
52
+ codepoints = Range.new(line["from"].to_i(16), line["to"].to_i(16))
53
+ else
54
+ codepoints = [line["from"].to_i(16)]
55
+ end
56
+
57
+ codepoints.each{ |codepoint|
58
+ case line["category"]
59
+ when "Cf", "Zl", "Zp"
60
+ type = 1
61
+ when "Cc"
62
+ type = 2
63
+ when "Co"
64
+ type = 3
65
+ when "Cs"
66
+ type = 4
67
+ when "Cn"
68
+ if NONCHARACTERS.include?(codepoint)
69
+ type = 5
70
+ else
71
+ type = 6
72
+ end
73
+ end
74
+
75
+ assign :TYPES, codepoint, type
76
+ }
77
+ end
78
+
79
+ 4.times{ compress! @index[:TYPES] }
80
+ end
81
+ end
82
+ end
83
+ end
@@ -1,29 +1,94 @@
1
- module Unicoder
2
- VERSION = "0.1.0".freeze
1
+ # frozen_string_literal: true
3
2
 
4
- CURRENT_UNICODE_VERSION = "8.0.0".freeze
3
+ module Unicoder
4
+ VERSION = "1.1.0"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
- 6.3.0
8
- 7.0.0
9
- 8.0.0
7
+ 16.0.0
8
+ 15.1.0
9
+ 15.0.0
10
+ 14.0.0
11
+ 13.0.0
12
+ 12.1.0
13
+ 12.0.0
14
+ 11.0.0
15
+ 10.0.0
10
16
  9.0.0
17
+ 8.0.0
18
+ 7.0.0
19
+ 6.3.0
11
20
  ].freeze
12
21
 
13
- UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public".freeze
22
+ CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
23
+
24
+ EMOJI_VERSIONS = %w[
25
+ 16.0
26
+ 15.1
27
+ 15.0
28
+ 14.0
29
+ 13.1
30
+ 13.0
31
+ 12.1
32
+ 12.0
33
+ 11.0
34
+ 5.0
35
+ 4.0
36
+ 3.0
37
+ 2.0
38
+ ].freeze
39
+
40
+ EMOJI_RELATED_UNICODE_VERSIONS = {
41
+ "16.0" => "16.0.0",
42
+ "15.1" => "15.1.0",
43
+ "15.0" => "15.0.0",
44
+ "14.0" => "14.0.0",
45
+ "13.1" => "13.0.0",
46
+ "13.0" => "13.0.0",
47
+ "12.1" => "12.1.0",
48
+ "12.0" => "12.0.0",
49
+ "11.0" => "11.0.0",
50
+ "5.0" => "10.0.0",
51
+ "4.0" => "9.0.0",
52
+ "3.0" => "9.0.0",
53
+ "2.0" => "8.0.0",
54
+ }.freeze
55
+
56
+ CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
57
+
58
+ IVD_VERSION = "2022-09-13"
59
+
60
+ CLDR_VERSION = "45"
61
+
62
+ UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
14
63
 
15
64
  LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
16
65
 
17
66
  UNICODE_FILES = {
18
- east_asian_width: "/VERSION/ucd/EastAsianWidth.txt",
19
- unicode_data: "/VERSION/ucd/UnicodeData.txt",
20
- name_aliases: "/VERSION/ucd/NameAliases.txt",
21
- confusables: "/security/VERSION/confusables.txt",
22
- blocks: "/VERSION/ucd/Blocks.txt",
23
- scripts: "/VERSION/ucd/Scripts.txt",
24
- script_extensions: "/VERSION/ucd/ScriptExtensions.txt",
25
- property_value_aliases: "/VERSION/ucd/PropertyValueAliases.txt",
26
- general_categories: "/VERSION/ucd/extracted/DerivedGeneralCategory.txt",
67
+ east_asian_width: "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
68
+ unicode_data: "/UNICODE_VERSION/ucd/UnicodeData.txt",
69
+ name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
70
+ confusables: "/security/UNICODE_VERSION/confusables.txt",
71
+ blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
72
+ scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
73
+ script_extensions: "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
74
+ property_value_aliases: "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
75
+ general_categories: "/UNICODE_VERSION/ucd/extracted/DerivedGeneralCategory.txt",
76
+ unihan_numeric_values: "/UNICODE_VERSION/ucd/Unihan.zip/Unihan_NumericValues.txt",
77
+ jamo: "/UNICODE_VERSION/ucd/Jamo.txt",
78
+ named_sequences: "/UNICODE_VERSION/ucd/NamedSequences.txt",
79
+ named_sequences_prov: "/UNICODE_VERSION/ucd/NamedSequencesProv.txt",
80
+ standardized_variants: "/UNICODE_VERSION/ucd/StandardizedVariants.txt",
81
+ ivd_sequences: "https://www.unicode.org/ivd/data/#{IVD_VERSION}/IVD_Sequences.txt",
82
+ # emoji_data: "/EMOJI_VERSION/ucd/emoji/",
83
+ emoji_data: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-data.txt",
84
+ emoji_sequences: "/emoji/EMOJI_VERSION/emoji-sequences.txt",
85
+ # emoji_variation_sequences: "/emoji/EMOJI_VERSION/emoji-variation-sequences.txt",
86
+ emoji_variation_sequences: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-variation-sequences.txt",
87
+ emoji_zwj_sequences: "/emoji/EMOJI_VERSION/emoji-zwj-sequences.txt",
88
+ emoji_test: "/emoji/EMOJI_VERSION/emoji-test.txt",
89
+ # valid_subdivisions: "https://www.unicode.org/repos/cldr/tags/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
90
+ valid_subdivisions: "https://raw.githubusercontent.com/unicode-org/cldr/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
91
+ # ""
27
92
  }
28
93
  end
29
94
 
@@ -1,28 +1,74 @@
1
1
  require "open-uri"
2
2
  require "fileutils"
3
+ require "zip"
3
4
 
4
5
  module Unicoder
5
6
  module Downloader
6
7
  def self.fetch(identifier,
7
8
  unicode_version: CURRENT_UNICODE_VERSION,
9
+ emoji_version: CURRENT_EMOJI_VERSION,
8
10
  destination_directory: LOCAL_DATA_DIRECTORY,
9
11
  destination: nil,
10
12
  filename: nil
11
13
  )
12
14
  filename = UNICODE_FILES[identifier.to_sym] || filename
13
15
  raise ArgumentError, "No valid file identifier or filename given" if !filename
14
- filename.sub! 'VERSION', unicode_version
15
- source = UNICODE_DATA_ENDPOINT + filename
16
- destination ||= destination_directory + filename
17
-
18
- open(source){ |f|
19
- FileUtils.mkdir_p(File.dirname(destination))
20
- File.write(destination, f.read)
21
- }
16
+ filename = filename.dup
17
+ filename.sub! 'UNICODE_VERSION', unicode_version
18
+ filename.sub! 'EMOJI_VERSION', emoji_version
19
+ filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[emoji_version]
20
+ if filename =~ /\A(https?|ftp):\/\//
21
+ source = filename
22
+ destination ||= destination_directory + filename.sub(/\A(https?|ftp):\//, "")
23
+ else
24
+ source = UNICODE_DATA_ENDPOINT + filename
25
+ destination ||= destination_directory + filename
26
+ end
22
27
 
23
28
  puts "GET #{source} => #{destination}"
29
+
30
+ if source =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
31
+ # Too much magic, download unzip zip files
32
+ zip = true
33
+ source = $~[:outer_path] + ".zip"
34
+ inner_zip_filename = $~[:inner_path]
35
+ if destination =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
36
+ destination = $~[:outer_path] + ".zip"
37
+ destination_files = $~[:outer_path]
38
+ else
39
+ raise "uncoder bug"
40
+ end
41
+ else
42
+ zip = false
43
+ end
44
+
45
+ if File.exist?(destination)
46
+ puts "Skipping download of #{source} (already exists)"
47
+ else
48
+ URI.open(source){ |f|
49
+ FileUtils.mkdir_p(File.dirname(destination))
50
+ File.write(destination, f.read)
51
+ }
52
+ end
53
+
54
+ if zip
55
+ unzip(destination, [inner_zip_filename], destination_files)
56
+ end
24
57
  rescue => e
25
58
  $stderr.puts "#{e.class}: #{e.message}"
26
59
  end
60
+
61
+ def self.unzip(archive, files, destination_dir)
62
+ Zip::File.open(archive) do |zip|
63
+ zip.each do |file_in_zip|
64
+ if files.include?(file_in_zip.name)
65
+ FileUtils.mkdir_p(destination_dir)
66
+ puts "Extract #{file_in_zip.name}"
67
+ file_in_zip.extract(destination_dir + "/#{file_in_zip.name}")
68
+ end
69
+ end
70
+ # entry = zip.glob('*.csv').first
71
+ end
72
+ end
27
73
  end
28
74
  end
@@ -59,6 +59,28 @@ module Unicoder
59
59
  end
60
60
  }
61
61
  end
62
-
62
+
63
+ def remove_trailing_nils!(index = @index)
64
+ index.each{ |plane|
65
+ if plane.is_a?(Array)
66
+ plane.pop while plane[-1] == nil
67
+ plane.each{ |row|
68
+ if row.is_a?(Array)
69
+ row.pop while row[-1] == nil
70
+ row.each{ |byte|
71
+ if byte.is_a?(Array)
72
+ byte.pop while byte[-1] == nil
73
+ byte.each{ |nibble|
74
+ if nibble.is_a?(Array)
75
+ nibble.pop while nibble[-1] == nil
76
+ end
77
+ }
78
+ end
79
+ }
80
+ end
81
+ }
82
+ end
83
+ }
84
+ end
63
85
  end
64
- end
86
+ end
@@ -0,0 +1,20 @@
1
+ require "json"
2
+
3
+ module Unicoder
4
+ module ReplaceCommonWords
5
+ def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
6
+ puts "Starting to replace the #{count} most common words"
7
+ @index[:REPLACE_BASE] = base
8
+ @index[:COMMON_WORDS] = words.
9
+ select{_1.size >= min_word_length}.
10
+ tally.
11
+ max_by(count){_2}.
12
+ map(&:first)
13
+ @index[which_index].each{|_, name|
14
+ @index[:COMMON_WORDS].each_with_index{|word, index|
15
+ name.gsub! word + " ", [base + index].pack("U")
16
+ }
17
+ }
18
+ end
19
+ end
20
+ end
data/lib/unicoder.rb CHANGED
@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
2
2
  require_relative "unicoder/downloader"
3
3
  require_relative "unicoder/builder"
4
4
  require_relative "unicoder/multi_dimensional_array_builder"
5
+ require_relative "unicoder/replace_common_words"
5
6
 
6
7
  if defined?(Rake)
7
8
  Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))
data/unicoder.gemspec CHANGED
@@ -5,18 +5,20 @@ require File.dirname(__FILE__) + "/lib/unicoder/constants"
5
5
  Gem::Specification.new do |gem|
6
6
  gem.name = "unicoder"
7
7
  gem.version = Unicoder::VERSION
8
- gem.summary = "Create specialized indexes for Unicode data lookup"
9
- gem.description = "Generate specialized indexes for Unicode data lookup"
8
+ gem.summary = "Creates specialized indexes for Unicode data lookup"
9
+ gem.description = "Generates specialized indexes for Unicode data lookup"
10
10
  gem.authors = ["Jan Lelis"]
11
- gem.email = ["mail@janlelis.de"]
11
+ gem.email = ["hi@ruby.consulting"]
12
12
  gem.homepage = "https://github.com/janlelis/unicoder"
13
13
  gem.license = "MIT"
14
14
 
15
- gem.files = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^pkg/ }
15
+ gem.files = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^(pkg|data)/ && path !~ /(marshal|mjs|json)(.gz)?$/ }
16
16
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
- gem.required_ruby_version = "~> 2.0"
20
+ gem.required_ruby_version = ">= 3.0", "< 4.0"
21
21
  gem.add_dependency "rationalist", "~> 2.0"
22
+ gem.add_dependency "rubyzip", "~> 1.2"
23
+ gem.add_dependency "oga", "~> 2.9"
22
24
  end