unicoder 0.1.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,101 @@
1
+ module Unicoder
2
+ module Builder
3
+ class Name
4
+
5
+ include Builder
6
+ include ReplaceCommonWords
7
+
8
+ JAMO_INITIAL = 4352
9
+ JAMO_MEDIAL = 4449
10
+ JAMO_FINAL = 4520
11
+ JAMO_END = 4697
12
+
13
+ CJK = "CJK UNIFIED IDEOGRAPH-"
14
+ TANGUT = "TANGUT IDEOGRAPH-"
15
+
16
+ REPLACE_COUNT = 500
17
+ REPLACE_BASE = ?[.ord
18
+
19
+ def initialize_index
20
+ @index = {
21
+ NAMES: {},
22
+ ALIASES: {},
23
+ # HANGUL: [],
24
+ CP_RANGES: {
25
+ CJK => [], # filled while parsing
26
+ TANGUT => [], # filled while parsing
27
+ "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]],
28
+ "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]],
29
+ "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]],
30
+ "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]],
31
+ },
32
+ # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area
33
+ JAMO: {
34
+ INITIAL: [],
35
+ MEDIAL: [],
36
+ FINAL: [""],
37
+ },
38
+ }
39
+ @words = []
40
+ @range_start = nil
41
+ end
42
+
43
+ def parse!
44
+ if option =~ /charkeys/
45
+ get_key = ->(codepoint){ [codepoint].pack("U*") }
46
+ else
47
+ get_key = -> (codepoint){ codepoint }
48
+ end
49
+
50
+ parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<name>.+?);.*$/ do |line|
51
+ if line["name"][0] == "<" && line["name"][-1] == ">"
52
+ if line["name"] =~ /First/
53
+ @range_start = line["codepoint"].to_i(16)
54
+ elsif line["name"] =~ /Last/ && @range_start
55
+ case line["name"]
56
+ when /Hangul/
57
+ # currently not necessary
58
+ # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)]
59
+ when /CJK/
60
+ @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)]
61
+ when /Tangut/
62
+ @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)]
63
+ else
64
+ # no name
65
+ warn "ignoring range: #{line["name"]}"
66
+ end
67
+ @range_start = nil
68
+ elsif line["name"] != "<control>"
69
+ raise ArgumentError, "inconsistent range found in data, don't know what to do"
70
+ end
71
+ elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys)
72
+ # ignore
73
+ else
74
+ assign :NAMES, line["codepoint"].to_i(16), line["name"]
75
+ @words += line["name"].split
76
+ end
77
+ end
78
+
79
+ replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE
80
+
81
+ parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line|
82
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {}
83
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= []
84
+ @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] << line["alias"]
85
+ end
86
+
87
+ parse_file :jamo, :line, regex: /^(?<codepoint>.+?); (?<short_name>.*?) +#.*$/ do |line|
88
+ case line["codepoint"].to_i(16)
89
+ when JAMO_INITIAL...JAMO_MEDIAL
90
+ @index[:JAMO][:INITIAL] << line["short_name"]
91
+ when JAMO_MEDIAL...JAMO_FINAL
92
+ @index[:JAMO][:MEDIAL] << line["short_name"]
93
+ when JAMO_FINAL..JAMO_END
94
+ @index[:JAMO][:FINAL] << line["short_name"]
95
+ end
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
101
+
@@ -0,0 +1,30 @@
1
+ module Unicoder
2
+ module Builder
3
+ class NumericValue
4
+ include Builder
5
+
6
+ def initialize_index
7
+ @index = {
8
+ NUMBERS: {},
9
+ }
10
+ end
11
+
12
+ def parse!
13
+ parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(.*?;){7}(?<value>.*?);.*$/ do |line|
14
+ unless line["value"].empty?
15
+ if line["value"] =~ %r</>
16
+
17
+ assign :NUMBERS, line["codepoint"].to_i(16), option =~ /stringfractions/ ? "#{line["value"]}" : line["value"].to_r
18
+ else
19
+ assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
20
+ end
21
+ end
22
+ end
23
+
24
+ parse_file :unihan_numeric_values, :line, regex: /^U\+(?<codepoint>\S+)\s+\S+\s+(?<value>\S+)$/ do |line|
25
+ assign :NUMBERS, line["codepoint"].to_i(16), line["value"].to_i
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,99 @@
1
+ module Unicoder
2
+ module Builder
3
+ class SequenceName
4
+ include Builder
5
+ include ReplaceCommonWords
6
+
7
+ REPLACE_COUNT = 100
8
+ REPLACE_BASE = ?{.ord
9
+ REPLACE_MIN_WORD_LENGTH = 3
10
+
11
+ def initialize_index
12
+ @index = {
13
+ SEQUENCES: {},
14
+ SEQUENCES_NOT_QUALIFIED: {},
15
+ }
16
+ @words = []
17
+ end
18
+
19
+ def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
20
+ if option =~ /charkeys/
21
+ key = codepoints.pack("U*")
22
+ else
23
+ key = codepoints
24
+ end
25
+
26
+ if idx.has_key?(codepoints)
27
+ if combine
28
+ idx[key] << " / #{value}"
29
+ else
30
+ # ignore new one
31
+ end
32
+ else
33
+ idx[key] = value
34
+ end
35
+
36
+ @words += value.split
37
+ end
38
+
39
+ def parse!
40
+ parse_file :named_sequences, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
41
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
42
+ end
43
+
44
+ parse_file :named_sequences_prov, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
45
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
46
+ end
47
+
48
+ parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?);\s*(?<variant>.+?)\s*;\s*(?<context>.*?)\s*# (?<name>.+)$/ do |line|
49
+ name = "#{line["name"].strip} (#{line["variant"]})"
50
+ name << " [#{line["context"]}]" if line["context"] && !line["context"].empty?
51
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name, combine: true
52
+ end
53
+
54
+ parse_file :standardized_variants, :line, regex: /^(?<codepoints>.+?); (?<name>.+?)\s*;$/ do |line|
55
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
56
+ end
57
+
58
+ parse_file :ivd_sequences, :line, regex: /^(?<codepoints>.+?);.*?; (?<name>.+?)$/ do |line|
59
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"], combine: true
60
+ end
61
+
62
+ parse_file :emoji_variation_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<variant>.+?)\s*;\s*# \(.*\)\s*(?<name>.+?)\s*$/ do |line|
63
+ name = "#{line["name"].strip} (#{line["variant"]})"
64
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
65
+ end
66
+
67
+ parse_file :emoji_sequences, :line, regex: /^(?<codepoints>.+?)\s*;\s*(?<type>.+?)\s*; (?<name>.+?)\s*#/ do |line|
68
+ next if line["type"] == "Basic_Emoji"
69
+ name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
70
+ assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
71
+ end
72
+
73
+ parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
74
+ name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
75
+ codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
76
+ assign_codepoint codepoints, name
77
+ if codepoints.include?(0xFE0F)
78
+ # Build all combinations of VS16 present and missing
79
+ codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
80
+ if cur.include? 0xFE0F
81
+ acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
82
+ else
83
+ acc.map{|prev| prev + cur}
84
+ end
85
+ }.
86
+ select {|sub_codepoints| sub_codepoints != codepoints }.
87
+ each { |sub_codepoints|
88
+ assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
89
+ }
90
+ end
91
+ end
92
+
93
+ replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
94
+ replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
95
+ end
96
+ end
97
+ end
98
+ end
99
+
@@ -0,0 +1,83 @@
1
+ module Unicoder
2
+ module Builder
3
+ class Types
4
+ include Builder
5
+ include MultiDimensionalArrayBuilder
6
+
7
+ NONCHARACTERS = [
8
+ *0xFDD0..0xFDEF,
9
+ 0xFFFE, 0xFFFF,
10
+ 0x1FFFE, 0x1FFFF,
11
+ 0x2FFFE, 0x2FFFF,
12
+ 0x3FFFE, 0x3FFFF,
13
+ 0x4FFFE, 0x4FFFF,
14
+ 0x5FFFE, 0x5FFFF,
15
+ 0x6FFFE, 0x6FFFF,
16
+ 0x7FFFE, 0x7FFFF,
17
+ 0x8FFFE, 0x8FFFF,
18
+ 0x9FFFE, 0x9FFFF,
19
+ 0xAFFFE, 0xAFFFF,
20
+ 0xBFFFE, 0xBFFFF,
21
+ 0xCFFFE, 0xCFFFF,
22
+ 0xDFFFE, 0xDFFFF,
23
+ 0xEFFFE, 0xEFFFF,
24
+ 0xFFFFE, 0xFFFFF,
25
+ 0x10FFFE, 0x10FFFF,
26
+ ]
27
+
28
+ def initialize_index
29
+ @index = {
30
+ TYPES: [],
31
+ TYPE_NAMES: %w[
32
+ Graphic
33
+ Format
34
+ Control
35
+ Private-use
36
+ Surrogate
37
+ Noncharacter
38
+ Reserved
39
+ ],
40
+ OFFSETS: [
41
+ 0x10000,
42
+ 0x1000,
43
+ 0x100,
44
+ 0x10
45
+ ],
46
+ }
47
+ end
48
+
49
+ def parse!
50
+ parse_file :general_categories, :line, regex: /^(?<from>[^. ]+)(?:..(?<to>\S+))?\s*; (?<category>\S+).*$/ do |line|
51
+ if line["to"]
52
+ codepoints = Range.new(line["from"].to_i(16), line["to"].to_i(16))
53
+ else
54
+ codepoints = [line["from"].to_i(16)]
55
+ end
56
+
57
+ codepoints.each{ |codepoint|
58
+ case line["category"]
59
+ when "Cf", "Zl", "Zp"
60
+ type = 1
61
+ when "Cc"
62
+ type = 2
63
+ when "Co"
64
+ type = 3
65
+ when "Cs"
66
+ type = 4
67
+ when "Cn"
68
+ if NONCHARACTERS.include?(codepoint)
69
+ type = 5
70
+ else
71
+ type = 6
72
+ end
73
+ end
74
+
75
+ assign :TYPES, codepoint, type
76
+ }
77
+ end
78
+
79
+ 4.times{ compress! @index[:TYPES] }
80
+ end
81
+ end
82
+ end
83
+ end
@@ -1,29 +1,94 @@
1
- module Unicoder
2
- VERSION = "0.1.0".freeze
1
+ # frozen_string_literal: true
3
2
 
4
- CURRENT_UNICODE_VERSION = "8.0.0".freeze
3
+ module Unicoder
4
+ VERSION = "1.1.0"
5
5
 
6
6
  UNICODE_VERSIONS = %w[
7
- 6.3.0
8
- 7.0.0
9
- 8.0.0
7
+ 16.0.0
8
+ 15.1.0
9
+ 15.0.0
10
+ 14.0.0
11
+ 13.0.0
12
+ 12.1.0
13
+ 12.0.0
14
+ 11.0.0
15
+ 10.0.0
10
16
  9.0.0
17
+ 8.0.0
18
+ 7.0.0
19
+ 6.3.0
11
20
  ].freeze
12
21
 
13
- UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public".freeze
22
+ CURRENT_UNICODE_VERSION = UNICODE_VERSIONS.first
23
+
24
+ EMOJI_VERSIONS = %w[
25
+ 16.0
26
+ 15.1
27
+ 15.0
28
+ 14.0
29
+ 13.1
30
+ 13.0
31
+ 12.1
32
+ 12.0
33
+ 11.0
34
+ 5.0
35
+ 4.0
36
+ 3.0
37
+ 2.0
38
+ ].freeze
39
+
40
+ EMOJI_RELATED_UNICODE_VERSIONS = {
41
+ "16.0" => "16.0.0",
42
+ "15.1" => "15.1.0",
43
+ "15.0" => "15.0.0",
44
+ "14.0" => "14.0.0",
45
+ "13.1" => "13.0.0",
46
+ "13.0" => "13.0.0",
47
+ "12.1" => "12.1.0",
48
+ "12.0" => "12.0.0",
49
+ "11.0" => "11.0.0",
50
+ "5.0" => "10.0.0",
51
+ "4.0" => "9.0.0",
52
+ "3.0" => "9.0.0",
53
+ "2.0" => "8.0.0",
54
+ }.freeze
55
+
56
+ CURRENT_EMOJI_VERSION = EMOJI_VERSIONS.first
57
+
58
+ IVD_VERSION = "2022-09-13"
59
+
60
+ CLDR_VERSION = "45"
61
+
62
+ UNICODE_DATA_ENDPOINT = "ftp://ftp.unicode.org/Public"
14
63
 
15
64
  LOCAL_DATA_DIRECTORY = File.expand_path(File.dirname(__FILE__) + "/../../data/unicode").freeze
16
65
 
17
66
  UNICODE_FILES = {
18
- east_asian_width: "/VERSION/ucd/EastAsianWidth.txt",
19
- unicode_data: "/VERSION/ucd/UnicodeData.txt",
20
- name_aliases: "/VERSION/ucd/NameAliases.txt",
21
- confusables: "/security/VERSION/confusables.txt",
22
- blocks: "/VERSION/ucd/Blocks.txt",
23
- scripts: "/VERSION/ucd/Scripts.txt",
24
- script_extensions: "/VERSION/ucd/ScriptExtensions.txt",
25
- property_value_aliases: "/VERSION/ucd/PropertyValueAliases.txt",
26
- general_categories: "/VERSION/ucd/extracted/DerivedGeneralCategory.txt",
67
+ east_asian_width: "/UNICODE_VERSION/ucd/EastAsianWidth.txt",
68
+ unicode_data: "/UNICODE_VERSION/ucd/UnicodeData.txt",
69
+ name_aliases: "/UNICODE_VERSION/ucd/NameAliases.txt",
70
+ confusables: "/security/UNICODE_VERSION/confusables.txt",
71
+ blocks: "/UNICODE_VERSION/ucd/Blocks.txt",
72
+ scripts: "/UNICODE_VERSION/ucd/Scripts.txt",
73
+ script_extensions: "/UNICODE_VERSION/ucd/ScriptExtensions.txt",
74
+ property_value_aliases: "/UNICODE_VERSION/ucd/PropertyValueAliases.txt",
75
+ general_categories: "/UNICODE_VERSION/ucd/extracted/DerivedGeneralCategory.txt",
76
+ unihan_numeric_values: "/UNICODE_VERSION/ucd/Unihan.zip/Unihan_NumericValues.txt",
77
+ jamo: "/UNICODE_VERSION/ucd/Jamo.txt",
78
+ named_sequences: "/UNICODE_VERSION/ucd/NamedSequences.txt",
79
+ named_sequences_prov: "/UNICODE_VERSION/ucd/NamedSequencesProv.txt",
80
+ standardized_variants: "/UNICODE_VERSION/ucd/StandardizedVariants.txt",
81
+ ivd_sequences: "https://www.unicode.org/ivd/data/#{IVD_VERSION}/IVD_Sequences.txt",
82
+ # emoji_data: "/EMOJI_VERSION/ucd/emoji/",
83
+ emoji_data: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-data.txt",
84
+ emoji_sequences: "/emoji/EMOJI_VERSION/emoji-sequences.txt",
85
+ # emoji_variation_sequences: "/emoji/EMOJI_VERSION/emoji-variation-sequences.txt",
86
+ emoji_variation_sequences: "/EMOJI_RELATED_VERSION/ucd/emoji/emoji-variation-sequences.txt",
87
+ emoji_zwj_sequences: "/emoji/EMOJI_VERSION/emoji-zwj-sequences.txt",
88
+ emoji_test: "/emoji/EMOJI_VERSION/emoji-test.txt",
89
+ # valid_subdivisions: "https://www.unicode.org/repos/cldr/tags/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
90
+ valid_subdivisions: "https://raw.githubusercontent.com/unicode-org/cldr/release-#{CLDR_VERSION}/common/validity/subdivision.xml",
91
+ # ""
27
92
  }
28
93
  end
29
94
 
@@ -1,28 +1,74 @@
1
1
  require "open-uri"
2
2
  require "fileutils"
3
+ require "zip"
3
4
 
4
5
  module Unicoder
5
6
  module Downloader
6
7
  def self.fetch(identifier,
7
8
  unicode_version: CURRENT_UNICODE_VERSION,
9
+ emoji_version: CURRENT_EMOJI_VERSION,
8
10
  destination_directory: LOCAL_DATA_DIRECTORY,
9
11
  destination: nil,
10
12
  filename: nil
11
13
  )
12
14
  filename = UNICODE_FILES[identifier.to_sym] || filename
13
15
  raise ArgumentError, "No valid file identifier or filename given" if !filename
14
- filename.sub! 'VERSION', unicode_version
15
- source = UNICODE_DATA_ENDPOINT + filename
16
- destination ||= destination_directory + filename
17
-
18
- open(source){ |f|
19
- FileUtils.mkdir_p(File.dirname(destination))
20
- File.write(destination, f.read)
21
- }
16
+ filename = filename.dup
17
+ filename.sub! 'UNICODE_VERSION', unicode_version
18
+ filename.sub! 'EMOJI_VERSION', emoji_version
19
+ filename.sub! 'EMOJI_RELATED_VERSION', EMOJI_RELATED_UNICODE_VERSIONS[emoji_version]
20
+ if filename =~ /\A(https?|ftp):\/\//
21
+ source = filename
22
+ destination ||= destination_directory + filename.sub(/\A(https?|ftp):\//, "")
23
+ else
24
+ source = UNICODE_DATA_ENDPOINT + filename
25
+ destination ||= destination_directory + filename
26
+ end
22
27
 
23
28
  puts "GET #{source} => #{destination}"
29
+
30
+ if source =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
31
+ # Too much magic, download unzip zip files
32
+ zip = true
33
+ source = $~[:outer_path] + ".zip"
34
+ inner_zip_filename = $~[:inner_path]
35
+ if destination =~ %r[^(?<outer_path>.*).zip/(?<inner_path>.*)$]
36
+ destination = $~[:outer_path] + ".zip"
37
+ destination_files = $~[:outer_path]
38
+ else
39
+ raise "uncoder bug"
40
+ end
41
+ else
42
+ zip = false
43
+ end
44
+
45
+ if File.exist?(destination)
46
+ puts "Skipping download of #{source} (already exists)"
47
+ else
48
+ URI.open(source){ |f|
49
+ FileUtils.mkdir_p(File.dirname(destination))
50
+ File.write(destination, f.read)
51
+ }
52
+ end
53
+
54
+ if zip
55
+ unzip(destination, [inner_zip_filename], destination_files)
56
+ end
24
57
  rescue => e
25
58
  $stderr.puts "#{e.class}: #{e.message}"
26
59
  end
60
+
61
+ def self.unzip(archive, files, destination_dir)
62
+ Zip::File.open(archive) do |zip|
63
+ zip.each do |file_in_zip|
64
+ if files.include?(file_in_zip.name)
65
+ FileUtils.mkdir_p(destination_dir)
66
+ puts "Extract #{file_in_zip.name}"
67
+ file_in_zip.extract(destination_dir + "/#{file_in_zip.name}")
68
+ end
69
+ end
70
+ # entry = zip.glob('*.csv').first
71
+ end
72
+ end
27
73
  end
28
74
  end
@@ -59,6 +59,28 @@ module Unicoder
59
59
  end
60
60
  }
61
61
  end
62
-
62
+
63
+ def remove_trailing_nils!(index = @index)
64
+ index.each{ |plane|
65
+ if plane.is_a?(Array)
66
+ plane.pop while plane[-1] == nil
67
+ plane.each{ |row|
68
+ if row.is_a?(Array)
69
+ row.pop while row[-1] == nil
70
+ row.each{ |byte|
71
+ if byte.is_a?(Array)
72
+ byte.pop while byte[-1] == nil
73
+ byte.each{ |nibble|
74
+ if nibble.is_a?(Array)
75
+ nibble.pop while nibble[-1] == nil
76
+ end
77
+ }
78
+ end
79
+ }
80
+ end
81
+ }
82
+ end
83
+ }
84
+ end
63
85
  end
64
- end
86
+ end
@@ -0,0 +1,20 @@
1
+ require "json"
2
+
3
+ module Unicoder
4
+ module ReplaceCommonWords
5
+ def replace_common_words!(which_index, words, count = 500, base = ?[.ord, min_word_length = 4)
6
+ puts "Starting to replace the #{count} most common words"
7
+ @index[:REPLACE_BASE] = base
8
+ @index[:COMMON_WORDS] = words.
9
+ select{_1.size >= min_word_length}.
10
+ tally.
11
+ max_by(count){_2}.
12
+ map(&:first)
13
+ @index[which_index].each{|_, name|
14
+ @index[:COMMON_WORDS].each_with_index{|word, index|
15
+ name.gsub! word + " ", [base + index].pack("U")
16
+ }
17
+ }
18
+ end
19
+ end
20
+ end
data/lib/unicoder.rb CHANGED
@@ -2,6 +2,7 @@ require_relative "unicoder/constants"
2
2
  require_relative "unicoder/downloader"
3
3
  require_relative "unicoder/builder"
4
4
  require_relative "unicoder/multi_dimensional_array_builder"
5
+ require_relative "unicoder/replace_common_words"
5
6
 
6
7
  if defined?(Rake)
7
8
  Rake.add_rakelib(File.expand_path('../unicoder', __FILE__))
data/unicoder.gemspec CHANGED
@@ -5,18 +5,20 @@ require File.dirname(__FILE__) + "/lib/unicoder/constants"
5
5
  Gem::Specification.new do |gem|
6
6
  gem.name = "unicoder"
7
7
  gem.version = Unicoder::VERSION
8
- gem.summary = "Create specialized indexes for Unicode data lookup"
9
- gem.description = "Generate specialized indexes for Unicode data lookup"
8
+ gem.summary = "Creates specialized indexes for Unicode data lookup"
9
+ gem.description = "Generates specialized indexes for Unicode data lookup"
10
10
  gem.authors = ["Jan Lelis"]
11
- gem.email = ["mail@janlelis.de"]
11
+ gem.email = ["hi@ruby.consulting"]
12
12
  gem.homepage = "https://github.com/janlelis/unicoder"
13
13
  gem.license = "MIT"
14
14
 
15
- gem.files = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^pkg/ }
15
+ gem.files = Dir["{**/}{.*,*}"].select{ |path| File.file?(path) && path !~ /^(pkg|data)/ && path !~ /(marshal|mjs|json)(.gz)?$/ }
16
16
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
- gem.required_ruby_version = "~> 2.0"
20
+ gem.required_ruby_version = ">= 3.0", "< 4.0"
21
21
  gem.add_dependency "rationalist", "~> 2.0"
22
+ gem.add_dependency "rubyzip", "~> 1.2"
23
+ gem.add_dependency "oga", "~> 2.9"
22
24
  end