red-datasets 0.0.7 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,154 @@
1
+ require "csv"
2
+ require "zip"
3
+
4
+ require_relative "dataset"
5
+
6
+ module Datasets
7
+ class PostalCodeJapan < Dataset
8
+ class Record < Struct.new(:organization_code,
9
+ :old_postal_code,
10
+ :postal_code,
11
+ :prefecture_reading,
12
+ :city_reading,
13
+ :address_reading,
14
+ :prefecture,
15
+ :city,
16
+ :address,
17
+ :have_multiple_postal_codes,
18
+ :have_address_number_per_koaza,
19
+ :have_chome,
20
+ :postal_code_is_shared,
21
+ :changed,
22
+ :change_reason)
23
+ alias_method :have_multiple_postal_codes?,
24
+ :have_multiple_postal_codes
25
+ alias_method :have_address_number_per_koaza?,
26
+ :have_address_number_per_koaza
27
+ alias_method :have_chome?,
28
+ :have_chome
29
+ alias_method :postal_code_is_shared?,
30
+ :postal_code_is_shared
31
+ alias_method :changed?,
32
+ :changed
33
+ end
34
+
35
+ VALID_READINGS = [
36
+ :lowercase,
37
+ :uppercase,
38
+ :romaji,
39
+ ]
40
+ def initialize(reading: :lowercase)
41
+ super()
42
+ @reading = reading
43
+ unless VALID_READINGS.include?(@reading)
44
+ message = ":reading must be one of ["
45
+ message << VALID_READINGS.collect(&:inspect).join(", ")
46
+ message << "]: #{@reading.inspect}"
47
+ raise ArgumentError, message
48
+ end
49
+ @metadata.id = "postal-code-japan-#{@reading}"
50
+ @metadata.name = "Postal code in Japan (#{@reading})"
51
+ @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
52
+ @metadata.licenses = [
53
+ "CC0-1.0",
54
+ ]
55
+ @metadata.description = "Postal code in Japan (reading: #{@reading})"
56
+ end
57
+
58
+ def each(&block)
59
+ return to_enum(__method__) unless block_given?
60
+
61
+ open_data do |input|
62
+ utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
63
+ options = {
64
+ quote_char: nil,
65
+ strip: %Q["],
66
+ }
67
+ if @reading == :romaji
68
+ CSV.parse(utf8_data, **options) do |row|
69
+ yield(Record.new(nil,
70
+ nil,
71
+ row[0],
72
+ row[4],
73
+ row[5],
74
+ row[6],
75
+ row[1],
76
+ row[2],
77
+ row[3],
78
+ false,
79
+ false,
80
+ false,
81
+ false,
82
+ false,
83
+ nil))
84
+ end
85
+ else
86
+ CSV.parse(utf8_data, **options) do |row|
87
+ yield(Record.new(row[0],
88
+ row[1].rstrip,
89
+ row[2],
90
+ row[3],
91
+ row[4],
92
+ row[5],
93
+ row[6],
94
+ row[7],
95
+ row[8],
96
+ (row[9] == "1"),
97
+ (row[10] == "1"),
98
+ (row[11] == "1"),
99
+ (row[12] == "1"),
100
+ (row[13] != "0"),
101
+ convert_change_reason(row[14])))
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ private
108
+ def open_data
109
+ data_url = "https://www.post.japanpost.jp/zipcode/dl"
110
+ case @reading
111
+ when :lowercase
112
+ data_url << "/kogaki/zip/ken_all.zip"
113
+ when :uppercase
114
+ data_url << "/oogaki/zip/ken_all.zip"
115
+ when :romaji
116
+ data_url << "/roman/ken_all_rome.zip"
117
+ end
118
+ data_path = cache_dir_path + "#{@reading}-ken-all.zip"
119
+ unless data_path.exist?
120
+ download(data_path, data_url)
121
+ end
122
+
123
+ Zip::File.open(data_path.to_s) do |zip_file|
124
+ zip_file.each do |entry|
125
+ next unless entry.file?
126
+ entry.get_input_stream do |input|
127
+ yield(input)
128
+ end
129
+ end
130
+ end
131
+ end
132
+
133
+ def convert_change_reason(reason)
134
+ case reason
135
+ when "0"
136
+ nil
137
+ when "1"
138
+ :new
139
+ when "2"
140
+ :japanese_addressing_system
141
+ when "3"
142
+ :land_readjustment
143
+ when "4"
144
+ :postal_district_adjustment
145
+ when "5"
146
+ :correction
147
+ when "6"
148
+ :deletion
149
+ else
150
+ :unknown
151
+ end
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,95 @@
1
+ require_relative "dataset"
2
+ require_relative "tar_gz_readable"
3
+
4
+ module Datasets
5
+ class RdatasetsList < Dataset
6
+ Record = Struct.new(:package,
7
+ :dataset,
8
+ :title,
9
+ :rows,
10
+ :cols,
11
+ :n_binary,
12
+ :n_character,
13
+ :n_factor,
14
+ :n_logical,
15
+ :n_numeric,
16
+ :csv,
17
+ :doc)
18
+
19
+ def initialize
20
+ super
21
+ @metadata.id = "rdatasets"
22
+ @metadata.name = "Rdatasets"
23
+ @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
24
+ @metadata.licenses = ["GPL-3"]
25
+ @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
26
+ @data_path = cache_dir_path + "datasets.csv"
27
+ end
28
+
29
+ def filter(package: nil, dataset: nil)
30
+ return to_enum(__method__, package: package, dataset: dataset) unless block_given?
31
+
32
+ conds = {}
33
+ conds["Package"] = package if package
34
+ conds["Item"] = dataset if dataset
35
+ if conds.empty?
36
+ each_row {|row| yield Record.new(*row.fields) }
37
+ else
38
+ each_row do |row|
39
+ if conds.all? {|k, v| row[k] == v }
40
+ yield Record.new(*row.fields)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ def each(&block)
47
+ filter(&block)
48
+ end
49
+
50
+ private def each_row(&block)
51
+ download(@data_path, @data_url) unless @data_path.exist?
52
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
53
+ csv.each(&block)
54
+ end
55
+ end
56
+ end
57
+
58
+ class Rdatasets < Dataset
59
+ def initialize(package_name, dataset_name)
60
+ list = RdatasetsList.new
61
+
62
+ info = list.filter(package: package_name, dataset: dataset_name).first
63
+ unless info
64
+ raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
65
+ end
66
+
67
+ super()
68
+ @metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
69
+ @metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
70
+ @metadata.url = info.csv
71
+ @metadata.licenses = ["GPL-3"]
72
+ @metadata.description = info.title
73
+
74
+ # Follow the original directory structure in the cache directory
75
+ @data_path = cache_dir_path + (dataset_name + ".csv")
76
+
77
+ @package_name = package_name
78
+ @dataset_name = dataset_name
79
+ end
80
+
81
+ def each(&block)
82
+ return to_enum(__method__) unless block_given?
83
+
84
+ download(@data_path, @metadata.url) unless @data_path.exist?
85
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
86
+ csv.each do |row|
87
+ record = row.to_h
88
+ record.delete("")
89
+ record.transform_keys!(&:to_sym)
90
+ yield record
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -2,19 +2,99 @@ require "datasets/dictionary"
2
2
 
3
3
  module Datasets
4
4
  class Table
5
+ class Record
6
+ include Enumerable
7
+
8
+ def initialize(table, index)
9
+ @table = table
10
+ @index = index
11
+ end
12
+
13
+ def [](column_name_or_column_index)
14
+ @table[column_name_or_column_index][@index]
15
+ end
16
+
17
+ def each
18
+ return to_enum(__method__) unless block_given?
19
+ @table.each_column.each do |column_name, column_values|
20
+ yield(column_name, column_values[@index])
21
+ end
22
+ end
23
+
24
+ def values
25
+ @table.each_column.collect do |_column_name, column_values|
26
+ column_values[@index]
27
+ end
28
+ end
29
+
30
+ def to_h
31
+ hash = {}
32
+ each do |column_name, column_value|
33
+ hash[column_name] = column_value
34
+ end
35
+ hash
36
+ end
37
+
38
+ def inspect
39
+ "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
40
+ end
41
+ end
42
+
5
43
  include Enumerable
6
44
 
45
+ attr_reader :dataset
7
46
  def initialize(dataset)
8
47
  @dataset = dataset
9
48
  @dictionaries = {}
10
49
  end
11
50
 
12
- def each(&block)
51
+ def n_columns
52
+ columner_data.size
53
+ end
54
+ alias_method :size, :n_columns
55
+ alias_method :length, :n_columns
56
+
57
+ def n_rows
58
+ first_column = columner_data.first
59
+ return 0 if first_column.nil?
60
+ first_column[1].size
61
+ end
62
+
63
+ def column_names
64
+ columner_data.keys
65
+ end
66
+
67
+ def each_column(&block)
13
68
  columner_data.each(&block)
14
69
  end
70
+ alias_method :each, :each_column
15
71
 
16
- def [](name)
17
- columner_data[normalize_name(name)]
72
+ def each_record
73
+ return to_enum(__method__) unless block_given?
74
+ n_rows.times do |i|
75
+ yield(Record.new(self, i))
76
+ end
77
+ end
78
+
79
+ def find_record(row)
80
+ row += n_rows if row < 0
81
+ return nil if row < 0
82
+ return nil if row >= n_rows
83
+ Record.new(self, row)
84
+ end
85
+
86
+ def [](name_or_index)
87
+ case name_or_index
88
+ when Integer
89
+ index = name_or_index
90
+ columner_data.each_with_index do |(_name, values), i|
91
+ return values if i == index
92
+ end
93
+ nil
94
+ else
95
+ name = name_or_index
96
+ columner_data[normalize_name(name)]
97
+ end
18
98
  end
19
99
 
20
100
  def dictionary_encode(name)
@@ -0,0 +1,14 @@
1
+ require "rubygems/package"
2
+ require "zlib"
3
+
4
+ module Datasets
5
+ module TarGzReadable
6
+ def open_tar_gz(data_path)
7
+ Zlib::GzipReader.open(data_path) do |f|
8
+ Gem::Package::TarReader.new(f) do |tar|
9
+ yield(tar)
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.7"
2
+ VERSION = "0.1.2"
3
3
  end
@@ -52,7 +52,7 @@ module Datasets
52
52
  end
53
53
 
54
54
  private
55
- def open_data
55
+ def open_data(&block)
56
56
  base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
57
  data_path = cache_dir_path + base_name
58
58
  unless data_path.exist?
@@ -60,15 +60,7 @@ module Datasets
60
60
  download(data_path, data_url)
61
61
  end
62
62
 
63
- input, output = IO.pipe
64
- pid = spawn("bzcat", data_path.to_s, {:out => output})
65
- begin
66
- output.close
67
- yield(input)
68
- ensure
69
- input.close
70
- Process.waitpid(pid)
71
- end
63
+ extract_bz2(data_path, &block)
72
64
  end
73
65
 
74
66
  def type_in_path
data/red-datasets.gemspec CHANGED
@@ -34,6 +34,10 @@ Gem::Specification.new do |spec|
34
34
  spec.files += Dir.glob("doc/text/*")
35
35
  spec.test_files += Dir.glob("test/**/*")
36
36
 
37
+ spec.add_runtime_dependency("csv", ">= 3.0.5")
38
+ spec.add_runtime_dependency("rexml")
39
+ spec.add_runtime_dependency("rubyzip")
40
+
37
41
  spec.add_development_dependency("bundler")
38
42
  spec.add_development_dependency("rake")
39
43
  spec.add_development_dependency("test-unit")
data/test/run-test.rb CHANGED
@@ -13,4 +13,6 @@ $LOAD_PATH.unshift(lib_dir.to_s)
13
13
 
14
14
  require_relative "helper"
15
15
 
16
+ ARGV.unshift("--max-diff-target-string-size=#{10 * 1024}")
17
+
16
18
  exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
@@ -0,0 +1,180 @@
1
+ class CLDRPluralsTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::CLDRPlurals.new
4
+ end
5
+
6
+ def locale(*args)
7
+ Datasets::CLDRPlurals::Locale.new(*args)
8
+ end
9
+
10
+ def rule(*args)
11
+ Datasets::CLDRPlurals::Rule.new(*args)
12
+ end
13
+
14
+ test("#each") do
15
+ locales = @dataset.each.to_a
16
+ assert_equal([
17
+ 215,
18
+ locale("bm",
19
+ [
20
+ rule("other",
21
+ nil,
22
+ [
23
+ 0..15,
24
+ 100,
25
+ 1000,
26
+ 10000,
27
+ 100000,
28
+ 1000000,
29
+ :elipsis,
30
+ ],
31
+ [
32
+ 0.0..1.5,
33
+ 10.0,
34
+ 100.0,
35
+ 1000.0,
36
+ 10000.0,
37
+ 100000.0,
38
+ 1000000.0,
39
+ :elipsis,
40
+ ])
41
+ ]),
42
+ locale("kw",
43
+ [
44
+ rule("zero",
45
+ [:equal, "n", [0]],
46
+ [0],
47
+ [0.0, 0.00, 0.000, 0.0000]),
48
+ rule("one",
49
+ [:equal, "n", [1]],
50
+ [1],
51
+ [1.0, 1.00, 1.000, 1.0000]),
52
+ rule("two",
53
+ [:or,
54
+ [:equal,
55
+ [:mod, "n", 100],
56
+ [2, 22, 42, 62, 82]],
57
+ [:and,
58
+ [:equal, [:mod, "n", 1000], [0]],
59
+ [:equal,
60
+ [:mod, "n", 100000],
61
+ [1000..20000, 40000, 60000, 80000]]],
62
+ [:and,
63
+ [:not_equal, "n", [0]],
64
+ [:equal, [:mod, "n", 1000000], [100000]]]],
65
+ [
66
+ 2,
67
+ 22,
68
+ 42,
69
+ 62,
70
+ 82,
71
+ 102,
72
+ 122,
73
+ 142,
74
+ 1000,
75
+ 10000,
76
+ 100000,
77
+ :elipsis,
78
+ ],
79
+ [
80
+ 2.0,
81
+ 22.0,
82
+ 42.0,
83
+ 62.0,
84
+ 82.0,
85
+ 102.0,
86
+ 122.0,
87
+ 142.0,
88
+ 1000.0,
89
+ 10000.0,
90
+ 100000.0,
91
+ :elipsis,
92
+ ]),
93
+ rule("few",
94
+ [:equal,
95
+ [:mod, "n", 100],
96
+ [3, 23, 43, 63, 83]],
97
+ [
98
+ 3,
99
+ 23,
100
+ 43,
101
+ 63,
102
+ 83,
103
+ 103,
104
+ 123,
105
+ 143,
106
+ 1003,
107
+ :elipsis,
108
+ ],
109
+ [
110
+ 3.0,
111
+ 23.0,
112
+ 43.0,
113
+ 63.0,
114
+ 83.0,
115
+ 103.0,
116
+ 123.0,
117
+ 143.0,
118
+ 1003.0,
119
+ :elipsis,
120
+ ]),
121
+ rule("many",
122
+ [:and,
123
+ [:not_equal, "n", [1]],
124
+ [:equal,
125
+ [:mod, "n", 100],
126
+ [1, 21, 41, 61, 81]]],
127
+ [
128
+ 21,
129
+ 41,
130
+ 61,
131
+ 81,
132
+ 101,
133
+ 121,
134
+ 141,
135
+ 161,
136
+ 1001,
137
+ :elipsis,
138
+ ],
139
+ [
140
+ 21.0,
141
+ 41.0,
142
+ 61.0,
143
+ 81.0,
144
+ 101.0,
145
+ 121.0,
146
+ 141.0,
147
+ 161.0,
148
+ 1001.0,
149
+ :elipsis,
150
+ ]),
151
+ rule("other",
152
+ nil,
153
+ [4..19, 100, 1004, 1000000, :elipsis],
154
+ [
155
+ 0.1..0.9,
156
+ 1.1..1.7,
157
+ 10.0,
158
+ 100.0,
159
+ 1000.1,
160
+ 1000000.0,
161
+ :elipsis,
162
+ ]),
163
+ ]),
164
+ ],
165
+ [
166
+ locales.size,
167
+ locales[0],
168
+ locales[-4],
169
+ ])
170
+ end
171
+
172
+ sub_test_case("#metadata") do
173
+ test("#description") do
174
+ description = @dataset.metadata.description
175
+ assert do
176
+ description.start_with?("Language plural rules in Unicode Common Locale Data Repository.")
177
+ end
178
+ end
179
+ end
180
+ end