red-datasets 0.0.7 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,154 @@
1
+ require "csv"
2
+ require "zip"
3
+
4
+ require_relative "dataset"
5
+
6
+ module Datasets
7
+ class PostalCodeJapan < Dataset
8
+ class Record < Struct.new(:organization_code,
9
+ :old_postal_code,
10
+ :postal_code,
11
+ :prefecture_reading,
12
+ :city_reading,
13
+ :address_reading,
14
+ :prefecture,
15
+ :city,
16
+ :address,
17
+ :have_multiple_postal_codes,
18
+ :have_address_number_per_koaza,
19
+ :have_chome,
20
+ :postal_code_is_shared,
21
+ :changed,
22
+ :change_reason)
23
+ alias_method :have_multiple_postal_codes?,
24
+ :have_multiple_postal_codes
25
+ alias_method :have_address_number_per_koaza?,
26
+ :have_address_number_per_koaza
27
+ alias_method :have_chome?,
28
+ :have_chome
29
+ alias_method :postal_code_is_shared?,
30
+ :postal_code_is_shared
31
+ alias_method :changed?,
32
+ :changed
33
+ end
34
+
35
+ VALID_READINGS = [
36
+ :lowercase,
37
+ :uppercase,
38
+ :romaji,
39
+ ]
40
+ def initialize(reading: :lowercase)
41
+ super()
42
+ @reading = reading
43
+ unless VALID_READINGS.include?(@reading)
44
+ message = ":reading must be one of ["
45
+ message << VALID_READINGS.collect(&:inspect).join(", ")
46
+ message << "]: #{@reading.inspect}"
47
+ raise ArgumentError, message
48
+ end
49
+ @metadata.id = "postal-code-japan-#{@reading}"
50
+ @metadata.name = "Postal code in Japan (#{@reading})"
51
+ @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
52
+ @metadata.licenses = [
53
+ "CC0-1.0",
54
+ ]
55
+ @metadata.description = "Postal code in Japan (reading: #{@reading})"
56
+ end
57
+
58
+ def each(&block)
59
+ return to_enum(__method__) unless block_given?
60
+
61
+ open_data do |input|
62
+ utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
63
+ options = {
64
+ quote_char: nil,
65
+ strip: %Q["],
66
+ }
67
+ if @reading == :romaji
68
+ CSV.parse(utf8_data, **options) do |row|
69
+ yield(Record.new(nil,
70
+ nil,
71
+ row[0],
72
+ row[4],
73
+ row[5],
74
+ row[6],
75
+ row[1],
76
+ row[2],
77
+ row[3],
78
+ false,
79
+ false,
80
+ false,
81
+ false,
82
+ false,
83
+ nil))
84
+ end
85
+ else
86
+ CSV.parse(utf8_data, **options) do |row|
87
+ yield(Record.new(row[0],
88
+ row[1].rstrip,
89
+ row[2],
90
+ row[3],
91
+ row[4],
92
+ row[5],
93
+ row[6],
94
+ row[7],
95
+ row[8],
96
+ (row[9] == "1"),
97
+ (row[10] == "1"),
98
+ (row[11] == "1"),
99
+ (row[12] == "1"),
100
+ (row[13] != "0"),
101
+ convert_change_reason(row[14])))
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ private
108
+ def open_data
109
+ data_url = "https://www.post.japanpost.jp/zipcode/dl"
110
+ case @reading
111
+ when :lowercase
112
+ data_url << "/kogaki/zip/ken_all.zip"
113
+ when :uppercase
114
+ data_url << "/oogaki/zip/ken_all.zip"
115
+ when :romaji
116
+ data_url << "/roman/ken_all_rome.zip"
117
+ end
118
+ data_path = cache_dir_path + "#{@reading}-ken-all.zip"
119
+ unless data_path.exist?
120
+ download(data_path, data_url)
121
+ end
122
+
123
+ Zip::File.open(data_path.to_s) do |zip_file|
124
+ zip_file.each do |entry|
125
+ next unless entry.file?
126
+ entry.get_input_stream do |input|
127
+ yield(input)
128
+ end
129
+ end
130
+ end
131
+ end
132
+
133
+ def convert_change_reason(reason)
134
+ case reason
135
+ when "0"
136
+ nil
137
+ when "1"
138
+ :new
139
+ when "2"
140
+ :japanese_addressing_system
141
+ when "3"
142
+ :land_readjustment
143
+ when "4"
144
+ :postal_district_adjustment
145
+ when "5"
146
+ :correction
147
+ when "6"
148
+ :deletion
149
+ else
150
+ :unknown
151
+ end
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,95 @@
1
+ require_relative "dataset"
2
+ require_relative "tar_gz_readable"
3
+
4
+ module Datasets
5
+ class RdatasetsList < Dataset
6
+ Record = Struct.new(:package,
7
+ :dataset,
8
+ :title,
9
+ :rows,
10
+ :cols,
11
+ :n_binary,
12
+ :n_character,
13
+ :n_factor,
14
+ :n_logical,
15
+ :n_numeric,
16
+ :csv,
17
+ :doc)
18
+
19
+ def initialize
20
+ super
21
+ @metadata.id = "rdatasets"
22
+ @metadata.name = "Rdatasets"
23
+ @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
24
+ @metadata.licenses = ["GPL-3"]
25
+ @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
26
+ @data_path = cache_dir_path + "datasets.csv"
27
+ end
28
+
29
+ def filter(package: nil, dataset: nil)
30
+ return to_enum(__method__, package: package, dataset: dataset) unless block_given?
31
+
32
+ conds = {}
33
+ conds["Package"] = package if package
34
+ conds["Item"] = dataset if dataset
35
+ if conds.empty?
36
+ each_row {|row| yield Record.new(*row.fields) }
37
+ else
38
+ each_row do |row|
39
+ if conds.all? {|k, v| row[k] == v }
40
+ yield Record.new(*row.fields)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ def each(&block)
47
+ filter(&block)
48
+ end
49
+
50
+ private def each_row(&block)
51
+ download(@data_path, @data_url) unless @data_path.exist?
52
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
53
+ csv.each(&block)
54
+ end
55
+ end
56
+ end
57
+
58
+ class Rdatasets < Dataset
59
+ def initialize(package_name, dataset_name)
60
+ list = RdatasetsList.new
61
+
62
+ info = list.filter(package: package_name, dataset: dataset_name).first
63
+ unless info
64
+ raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
65
+ end
66
+
67
+ super()
68
+ @metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
69
+ @metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
70
+ @metadata.url = info.csv
71
+ @metadata.licenses = ["GPL-3"]
72
+ @metadata.description = info.title
73
+
74
+ # Follow the original directory structure in the cache directory
75
+ @data_path = cache_dir_path + (dataset_name + ".csv")
76
+
77
+ @package_name = package_name
78
+ @dataset_name = dataset_name
79
+ end
80
+
81
+ def each(&block)
82
+ return to_enum(__method__) unless block_given?
83
+
84
+ download(@data_path, @metadata.url) unless @data_path.exist?
85
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
86
+ csv.each do |row|
87
+ record = row.to_h
88
+ record.delete("")
89
+ record.transform_keys!(&:to_sym)
90
+ yield record
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -2,19 +2,99 @@ require "datasets/dictionary"
2
2
 
3
3
  module Datasets
4
4
  class Table
5
+ class Record
6
+ include Enumerable
7
+
8
+ def initialize(table, index)
9
+ @table = table
10
+ @index = index
11
+ end
12
+
13
+ def [](column_name_or_column_index)
14
+ @table[column_name_or_column_index][@index]
15
+ end
16
+
17
+ def each
18
+ return to_enum(__method__) unless block_given?
19
+ @table.each_column.each do |column_name, column_values|
20
+ yield(column_name, column_values[@index])
21
+ end
22
+ end
23
+
24
+ def values
25
+ @table.each_column.collect do |_column_name, column_values|
26
+ column_values[@index]
27
+ end
28
+ end
29
+
30
+ def to_h
31
+ hash = {}
32
+ each do |column_name, column_value|
33
+ hash[column_name] = column_value
34
+ end
35
+ hash
36
+ end
37
+
38
+ def inspect
39
+ "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
40
+ end
41
+ end
42
+
5
43
  include Enumerable
6
44
 
45
+ attr_reader :dataset
7
46
  def initialize(dataset)
8
47
  @dataset = dataset
9
48
  @dictionaries = {}
10
49
  end
11
50
 
12
- def each(&block)
51
+ def n_columns
52
+ columner_data.size
53
+ end
54
+ alias_method :size, :n_columns
55
+ alias_method :length, :n_columns
56
+
57
+ def n_rows
58
+ first_column = columner_data.first
59
+ return 0 if first_column.nil?
60
+ first_column[1].size
61
+ end
62
+
63
+ def column_names
64
+ columner_data.keys
65
+ end
66
+
67
+ def each_column(&block)
13
68
  columner_data.each(&block)
14
69
  end
70
+ alias_method :each, :each_column
15
71
 
16
- def [](name)
17
- columner_data[normalize_name(name)]
72
+ def each_record
73
+ return to_enum(__method__) unless block_given?
74
+ n_rows.times do |i|
75
+ yield(Record.new(self, i))
76
+ end
77
+ end
78
+
79
+ def find_record(row)
80
+ row += n_rows if row < 0
81
+ return nil if row < 0
82
+ return nil if row >= n_rows
83
+ Record.new(self, row)
84
+ end
85
+
86
+ def [](name_or_index)
87
+ case name_or_index
88
+ when Integer
89
+ index = name_or_index
90
+ columner_data.each_with_index do |(_name, values), i|
91
+ return values if i == index
92
+ end
93
+ nil
94
+ else
95
+ name = name_or_index
96
+ columner_data[normalize_name(name)]
97
+ end
18
98
  end
19
99
 
20
100
  def dictionary_encode(name)
@@ -0,0 +1,14 @@
1
+ require "rubygems/package"
2
+ require "zlib"
3
+
4
+ module Datasets
5
+ module TarGzReadable
6
+ def open_tar_gz(data_path)
7
+ Zlib::GzipReader.open(data_path) do |f|
8
+ Gem::Package::TarReader.new(f) do |tar|
9
+ yield(tar)
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.7"
2
+ VERSION = "0.1.2"
3
3
  end
@@ -52,7 +52,7 @@ module Datasets
52
52
  end
53
53
 
54
54
  private
55
- def open_data
55
+ def open_data(&block)
56
56
  base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
57
  data_path = cache_dir_path + base_name
58
58
  unless data_path.exist?
@@ -60,15 +60,7 @@ module Datasets
60
60
  download(data_path, data_url)
61
61
  end
62
62
 
63
- input, output = IO.pipe
64
- pid = spawn("bzcat", data_path.to_s, {:out => output})
65
- begin
66
- output.close
67
- yield(input)
68
- ensure
69
- input.close
70
- Process.waitpid(pid)
71
- end
63
+ extract_bz2(data_path, &block)
72
64
  end
73
65
 
74
66
  def type_in_path
data/red-datasets.gemspec CHANGED
@@ -34,6 +34,10 @@ Gem::Specification.new do |spec|
34
34
  spec.files += Dir.glob("doc/text/*")
35
35
  spec.test_files += Dir.glob("test/**/*")
36
36
 
37
+ spec.add_runtime_dependency("csv", ">= 3.0.5")
38
+ spec.add_runtime_dependency("rexml")
39
+ spec.add_runtime_dependency("rubyzip")
40
+
37
41
  spec.add_development_dependency("bundler")
38
42
  spec.add_development_dependency("rake")
39
43
  spec.add_development_dependency("test-unit")
data/test/run-test.rb CHANGED
@@ -13,4 +13,6 @@ $LOAD_PATH.unshift(lib_dir.to_s)
13
13
 
14
14
  require_relative "helper"
15
15
 
16
+ ARGV.unshift("--max-diff-target-string-size=#{10 * 1024}")
17
+
16
18
  exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
@@ -0,0 +1,180 @@
1
+ class CLDRPluralsTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::CLDRPlurals.new
4
+ end
5
+
6
+ def locale(*args)
7
+ Datasets::CLDRPlurals::Locale.new(*args)
8
+ end
9
+
10
+ def rule(*args)
11
+ Datasets::CLDRPlurals::Rule.new(*args)
12
+ end
13
+
14
+ test("#each") do
15
+ locales = @dataset.each.to_a
16
+ assert_equal([
17
+ 215,
18
+ locale("bm",
19
+ [
20
+ rule("other",
21
+ nil,
22
+ [
23
+ 0..15,
24
+ 100,
25
+ 1000,
26
+ 10000,
27
+ 100000,
28
+ 1000000,
29
+ :elipsis,
30
+ ],
31
+ [
32
+ 0.0..1.5,
33
+ 10.0,
34
+ 100.0,
35
+ 1000.0,
36
+ 10000.0,
37
+ 100000.0,
38
+ 1000000.0,
39
+ :elipsis,
40
+ ])
41
+ ]),
42
+ locale("kw",
43
+ [
44
+ rule("zero",
45
+ [:equal, "n", [0]],
46
+ [0],
47
+ [0.0, 0.00, 0.000, 0.0000]),
48
+ rule("one",
49
+ [:equal, "n", [1]],
50
+ [1],
51
+ [1.0, 1.00, 1.000, 1.0000]),
52
+ rule("two",
53
+ [:or,
54
+ [:equal,
55
+ [:mod, "n", 100],
56
+ [2, 22, 42, 62, 82]],
57
+ [:and,
58
+ [:equal, [:mod, "n", 1000], [0]],
59
+ [:equal,
60
+ [:mod, "n", 100000],
61
+ [1000..20000, 40000, 60000, 80000]]],
62
+ [:and,
63
+ [:not_equal, "n", [0]],
64
+ [:equal, [:mod, "n", 1000000], [100000]]]],
65
+ [
66
+ 2,
67
+ 22,
68
+ 42,
69
+ 62,
70
+ 82,
71
+ 102,
72
+ 122,
73
+ 142,
74
+ 1000,
75
+ 10000,
76
+ 100000,
77
+ :elipsis,
78
+ ],
79
+ [
80
+ 2.0,
81
+ 22.0,
82
+ 42.0,
83
+ 62.0,
84
+ 82.0,
85
+ 102.0,
86
+ 122.0,
87
+ 142.0,
88
+ 1000.0,
89
+ 10000.0,
90
+ 100000.0,
91
+ :elipsis,
92
+ ]),
93
+ rule("few",
94
+ [:equal,
95
+ [:mod, "n", 100],
96
+ [3, 23, 43, 63, 83]],
97
+ [
98
+ 3,
99
+ 23,
100
+ 43,
101
+ 63,
102
+ 83,
103
+ 103,
104
+ 123,
105
+ 143,
106
+ 1003,
107
+ :elipsis,
108
+ ],
109
+ [
110
+ 3.0,
111
+ 23.0,
112
+ 43.0,
113
+ 63.0,
114
+ 83.0,
115
+ 103.0,
116
+ 123.0,
117
+ 143.0,
118
+ 1003.0,
119
+ :elipsis,
120
+ ]),
121
+ rule("many",
122
+ [:and,
123
+ [:not_equal, "n", [1]],
124
+ [:equal,
125
+ [:mod, "n", 100],
126
+ [1, 21, 41, 61, 81]]],
127
+ [
128
+ 21,
129
+ 41,
130
+ 61,
131
+ 81,
132
+ 101,
133
+ 121,
134
+ 141,
135
+ 161,
136
+ 1001,
137
+ :elipsis,
138
+ ],
139
+ [
140
+ 21.0,
141
+ 41.0,
142
+ 61.0,
143
+ 81.0,
144
+ 101.0,
145
+ 121.0,
146
+ 141.0,
147
+ 161.0,
148
+ 1001.0,
149
+ :elipsis,
150
+ ]),
151
+ rule("other",
152
+ nil,
153
+ [4..19, 100, 1004, 1000000, :elipsis],
154
+ [
155
+ 0.1..0.9,
156
+ 1.1..1.7,
157
+ 10.0,
158
+ 100.0,
159
+ 1000.1,
160
+ 1000000.0,
161
+ :elipsis,
162
+ ]),
163
+ ]),
164
+ ],
165
+ [
166
+ locales.size,
167
+ locales[0],
168
+ locales[-4],
169
+ ])
170
+ end
171
+
172
+ sub_test_case("#metadata") do
173
+ test("#description") do
174
+ description = @dataset.metadata.description
175
+ assert do
176
+ description.start_with?("Language plural rules in Unicode Common Locale Data Repository.")
177
+ end
178
+ end
179
+ end
180
+ end