red-datasets 0.0.7 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/doc/text/news.md +102 -0
- data/lib/datasets.rb +19 -9
- data/lib/datasets/adult.rb +4 -3
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +20 -1
- data/lib/datasets/downloader.rb +54 -26
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/libsvm-dataset-list.rb +277 -0
- data/lib/datasets/libsvm.rb +135 -0
- data/lib/datasets/mnist.rb +0 -2
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +146 -0
- data/lib/datasets/postal-code-japan.rb +154 -0
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/table.rb +83 -3
- data/lib/datasets/tar_gz_readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/red-datasets.gemspec +4 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-libsvm-dataset-list.rb +47 -0
- data/test/test-libsvm.rb +205 -0
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +251 -0
- data/test/test-postal-code-japan.rb +69 -0
- data/test/test-rdatasets.rb +136 -0
- data/test/test-table.rb +123 -18
- metadata +88 -11
@@ -0,0 +1,154 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "zip"
|
3
|
+
|
4
|
+
require_relative "dataset"
|
5
|
+
|
6
|
+
module Datasets
|
7
|
+
class PostalCodeJapan < Dataset
|
8
|
+
class Record < Struct.new(:organization_code,
|
9
|
+
:old_postal_code,
|
10
|
+
:postal_code,
|
11
|
+
:prefecture_reading,
|
12
|
+
:city_reading,
|
13
|
+
:address_reading,
|
14
|
+
:prefecture,
|
15
|
+
:city,
|
16
|
+
:address,
|
17
|
+
:have_multiple_postal_codes,
|
18
|
+
:have_address_number_per_koaza,
|
19
|
+
:have_chome,
|
20
|
+
:postal_code_is_shared,
|
21
|
+
:changed,
|
22
|
+
:change_reason)
|
23
|
+
alias_method :have_multiple_postal_codes?,
|
24
|
+
:have_multiple_postal_codes
|
25
|
+
alias_method :have_address_number_per_koaza?,
|
26
|
+
:have_address_number_per_koaza
|
27
|
+
alias_method :have_chome?,
|
28
|
+
:have_chome
|
29
|
+
alias_method :postal_code_is_shared?,
|
30
|
+
:postal_code_is_shared
|
31
|
+
alias_method :changed?,
|
32
|
+
:changed
|
33
|
+
end
|
34
|
+
|
35
|
+
VALID_READINGS = [
|
36
|
+
:lowercase,
|
37
|
+
:uppercase,
|
38
|
+
:romaji,
|
39
|
+
]
|
40
|
+
def initialize(reading: :lowercase)
|
41
|
+
super()
|
42
|
+
@reading = reading
|
43
|
+
unless VALID_READINGS.include?(@reading)
|
44
|
+
message = ":reading must be one of ["
|
45
|
+
message << VALID_READINGS.collect(&:inspect).join(", ")
|
46
|
+
message << "]: #{@reading.inspect}"
|
47
|
+
raise ArgumentError, message
|
48
|
+
end
|
49
|
+
@metadata.id = "postal-code-japan-#{@reading}"
|
50
|
+
@metadata.name = "Postal code in Japan (#{@reading})"
|
51
|
+
@metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
|
52
|
+
@metadata.licenses = [
|
53
|
+
"CC0-1.0",
|
54
|
+
]
|
55
|
+
@metadata.description = "Postal code in Japan (reading: #{@reading})"
|
56
|
+
end
|
57
|
+
|
58
|
+
def each(&block)
|
59
|
+
return to_enum(__method__) unless block_given?
|
60
|
+
|
61
|
+
open_data do |input|
|
62
|
+
utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
|
63
|
+
options = {
|
64
|
+
quote_char: nil,
|
65
|
+
strip: %Q["],
|
66
|
+
}
|
67
|
+
if @reading == :romaji
|
68
|
+
CSV.parse(utf8_data, **options) do |row|
|
69
|
+
yield(Record.new(nil,
|
70
|
+
nil,
|
71
|
+
row[0],
|
72
|
+
row[4],
|
73
|
+
row[5],
|
74
|
+
row[6],
|
75
|
+
row[1],
|
76
|
+
row[2],
|
77
|
+
row[3],
|
78
|
+
false,
|
79
|
+
false,
|
80
|
+
false,
|
81
|
+
false,
|
82
|
+
false,
|
83
|
+
nil))
|
84
|
+
end
|
85
|
+
else
|
86
|
+
CSV.parse(utf8_data, **options) do |row|
|
87
|
+
yield(Record.new(row[0],
|
88
|
+
row[1].rstrip,
|
89
|
+
row[2],
|
90
|
+
row[3],
|
91
|
+
row[4],
|
92
|
+
row[5],
|
93
|
+
row[6],
|
94
|
+
row[7],
|
95
|
+
row[8],
|
96
|
+
(row[9] == "1"),
|
97
|
+
(row[10] == "1"),
|
98
|
+
(row[11] == "1"),
|
99
|
+
(row[12] == "1"),
|
100
|
+
(row[13] != "0"),
|
101
|
+
convert_change_reason(row[14])))
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
def open_data
|
109
|
+
data_url = "https://www.post.japanpost.jp/zipcode/dl"
|
110
|
+
case @reading
|
111
|
+
when :lowercase
|
112
|
+
data_url << "/kogaki/zip/ken_all.zip"
|
113
|
+
when :uppercase
|
114
|
+
data_url << "/oogaki/zip/ken_all.zip"
|
115
|
+
when :romaji
|
116
|
+
data_url << "/roman/ken_all_rome.zip"
|
117
|
+
end
|
118
|
+
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
119
|
+
unless data_path.exist?
|
120
|
+
download(data_path, data_url)
|
121
|
+
end
|
122
|
+
|
123
|
+
Zip::File.open(data_path.to_s) do |zip_file|
|
124
|
+
zip_file.each do |entry|
|
125
|
+
next unless entry.file?
|
126
|
+
entry.get_input_stream do |input|
|
127
|
+
yield(input)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def convert_change_reason(reason)
|
134
|
+
case reason
|
135
|
+
when "0"
|
136
|
+
nil
|
137
|
+
when "1"
|
138
|
+
:new
|
139
|
+
when "2"
|
140
|
+
:japanese_addressing_system
|
141
|
+
when "3"
|
142
|
+
:land_readjustment
|
143
|
+
when "4"
|
144
|
+
:postal_district_adjustment
|
145
|
+
when "5"
|
146
|
+
:correction
|
147
|
+
when "6"
|
148
|
+
:deletion
|
149
|
+
else
|
150
|
+
:unknown
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "tar_gz_readable"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class RdatasetsList < Dataset
|
6
|
+
Record = Struct.new(:package,
|
7
|
+
:dataset,
|
8
|
+
:title,
|
9
|
+
:rows,
|
10
|
+
:cols,
|
11
|
+
:n_binary,
|
12
|
+
:n_character,
|
13
|
+
:n_factor,
|
14
|
+
:n_logical,
|
15
|
+
:n_numeric,
|
16
|
+
:csv,
|
17
|
+
:doc)
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super
|
21
|
+
@metadata.id = "rdatasets"
|
22
|
+
@metadata.name = "Rdatasets"
|
23
|
+
@metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
|
24
|
+
@metadata.licenses = ["GPL-3"]
|
25
|
+
@data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
|
26
|
+
@data_path = cache_dir_path + "datasets.csv"
|
27
|
+
end
|
28
|
+
|
29
|
+
def filter(package: nil, dataset: nil)
|
30
|
+
return to_enum(__method__, package: package, dataset: dataset) unless block_given?
|
31
|
+
|
32
|
+
conds = {}
|
33
|
+
conds["Package"] = package if package
|
34
|
+
conds["Item"] = dataset if dataset
|
35
|
+
if conds.empty?
|
36
|
+
each_row {|row| yield Record.new(*row.fields) }
|
37
|
+
else
|
38
|
+
each_row do |row|
|
39
|
+
if conds.all? {|k, v| row[k] == v }
|
40
|
+
yield Record.new(*row.fields)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def each(&block)
|
47
|
+
filter(&block)
|
48
|
+
end
|
49
|
+
|
50
|
+
private def each_row(&block)
|
51
|
+
download(@data_path, @data_url) unless @data_path.exist?
|
52
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
53
|
+
csv.each(&block)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class Rdatasets < Dataset
|
59
|
+
def initialize(package_name, dataset_name)
|
60
|
+
list = RdatasetsList.new
|
61
|
+
|
62
|
+
info = list.filter(package: package_name, dataset: dataset_name).first
|
63
|
+
unless info
|
64
|
+
raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
|
65
|
+
end
|
66
|
+
|
67
|
+
super()
|
68
|
+
@metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
|
69
|
+
@metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
|
70
|
+
@metadata.url = info.csv
|
71
|
+
@metadata.licenses = ["GPL-3"]
|
72
|
+
@metadata.description = info.title
|
73
|
+
|
74
|
+
# Follow the original directory structure in the cache directory
|
75
|
+
@data_path = cache_dir_path + (dataset_name + ".csv")
|
76
|
+
|
77
|
+
@package_name = package_name
|
78
|
+
@dataset_name = dataset_name
|
79
|
+
end
|
80
|
+
|
81
|
+
def each(&block)
|
82
|
+
return to_enum(__method__) unless block_given?
|
83
|
+
|
84
|
+
download(@data_path, @metadata.url) unless @data_path.exist?
|
85
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
86
|
+
csv.each do |row|
|
87
|
+
record = row.to_h
|
88
|
+
record.delete("")
|
89
|
+
record.transform_keys!(&:to_sym)
|
90
|
+
yield record
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/datasets/table.rb
CHANGED
@@ -2,19 +2,99 @@ require "datasets/dictionary"
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class Table
|
5
|
+
class Record
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
def initialize(table, index)
|
9
|
+
@table = table
|
10
|
+
@index = index
|
11
|
+
end
|
12
|
+
|
13
|
+
def [](column_name_or_column_index)
|
14
|
+
@table[column_name_or_column_index][@index]
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
return to_enum(__method__) unless block_given?
|
19
|
+
@table.each_column.each do |column_name, column_values|
|
20
|
+
yield(column_name, column_values[@index])
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def values
|
25
|
+
@table.each_column.collect do |_column_name, column_values|
|
26
|
+
column_values[@index]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_h
|
31
|
+
hash = {}
|
32
|
+
each do |column_name, column_value|
|
33
|
+
hash[column_name] = column_value
|
34
|
+
end
|
35
|
+
hash
|
36
|
+
end
|
37
|
+
|
38
|
+
def inspect
|
39
|
+
"#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
5
43
|
include Enumerable
|
6
44
|
|
45
|
+
attr_reader :dataset
|
7
46
|
def initialize(dataset)
|
8
47
|
@dataset = dataset
|
9
48
|
@dictionaries = {}
|
10
49
|
end
|
11
50
|
|
12
|
-
def
|
51
|
+
def n_columns
|
52
|
+
columner_data.size
|
53
|
+
end
|
54
|
+
alias_method :size, :n_columns
|
55
|
+
alias_method :length, :n_columns
|
56
|
+
|
57
|
+
def n_rows
|
58
|
+
first_column = columner_data.first
|
59
|
+
return 0 if first_column.nil?
|
60
|
+
first_column[1].size
|
61
|
+
end
|
62
|
+
|
63
|
+
def column_names
|
64
|
+
columner_data.keys
|
65
|
+
end
|
66
|
+
|
67
|
+
def each_column(&block)
|
13
68
|
columner_data.each(&block)
|
14
69
|
end
|
70
|
+
alias_method :each, :each_column
|
15
71
|
|
16
|
-
def
|
17
|
-
|
72
|
+
def each_record
|
73
|
+
return to_enum(__method__) unless block_given?
|
74
|
+
n_rows.times do |i|
|
75
|
+
yield(Record.new(self, i))
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_record(row)
|
80
|
+
row += n_rows if row < 0
|
81
|
+
return nil if row < 0
|
82
|
+
return nil if row >= n_rows
|
83
|
+
Record.new(self, row)
|
84
|
+
end
|
85
|
+
|
86
|
+
def [](name_or_index)
|
87
|
+
case name_or_index
|
88
|
+
when Integer
|
89
|
+
index = name_or_index
|
90
|
+
columner_data.each_with_index do |(_name, values), i|
|
91
|
+
return values if i == index
|
92
|
+
end
|
93
|
+
nil
|
94
|
+
else
|
95
|
+
name = name_or_index
|
96
|
+
columner_data[normalize_name(name)]
|
97
|
+
end
|
18
98
|
end
|
19
99
|
|
20
100
|
def dictionary_encode(name)
|
data/lib/datasets/version.rb
CHANGED
data/lib/datasets/wikipedia.rb
CHANGED
@@ -52,7 +52,7 @@ module Datasets
|
|
52
52
|
end
|
53
53
|
|
54
54
|
private
|
55
|
-
def open_data
|
55
|
+
def open_data(&block)
|
56
56
|
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
57
57
|
data_path = cache_dir_path + base_name
|
58
58
|
unless data_path.exist?
|
@@ -60,15 +60,7 @@ module Datasets
|
|
60
60
|
download(data_path, data_url)
|
61
61
|
end
|
62
62
|
|
63
|
-
|
64
|
-
pid = spawn("bzcat", data_path.to_s, {:out => output})
|
65
|
-
begin
|
66
|
-
output.close
|
67
|
-
yield(input)
|
68
|
-
ensure
|
69
|
-
input.close
|
70
|
-
Process.waitpid(pid)
|
71
|
-
end
|
63
|
+
extract_bz2(data_path, &block)
|
72
64
|
end
|
73
65
|
|
74
66
|
def type_in_path
|
data/red-datasets.gemspec
CHANGED
@@ -34,6 +34,10 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.files += Dir.glob("doc/text/*")
|
35
35
|
spec.test_files += Dir.glob("test/**/*")
|
36
36
|
|
37
|
+
spec.add_runtime_dependency("csv", ">= 3.0.5")
|
38
|
+
spec.add_runtime_dependency("rexml")
|
39
|
+
spec.add_runtime_dependency("rubyzip")
|
40
|
+
|
37
41
|
spec.add_development_dependency("bundler")
|
38
42
|
spec.add_development_dependency("rake")
|
39
43
|
spec.add_development_dependency("test-unit")
|
data/test/run-test.rb
CHANGED
@@ -0,0 +1,180 @@
|
|
1
|
+
class CLDRPluralsTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::CLDRPlurals.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def locale(*args)
|
7
|
+
Datasets::CLDRPlurals::Locale.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
def rule(*args)
|
11
|
+
Datasets::CLDRPlurals::Rule.new(*args)
|
12
|
+
end
|
13
|
+
|
14
|
+
test("#each") do
|
15
|
+
locales = @dataset.each.to_a
|
16
|
+
assert_equal([
|
17
|
+
215,
|
18
|
+
locale("bm",
|
19
|
+
[
|
20
|
+
rule("other",
|
21
|
+
nil,
|
22
|
+
[
|
23
|
+
0..15,
|
24
|
+
100,
|
25
|
+
1000,
|
26
|
+
10000,
|
27
|
+
100000,
|
28
|
+
1000000,
|
29
|
+
:elipsis,
|
30
|
+
],
|
31
|
+
[
|
32
|
+
0.0..1.5,
|
33
|
+
10.0,
|
34
|
+
100.0,
|
35
|
+
1000.0,
|
36
|
+
10000.0,
|
37
|
+
100000.0,
|
38
|
+
1000000.0,
|
39
|
+
:elipsis,
|
40
|
+
])
|
41
|
+
]),
|
42
|
+
locale("kw",
|
43
|
+
[
|
44
|
+
rule("zero",
|
45
|
+
[:equal, "n", [0]],
|
46
|
+
[0],
|
47
|
+
[0.0, 0.00, 0.000, 0.0000]),
|
48
|
+
rule("one",
|
49
|
+
[:equal, "n", [1]],
|
50
|
+
[1],
|
51
|
+
[1.0, 1.00, 1.000, 1.0000]),
|
52
|
+
rule("two",
|
53
|
+
[:or,
|
54
|
+
[:equal,
|
55
|
+
[:mod, "n", 100],
|
56
|
+
[2, 22, 42, 62, 82]],
|
57
|
+
[:and,
|
58
|
+
[:equal, [:mod, "n", 1000], [0]],
|
59
|
+
[:equal,
|
60
|
+
[:mod, "n", 100000],
|
61
|
+
[1000..20000, 40000, 60000, 80000]]],
|
62
|
+
[:and,
|
63
|
+
[:not_equal, "n", [0]],
|
64
|
+
[:equal, [:mod, "n", 1000000], [100000]]]],
|
65
|
+
[
|
66
|
+
2,
|
67
|
+
22,
|
68
|
+
42,
|
69
|
+
62,
|
70
|
+
82,
|
71
|
+
102,
|
72
|
+
122,
|
73
|
+
142,
|
74
|
+
1000,
|
75
|
+
10000,
|
76
|
+
100000,
|
77
|
+
:elipsis,
|
78
|
+
],
|
79
|
+
[
|
80
|
+
2.0,
|
81
|
+
22.0,
|
82
|
+
42.0,
|
83
|
+
62.0,
|
84
|
+
82.0,
|
85
|
+
102.0,
|
86
|
+
122.0,
|
87
|
+
142.0,
|
88
|
+
1000.0,
|
89
|
+
10000.0,
|
90
|
+
100000.0,
|
91
|
+
:elipsis,
|
92
|
+
]),
|
93
|
+
rule("few",
|
94
|
+
[:equal,
|
95
|
+
[:mod, "n", 100],
|
96
|
+
[3, 23, 43, 63, 83]],
|
97
|
+
[
|
98
|
+
3,
|
99
|
+
23,
|
100
|
+
43,
|
101
|
+
63,
|
102
|
+
83,
|
103
|
+
103,
|
104
|
+
123,
|
105
|
+
143,
|
106
|
+
1003,
|
107
|
+
:elipsis,
|
108
|
+
],
|
109
|
+
[
|
110
|
+
3.0,
|
111
|
+
23.0,
|
112
|
+
43.0,
|
113
|
+
63.0,
|
114
|
+
83.0,
|
115
|
+
103.0,
|
116
|
+
123.0,
|
117
|
+
143.0,
|
118
|
+
1003.0,
|
119
|
+
:elipsis,
|
120
|
+
]),
|
121
|
+
rule("many",
|
122
|
+
[:and,
|
123
|
+
[:not_equal, "n", [1]],
|
124
|
+
[:equal,
|
125
|
+
[:mod, "n", 100],
|
126
|
+
[1, 21, 41, 61, 81]]],
|
127
|
+
[
|
128
|
+
21,
|
129
|
+
41,
|
130
|
+
61,
|
131
|
+
81,
|
132
|
+
101,
|
133
|
+
121,
|
134
|
+
141,
|
135
|
+
161,
|
136
|
+
1001,
|
137
|
+
:elipsis,
|
138
|
+
],
|
139
|
+
[
|
140
|
+
21.0,
|
141
|
+
41.0,
|
142
|
+
61.0,
|
143
|
+
81.0,
|
144
|
+
101.0,
|
145
|
+
121.0,
|
146
|
+
141.0,
|
147
|
+
161.0,
|
148
|
+
1001.0,
|
149
|
+
:elipsis,
|
150
|
+
]),
|
151
|
+
rule("other",
|
152
|
+
nil,
|
153
|
+
[4..19, 100, 1004, 1000000, :elipsis],
|
154
|
+
[
|
155
|
+
0.1..0.9,
|
156
|
+
1.1..1.7,
|
157
|
+
10.0,
|
158
|
+
100.0,
|
159
|
+
1000.1,
|
160
|
+
1000000.0,
|
161
|
+
:elipsis,
|
162
|
+
]),
|
163
|
+
]),
|
164
|
+
],
|
165
|
+
[
|
166
|
+
locales.size,
|
167
|
+
locales[0],
|
168
|
+
locales[-4],
|
169
|
+
])
|
170
|
+
end
|
171
|
+
|
172
|
+
sub_test_case("#metadata") do
|
173
|
+
test("#description") do
|
174
|
+
description = @dataset.metadata.description
|
175
|
+
assert do
|
176
|
+
description.start_with?("Language plural rules in Unicode Common Locale Data Repository.")
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|