red-datasets 0.0.6 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -7
- data/doc/text/news.md +124 -0
- data/lib/datasets.rb +18 -6
- data/lib/datasets/adult.rb +84 -0
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +13 -0
- data/lib/datasets/dictionary.rb +59 -0
- data/lib/datasets/downloader.rb +37 -62
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/fashion-mnist.rb +12 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/iris.rb +1 -1
- data/lib/datasets/libsvm-dataset-list.rb +277 -0
- data/lib/datasets/libsvm.rb +135 -0
- data/lib/datasets/mnist.rb +11 -8
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +125 -0
- data/lib/datasets/penn-treebank.rb +2 -9
- data/lib/datasets/postal-code-japan.rb +154 -0
- data/lib/datasets/table.rb +99 -3
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/lib/datasets/wine.rb +64 -0
- data/red-datasets.gemspec +4 -0
- data/test/helper.rb +1 -0
- data/test/run-test.rb +2 -0
- data/test/test-adult.rb +126 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dictionary.rb +43 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-fashion-mnist.rb +137 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-libsvm-dataset-list.rb +47 -0
- data/test/test-libsvm.rb +205 -0
- data/test/test-mnist.rb +95 -70
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +239 -0
- data/test/test-penn-treebank.rb +6 -6
- data/test/test-postal-code-japan.rb +69 -0
- data/test/test-table.rb +144 -19
- data/test/test-wine.rb +58 -0
- metadata +89 -8
@@ -2,7 +2,7 @@ require_relative "dataset"
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class PennTreebank < Dataset
|
5
|
-
Record = Struct.new(:word
|
5
|
+
Record = Struct.new(:word)
|
6
6
|
|
7
7
|
DESCRIPTION = <<~DESC
|
8
8
|
`Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a
|
@@ -46,17 +46,10 @@ module Datasets
|
|
46
46
|
|
47
47
|
private
|
48
48
|
def parse_data(data_path)
|
49
|
-
index = 0
|
50
|
-
vocabulary = {}
|
51
49
|
File.open(data_path) do |f|
|
52
50
|
f.each_line do |line|
|
53
51
|
line.split.each do |word|
|
54
|
-
word
|
55
|
-
unless vocabulary.key?(word)
|
56
|
-
vocabulary[word] = index
|
57
|
-
index += 1
|
58
|
-
end
|
59
|
-
yield(Record.new(word, vocabulary[word]))
|
52
|
+
yield(Record.new(word.strip))
|
60
53
|
end
|
61
54
|
end
|
62
55
|
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "zip"
|
3
|
+
|
4
|
+
require_relative "dataset"
|
5
|
+
|
6
|
+
module Datasets
|
7
|
+
class PostalCodeJapan < Dataset
|
8
|
+
class Record < Struct.new(:organization_code,
|
9
|
+
:old_postal_code,
|
10
|
+
:postal_code,
|
11
|
+
:prefecture_reading,
|
12
|
+
:city_reading,
|
13
|
+
:address_reading,
|
14
|
+
:prefecture,
|
15
|
+
:city,
|
16
|
+
:address,
|
17
|
+
:have_multiple_postal_codes,
|
18
|
+
:have_address_number_per_koaza,
|
19
|
+
:have_chome,
|
20
|
+
:postal_code_is_shared,
|
21
|
+
:changed,
|
22
|
+
:change_reason)
|
23
|
+
alias_method :have_multiple_postal_codes?,
|
24
|
+
:have_multiple_postal_codes
|
25
|
+
alias_method :have_address_number_per_koaza?,
|
26
|
+
:have_address_number_per_koaza
|
27
|
+
alias_method :have_chome?,
|
28
|
+
:have_chome
|
29
|
+
alias_method :postal_code_is_shared?,
|
30
|
+
:postal_code_is_shared
|
31
|
+
alias_method :changed?,
|
32
|
+
:changed
|
33
|
+
end
|
34
|
+
|
35
|
+
VALID_READINGS = [
|
36
|
+
:lowercase,
|
37
|
+
:uppercase,
|
38
|
+
:romaji,
|
39
|
+
]
|
40
|
+
def initialize(reading: :lowercase)
|
41
|
+
super()
|
42
|
+
@reading = reading
|
43
|
+
unless VALID_READINGS.include?(@reading)
|
44
|
+
message = ":reading must be one of ["
|
45
|
+
message << VALID_READINGS.collect(&:inspect).join(", ")
|
46
|
+
message << "]: #{@reading.inspect}"
|
47
|
+
raise ArgumentError, message
|
48
|
+
end
|
49
|
+
@metadata.id = "postal-code-japan-#{@reading}"
|
50
|
+
@metadata.name = "Postal code in Japan (#{@reading})"
|
51
|
+
@metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
|
52
|
+
@metadata.licenses = [
|
53
|
+
"CC0-1.0",
|
54
|
+
]
|
55
|
+
@metadata.description = "Postal code in Japan (reading: #{@reading})"
|
56
|
+
end
|
57
|
+
|
58
|
+
def each(&block)
|
59
|
+
return to_enum(__method__) unless block_given?
|
60
|
+
|
61
|
+
open_data do |input|
|
62
|
+
utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
|
63
|
+
options = {
|
64
|
+
quote_char: nil,
|
65
|
+
strip: %Q["],
|
66
|
+
}
|
67
|
+
if @reading == :romaji
|
68
|
+
CSV.parse(utf8_data, **options) do |row|
|
69
|
+
yield(Record.new(nil,
|
70
|
+
nil,
|
71
|
+
row[0],
|
72
|
+
row[4],
|
73
|
+
row[5],
|
74
|
+
row[6],
|
75
|
+
row[1],
|
76
|
+
row[2],
|
77
|
+
row[3],
|
78
|
+
false,
|
79
|
+
false,
|
80
|
+
false,
|
81
|
+
false,
|
82
|
+
false,
|
83
|
+
nil))
|
84
|
+
end
|
85
|
+
else
|
86
|
+
CSV.parse(utf8_data, **options) do |row|
|
87
|
+
yield(Record.new(row[0],
|
88
|
+
row[1].rstrip,
|
89
|
+
row[2],
|
90
|
+
row[3],
|
91
|
+
row[4],
|
92
|
+
row[5],
|
93
|
+
row[6],
|
94
|
+
row[7],
|
95
|
+
row[8],
|
96
|
+
(row[9] == "1"),
|
97
|
+
(row[10] == "1"),
|
98
|
+
(row[11] == "1"),
|
99
|
+
(row[12] == "1"),
|
100
|
+
(row[13] != "0"),
|
101
|
+
convert_change_reason(row[14])))
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
def open_data
|
109
|
+
data_url = "https://www.post.japanpost.jp/zipcode/dl"
|
110
|
+
case @reading
|
111
|
+
when :lowercase
|
112
|
+
data_url << "/kogaki/zip/ken_all.zip"
|
113
|
+
when :uppercase
|
114
|
+
data_url << "/oogaki/zip/ken_all.zip"
|
115
|
+
when :romaji
|
116
|
+
data_url << "/roman/ken_all_rome.zip"
|
117
|
+
end
|
118
|
+
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
119
|
+
unless data_path.exist?
|
120
|
+
download(data_path, data_url)
|
121
|
+
end
|
122
|
+
|
123
|
+
Zip::File.open(data_path.to_s) do |zip_file|
|
124
|
+
zip_file.each do |entry|
|
125
|
+
next unless entry.file?
|
126
|
+
entry.get_input_stream do |input|
|
127
|
+
yield(input)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def convert_change_reason(reason)
|
134
|
+
case reason
|
135
|
+
when "0"
|
136
|
+
nil
|
137
|
+
when "1"
|
138
|
+
:new
|
139
|
+
when "2"
|
140
|
+
:japanese_addressing_system
|
141
|
+
when "3"
|
142
|
+
:land_readjustment
|
143
|
+
when "4"
|
144
|
+
:postal_district_adjustment
|
145
|
+
when "5"
|
146
|
+
:correction
|
147
|
+
when "6"
|
148
|
+
:deletion
|
149
|
+
else
|
150
|
+
:unknown
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
data/lib/datasets/table.rb
CHANGED
@@ -1,17 +1,109 @@
|
|
1
|
+
require "datasets/dictionary"
|
2
|
+
|
1
3
|
module Datasets
|
2
4
|
class Table
|
5
|
+
class Record
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
def initialize(table, index)
|
9
|
+
@table = table
|
10
|
+
@index = index
|
11
|
+
end
|
12
|
+
|
13
|
+
def [](column_name_or_column_index)
|
14
|
+
@table[column_name_or_column_index][@index]
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
return to_enum(__method__) unless block_given?
|
19
|
+
@table.each_column.each do |column_name, column_values|
|
20
|
+
yield(column_name, column_values[@index])
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def values
|
25
|
+
@table.each_column.collect do |_column_name, column_values|
|
26
|
+
column_values[@index]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_h
|
31
|
+
hash = {}
|
32
|
+
each do |column_name, column_value|
|
33
|
+
hash[column_name] = column_value
|
34
|
+
end
|
35
|
+
hash
|
36
|
+
end
|
37
|
+
|
38
|
+
def inspect
|
39
|
+
"#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
3
43
|
include Enumerable
|
4
44
|
|
45
|
+
attr_reader :dataset
|
5
46
|
def initialize(dataset)
|
6
47
|
@dataset = dataset
|
48
|
+
@dictionaries = {}
|
7
49
|
end
|
8
50
|
|
9
|
-
def
|
51
|
+
def n_columns
|
52
|
+
columner_data.size
|
53
|
+
end
|
54
|
+
alias_method :size, :n_columns
|
55
|
+
alias_method :length, :n_columns
|
56
|
+
|
57
|
+
def n_rows
|
58
|
+
first_column = columner_data.first
|
59
|
+
return 0 if first_column.nil?
|
60
|
+
first_column[1].size
|
61
|
+
end
|
62
|
+
|
63
|
+
def column_names
|
64
|
+
columner_data.keys
|
65
|
+
end
|
66
|
+
|
67
|
+
def each_column(&block)
|
10
68
|
columner_data.each(&block)
|
11
69
|
end
|
70
|
+
alias_method :each, :each_column
|
71
|
+
|
72
|
+
def each_record
|
73
|
+
return to_enum(__method__) unless block_given?
|
74
|
+
n_rows.times do |i|
|
75
|
+
yield(Record.new(self, i))
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_record(row)
|
80
|
+
row += n_rows if row < 0
|
81
|
+
return nil if row < 0
|
82
|
+
return nil if row >= n_rows
|
83
|
+
Record.new(self, row)
|
84
|
+
end
|
85
|
+
|
86
|
+
def [](name_or_index)
|
87
|
+
case name_or_index
|
88
|
+
when Integer
|
89
|
+
index = name_or_index
|
90
|
+
columner_data.each_with_index do |(_name, values), i|
|
91
|
+
return values if i == index
|
92
|
+
end
|
93
|
+
nil
|
94
|
+
else
|
95
|
+
name = name_or_index
|
96
|
+
columner_data[normalize_name(name)]
|
97
|
+
end
|
98
|
+
end
|
12
99
|
|
13
|
-
def
|
14
|
-
|
100
|
+
def dictionary_encode(name)
|
101
|
+
@dictionaries[normalize_name(name)] ||= Dictionary.new(self[name])
|
102
|
+
end
|
103
|
+
|
104
|
+
def label_encode(name)
|
105
|
+
dictionary = dictionary_encode(name)
|
106
|
+
dictionary.encode(self[name])
|
15
107
|
end
|
16
108
|
|
17
109
|
def fetch_values(*keys)
|
@@ -55,5 +147,9 @@ module Datasets
|
|
55
147
|
def columner_data
|
56
148
|
@columns ||= to_h
|
57
149
|
end
|
150
|
+
|
151
|
+
def normalize_name(name)
|
152
|
+
name.to_sym
|
153
|
+
end
|
58
154
|
end
|
59
155
|
end
|
data/lib/datasets/version.rb
CHANGED
data/lib/datasets/wikipedia.rb
CHANGED
@@ -52,7 +52,7 @@ module Datasets
|
|
52
52
|
end
|
53
53
|
|
54
54
|
private
|
55
|
-
def open_data
|
55
|
+
def open_data(&block)
|
56
56
|
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
57
57
|
data_path = cache_dir_path + base_name
|
58
58
|
unless data_path.exist?
|
@@ -60,15 +60,7 @@ module Datasets
|
|
60
60
|
download(data_path, data_url)
|
61
61
|
end
|
62
62
|
|
63
|
-
|
64
|
-
pid = spawn("bzcat", data_path.to_s, {:out => output})
|
65
|
-
begin
|
66
|
-
output.close
|
67
|
-
yield(input)
|
68
|
-
ensure
|
69
|
-
input.close
|
70
|
-
Process.waitpid(pid)
|
71
|
-
end
|
63
|
+
extract_bz2(data_path, &block)
|
72
64
|
end
|
73
65
|
|
74
66
|
def type_in_path
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
require_relative 'dataset'
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Wine < Dataset
|
7
|
+
Record = Struct.new(:label,
|
8
|
+
:alcohol,
|
9
|
+
:malic_acid,
|
10
|
+
:ash,
|
11
|
+
:alcalinity_of_ash,
|
12
|
+
:n_magnesiums,
|
13
|
+
:total_phenols,
|
14
|
+
:total_flavonoids,
|
15
|
+
:total_nonflavanoid_phenols,
|
16
|
+
:total_proanthocyanins,
|
17
|
+
:color_intensity,
|
18
|
+
:hue,
|
19
|
+
:optical_nucleic_acid_concentration,
|
20
|
+
:n_prolines)
|
21
|
+
|
22
|
+
def initialize
|
23
|
+
super
|
24
|
+
@metadata.id = 'wine'
|
25
|
+
@metadata.name = 'Wine'
|
26
|
+
@metadata.url = 'http://archive.ics.uci.edu/ml/datasets/wine'
|
27
|
+
@metadata.description = -> { read_names }
|
28
|
+
end
|
29
|
+
|
30
|
+
def each
|
31
|
+
return to_enum(__method__) unless block_given?
|
32
|
+
|
33
|
+
open_data do |csv|
|
34
|
+
csv.each do |row|
|
35
|
+
next if row[0].nil?
|
36
|
+
record = Record.new(*row)
|
37
|
+
yield(record)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def read_names
|
45
|
+
names_path = cache_dir_path + 'wine.names'
|
46
|
+
unless names_path.exist?
|
47
|
+
names_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
|
48
|
+
download(names_path, names_url)
|
49
|
+
end
|
50
|
+
names_path.read
|
51
|
+
end
|
52
|
+
|
53
|
+
def open_data
|
54
|
+
data_path = cache_dir_path + 'wine.data'
|
55
|
+
unless data_path.exist?
|
56
|
+
data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
|
57
|
+
download(data_path, data_url)
|
58
|
+
end
|
59
|
+
CSV.open(data_path, converters: %i[numeric]) do |csv|
|
60
|
+
yield(csv)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
data/red-datasets.gemspec
CHANGED
@@ -34,6 +34,10 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.files += Dir.glob("doc/text/*")
|
35
35
|
spec.test_files += Dir.glob("test/**/*")
|
36
36
|
|
37
|
+
spec.add_runtime_dependency("csv", ">= 3.0.5")
|
38
|
+
spec.add_runtime_dependency("rexml")
|
39
|
+
spec.add_runtime_dependency("rubyzip")
|
40
|
+
|
37
41
|
spec.add_development_dependency("bundler")
|
38
42
|
spec.add_development_dependency("rake")
|
39
43
|
spec.add_development_dependency("test-unit")
|
data/test/helper.rb
CHANGED
data/test/run-test.rb
CHANGED
data/test/test-adult.rb
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
class AdultTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("train") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::Adult.new(type: :train)
|
5
|
+
end
|
6
|
+
|
7
|
+
def record(*args)
|
8
|
+
Datasets::Adult::Record.new(*args)
|
9
|
+
end
|
10
|
+
|
11
|
+
test("#each") do
|
12
|
+
records = @dataset.each.to_a
|
13
|
+
assert_equal([
|
14
|
+
32561,
|
15
|
+
{
|
16
|
+
:age => 39,
|
17
|
+
:work_class => "State-gov",
|
18
|
+
:final_weight => 77516,
|
19
|
+
:education => "Bachelors",
|
20
|
+
:n_education_years => 13,
|
21
|
+
:marital_status => "Never-married",
|
22
|
+
:occupation => "Adm-clerical",
|
23
|
+
:relationship => "Not-in-family",
|
24
|
+
:race => "White",
|
25
|
+
:sex => "Male",
|
26
|
+
:capital_gain => 2174,
|
27
|
+
:capital_loss => 0,
|
28
|
+
:hours_per_week => 40,
|
29
|
+
:native_country => "United-States",
|
30
|
+
:label => "<=50K"
|
31
|
+
},
|
32
|
+
{
|
33
|
+
:age => 52,
|
34
|
+
:work_class => "Self-emp-inc",
|
35
|
+
:final_weight => 287927,
|
36
|
+
:education => "HS-grad",
|
37
|
+
:n_education_years => 9,
|
38
|
+
:marital_status => "Married-civ-spouse",
|
39
|
+
:occupation => "Exec-managerial",
|
40
|
+
:relationship => "Wife",
|
41
|
+
:race => "White",
|
42
|
+
:sex => "Female",
|
43
|
+
:capital_gain => 15024,
|
44
|
+
:capital_loss => 0,
|
45
|
+
:hours_per_week => 40,
|
46
|
+
:native_country => "United-States",
|
47
|
+
:label => ">50K"
|
48
|
+
}
|
49
|
+
],
|
50
|
+
[
|
51
|
+
records.size,
|
52
|
+
records[0].to_h,
|
53
|
+
records[-1].to_h
|
54
|
+
])
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
sub_test_case("test") do
|
59
|
+
def setup
|
60
|
+
@dataset = Datasets::Adult.new(type: :test)
|
61
|
+
end
|
62
|
+
|
63
|
+
def record(*args)
|
64
|
+
Datasets::Adult::Record.new(*args)
|
65
|
+
end
|
66
|
+
|
67
|
+
test("#each") do
|
68
|
+
records = @dataset.each.to_a
|
69
|
+
assert_equal([
|
70
|
+
16281,
|
71
|
+
{
|
72
|
+
:age => 25,
|
73
|
+
:work_class => "Private",
|
74
|
+
:final_weight => 226802,
|
75
|
+
:education => "11th",
|
76
|
+
:n_education_years => 7,
|
77
|
+
:marital_status => "Never-married",
|
78
|
+
:occupation => "Machine-op-inspct",
|
79
|
+
:relationship => "Own-child",
|
80
|
+
:race => "Black",
|
81
|
+
:sex => "Male",
|
82
|
+
:capital_gain => 0,
|
83
|
+
:capital_loss => 0,
|
84
|
+
:hours_per_week => 40,
|
85
|
+
:native_country => "United-States",
|
86
|
+
:label => "<=50K."
|
87
|
+
},
|
88
|
+
{
|
89
|
+
:age => 35,
|
90
|
+
:work_class => "Self-emp-inc",
|
91
|
+
:final_weight => 182148,
|
92
|
+
:education => "Bachelors",
|
93
|
+
:n_education_years => 13,
|
94
|
+
:marital_status => "Married-civ-spouse",
|
95
|
+
:occupation => "Exec-managerial",
|
96
|
+
:relationship => "Husband",
|
97
|
+
:race => "White",
|
98
|
+
:sex => "Male",
|
99
|
+
:capital_gain => 0,
|
100
|
+
:capital_loss => 0,
|
101
|
+
:hours_per_week => 60,
|
102
|
+
:native_country => "United-States",
|
103
|
+
:label => ">50K."
|
104
|
+
}
|
105
|
+
],
|
106
|
+
[
|
107
|
+
records.size,
|
108
|
+
records[0].to_h,
|
109
|
+
records[-1].to_h
|
110
|
+
])
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
sub_test_case("#metadata") do
|
115
|
+
def setup
|
116
|
+
@dataset = Datasets::Adult.new(type: :train)
|
117
|
+
end
|
118
|
+
|
119
|
+
test("#description") do
|
120
|
+
description = @dataset.metadata.description
|
121
|
+
assert do
|
122
|
+
description.start_with?("| This data was extracted from the census bureau database found at")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|