red-datasets 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +15 -5
- data/doc/text/news.md +17 -0
- data/lib/datasets/libsvm-dataset-list.rb +137 -0
- data/lib/datasets/libsvm.rb +143 -0
- data/lib/datasets/postal-code-japan.rb +154 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets.rb +12 -9
- data/red-datasets.gemspec +3 -0
- data/test/test-libsvm-dataset-list.rb +47 -0
- data/test/test-libsvm.rb +205 -0
- data/test/test-postal-code-japan.rb +69 -0
- metadata +47 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c7a9199546e7a001c97e45c6fa28db15c0d96b748e527d9705dfee4e4b1db6fd
|
4
|
+
data.tar.gz: c659f6ae1e658ad91210e4427be063463124d89ef90388d34ebfb73ceb49068a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8a23c4a165a596df22ce5bbe1f8f0cd5c0f002deecafbb26cd5e5f75abb3c0224c1013898162a67787159258d1b801395fc4d949c17939d95940664cffd5600
|
7
|
+
data.tar.gz: f2fd4eb733e6205f138c4005627e815e3787040a8a4b6cce7eca9fd5d4adaa12263e17e8f5bd9394a851e5210f28736ee3c682c81e110da304ae17fb3f0bedba
|
data/README.md
CHANGED
@@ -1,8 +1,4 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
## Name
|
4
|
-
|
5
|
-
Red Datasets
|
1
|
+
# Red Datasets
|
6
2
|
|
7
3
|
## Description
|
8
4
|
|
@@ -16,6 +12,20 @@ You can use datasets easily because you can access each dataset with multiple wa
|
|
16
12
|
% gem install red-datasets
|
17
13
|
```
|
18
14
|
|
15
|
+
## Available datasets
|
16
|
+
|
17
|
+
TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
|
18
|
+
|
19
|
+
* Adult Dataset
|
20
|
+
* CIFAR-10 Dataset
|
21
|
+
* CIFAR-100 Dataset
|
22
|
+
* Fashion-MNIST
|
23
|
+
* Iris Dataset
|
24
|
+
* MNIST database
|
25
|
+
* The Penn Treebank Project
|
26
|
+
* Wikipedia
|
27
|
+
* Wine Dataset
|
28
|
+
|
19
29
|
## Usage
|
20
30
|
|
21
31
|
Here is an example to access [Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris) by `#each` or `Table#to_h` or `Table#fetch_values`.
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,22 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.0.8 - 2019-03-24
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Improved README.
|
8
|
+
[GitHub#40][Patch by kojix2]
|
9
|
+
|
10
|
+
* `Datasets::PostalCodeJapan`: Added.
|
11
|
+
|
12
|
+
* `Datasets::LIBSVMDatasetList`: Added.
|
13
|
+
|
14
|
+
* `Datasets::LIBSVM`: Added.
|
15
|
+
|
16
|
+
### Thanks
|
17
|
+
|
18
|
+
* kojix2
|
19
|
+
|
3
20
|
## 0.0.7 - 2018-11-21
|
4
21
|
|
5
22
|
### Improvements
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require "English"
|
2
|
+
require "rexml/document"
|
3
|
+
|
4
|
+
require_relative "dataset"
|
5
|
+
|
6
|
+
module Datasets
|
7
|
+
class LIBSVMDatasetList < Dataset
|
8
|
+
File = Struct.new(:name,
|
9
|
+
:url,
|
10
|
+
:note)
|
11
|
+
class Record < Struct.new(:name,
|
12
|
+
:source,
|
13
|
+
:preprocessing,
|
14
|
+
:n_classes,
|
15
|
+
:n_data,
|
16
|
+
:n_features,
|
17
|
+
:files)
|
18
|
+
def to_h
|
19
|
+
hash = super
|
20
|
+
hash[:files] = hash[:files].collect(&:to_h)
|
21
|
+
hash
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def initialize
|
26
|
+
super()
|
27
|
+
@metadata.id = "libsvm-dataset-list"
|
28
|
+
@metadata.name = "LIBSVM dataset list"
|
29
|
+
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
30
|
+
@metadata.description = lambda do
|
31
|
+
extract_description
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def each
|
36
|
+
return to_enum(__method__) unless block_given?
|
37
|
+
|
38
|
+
open_data do |input|
|
39
|
+
# TODO: Improve performance
|
40
|
+
document = REXML::Document.new(input)
|
41
|
+
is_header = true
|
42
|
+
document.each_element("//tr") do |tr|
|
43
|
+
if is_header
|
44
|
+
is_header = false
|
45
|
+
next
|
46
|
+
end
|
47
|
+
name = tr.elements.first
|
48
|
+
a = name.elements.first
|
49
|
+
href = a.attributes["href"]
|
50
|
+
record = Record.new
|
51
|
+
record.name = a.text
|
52
|
+
record.files = []
|
53
|
+
parse_detail(href, record)
|
54
|
+
yield(record)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
def open_data
|
61
|
+
data_path = cache_dir_path + "index.html"
|
62
|
+
unless data_path.exist?
|
63
|
+
download(data_path, @metadata.url)
|
64
|
+
end
|
65
|
+
::File.open(data_path) do |input|
|
66
|
+
yield(input)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def extract_description
|
71
|
+
open_data do |input|
|
72
|
+
document = REXML::Document.new(input)
|
73
|
+
description = []
|
74
|
+
in_content = false
|
75
|
+
document.each_element("//body/*") do |element|
|
76
|
+
unless in_content
|
77
|
+
in_content = (element.name == "h1")
|
78
|
+
next
|
79
|
+
end
|
80
|
+
break if element.name == "hr"
|
81
|
+
content = extract_text(element)
|
82
|
+
description << content unless content.empty?
|
83
|
+
end
|
84
|
+
description.join("\n\n")
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def extract_text(element)
|
89
|
+
texts = REXML::XPath.match(element, ".//text()")
|
90
|
+
texts.join("").gsub(/[ \t\n]+/, " ").strip
|
91
|
+
end
|
92
|
+
|
93
|
+
def open_detail(detail)
|
94
|
+
data_path = cache_dir_path + detail
|
95
|
+
unless data_path.exist?
|
96
|
+
download(data_path, @metadata.url + detail)
|
97
|
+
end
|
98
|
+
::File.open(data_path) do |input|
|
99
|
+
yield(input)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def parse_detail(href, record)
|
104
|
+
path, id = href.split("#")
|
105
|
+
open_detail(path) do |detail|
|
106
|
+
detail_document = REXML::Document.new(detail)
|
107
|
+
anchor = REXML::XPath.match(detail_document, "//*[@name='#{id}']")[0]
|
108
|
+
ul = anchor.next_sibling
|
109
|
+
ul.each_element do |li|
|
110
|
+
text = extract_text(li)
|
111
|
+
case text
|
112
|
+
when /\ASource: /
|
113
|
+
record.source = $POSTMATCH
|
114
|
+
when /\APreprocessing: /
|
115
|
+
record.preprocessing = $POSTMATCH
|
116
|
+
when /\A\# of classes: (\d+)/
|
117
|
+
record.n_classes = Integer($1, 10)
|
118
|
+
when /\A\# of data: ([\d,]+)/
|
119
|
+
record.n_data = Integer($1.gsub(/,/, ""), 10)
|
120
|
+
when /\A\# of features: ([\d,]+)/
|
121
|
+
record.n_features = Integer($1.gsub(/,/, ""), 10)
|
122
|
+
when /\AFiles:/
|
123
|
+
li.elements.first.each_element do |file_li|
|
124
|
+
file_a = file_li.elements.first
|
125
|
+
file = File.new
|
126
|
+
file.name = file_a.text
|
127
|
+
file.url = @metadata.url + file_a.attributes["href"]
|
128
|
+
file_note = file_li.text
|
129
|
+
file.note = file_note.strip.gsub(/[()]/, "") if file_note
|
130
|
+
record.files << file
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,143 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class LIBSVM < Dataset
|
7
|
+
class Record
|
8
|
+
attr_reader :label
|
9
|
+
attr_reader :features
|
10
|
+
def initialize(label, features)
|
11
|
+
@label = label
|
12
|
+
@features = features
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](index)
|
16
|
+
@features[index]
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_h
|
20
|
+
hash = {
|
21
|
+
label: @label,
|
22
|
+
}
|
23
|
+
@features.each_with_index do |feature, i|
|
24
|
+
hash[i] = feature
|
25
|
+
end
|
26
|
+
hash
|
27
|
+
end
|
28
|
+
|
29
|
+
def values
|
30
|
+
[@label] + @features
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize(name,
|
35
|
+
note: nil,
|
36
|
+
default_feature_value: 0)
|
37
|
+
super()
|
38
|
+
@libsvm_dataset_metadata = fetch_dataset_info(name)
|
39
|
+
@file = choose_file(note)
|
40
|
+
@default_feature_value = default_feature_value
|
41
|
+
@metadata.id = "libsvm-#{normalize_name(name)}"
|
42
|
+
@metadata.name = "LIBSVM dataset: #{name}"
|
43
|
+
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
44
|
+
end
|
45
|
+
|
46
|
+
def each
|
47
|
+
return to_enum(__method__) unless block_given?
|
48
|
+
|
49
|
+
open_data do |input|
|
50
|
+
n_features = @libsvm_dataset_metadata.n_features
|
51
|
+
csv = CSV.new(input, col_sep: " ")
|
52
|
+
csv.each do |row|
|
53
|
+
label = parse_label(row.shift)
|
54
|
+
features = [@default_feature_value] * n_features
|
55
|
+
row.each do |column|
|
56
|
+
next if column.nil?
|
57
|
+
index, value = column.split(":", 2)
|
58
|
+
features[Integer(index, 10) - 1] = parse_value(value)
|
59
|
+
end
|
60
|
+
yield(Record.new(label, features))
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def fetch_dataset_info(name)
|
67
|
+
list = LIBSVMDatasetList.new
|
68
|
+
available_datasets = []
|
69
|
+
list.each do |record|
|
70
|
+
available_datasets << record.name
|
71
|
+
if record.name == name
|
72
|
+
return record
|
73
|
+
end
|
74
|
+
end
|
75
|
+
message = "unavailable LIBSVM dataset: #{name.inspect}: "
|
76
|
+
message << "available datasets: ["
|
77
|
+
message << available_datasets.collect(&:inspect).join(", ")
|
78
|
+
message << "]"
|
79
|
+
raise ArgumentError, message
|
80
|
+
end
|
81
|
+
|
82
|
+
def choose_file(note)
|
83
|
+
files = @libsvm_dataset_metadata.files
|
84
|
+
return files.first if note.nil?
|
85
|
+
|
86
|
+
available_notes = []
|
87
|
+
@libsvm_dataset_metadata.files.find do |file|
|
88
|
+
return file if file.note == note
|
89
|
+
available_notes << file.note if file.note
|
90
|
+
end
|
91
|
+
|
92
|
+
name = @libsvm_dataset_metadata.name
|
93
|
+
message = "unavailable note: #{name}: #{note.inspect}: "
|
94
|
+
message << "available notes: ["
|
95
|
+
message << available_notes.collect(&:inspect).join(", ")
|
96
|
+
message << "]"
|
97
|
+
raise ArgumentError, message
|
98
|
+
end
|
99
|
+
|
100
|
+
def open_data(&block)
|
101
|
+
data_path = cache_dir_path + @file.name
|
102
|
+
unless data_path.exist?
|
103
|
+
download(data_path, @file.url)
|
104
|
+
end
|
105
|
+
if data_path.extname == ".bz2"
|
106
|
+
input, output = IO.pipe
|
107
|
+
pid = spawn("bzcat", data_path.to_s, {:out => output})
|
108
|
+
begin
|
109
|
+
output.close
|
110
|
+
yield(input)
|
111
|
+
ensure
|
112
|
+
input.close
|
113
|
+
Process.waitpid(pid)
|
114
|
+
end
|
115
|
+
else
|
116
|
+
File.open(data_path, &block)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def normalize_name(name)
|
121
|
+
name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
|
122
|
+
end
|
123
|
+
|
124
|
+
def parse_label(label)
|
125
|
+
labels = label.split(",").collect do |value|
|
126
|
+
parse_value(value)
|
127
|
+
end
|
128
|
+
if labels.size == 1
|
129
|
+
labels[0]
|
130
|
+
else
|
131
|
+
labels
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def parse_value(value)
|
136
|
+
if value.include?(".")
|
137
|
+
Float(value)
|
138
|
+
else
|
139
|
+
Integer(value, 10)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "zip"
|
3
|
+
|
4
|
+
require_relative "dataset"
|
5
|
+
|
6
|
+
module Datasets
|
7
|
+
class PostalCodeJapan < Dataset
|
8
|
+
class Record < Struct.new(:organization_code,
|
9
|
+
:old_postal_code,
|
10
|
+
:postal_code,
|
11
|
+
:prefecture_reading,
|
12
|
+
:city_reading,
|
13
|
+
:address_reading,
|
14
|
+
:prefecture,
|
15
|
+
:city,
|
16
|
+
:address,
|
17
|
+
:have_multiple_postal_codes,
|
18
|
+
:have_address_number_per_koaza,
|
19
|
+
:have_chome,
|
20
|
+
:postal_code_is_shared,
|
21
|
+
:changed,
|
22
|
+
:change_reason)
|
23
|
+
alias_method :have_multiple_postal_codes?,
|
24
|
+
:have_multiple_postal_codes
|
25
|
+
alias_method :have_address_number_per_koaza?,
|
26
|
+
:have_address_number_per_koaza
|
27
|
+
alias_method :have_chome?,
|
28
|
+
:have_chome
|
29
|
+
alias_method :postal_code_is_shared?,
|
30
|
+
:postal_code_is_shared
|
31
|
+
alias_method :changed?,
|
32
|
+
:changed
|
33
|
+
end
|
34
|
+
|
35
|
+
VALID_READINGS = [
|
36
|
+
:lowercase,
|
37
|
+
:uppercase,
|
38
|
+
:romaji,
|
39
|
+
]
|
40
|
+
def initialize(reading: :lowercase)
|
41
|
+
super()
|
42
|
+
@reading = reading
|
43
|
+
unless VALID_READINGS.include?(@reading)
|
44
|
+
message = ":reading must be one of ["
|
45
|
+
message << VALID_READINGS.collect(&:inspect).join(", ")
|
46
|
+
message << "]: #{@reading.inspect}"
|
47
|
+
raise ArgumentError, message
|
48
|
+
end
|
49
|
+
@metadata.id = "postal-code-japan-#{@reading}"
|
50
|
+
@metadata.name = "Postal code in Japan (#{@reading})"
|
51
|
+
@metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
|
52
|
+
@metadata.licenses = [
|
53
|
+
"CC0-1.0",
|
54
|
+
]
|
55
|
+
@metadata.description = "Postal code in Japan (reading: #{@reading})"
|
56
|
+
end
|
57
|
+
|
58
|
+
def each(&block)
|
59
|
+
return to_enum(__method__) unless block_given?
|
60
|
+
|
61
|
+
open_data do |input|
|
62
|
+
utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
|
63
|
+
options = {
|
64
|
+
quote_char: nil,
|
65
|
+
strip: %Q["],
|
66
|
+
}
|
67
|
+
if @reading == :romaji
|
68
|
+
CSV.parse(utf8_data, **options) do |row|
|
69
|
+
yield(Record.new(nil,
|
70
|
+
nil,
|
71
|
+
row[0],
|
72
|
+
row[4],
|
73
|
+
row[5],
|
74
|
+
row[6],
|
75
|
+
row[1],
|
76
|
+
row[2],
|
77
|
+
row[3],
|
78
|
+
false,
|
79
|
+
false,
|
80
|
+
false,
|
81
|
+
false,
|
82
|
+
false,
|
83
|
+
nil))
|
84
|
+
end
|
85
|
+
else
|
86
|
+
CSV.parse(utf8_data, **options) do |row|
|
87
|
+
yield(Record.new(row[0],
|
88
|
+
row[1].rstrip,
|
89
|
+
row[2],
|
90
|
+
row[3],
|
91
|
+
row[4],
|
92
|
+
row[5],
|
93
|
+
row[6],
|
94
|
+
row[7],
|
95
|
+
row[8],
|
96
|
+
(row[9] == "1"),
|
97
|
+
(row[10] == "1"),
|
98
|
+
(row[11] == "1"),
|
99
|
+
(row[12] == "1"),
|
100
|
+
(row[13] != "0"),
|
101
|
+
convert_change_reason(row[14])))
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
def open_data
|
109
|
+
data_url = "https://www.post.japanpost.jp/zipcode/dl"
|
110
|
+
case @reading
|
111
|
+
when :lowercase
|
112
|
+
data_url << "/kogaki/zip/ken_all.zip"
|
113
|
+
when :uppercase
|
114
|
+
data_url << "/oogaki/zip/ken_all.zip"
|
115
|
+
when :romaji
|
116
|
+
data_url << "/roman/ken_all_rome.zip"
|
117
|
+
end
|
118
|
+
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
119
|
+
unless data_path.exist?
|
120
|
+
download(data_path, data_url)
|
121
|
+
end
|
122
|
+
|
123
|
+
Zip::File.open(data_path.to_s) do |zip_file|
|
124
|
+
zip_file.each do |entry|
|
125
|
+
next unless entry.file?
|
126
|
+
entry.get_input_stream do |input|
|
127
|
+
yield(input)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def convert_change_reason(reason)
|
134
|
+
case reason
|
135
|
+
when "0"
|
136
|
+
nil
|
137
|
+
when "1"
|
138
|
+
:new
|
139
|
+
when "2"
|
140
|
+
:japanese_addressing_system
|
141
|
+
when "3"
|
142
|
+
:land_readjustment
|
143
|
+
when "4"
|
144
|
+
:postal_district_adjustment
|
145
|
+
when "5"
|
146
|
+
:correction
|
147
|
+
when "6"
|
148
|
+
:deletion
|
149
|
+
else
|
150
|
+
:unknown
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
data/lib/datasets/version.rb
CHANGED
data/lib/datasets.rb
CHANGED
@@ -1,10 +1,13 @@
|
|
1
|
-
|
1
|
+
require_relative "datasets/version"
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
3
|
+
require_relative "datasets/adult"
|
4
|
+
require_relative "datasets/cifar"
|
5
|
+
require_relative "datasets/fashion-mnist"
|
6
|
+
require_relative "datasets/iris"
|
7
|
+
require_relative "datasets/libsvm"
|
8
|
+
require_relative "datasets/libsvm-dataset-list"
|
9
|
+
require_relative "datasets/mnist"
|
10
|
+
require_relative "datasets/penn-treebank"
|
11
|
+
require_relative "datasets/postal-code-japan"
|
12
|
+
require_relative "datasets/wikipedia"
|
13
|
+
require_relative "datasets/wine"
|
data/red-datasets.gemspec
CHANGED
@@ -34,6 +34,9 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.files += Dir.glob("doc/text/*")
|
35
35
|
spec.test_files += Dir.glob("test/**/*")
|
36
36
|
|
37
|
+
spec.add_runtime_dependency("csv", ">= 3.0.5")
|
38
|
+
spec.add_runtime_dependency("rubyzip")
|
39
|
+
|
37
40
|
spec.add_development_dependency("bundler")
|
38
41
|
spec.add_development_dependency("rake")
|
39
42
|
spec.add_development_dependency("test-unit")
|
@@ -0,0 +1,47 @@
|
|
1
|
+
class LIBSVMDatasetListTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::LIBSVMDatasetList.new
|
4
|
+
end
|
5
|
+
|
6
|
+
test("#each") do
|
7
|
+
assert_equal({
|
8
|
+
name: "a1a",
|
9
|
+
source: "UCI / Adult",
|
10
|
+
preprocessing:
|
11
|
+
"The original Adult data set has 14 features, " +
|
12
|
+
"among which six are continuous and eight are " +
|
13
|
+
"categorical. In this data set, continuous features " +
|
14
|
+
"are discretized into quantiles, and each quantile is " +
|
15
|
+
"represented by a binary feature. Also, a categorical " +
|
16
|
+
"feature with m categories is converted to m binary " +
|
17
|
+
"features. Details on how each feature is converted " +
|
18
|
+
"can be found in the beginning of each file from this " +
|
19
|
+
"page. [JP98a]",
|
20
|
+
n_classes: 2,
|
21
|
+
n_data: 1605,
|
22
|
+
n_features: 123,
|
23
|
+
files: [
|
24
|
+
{
|
25
|
+
name: "a1a",
|
26
|
+
url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a",
|
27
|
+
note: nil,
|
28
|
+
},
|
29
|
+
{
|
30
|
+
name: "a1a.t",
|
31
|
+
url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t",
|
32
|
+
note: "testing",
|
33
|
+
}
|
34
|
+
],
|
35
|
+
},
|
36
|
+
@dataset.first.to_h)
|
37
|
+
end
|
38
|
+
|
39
|
+
sub_test_case("#metadata") do
|
40
|
+
test("#description") do
|
41
|
+
description = @dataset.metadata.description
|
42
|
+
assert do
|
43
|
+
description.start_with?("This page contains many classification, ")
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
data/test/test-libsvm.rb
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
class LIBSVMDatasetTest < Test::Unit::TestCase
|
2
|
+
test(":note") do
|
3
|
+
dataset = Datasets::LIBSVM.new("a1a", note: "testing")
|
4
|
+
hash = {label: -1}
|
5
|
+
n_features = 123
|
6
|
+
n_features.times do |i|
|
7
|
+
hash[i] = 0
|
8
|
+
end
|
9
|
+
[5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i|
|
10
|
+
hash[i - 1] = 1
|
11
|
+
end
|
12
|
+
assert_equal(hash,
|
13
|
+
dataset.first.to_h)
|
14
|
+
end
|
15
|
+
|
16
|
+
test(":default_feature_value") do
|
17
|
+
dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil)
|
18
|
+
hash = {label: -1}
|
19
|
+
n_features = 123
|
20
|
+
n_features.times do |i|
|
21
|
+
hash[i] = nil
|
22
|
+
end
|
23
|
+
[3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
|
24
|
+
hash[i - 1] = 1
|
25
|
+
end
|
26
|
+
assert_equal(hash,
|
27
|
+
dataset.first.to_h)
|
28
|
+
end
|
29
|
+
|
30
|
+
test("classification") do
|
31
|
+
dataset = Datasets::LIBSVM.new("a1a")
|
32
|
+
hash = {label: -1}
|
33
|
+
n_features = 123
|
34
|
+
n_features.times do |i|
|
35
|
+
hash[i] = 0
|
36
|
+
end
|
37
|
+
[3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
|
38
|
+
hash[i - 1] = 1
|
39
|
+
end
|
40
|
+
assert_equal(hash,
|
41
|
+
dataset.first.to_h)
|
42
|
+
end
|
43
|
+
|
44
|
+
test("regression") do
|
45
|
+
dataset = Datasets::LIBSVM.new("abalone")
|
46
|
+
hash = {label: 15}
|
47
|
+
n_features = 8
|
48
|
+
n_features.times do |i|
|
49
|
+
hash[i] = 0
|
50
|
+
end
|
51
|
+
[
|
52
|
+
[1, 1],
|
53
|
+
[2, 0.455],
|
54
|
+
[3, 0.365],
|
55
|
+
[4, 0.095],
|
56
|
+
[5, 0.514],
|
57
|
+
[6, 0.2245],
|
58
|
+
[7, 0.101],
|
59
|
+
[8, 0.15],
|
60
|
+
].each do |i, value|
|
61
|
+
hash[i - 1] = value
|
62
|
+
end
|
63
|
+
assert_equal(hash,
|
64
|
+
dataset.first.to_h)
|
65
|
+
end
|
66
|
+
|
67
|
+
test("multi-label") do
|
68
|
+
dataset = Datasets::LIBSVM.new("mediamill (exp1)")
|
69
|
+
hash = {label: [65, 67, 11, 31]}
|
70
|
+
n_features = 120
|
71
|
+
n_features.times do |i|
|
72
|
+
hash[i] = 0
|
73
|
+
end
|
74
|
+
[
|
75
|
+
[1, 0.380877],
|
76
|
+
[2, 0.494079],
|
77
|
+
[3, 0.540009],
|
78
|
+
[4, 0.422926],
|
79
|
+
[5, 0.158318],
|
80
|
+
[6, 0.326975],
|
81
|
+
[7, 0.390861],
|
82
|
+
[8, 0.527121],
|
83
|
+
[9, 0.254052],
|
84
|
+
[10, 0.223731],
|
85
|
+
[11, 0.040285],
|
86
|
+
[12, 0.141133],
|
87
|
+
[13, 0.112249],
|
88
|
+
[14, 0.263171],
|
89
|
+
[15, 0.147020],
|
90
|
+
[16, 0.472414],
|
91
|
+
[17, 0.592614],
|
92
|
+
[18, 0.653138],
|
93
|
+
[19, 0.499867],
|
94
|
+
[20, 0.196520],
|
95
|
+
[21, 0.403892],
|
96
|
+
[22, 0.482395],
|
97
|
+
[23, 0.619219],
|
98
|
+
[24, 0.320346],
|
99
|
+
[25, 0.281251],
|
100
|
+
[26, 0.054750],
|
101
|
+
[27, 0.180459],
|
102
|
+
[28, 0.139964],
|
103
|
+
[29, 0.319925],
|
104
|
+
[30, 0.181216],
|
105
|
+
[31, 0.364294],
|
106
|
+
[32, 0.407211],
|
107
|
+
[33, 0.368926],
|
108
|
+
[34, 0.427661],
|
109
|
+
[35, 0.211391],
|
110
|
+
[36, 0.364345],
|
111
|
+
[37, 0.370710],
|
112
|
+
[38, 0.409107],
|
113
|
+
[39, 0.289299],
|
114
|
+
[40, 0.243053],
|
115
|
+
[41, 0.063121],
|
116
|
+
[42, 0.193587],
|
117
|
+
[43, 0.158755],
|
118
|
+
[44, 0.316054],
|
119
|
+
[45, 0.197410],
|
120
|
+
[46, 0.656168],
|
121
|
+
[47, 0.678760],
|
122
|
+
[48, 0.650831],
|
123
|
+
[49, 0.674636],
|
124
|
+
[50, 0.492428],
|
125
|
+
[51, 0.623887],
|
126
|
+
[52, 0.610622],
|
127
|
+
[53, 0.678219],
|
128
|
+
[54, 0.574774],
|
129
|
+
[55, 0.523073],
|
130
|
+
[56, 0.206804],
|
131
|
+
[57, 0.496294],
|
132
|
+
[58, 0.429221],
|
133
|
+
[59, 0.586611],
|
134
|
+
[60, 0.471550],
|
135
|
+
[61, 0.284480],
|
136
|
+
[62, 0.432466],
|
137
|
+
[63, 0.498075],
|
138
|
+
[64, 0.408141],
|
139
|
+
[65, 0.102713],
|
140
|
+
[66, 0.303028],
|
141
|
+
[67, 0.309501],
|
142
|
+
[68, 0.444855],
|
143
|
+
[69, 0.191727],
|
144
|
+
[70, 0.174895],
|
145
|
+
[71, 0.034143],
|
146
|
+
[72, 0.153099],
|
147
|
+
[73, 0.068318],
|
148
|
+
[74, 0.217020],
|
149
|
+
[75, 0.099688],
|
150
|
+
[76, 0.409862],
|
151
|
+
[77, 0.561918],
|
152
|
+
[78, 0.612031],
|
153
|
+
[79, 0.514471],
|
154
|
+
[80, 0.146015],
|
155
|
+
[81, 0.398807],
|
156
|
+
[82, 0.383295],
|
157
|
+
[83, 0.548485],
|
158
|
+
[84, 0.282937],
|
159
|
+
[85, 0.252712],
|
160
|
+
[86, 0.051008],
|
161
|
+
[87, 0.223110],
|
162
|
+
[88, 0.098112],
|
163
|
+
[89, 0.299672],
|
164
|
+
[90, 0.144873],
|
165
|
+
[91, 0.308488],
|
166
|
+
[92, 0.358478],
|
167
|
+
[93, 0.352077],
|
168
|
+
[94, 0.394686],
|
169
|
+
[95, 0.157513],
|
170
|
+
[96, 0.339370],
|
171
|
+
[97, 0.321558],
|
172
|
+
[98, 0.341373],
|
173
|
+
[99, 0.247969],
|
174
|
+
[100, 0.206070],
|
175
|
+
[101, 0.061001],
|
176
|
+
[102, 0.216793],
|
177
|
+
[103, 0.112389],
|
178
|
+
[104, 0.273648],
|
179
|
+
[105, 0.152745],
|
180
|
+
[106, 0.598081],
|
181
|
+
[107, 0.621687],
|
182
|
+
[108, 0.607213],
|
183
|
+
[109, 0.644025],
|
184
|
+
[110, 0.394948],
|
185
|
+
[111, 0.593651],
|
186
|
+
[112, 0.551529],
|
187
|
+
[113, 0.574392],
|
188
|
+
[114, 0.511032],
|
189
|
+
[115, 0.463997],
|
190
|
+
[116, 0.202034],
|
191
|
+
[117, 0.492341],
|
192
|
+
[118, 0.317983],
|
193
|
+
[119, 0.547807],
|
194
|
+
[120, 0.393778],
|
195
|
+
].each do |i, value|
|
196
|
+
hash[i - 1] = value
|
197
|
+
end
|
198
|
+
assert_equal(hash,
|
199
|
+
dataset.first.to_h)
|
200
|
+
end
|
201
|
+
|
202
|
+
test("string") do
|
203
|
+
# TODO
|
204
|
+
end
|
205
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
class PostalCodeJapanTest < Test::Unit::TestCase
|
2
|
+
sub_test_case(":reading") do
|
3
|
+
test(":lowercase") do
|
4
|
+
dataset = Datasets::PostalCodeJapan.new(reading: :lowercase)
|
5
|
+
assert_equal({
|
6
|
+
organization_code: "01101",
|
7
|
+
old_postal_code: "060",
|
8
|
+
postal_code: "0600000",
|
9
|
+
prefecture_reading: "ホッカイドウ",
|
10
|
+
city_reading: "サッポロシチュウオウク",
|
11
|
+
address_reading: "イカニケイサイガナイバアイ",
|
12
|
+
prefecture: "北海道",
|
13
|
+
city: "札幌市中央区",
|
14
|
+
address: "以下に掲載がない場合",
|
15
|
+
have_multiple_postal_codes: false,
|
16
|
+
have_address_number_per_koaza: false,
|
17
|
+
have_chome: false,
|
18
|
+
postal_code_is_shared: false,
|
19
|
+
changed: false,
|
20
|
+
change_reason: nil,
|
21
|
+
},
|
22
|
+
dataset.first.to_h)
|
23
|
+
end
|
24
|
+
|
25
|
+
test(":uppercase") do
|
26
|
+
dataset = Datasets::PostalCodeJapan.new(reading: :uppercase)
|
27
|
+
assert_equal({
|
28
|
+
organization_code: "01101",
|
29
|
+
old_postal_code: "060",
|
30
|
+
postal_code: "0600000",
|
31
|
+
prefecture_reading: "ホツカイドウ",
|
32
|
+
city_reading: "サツポロシチユウオウク",
|
33
|
+
address_reading: "イカニケイサイガナイバアイ",
|
34
|
+
prefecture: "北海道",
|
35
|
+
city: "札幌市中央区",
|
36
|
+
address: "以下に掲載がない場合",
|
37
|
+
have_multiple_postal_codes: false,
|
38
|
+
have_address_number_per_koaza: false,
|
39
|
+
have_chome: false,
|
40
|
+
postal_code_is_shared: false,
|
41
|
+
changed: false,
|
42
|
+
change_reason: nil,
|
43
|
+
},
|
44
|
+
dataset.first.to_h)
|
45
|
+
end
|
46
|
+
|
47
|
+
test(":romaji") do
|
48
|
+
dataset = Datasets::PostalCodeJapan.new(reading: :romaji)
|
49
|
+
assert_equal({
|
50
|
+
organization_code: nil,
|
51
|
+
old_postal_code: nil,
|
52
|
+
postal_code: "0600000",
|
53
|
+
prefecture_reading: "HOKKAIDO",
|
54
|
+
city_reading: "SAPPORO SHI CHUO KU",
|
55
|
+
address_reading: "IKANIKEISAIGANAIBAAI",
|
56
|
+
prefecture: "北海道",
|
57
|
+
city: "札幌市 中央区",
|
58
|
+
address: "以下に掲載がない場合",
|
59
|
+
have_multiple_postal_codes: false,
|
60
|
+
have_address_number_per_koaza: false,
|
61
|
+
have_chome: false,
|
62
|
+
postal_code_is_shared: false,
|
63
|
+
changed: false,
|
64
|
+
change_reason: nil,
|
65
|
+
},
|
66
|
+
dataset.first.to_h)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,8 +9,36 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2019-03-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: csv
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - ">="
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: 3.0.5
|
21
|
+
type: :runtime
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: 3.0.5
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: rubyzip
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
type: :runtime
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
14
42
|
- !ruby/object:Gem::Dependency
|
15
43
|
name: bundler
|
16
44
|
requirement: !ruby/object:Gem::Requirement
|
@@ -106,9 +134,12 @@ files:
|
|
106
134
|
- lib/datasets/downloader.rb
|
107
135
|
- lib/datasets/fashion-mnist.rb
|
108
136
|
- lib/datasets/iris.rb
|
137
|
+
- lib/datasets/libsvm-dataset-list.rb
|
138
|
+
- lib/datasets/libsvm.rb
|
109
139
|
- lib/datasets/metadata.rb
|
110
140
|
- lib/datasets/mnist.rb
|
111
141
|
- lib/datasets/penn-treebank.rb
|
142
|
+
- lib/datasets/postal-code-japan.rb
|
112
143
|
- lib/datasets/table.rb
|
113
144
|
- lib/datasets/version.rb
|
114
145
|
- lib/datasets/wikipedia.rb
|
@@ -121,8 +152,11 @@ files:
|
|
121
152
|
- test/test-dictionary.rb
|
122
153
|
- test/test-fashion-mnist.rb
|
123
154
|
- test/test-iris.rb
|
155
|
+
- test/test-libsvm-dataset-list.rb
|
156
|
+
- test/test-libsvm.rb
|
124
157
|
- test/test-mnist.rb
|
125
158
|
- test/test-penn-treebank.rb
|
159
|
+
- test/test-postal-code-japan.rb
|
126
160
|
- test/test-table.rb
|
127
161
|
- test/test-wikipedia.rb
|
128
162
|
- test/test-wine.rb
|
@@ -146,20 +180,23 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
146
180
|
version: '0'
|
147
181
|
requirements: []
|
148
182
|
rubyforge_project:
|
149
|
-
rubygems_version:
|
183
|
+
rubygems_version: 2.7.6
|
150
184
|
signing_key:
|
151
185
|
specification_version: 4
|
152
186
|
summary: Red Datasets provides classes that provide common datasets such as iris dataset.
|
153
187
|
test_files:
|
154
|
-
- test/test-wine.rb
|
155
|
-
- test/run-test.rb
|
156
|
-
- test/test-cifar.rb
|
157
|
-
- test/test-fashion-mnist.rb
|
158
|
-
- test/test-wikipedia.rb
|
159
188
|
- test/test-iris.rb
|
160
|
-
- test/
|
189
|
+
- test/test-wikipedia.rb
|
190
|
+
- test/test-fashion-mnist.rb
|
191
|
+
- test/test-wine.rb
|
192
|
+
- test/test-postal-code-japan.rb
|
161
193
|
- test/test-mnist.rb
|
162
|
-
- test/
|
194
|
+
- test/helper.rb
|
163
195
|
- test/test-adult.rb
|
196
|
+
- test/test-libsvm.rb
|
197
|
+
- test/run-test.rb
|
198
|
+
- test/test-table.rb
|
199
|
+
- test/test-cifar.rb
|
200
|
+
- test/test-libsvm-dataset-list.rb
|
164
201
|
- test/test-penn-treebank.rb
|
165
202
|
- test/test-dictionary.rb
|