red-datasets 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -5
- data/doc/text/news.md +17 -0
- data/lib/datasets/libsvm-dataset-list.rb +137 -0
- data/lib/datasets/libsvm.rb +143 -0
- data/lib/datasets/postal-code-japan.rb +154 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets.rb +12 -9
- data/red-datasets.gemspec +3 -0
- data/test/test-libsvm-dataset-list.rb +47 -0
- data/test/test-libsvm.rb +205 -0
- data/test/test-postal-code-japan.rb +69 -0
- metadata +47 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c7a9199546e7a001c97e45c6fa28db15c0d96b748e527d9705dfee4e4b1db6fd
|
4
|
+
data.tar.gz: c659f6ae1e658ad91210e4427be063463124d89ef90388d34ebfb73ceb49068a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d8a23c4a165a596df22ce5bbe1f8f0cd5c0f002deecafbb26cd5e5f75abb3c0224c1013898162a67787159258d1b801395fc4d949c17939d95940664cffd5600
|
7
|
+
data.tar.gz: f2fd4eb733e6205f138c4005627e815e3787040a8a4b6cce7eca9fd5d4adaa12263e17e8f5bd9394a851e5210f28736ee3c682c81e110da304ae17fb3f0bedba
|
data/README.md
CHANGED
@@ -1,8 +1,4 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
## Name
|
4
|
-
|
5
|
-
Red Datasets
|
1
|
+
# Red Datasets
|
6
2
|
|
7
3
|
## Description
|
8
4
|
|
@@ -16,6 +12,20 @@ You can use datasets easily because you can access each dataset with multiple wa
|
|
16
12
|
% gem install red-datasets
|
17
13
|
```
|
18
14
|
|
15
|
+
## Available datasets
|
16
|
+
|
17
|
+
TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
|
18
|
+
|
19
|
+
* Adult Dataset
|
20
|
+
* CIFAR-10 Dataset
|
21
|
+
* CIFAR-100 Dataset
|
22
|
+
* Fashion-MNIST
|
23
|
+
* Iris Dataset
|
24
|
+
* MNIST database
|
25
|
+
* The Penn Treebank Project
|
26
|
+
* Wikipedia
|
27
|
+
* Wine Dataset
|
28
|
+
|
19
29
|
## Usage
|
20
30
|
|
21
31
|
Here is an example to access [Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris) by `#each` or `Table#to_h` or `Table#fetch_values`.
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,22 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.0.8 - 2019-03-24
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Improved README.
|
8
|
+
[GitHub#40][Patch by kojix2]
|
9
|
+
|
10
|
+
* `Datasets::PostalCodeJapan`: Added.
|
11
|
+
|
12
|
+
* `Datasets::LIBSVMDatasetList`: Added.
|
13
|
+
|
14
|
+
* `Datasets::LIBSVM`: Added.
|
15
|
+
|
16
|
+
### Thanks
|
17
|
+
|
18
|
+
* kojix2
|
19
|
+
|
3
20
|
## 0.0.7 - 2018-11-21
|
4
21
|
|
5
22
|
### Improvements
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require "English"
|
2
|
+
require "rexml/document"
|
3
|
+
|
4
|
+
require_relative "dataset"
|
5
|
+
|
6
|
+
module Datasets
|
7
|
+
class LIBSVMDatasetList < Dataset
|
8
|
+
File = Struct.new(:name,
|
9
|
+
:url,
|
10
|
+
:note)
|
11
|
+
class Record < Struct.new(:name,
|
12
|
+
:source,
|
13
|
+
:preprocessing,
|
14
|
+
:n_classes,
|
15
|
+
:n_data,
|
16
|
+
:n_features,
|
17
|
+
:files)
|
18
|
+
def to_h
|
19
|
+
hash = super
|
20
|
+
hash[:files] = hash[:files].collect(&:to_h)
|
21
|
+
hash
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def initialize
|
26
|
+
super()
|
27
|
+
@metadata.id = "libsvm-dataset-list"
|
28
|
+
@metadata.name = "LIBSVM dataset list"
|
29
|
+
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
30
|
+
@metadata.description = lambda do
|
31
|
+
extract_description
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def each
|
36
|
+
return to_enum(__method__) unless block_given?
|
37
|
+
|
38
|
+
open_data do |input|
|
39
|
+
# TODO: Improve performance
|
40
|
+
document = REXML::Document.new(input)
|
41
|
+
is_header = true
|
42
|
+
document.each_element("//tr") do |tr|
|
43
|
+
if is_header
|
44
|
+
is_header = false
|
45
|
+
next
|
46
|
+
end
|
47
|
+
name = tr.elements.first
|
48
|
+
a = name.elements.first
|
49
|
+
href = a.attributes["href"]
|
50
|
+
record = Record.new
|
51
|
+
record.name = a.text
|
52
|
+
record.files = []
|
53
|
+
parse_detail(href, record)
|
54
|
+
yield(record)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
def open_data
|
61
|
+
data_path = cache_dir_path + "index.html"
|
62
|
+
unless data_path.exist?
|
63
|
+
download(data_path, @metadata.url)
|
64
|
+
end
|
65
|
+
::File.open(data_path) do |input|
|
66
|
+
yield(input)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def extract_description
|
71
|
+
open_data do |input|
|
72
|
+
document = REXML::Document.new(input)
|
73
|
+
description = []
|
74
|
+
in_content = false
|
75
|
+
document.each_element("//body/*") do |element|
|
76
|
+
unless in_content
|
77
|
+
in_content = (element.name == "h1")
|
78
|
+
next
|
79
|
+
end
|
80
|
+
break if element.name == "hr"
|
81
|
+
content = extract_text(element)
|
82
|
+
description << content unless content.empty?
|
83
|
+
end
|
84
|
+
description.join("\n\n")
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def extract_text(element)
|
89
|
+
texts = REXML::XPath.match(element, ".//text()")
|
90
|
+
texts.join("").gsub(/[ \t\n]+/, " ").strip
|
91
|
+
end
|
92
|
+
|
93
|
+
def open_detail(detail)
|
94
|
+
data_path = cache_dir_path + detail
|
95
|
+
unless data_path.exist?
|
96
|
+
download(data_path, @metadata.url + detail)
|
97
|
+
end
|
98
|
+
::File.open(data_path) do |input|
|
99
|
+
yield(input)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def parse_detail(href, record)
|
104
|
+
path, id = href.split("#")
|
105
|
+
open_detail(path) do |detail|
|
106
|
+
detail_document = REXML::Document.new(detail)
|
107
|
+
anchor = REXML::XPath.match(detail_document, "//*[@name='#{id}']")[0]
|
108
|
+
ul = anchor.next_sibling
|
109
|
+
ul.each_element do |li|
|
110
|
+
text = extract_text(li)
|
111
|
+
case text
|
112
|
+
when /\ASource: /
|
113
|
+
record.source = $POSTMATCH
|
114
|
+
when /\APreprocessing: /
|
115
|
+
record.preprocessing = $POSTMATCH
|
116
|
+
when /\A\# of classes: (\d+)/
|
117
|
+
record.n_classes = Integer($1, 10)
|
118
|
+
when /\A\# of data: ([\d,]+)/
|
119
|
+
record.n_data = Integer($1.gsub(/,/, ""), 10)
|
120
|
+
when /\A\# of features: ([\d,]+)/
|
121
|
+
record.n_features = Integer($1.gsub(/,/, ""), 10)
|
122
|
+
when /\AFiles:/
|
123
|
+
li.elements.first.each_element do |file_li|
|
124
|
+
file_a = file_li.elements.first
|
125
|
+
file = File.new
|
126
|
+
file.name = file_a.text
|
127
|
+
file.url = @metadata.url + file_a.attributes["href"]
|
128
|
+
file_note = file_li.text
|
129
|
+
file.note = file_note.strip.gsub(/[()]/, "") if file_note
|
130
|
+
record.files << file
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,143 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class LIBSVM < Dataset
|
7
|
+
class Record
|
8
|
+
attr_reader :label
|
9
|
+
attr_reader :features
|
10
|
+
def initialize(label, features)
|
11
|
+
@label = label
|
12
|
+
@features = features
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](index)
|
16
|
+
@features[index]
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_h
|
20
|
+
hash = {
|
21
|
+
label: @label,
|
22
|
+
}
|
23
|
+
@features.each_with_index do |feature, i|
|
24
|
+
hash[i] = feature
|
25
|
+
end
|
26
|
+
hash
|
27
|
+
end
|
28
|
+
|
29
|
+
def values
|
30
|
+
[@label] + @features
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize(name,
|
35
|
+
note: nil,
|
36
|
+
default_feature_value: 0)
|
37
|
+
super()
|
38
|
+
@libsvm_dataset_metadata = fetch_dataset_info(name)
|
39
|
+
@file = choose_file(note)
|
40
|
+
@default_feature_value = default_feature_value
|
41
|
+
@metadata.id = "libsvm-#{normalize_name(name)}"
|
42
|
+
@metadata.name = "LIBSVM dataset: #{name}"
|
43
|
+
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
44
|
+
end
|
45
|
+
|
46
|
+
def each
|
47
|
+
return to_enum(__method__) unless block_given?
|
48
|
+
|
49
|
+
open_data do |input|
|
50
|
+
n_features = @libsvm_dataset_metadata.n_features
|
51
|
+
csv = CSV.new(input, col_sep: " ")
|
52
|
+
csv.each do |row|
|
53
|
+
label = parse_label(row.shift)
|
54
|
+
features = [@default_feature_value] * n_features
|
55
|
+
row.each do |column|
|
56
|
+
next if column.nil?
|
57
|
+
index, value = column.split(":", 2)
|
58
|
+
features[Integer(index, 10) - 1] = parse_value(value)
|
59
|
+
end
|
60
|
+
yield(Record.new(label, features))
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def fetch_dataset_info(name)
|
67
|
+
list = LIBSVMDatasetList.new
|
68
|
+
available_datasets = []
|
69
|
+
list.each do |record|
|
70
|
+
available_datasets << record.name
|
71
|
+
if record.name == name
|
72
|
+
return record
|
73
|
+
end
|
74
|
+
end
|
75
|
+
message = "unavailable LIBSVM dataset: #{name.inspect}: "
|
76
|
+
message << "available datasets: ["
|
77
|
+
message << available_datasets.collect(&:inspect).join(", ")
|
78
|
+
message << "]"
|
79
|
+
raise ArgumentError, message
|
80
|
+
end
|
81
|
+
|
82
|
+
def choose_file(note)
|
83
|
+
files = @libsvm_dataset_metadata.files
|
84
|
+
return files.first if note.nil?
|
85
|
+
|
86
|
+
available_notes = []
|
87
|
+
@libsvm_dataset_metadata.files.find do |file|
|
88
|
+
return file if file.note == note
|
89
|
+
available_notes << file.note if file.note
|
90
|
+
end
|
91
|
+
|
92
|
+
name = @libsvm_dataset_metadata.name
|
93
|
+
message = "unavailable note: #{name}: #{note.inspect}: "
|
94
|
+
message << "available notes: ["
|
95
|
+
message << available_notes.collect(&:inspect).join(", ")
|
96
|
+
message << "]"
|
97
|
+
raise ArgumentError, message
|
98
|
+
end
|
99
|
+
|
100
|
+
def open_data(&block)
|
101
|
+
data_path = cache_dir_path + @file.name
|
102
|
+
unless data_path.exist?
|
103
|
+
download(data_path, @file.url)
|
104
|
+
end
|
105
|
+
if data_path.extname == ".bz2"
|
106
|
+
input, output = IO.pipe
|
107
|
+
pid = spawn("bzcat", data_path.to_s, {:out => output})
|
108
|
+
begin
|
109
|
+
output.close
|
110
|
+
yield(input)
|
111
|
+
ensure
|
112
|
+
input.close
|
113
|
+
Process.waitpid(pid)
|
114
|
+
end
|
115
|
+
else
|
116
|
+
File.open(data_path, &block)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def normalize_name(name)
|
121
|
+
name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
|
122
|
+
end
|
123
|
+
|
124
|
+
def parse_label(label)
|
125
|
+
labels = label.split(",").collect do |value|
|
126
|
+
parse_value(value)
|
127
|
+
end
|
128
|
+
if labels.size == 1
|
129
|
+
labels[0]
|
130
|
+
else
|
131
|
+
labels
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def parse_value(value)
|
136
|
+
if value.include?(".")
|
137
|
+
Float(value)
|
138
|
+
else
|
139
|
+
Integer(value, 10)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "zip"
|
3
|
+
|
4
|
+
require_relative "dataset"
|
5
|
+
|
6
|
+
module Datasets
|
7
|
+
class PostalCodeJapan < Dataset
|
8
|
+
class Record < Struct.new(:organization_code,
|
9
|
+
:old_postal_code,
|
10
|
+
:postal_code,
|
11
|
+
:prefecture_reading,
|
12
|
+
:city_reading,
|
13
|
+
:address_reading,
|
14
|
+
:prefecture,
|
15
|
+
:city,
|
16
|
+
:address,
|
17
|
+
:have_multiple_postal_codes,
|
18
|
+
:have_address_number_per_koaza,
|
19
|
+
:have_chome,
|
20
|
+
:postal_code_is_shared,
|
21
|
+
:changed,
|
22
|
+
:change_reason)
|
23
|
+
alias_method :have_multiple_postal_codes?,
|
24
|
+
:have_multiple_postal_codes
|
25
|
+
alias_method :have_address_number_per_koaza?,
|
26
|
+
:have_address_number_per_koaza
|
27
|
+
alias_method :have_chome?,
|
28
|
+
:have_chome
|
29
|
+
alias_method :postal_code_is_shared?,
|
30
|
+
:postal_code_is_shared
|
31
|
+
alias_method :changed?,
|
32
|
+
:changed
|
33
|
+
end
|
34
|
+
|
35
|
+
VALID_READINGS = [
|
36
|
+
:lowercase,
|
37
|
+
:uppercase,
|
38
|
+
:romaji,
|
39
|
+
]
|
40
|
+
def initialize(reading: :lowercase)
|
41
|
+
super()
|
42
|
+
@reading = reading
|
43
|
+
unless VALID_READINGS.include?(@reading)
|
44
|
+
message = ":reading must be one of ["
|
45
|
+
message << VALID_READINGS.collect(&:inspect).join(", ")
|
46
|
+
message << "]: #{@reading.inspect}"
|
47
|
+
raise ArgumentError, message
|
48
|
+
end
|
49
|
+
@metadata.id = "postal-code-japan-#{@reading}"
|
50
|
+
@metadata.name = "Postal code in Japan (#{@reading})"
|
51
|
+
@metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
|
52
|
+
@metadata.licenses = [
|
53
|
+
"CC0-1.0",
|
54
|
+
]
|
55
|
+
@metadata.description = "Postal code in Japan (reading: #{@reading})"
|
56
|
+
end
|
57
|
+
|
58
|
+
def each(&block)
|
59
|
+
return to_enum(__method__) unless block_given?
|
60
|
+
|
61
|
+
open_data do |input|
|
62
|
+
utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
|
63
|
+
options = {
|
64
|
+
quote_char: nil,
|
65
|
+
strip: %Q["],
|
66
|
+
}
|
67
|
+
if @reading == :romaji
|
68
|
+
CSV.parse(utf8_data, **options) do |row|
|
69
|
+
yield(Record.new(nil,
|
70
|
+
nil,
|
71
|
+
row[0],
|
72
|
+
row[4],
|
73
|
+
row[5],
|
74
|
+
row[6],
|
75
|
+
row[1],
|
76
|
+
row[2],
|
77
|
+
row[3],
|
78
|
+
false,
|
79
|
+
false,
|
80
|
+
false,
|
81
|
+
false,
|
82
|
+
false,
|
83
|
+
nil))
|
84
|
+
end
|
85
|
+
else
|
86
|
+
CSV.parse(utf8_data, **options) do |row|
|
87
|
+
yield(Record.new(row[0],
|
88
|
+
row[1].rstrip,
|
89
|
+
row[2],
|
90
|
+
row[3],
|
91
|
+
row[4],
|
92
|
+
row[5],
|
93
|
+
row[6],
|
94
|
+
row[7],
|
95
|
+
row[8],
|
96
|
+
(row[9] == "1"),
|
97
|
+
(row[10] == "1"),
|
98
|
+
(row[11] == "1"),
|
99
|
+
(row[12] == "1"),
|
100
|
+
(row[13] != "0"),
|
101
|
+
convert_change_reason(row[14])))
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
def open_data
|
109
|
+
data_url = "https://www.post.japanpost.jp/zipcode/dl"
|
110
|
+
case @reading
|
111
|
+
when :lowercase
|
112
|
+
data_url << "/kogaki/zip/ken_all.zip"
|
113
|
+
when :uppercase
|
114
|
+
data_url << "/oogaki/zip/ken_all.zip"
|
115
|
+
when :romaji
|
116
|
+
data_url << "/roman/ken_all_rome.zip"
|
117
|
+
end
|
118
|
+
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
119
|
+
unless data_path.exist?
|
120
|
+
download(data_path, data_url)
|
121
|
+
end
|
122
|
+
|
123
|
+
Zip::File.open(data_path.to_s) do |zip_file|
|
124
|
+
zip_file.each do |entry|
|
125
|
+
next unless entry.file?
|
126
|
+
entry.get_input_stream do |input|
|
127
|
+
yield(input)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def convert_change_reason(reason)
|
134
|
+
case reason
|
135
|
+
when "0"
|
136
|
+
nil
|
137
|
+
when "1"
|
138
|
+
:new
|
139
|
+
when "2"
|
140
|
+
:japanese_addressing_system
|
141
|
+
when "3"
|
142
|
+
:land_readjustment
|
143
|
+
when "4"
|
144
|
+
:postal_district_adjustment
|
145
|
+
when "5"
|
146
|
+
:correction
|
147
|
+
when "6"
|
148
|
+
:deletion
|
149
|
+
else
|
150
|
+
:unknown
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
data/lib/datasets/version.rb
CHANGED
data/lib/datasets.rb
CHANGED
@@ -1,10 +1,13 @@
|
|
1
|
-
|
1
|
+
require_relative "datasets/version"
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
3
|
+
require_relative "datasets/adult"
|
4
|
+
require_relative "datasets/cifar"
|
5
|
+
require_relative "datasets/fashion-mnist"
|
6
|
+
require_relative "datasets/iris"
|
7
|
+
require_relative "datasets/libsvm"
|
8
|
+
require_relative "datasets/libsvm-dataset-list"
|
9
|
+
require_relative "datasets/mnist"
|
10
|
+
require_relative "datasets/penn-treebank"
|
11
|
+
require_relative "datasets/postal-code-japan"
|
12
|
+
require_relative "datasets/wikipedia"
|
13
|
+
require_relative "datasets/wine"
|
data/red-datasets.gemspec
CHANGED
@@ -34,6 +34,9 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.files += Dir.glob("doc/text/*")
|
35
35
|
spec.test_files += Dir.glob("test/**/*")
|
36
36
|
|
37
|
+
spec.add_runtime_dependency("csv", ">= 3.0.5")
|
38
|
+
spec.add_runtime_dependency("rubyzip")
|
39
|
+
|
37
40
|
spec.add_development_dependency("bundler")
|
38
41
|
spec.add_development_dependency("rake")
|
39
42
|
spec.add_development_dependency("test-unit")
|
@@ -0,0 +1,47 @@
|
|
1
|
+
class LIBSVMDatasetListTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::LIBSVMDatasetList.new
|
4
|
+
end
|
5
|
+
|
6
|
+
test("#each") do
|
7
|
+
assert_equal({
|
8
|
+
name: "a1a",
|
9
|
+
source: "UCI / Adult",
|
10
|
+
preprocessing:
|
11
|
+
"The original Adult data set has 14 features, " +
|
12
|
+
"among which six are continuous and eight are " +
|
13
|
+
"categorical. In this data set, continuous features " +
|
14
|
+
"are discretized into quantiles, and each quantile is " +
|
15
|
+
"represented by a binary feature. Also, a categorical " +
|
16
|
+
"feature with m categories is converted to m binary " +
|
17
|
+
"features. Details on how each feature is converted " +
|
18
|
+
"can be found in the beginning of each file from this " +
|
19
|
+
"page. [JP98a]",
|
20
|
+
n_classes: 2,
|
21
|
+
n_data: 1605,
|
22
|
+
n_features: 123,
|
23
|
+
files: [
|
24
|
+
{
|
25
|
+
name: "a1a",
|
26
|
+
url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a",
|
27
|
+
note: nil,
|
28
|
+
},
|
29
|
+
{
|
30
|
+
name: "a1a.t",
|
31
|
+
url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t",
|
32
|
+
note: "testing",
|
33
|
+
}
|
34
|
+
],
|
35
|
+
},
|
36
|
+
@dataset.first.to_h)
|
37
|
+
end
|
38
|
+
|
39
|
+
sub_test_case("#metadata") do
|
40
|
+
test("#description") do
|
41
|
+
description = @dataset.metadata.description
|
42
|
+
assert do
|
43
|
+
description.start_with?("This page contains many classification, ")
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
data/test/test-libsvm.rb
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
class LIBSVMDatasetTest < Test::Unit::TestCase
|
2
|
+
test(":note") do
|
3
|
+
dataset = Datasets::LIBSVM.new("a1a", note: "testing")
|
4
|
+
hash = {label: -1}
|
5
|
+
n_features = 123
|
6
|
+
n_features.times do |i|
|
7
|
+
hash[i] = 0
|
8
|
+
end
|
9
|
+
[5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i|
|
10
|
+
hash[i - 1] = 1
|
11
|
+
end
|
12
|
+
assert_equal(hash,
|
13
|
+
dataset.first.to_h)
|
14
|
+
end
|
15
|
+
|
16
|
+
test(":default_feature_value") do
|
17
|
+
dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil)
|
18
|
+
hash = {label: -1}
|
19
|
+
n_features = 123
|
20
|
+
n_features.times do |i|
|
21
|
+
hash[i] = nil
|
22
|
+
end
|
23
|
+
[3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
|
24
|
+
hash[i - 1] = 1
|
25
|
+
end
|
26
|
+
assert_equal(hash,
|
27
|
+
dataset.first.to_h)
|
28
|
+
end
|
29
|
+
|
30
|
+
test("classification") do
|
31
|
+
dataset = Datasets::LIBSVM.new("a1a")
|
32
|
+
hash = {label: -1}
|
33
|
+
n_features = 123
|
34
|
+
n_features.times do |i|
|
35
|
+
hash[i] = 0
|
36
|
+
end
|
37
|
+
[3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
|
38
|
+
hash[i - 1] = 1
|
39
|
+
end
|
40
|
+
assert_equal(hash,
|
41
|
+
dataset.first.to_h)
|
42
|
+
end
|
43
|
+
|
44
|
+
test("regression") do
|
45
|
+
dataset = Datasets::LIBSVM.new("abalone")
|
46
|
+
hash = {label: 15}
|
47
|
+
n_features = 8
|
48
|
+
n_features.times do |i|
|
49
|
+
hash[i] = 0
|
50
|
+
end
|
51
|
+
[
|
52
|
+
[1, 1],
|
53
|
+
[2, 0.455],
|
54
|
+
[3, 0.365],
|
55
|
+
[4, 0.095],
|
56
|
+
[5, 0.514],
|
57
|
+
[6, 0.2245],
|
58
|
+
[7, 0.101],
|
59
|
+
[8, 0.15],
|
60
|
+
].each do |i, value|
|
61
|
+
hash[i - 1] = value
|
62
|
+
end
|
63
|
+
assert_equal(hash,
|
64
|
+
dataset.first.to_h)
|
65
|
+
end
|
66
|
+
|
67
|
+
test("multi-label") do
|
68
|
+
dataset = Datasets::LIBSVM.new("mediamill (exp1)")
|
69
|
+
hash = {label: [65, 67, 11, 31]}
|
70
|
+
n_features = 120
|
71
|
+
n_features.times do |i|
|
72
|
+
hash[i] = 0
|
73
|
+
end
|
74
|
+
[
|
75
|
+
[1, 0.380877],
|
76
|
+
[2, 0.494079],
|
77
|
+
[3, 0.540009],
|
78
|
+
[4, 0.422926],
|
79
|
+
[5, 0.158318],
|
80
|
+
[6, 0.326975],
|
81
|
+
[7, 0.390861],
|
82
|
+
[8, 0.527121],
|
83
|
+
[9, 0.254052],
|
84
|
+
[10, 0.223731],
|
85
|
+
[11, 0.040285],
|
86
|
+
[12, 0.141133],
|
87
|
+
[13, 0.112249],
|
88
|
+
[14, 0.263171],
|
89
|
+
[15, 0.147020],
|
90
|
+
[16, 0.472414],
|
91
|
+
[17, 0.592614],
|
92
|
+
[18, 0.653138],
|
93
|
+
[19, 0.499867],
|
94
|
+
[20, 0.196520],
|
95
|
+
[21, 0.403892],
|
96
|
+
[22, 0.482395],
|
97
|
+
[23, 0.619219],
|
98
|
+
[24, 0.320346],
|
99
|
+
[25, 0.281251],
|
100
|
+
[26, 0.054750],
|
101
|
+
[27, 0.180459],
|
102
|
+
[28, 0.139964],
|
103
|
+
[29, 0.319925],
|
104
|
+
[30, 0.181216],
|
105
|
+
[31, 0.364294],
|
106
|
+
[32, 0.407211],
|
107
|
+
[33, 0.368926],
|
108
|
+
[34, 0.427661],
|
109
|
+
[35, 0.211391],
|
110
|
+
[36, 0.364345],
|
111
|
+
[37, 0.370710],
|
112
|
+
[38, 0.409107],
|
113
|
+
[39, 0.289299],
|
114
|
+
[40, 0.243053],
|
115
|
+
[41, 0.063121],
|
116
|
+
[42, 0.193587],
|
117
|
+
[43, 0.158755],
|
118
|
+
[44, 0.316054],
|
119
|
+
[45, 0.197410],
|
120
|
+
[46, 0.656168],
|
121
|
+
[47, 0.678760],
|
122
|
+
[48, 0.650831],
|
123
|
+
[49, 0.674636],
|
124
|
+
[50, 0.492428],
|
125
|
+
[51, 0.623887],
|
126
|
+
[52, 0.610622],
|
127
|
+
[53, 0.678219],
|
128
|
+
[54, 0.574774],
|
129
|
+
[55, 0.523073],
|
130
|
+
[56, 0.206804],
|
131
|
+
[57, 0.496294],
|
132
|
+
[58, 0.429221],
|
133
|
+
[59, 0.586611],
|
134
|
+
[60, 0.471550],
|
135
|
+
[61, 0.284480],
|
136
|
+
[62, 0.432466],
|
137
|
+
[63, 0.498075],
|
138
|
+
[64, 0.408141],
|
139
|
+
[65, 0.102713],
|
140
|
+
[66, 0.303028],
|
141
|
+
[67, 0.309501],
|
142
|
+
[68, 0.444855],
|
143
|
+
[69, 0.191727],
|
144
|
+
[70, 0.174895],
|
145
|
+
[71, 0.034143],
|
146
|
+
[72, 0.153099],
|
147
|
+
[73, 0.068318],
|
148
|
+
[74, 0.217020],
|
149
|
+
[75, 0.099688],
|
150
|
+
[76, 0.409862],
|
151
|
+
[77, 0.561918],
|
152
|
+
[78, 0.612031],
|
153
|
+
[79, 0.514471],
|
154
|
+
[80, 0.146015],
|
155
|
+
[81, 0.398807],
|
156
|
+
[82, 0.383295],
|
157
|
+
[83, 0.548485],
|
158
|
+
[84, 0.282937],
|
159
|
+
[85, 0.252712],
|
160
|
+
[86, 0.051008],
|
161
|
+
[87, 0.223110],
|
162
|
+
[88, 0.098112],
|
163
|
+
[89, 0.299672],
|
164
|
+
[90, 0.144873],
|
165
|
+
[91, 0.308488],
|
166
|
+
[92, 0.358478],
|
167
|
+
[93, 0.352077],
|
168
|
+
[94, 0.394686],
|
169
|
+
[95, 0.157513],
|
170
|
+
[96, 0.339370],
|
171
|
+
[97, 0.321558],
|
172
|
+
[98, 0.341373],
|
173
|
+
[99, 0.247969],
|
174
|
+
[100, 0.206070],
|
175
|
+
[101, 0.061001],
|
176
|
+
[102, 0.216793],
|
177
|
+
[103, 0.112389],
|
178
|
+
[104, 0.273648],
|
179
|
+
[105, 0.152745],
|
180
|
+
[106, 0.598081],
|
181
|
+
[107, 0.621687],
|
182
|
+
[108, 0.607213],
|
183
|
+
[109, 0.644025],
|
184
|
+
[110, 0.394948],
|
185
|
+
[111, 0.593651],
|
186
|
+
[112, 0.551529],
|
187
|
+
[113, 0.574392],
|
188
|
+
[114, 0.511032],
|
189
|
+
[115, 0.463997],
|
190
|
+
[116, 0.202034],
|
191
|
+
[117, 0.492341],
|
192
|
+
[118, 0.317983],
|
193
|
+
[119, 0.547807],
|
194
|
+
[120, 0.393778],
|
195
|
+
].each do |i, value|
|
196
|
+
hash[i - 1] = value
|
197
|
+
end
|
198
|
+
assert_equal(hash,
|
199
|
+
dataset.first.to_h)
|
200
|
+
end
|
201
|
+
|
202
|
+
test("string") do
|
203
|
+
# TODO
|
204
|
+
end
|
205
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
class PostalCodeJapanTest < Test::Unit::TestCase
|
2
|
+
sub_test_case(":reading") do
|
3
|
+
test(":lowercase") do
|
4
|
+
dataset = Datasets::PostalCodeJapan.new(reading: :lowercase)
|
5
|
+
assert_equal({
|
6
|
+
organization_code: "01101",
|
7
|
+
old_postal_code: "060",
|
8
|
+
postal_code: "0600000",
|
9
|
+
prefecture_reading: "ホッカイドウ",
|
10
|
+
city_reading: "サッポロシチュウオウク",
|
11
|
+
address_reading: "イカニケイサイガナイバアイ",
|
12
|
+
prefecture: "北海道",
|
13
|
+
city: "札幌市中央区",
|
14
|
+
address: "以下に掲載がない場合",
|
15
|
+
have_multiple_postal_codes: false,
|
16
|
+
have_address_number_per_koaza: false,
|
17
|
+
have_chome: false,
|
18
|
+
postal_code_is_shared: false,
|
19
|
+
changed: false,
|
20
|
+
change_reason: nil,
|
21
|
+
},
|
22
|
+
dataset.first.to_h)
|
23
|
+
end
|
24
|
+
|
25
|
+
test(":uppercase") do
|
26
|
+
dataset = Datasets::PostalCodeJapan.new(reading: :uppercase)
|
27
|
+
assert_equal({
|
28
|
+
organization_code: "01101",
|
29
|
+
old_postal_code: "060",
|
30
|
+
postal_code: "0600000",
|
31
|
+
prefecture_reading: "ホツカイドウ",
|
32
|
+
city_reading: "サツポロシチユウオウク",
|
33
|
+
address_reading: "イカニケイサイガナイバアイ",
|
34
|
+
prefecture: "北海道",
|
35
|
+
city: "札幌市中央区",
|
36
|
+
address: "以下に掲載がない場合",
|
37
|
+
have_multiple_postal_codes: false,
|
38
|
+
have_address_number_per_koaza: false,
|
39
|
+
have_chome: false,
|
40
|
+
postal_code_is_shared: false,
|
41
|
+
changed: false,
|
42
|
+
change_reason: nil,
|
43
|
+
},
|
44
|
+
dataset.first.to_h)
|
45
|
+
end
|
46
|
+
|
47
|
+
test(":romaji") do
|
48
|
+
dataset = Datasets::PostalCodeJapan.new(reading: :romaji)
|
49
|
+
assert_equal({
|
50
|
+
organization_code: nil,
|
51
|
+
old_postal_code: nil,
|
52
|
+
postal_code: "0600000",
|
53
|
+
prefecture_reading: "HOKKAIDO",
|
54
|
+
city_reading: "SAPPORO SHI CHUO KU",
|
55
|
+
address_reading: "IKANIKEISAIGANAIBAAI",
|
56
|
+
prefecture: "北海道",
|
57
|
+
city: "札幌市 中央区",
|
58
|
+
address: "以下に掲載がない場合",
|
59
|
+
have_multiple_postal_codes: false,
|
60
|
+
have_address_number_per_koaza: false,
|
61
|
+
have_chome: false,
|
62
|
+
postal_code_is_shared: false,
|
63
|
+
changed: false,
|
64
|
+
change_reason: nil,
|
65
|
+
},
|
66
|
+
dataset.first.to_h)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,8 +9,36 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2019-03-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: csv
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - ">="
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: 3.0.5
|
21
|
+
type: :runtime
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: 3.0.5
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: rubyzip
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - ">="
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '0'
|
35
|
+
type: :runtime
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
14
42
|
- !ruby/object:Gem::Dependency
|
15
43
|
name: bundler
|
16
44
|
requirement: !ruby/object:Gem::Requirement
|
@@ -106,9 +134,12 @@ files:
|
|
106
134
|
- lib/datasets/downloader.rb
|
107
135
|
- lib/datasets/fashion-mnist.rb
|
108
136
|
- lib/datasets/iris.rb
|
137
|
+
- lib/datasets/libsvm-dataset-list.rb
|
138
|
+
- lib/datasets/libsvm.rb
|
109
139
|
- lib/datasets/metadata.rb
|
110
140
|
- lib/datasets/mnist.rb
|
111
141
|
- lib/datasets/penn-treebank.rb
|
142
|
+
- lib/datasets/postal-code-japan.rb
|
112
143
|
- lib/datasets/table.rb
|
113
144
|
- lib/datasets/version.rb
|
114
145
|
- lib/datasets/wikipedia.rb
|
@@ -121,8 +152,11 @@ files:
|
|
121
152
|
- test/test-dictionary.rb
|
122
153
|
- test/test-fashion-mnist.rb
|
123
154
|
- test/test-iris.rb
|
155
|
+
- test/test-libsvm-dataset-list.rb
|
156
|
+
- test/test-libsvm.rb
|
124
157
|
- test/test-mnist.rb
|
125
158
|
- test/test-penn-treebank.rb
|
159
|
+
- test/test-postal-code-japan.rb
|
126
160
|
- test/test-table.rb
|
127
161
|
- test/test-wikipedia.rb
|
128
162
|
- test/test-wine.rb
|
@@ -146,20 +180,23 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
146
180
|
version: '0'
|
147
181
|
requirements: []
|
148
182
|
rubyforge_project:
|
149
|
-
rubygems_version:
|
183
|
+
rubygems_version: 2.7.6
|
150
184
|
signing_key:
|
151
185
|
specification_version: 4
|
152
186
|
summary: Red Datasets provides classes that provide common datasets such as iris dataset.
|
153
187
|
test_files:
|
154
|
-
- test/test-wine.rb
|
155
|
-
- test/run-test.rb
|
156
|
-
- test/test-cifar.rb
|
157
|
-
- test/test-fashion-mnist.rb
|
158
|
-
- test/test-wikipedia.rb
|
159
188
|
- test/test-iris.rb
|
160
|
-
- test/
|
189
|
+
- test/test-wikipedia.rb
|
190
|
+
- test/test-fashion-mnist.rb
|
191
|
+
- test/test-wine.rb
|
192
|
+
- test/test-postal-code-japan.rb
|
161
193
|
- test/test-mnist.rb
|
162
|
-
- test/
|
194
|
+
- test/helper.rb
|
163
195
|
- test/test-adult.rb
|
196
|
+
- test/test-libsvm.rb
|
197
|
+
- test/run-test.rb
|
198
|
+
- test/test-table.rb
|
199
|
+
- test/test-cifar.rb
|
200
|
+
- test/test-libsvm-dataset-list.rb
|
164
201
|
- test/test-penn-treebank.rb
|
165
202
|
- test/test-dictionary.rb
|