red-datasets 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc/text/news.md +9 -0
- data/lib/datasets.rb +1 -0
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/dataset.rb +7 -1
- data/lib/datasets/downloader.rb +53 -27
- data/lib/datasets/penguins.rb +37 -16
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/tar_gz_readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-penguins.rb +29 -17
- data/test/test-rdatasets.rb +136 -0
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6fbd4d11063f89ba2e09250b751886086c953ec8bc92c75a6a351c31a36da0c4
|
4
|
+
data.tar.gz: acc6ff31f0f4ae3a6c6565fe569233c01615718c01300b0838ff744571edc34d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 26361511155b447ffed56a79b2336a9a1db96494bf856b23e7b39cc6a8b6a2039e7ed27564140761bdb2daaae7ee563b3695c464a7a7b21ff93b0636f6b8338d
|
7
|
+
data.tar.gz: 40446f90e410e0d86abeec186a1d7adcc5375e29c19dc934f823befb26a87d904458ef5ea18c9d64055493d29ed305dba53d6e4d86bd7d84488baf3745ebd792
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.2 - 2021-06-03
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `Datasets::Rdatasets` and `Datasets::RdatasetsList`: Added.
|
8
|
+
|
9
|
+
* `Datasets::Penguins`: Changed for compatibility with seaborn's
|
10
|
+
penguins dataset.
|
11
|
+
|
3
12
|
## 0.1.1 - 2021-04-11
|
4
13
|
|
5
14
|
### Improvements
|
data/lib/datasets.rb
CHANGED
@@ -15,5 +15,6 @@ require_relative "datasets/mushroom"
|
|
15
15
|
require_relative "datasets/penguins"
|
16
16
|
require_relative "datasets/penn-treebank"
|
17
17
|
require_relative "datasets/postal-code-japan"
|
18
|
+
require_relative "datasets/rdatasets"
|
18
19
|
require_relative "datasets/wikipedia"
|
19
20
|
require_relative "datasets/wine"
|
data/lib/datasets/cifar.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
|
2
|
-
require "zlib"
|
3
|
-
|
1
|
+
require_relative "tar_gz_readable"
|
4
2
|
require_relative "dataset"
|
5
3
|
|
6
4
|
module Datasets
|
7
5
|
class CIFAR < Dataset
|
6
|
+
include TarGzReadable
|
7
|
+
|
8
8
|
module Pixelable
|
9
9
|
def pixels
|
10
10
|
data.unpack("C*")
|
@@ -61,7 +61,7 @@ module Datasets
|
|
61
61
|
private
|
62
62
|
|
63
63
|
def parse_data(data_path, &block)
|
64
|
-
|
64
|
+
open_tar_gz(data_path) do |tar|
|
65
65
|
target_file_names.each do |target_file_name|
|
66
66
|
tar.seek(target_file_name) do |entry|
|
67
67
|
parse_entry(entry, &block)
|
@@ -124,14 +124,6 @@ module Datasets
|
|
124
124
|
end
|
125
125
|
end
|
126
126
|
end
|
127
|
-
|
128
|
-
def open_tar(data_path)
|
129
|
-
Zlib::GzipReader.open(data_path) do |f|
|
130
|
-
Gem::Package::TarReader.new(f) do |tar|
|
131
|
-
yield(tar)
|
132
|
-
end
|
133
|
-
end
|
134
|
-
end
|
135
127
|
end
|
136
128
|
end
|
137
129
|
|
data/lib/datasets/dataset.rb
CHANGED
@@ -18,11 +18,17 @@ module Datasets
|
|
18
18
|
Table.new(self)
|
19
19
|
end
|
20
20
|
|
21
|
+
def clear_cache!
|
22
|
+
if cache_dir_path.exist?
|
23
|
+
FileUtils.rmtree(cache_dir_path.to_s, secure: true)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
21
27
|
private
|
22
28
|
def cache_dir_path
|
23
29
|
case RUBY_PLATFORM
|
24
30
|
when /mswin/, /mingw/
|
25
|
-
base_dir = ENV["LOCALAPPDATA"] || "~/AppData"
|
31
|
+
base_dir = ENV["LOCALAPPDATA"] || "~/AppData/Local"
|
26
32
|
when /darwin/
|
27
33
|
base_dir = "~/Library/Caches"
|
28
34
|
else
|
data/lib/datasets/downloader.rb
CHANGED
@@ -8,6 +8,8 @@ require "pathname"
|
|
8
8
|
|
9
9
|
module Datasets
|
10
10
|
class Downloader
|
11
|
+
class TooManyRedirects < StandardError; end
|
12
|
+
|
11
13
|
def initialize(url)
|
12
14
|
if url.is_a?(URI::Generic)
|
13
15
|
url = url.dup
|
@@ -31,41 +33,65 @@ module Datasets
|
|
31
33
|
headers["Range"] = "bytes=#{start}-"
|
32
34
|
end
|
33
35
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
start_http(@url, headers) do |response|
|
37
|
+
if response.is_a?(Net::HTTPPartialContent)
|
38
|
+
mode = "ab"
|
39
|
+
else
|
40
|
+
start = nil
|
41
|
+
mode = "wb"
|
42
|
+
end
|
43
|
+
|
44
|
+
base_name = @url.path.split("/").last
|
45
|
+
size_current = 0
|
46
|
+
size_max = response.content_length
|
47
|
+
if start
|
48
|
+
size_current += start
|
49
|
+
size_max += start
|
50
|
+
end
|
51
|
+
progress_reporter = ProgressReporter.new(base_name, size_max)
|
52
|
+
partial_output_path.open(mode) do |output|
|
53
|
+
response.read_body do |chunk|
|
54
|
+
size_current += chunk.bytesize
|
55
|
+
progress_reporter.report(size_current)
|
56
|
+
output.write(chunk)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
FileUtils.mv(partial_output_path, output_path)
|
61
|
+
rescue TooManyRedirects => error
|
62
|
+
last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
|
63
|
+
raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
|
64
|
+
end
|
65
|
+
|
66
|
+
private def start_http(url, headers, limit = 10, &block)
|
67
|
+
if limit == 0
|
68
|
+
raise TooManyRedirects, "too many redirections: #{url}"
|
69
|
+
end
|
70
|
+
http = Net::HTTP.new(url.hostname, url.port)
|
71
|
+
# http.set_debug_output($stderr)
|
72
|
+
http.use_ssl = (url.scheme == "https")
|
73
|
+
http.start do
|
74
|
+
path = url.path
|
75
|
+
path += "?#{url.query}" if url.query
|
39
76
|
request = Net::HTTP::Get.new(path, headers)
|
40
77
|
http.request(request) do |response|
|
41
78
|
case response
|
42
|
-
when Net::HTTPPartialContent
|
43
|
-
|
44
|
-
when Net::
|
45
|
-
|
46
|
-
|
79
|
+
when Net::HTTPSuccess, Net::HTTPPartialContent
|
80
|
+
return block.call(response)
|
81
|
+
when Net::HTTPRedirection
|
82
|
+
url = URI.parse(response[:location])
|
83
|
+
$stderr.puts "Redirect to #{url}"
|
84
|
+
return start_http(url, headers, limit - 1, &block)
|
47
85
|
else
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
base_name = @url.path.split("/").last
|
52
|
-
size_current = 0
|
53
|
-
size_max = response.content_length
|
54
|
-
if start
|
55
|
-
size_current += start
|
56
|
-
size_max += start
|
57
|
-
end
|
58
|
-
progress_reporter = ProgressReporter.new(base_name, size_max)
|
59
|
-
partial_output_path.open(mode) do |output|
|
60
|
-
response.read_body do |chunk|
|
61
|
-
size_current += chunk.bytesize
|
62
|
-
progress_reporter.report(size_current)
|
63
|
-
output.write(chunk)
|
86
|
+
message = response.code
|
87
|
+
if response.message and not response.message.empty?
|
88
|
+
message += ": #{response.message}"
|
64
89
|
end
|
90
|
+
message += ": #{url}"
|
91
|
+
raise response.error_type.new(message, response)
|
65
92
|
end
|
66
93
|
end
|
67
94
|
end
|
68
|
-
FileUtils.mv(partial_output_path, output_path)
|
69
95
|
end
|
70
96
|
|
71
97
|
class ProgressReporter
|
data/lib/datasets/penguins.rb
CHANGED
@@ -19,7 +19,6 @@ module Datasets
|
|
19
19
|
:delta_15_n_permil,
|
20
20
|
:delta_13_c_permil,
|
21
21
|
:comments)
|
22
|
-
|
23
22
|
class SpeciesBase < Dataset
|
24
23
|
def initialize
|
25
24
|
super
|
@@ -62,17 +61,17 @@ module Datasets
|
|
62
61
|
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
|
63
62
|
end
|
64
63
|
|
65
|
-
# Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
|
66
|
-
class Gentoo < SpeciesBase
|
67
|
-
DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
|
68
|
-
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
|
69
|
-
end
|
70
|
-
|
71
64
|
# Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
|
72
65
|
class Chinstrap < SpeciesBase
|
73
66
|
DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
|
74
67
|
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
|
75
68
|
end
|
69
|
+
|
70
|
+
# Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
|
71
|
+
class Gentoo < SpeciesBase
|
72
|
+
DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
|
73
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
|
74
|
+
end
|
76
75
|
end
|
77
76
|
|
78
77
|
# This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
|
@@ -100,8 +99,8 @@ module Datasets
|
|
100
99
|
|
101
100
|
species_classes = [
|
102
101
|
PenguinsRawData::Adelie,
|
102
|
+
PenguinsRawData::Chinstrap,
|
103
103
|
PenguinsRawData::Gentoo,
|
104
|
-
PenguinsRawData::Chinstrap
|
105
104
|
]
|
106
105
|
|
107
106
|
species_classes.each do |species_class|
|
@@ -112,14 +111,36 @@ module Datasets
|
|
112
111
|
end
|
113
112
|
|
114
113
|
private def convert_record(raw_record)
|
115
|
-
Record.new(raw_record
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
114
|
+
Record.new(*cleanse_fields(raw_record))
|
115
|
+
end
|
116
|
+
|
117
|
+
private def cleanse_fields(raw_record)
|
118
|
+
species = raw_record.species.split(' ')[0]
|
119
|
+
flipper_length_mm = raw_record.flipper_length_mm&.to_i
|
120
|
+
body_mass_g = raw_record.body_mass_g&.to_i
|
121
|
+
sex = normalize_sex(raw_record.sex)
|
122
|
+
year = raw_record.date_egg&.year
|
123
|
+
|
124
|
+
[
|
125
|
+
species,
|
126
|
+
raw_record.island,
|
127
|
+
raw_record.culmen_length_mm,
|
128
|
+
raw_record.culmen_depth_mm,
|
129
|
+
flipper_length_mm,
|
130
|
+
body_mass_g,
|
131
|
+
sex,
|
132
|
+
year
|
133
|
+
]
|
134
|
+
end
|
135
|
+
|
136
|
+
private def normalize_sex(val)
|
137
|
+
val = val&.downcase
|
138
|
+
case val
|
139
|
+
when "female", "male", nil
|
140
|
+
val
|
141
|
+
else
|
142
|
+
nil
|
143
|
+
end
|
123
144
|
end
|
124
145
|
end
|
125
146
|
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "tar_gz_readable"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class RdatasetsList < Dataset
|
6
|
+
Record = Struct.new(:package,
|
7
|
+
:dataset,
|
8
|
+
:title,
|
9
|
+
:rows,
|
10
|
+
:cols,
|
11
|
+
:n_binary,
|
12
|
+
:n_character,
|
13
|
+
:n_factor,
|
14
|
+
:n_logical,
|
15
|
+
:n_numeric,
|
16
|
+
:csv,
|
17
|
+
:doc)
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super
|
21
|
+
@metadata.id = "rdatasets"
|
22
|
+
@metadata.name = "Rdatasets"
|
23
|
+
@metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
|
24
|
+
@metadata.licenses = ["GPL-3"]
|
25
|
+
@data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
|
26
|
+
@data_path = cache_dir_path + "datasets.csv"
|
27
|
+
end
|
28
|
+
|
29
|
+
def filter(package: nil, dataset: nil)
|
30
|
+
return to_enum(__method__, package: package, dataset: dataset) unless block_given?
|
31
|
+
|
32
|
+
conds = {}
|
33
|
+
conds["Package"] = package if package
|
34
|
+
conds["Item"] = dataset if dataset
|
35
|
+
if conds.empty?
|
36
|
+
each_row {|row| yield Record.new(*row.fields) }
|
37
|
+
else
|
38
|
+
each_row do |row|
|
39
|
+
if conds.all? {|k, v| row[k] == v }
|
40
|
+
yield Record.new(*row.fields)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def each(&block)
|
47
|
+
filter(&block)
|
48
|
+
end
|
49
|
+
|
50
|
+
private def each_row(&block)
|
51
|
+
download(@data_path, @data_url) unless @data_path.exist?
|
52
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
53
|
+
csv.each(&block)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class Rdatasets < Dataset
|
59
|
+
def initialize(package_name, dataset_name)
|
60
|
+
list = RdatasetsList.new
|
61
|
+
|
62
|
+
info = list.filter(package: package_name, dataset: dataset_name).first
|
63
|
+
unless info
|
64
|
+
raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
|
65
|
+
end
|
66
|
+
|
67
|
+
super()
|
68
|
+
@metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
|
69
|
+
@metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
|
70
|
+
@metadata.url = info.csv
|
71
|
+
@metadata.licenses = ["GPL-3"]
|
72
|
+
@metadata.description = info.title
|
73
|
+
|
74
|
+
# Follow the original directory structure in the cache directory
|
75
|
+
@data_path = cache_dir_path + (dataset_name + ".csv")
|
76
|
+
|
77
|
+
@package_name = package_name
|
78
|
+
@dataset_name = dataset_name
|
79
|
+
end
|
80
|
+
|
81
|
+
def each(&block)
|
82
|
+
return to_enum(__method__) unless block_given?
|
83
|
+
|
84
|
+
download(@data_path, @metadata.url) unless @data_path.exist?
|
85
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
86
|
+
csv.each do |row|
|
87
|
+
record = row.to_h
|
88
|
+
record.delete("")
|
89
|
+
record.transform_keys!(&:to_sym)
|
90
|
+
yield record
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/datasets/version.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
class TestDataset < Test::Unit::TestCase
|
2
|
+
sub_test_case("#clear_cache!") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::Iris.new
|
5
|
+
@cache_dir_path = @dataset.send(:cache_dir_path)
|
6
|
+
end
|
7
|
+
|
8
|
+
test("when the dataset is downloaded") do
|
9
|
+
@dataset.first # This ensures the dataset downloaded
|
10
|
+
existence = {before: @cache_dir_path.join("iris.csv").exist?}
|
11
|
+
|
12
|
+
@dataset.clear_cache!
|
13
|
+
existence[:after] = @cache_dir_path.join("iris.csv").exist?
|
14
|
+
|
15
|
+
assert_equal({before: true, after: false},
|
16
|
+
existence)
|
17
|
+
end
|
18
|
+
|
19
|
+
test("when the dataset is not downloaded") do
|
20
|
+
FileUtils.rmtree(@cache_dir_path.to_s, secure: true) if @cache_dir_path.exist?
|
21
|
+
|
22
|
+
assert_nothing_raised do
|
23
|
+
@dataset.clear_cache!
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class DownloaderTest < Test::Unit::TestCase
|
2
|
+
include Helper::Sandbox
|
3
|
+
|
4
|
+
sub_test_case("#download") do
|
5
|
+
def setup
|
6
|
+
setup_sandbox
|
7
|
+
end
|
8
|
+
|
9
|
+
def teardown
|
10
|
+
teardown_sandbox
|
11
|
+
end
|
12
|
+
|
13
|
+
test("too many redirection") do
|
14
|
+
first_url = "https://example.com/file"
|
15
|
+
last_url = "https://example.com/last_redirection"
|
16
|
+
expected_message = "too many redirections: #{first_url} .. #{last_url}"
|
17
|
+
output_path = @tmp_dir + "file"
|
18
|
+
downloader = Datasets::Downloader.new(first_url)
|
19
|
+
|
20
|
+
downloader.define_singleton_method(:start_http) do |url, headers|
|
21
|
+
raise Datasets::Downloader::TooManyRedirects, "too many redirections: #{last_url}"
|
22
|
+
end
|
23
|
+
|
24
|
+
assert_raise(Datasets::Downloader::TooManyRedirects.new(expected_message)) do
|
25
|
+
downloader.download(output_path)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/test/test-penguins.rb
CHANGED
@@ -182,6 +182,18 @@ class PenguinsTest < Test::Unit::TestCase
|
|
182
182
|
@dataset = Datasets::Penguins.new
|
183
183
|
end
|
184
184
|
|
185
|
+
test("order of species") do
|
186
|
+
species_values = @dataset.map {|r| r.species }.uniq
|
187
|
+
assert_equal(["Adelie", "Chinstrap", "Gentoo"],
|
188
|
+
species_values)
|
189
|
+
end
|
190
|
+
|
191
|
+
test("data cleansing") do
|
192
|
+
sex_values = @dataset.map {|r| r.sex }.uniq.compact.sort
|
193
|
+
assert_equal(["female", "male"],
|
194
|
+
sex_values)
|
195
|
+
end
|
196
|
+
|
185
197
|
test("#each") do
|
186
198
|
records = @dataset.each.to_a
|
187
199
|
assert_equal([
|
@@ -196,16 +208,6 @@ class PenguinsTest < Test::Unit::TestCase
|
|
196
208
|
sex: "male",
|
197
209
|
year: 2007
|
198
210
|
},
|
199
|
-
{
|
200
|
-
species: "Gentoo",
|
201
|
-
island: "Biscoe",
|
202
|
-
bill_length_mm: 46.1,
|
203
|
-
bill_depth_mm: 13.2,
|
204
|
-
flipper_length_mm: 211,
|
205
|
-
body_mass_g: 4500,
|
206
|
-
sex: "female",
|
207
|
-
year: 2007
|
208
|
-
},
|
209
211
|
{
|
210
212
|
species: "Chinstrap",
|
211
213
|
island: "Dream",
|
@@ -217,13 +219,23 @@ class PenguinsTest < Test::Unit::TestCase
|
|
217
219
|
year: 2007
|
218
220
|
},
|
219
221
|
{
|
220
|
-
species: "
|
221
|
-
island: "
|
222
|
-
bill_length_mm:
|
223
|
-
bill_depth_mm:
|
224
|
-
flipper_length_mm:
|
225
|
-
body_mass_g:
|
222
|
+
species: "Gentoo",
|
223
|
+
island: "Biscoe",
|
224
|
+
bill_length_mm: 46.1,
|
225
|
+
bill_depth_mm: 13.2,
|
226
|
+
flipper_length_mm: 211,
|
227
|
+
body_mass_g: 4500,
|
226
228
|
sex: "female",
|
229
|
+
year: 2007
|
230
|
+
},
|
231
|
+
{
|
232
|
+
species: "Gentoo",
|
233
|
+
island: "Biscoe",
|
234
|
+
bill_length_mm: 49.9,
|
235
|
+
bill_depth_mm: 16.1,
|
236
|
+
flipper_length_mm: 213,
|
237
|
+
body_mass_g: 5400,
|
238
|
+
sex: "male",
|
227
239
|
year: 2009
|
228
240
|
}
|
229
241
|
],
|
@@ -231,7 +243,7 @@ class PenguinsTest < Test::Unit::TestCase
|
|
231
243
|
records.size,
|
232
244
|
records[0].to_h,
|
233
245
|
records[152].to_h,
|
234
|
-
records[
|
246
|
+
records[220].to_h,
|
235
247
|
records[-1].to_h,
|
236
248
|
])
|
237
249
|
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
class RdatasetsTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("RdatasetsList") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::RdatasetsList.new
|
5
|
+
end
|
6
|
+
|
7
|
+
sub_test_case("#each") do
|
8
|
+
test("with package_name") do
|
9
|
+
records = @dataset.filter(package: "datasets").to_a
|
10
|
+
assert_equal([
|
11
|
+
84,
|
12
|
+
{
|
13
|
+
package: "datasets",
|
14
|
+
dataset: "ability.cov",
|
15
|
+
title: "Ability and Intelligence Tests",
|
16
|
+
rows: 6,
|
17
|
+
cols: 8,
|
18
|
+
n_binary: 0,
|
19
|
+
n_character: 0,
|
20
|
+
n_factor: 0,
|
21
|
+
n_logical: 0,
|
22
|
+
n_numeric: 8,
|
23
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
|
24
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
|
25
|
+
},
|
26
|
+
{
|
27
|
+
package: "datasets",
|
28
|
+
dataset: "WWWusage",
|
29
|
+
title: "Internet Usage per Minute",
|
30
|
+
rows: 100,
|
31
|
+
cols: 2,
|
32
|
+
n_binary: 0,
|
33
|
+
n_character: 0,
|
34
|
+
n_factor: 0,
|
35
|
+
n_logical: 0,
|
36
|
+
n_numeric: 2,
|
37
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
|
38
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
|
39
|
+
}
|
40
|
+
],
|
41
|
+
[
|
42
|
+
records.size,
|
43
|
+
records[0].to_h,
|
44
|
+
records[-1].to_h
|
45
|
+
])
|
46
|
+
end
|
47
|
+
|
48
|
+
test("without package_name") do
|
49
|
+
records = @dataset.each.to_a
|
50
|
+
assert_equal([
|
51
|
+
1478,
|
52
|
+
{
|
53
|
+
package: "AER",
|
54
|
+
dataset: "Affairs",
|
55
|
+
title: "Fair's Extramarital Affairs Data",
|
56
|
+
rows: 601,
|
57
|
+
cols: 9,
|
58
|
+
n_binary: 2,
|
59
|
+
n_character: 0,
|
60
|
+
n_factor: 2,
|
61
|
+
n_logical: 0,
|
62
|
+
n_numeric: 7,
|
63
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
|
64
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
|
65
|
+
},
|
66
|
+
{
|
67
|
+
package: "vcd",
|
68
|
+
dataset: "WomenQueue",
|
69
|
+
title: "Women in Queues",
|
70
|
+
rows: 11,
|
71
|
+
cols: 2,
|
72
|
+
n_binary: 0,
|
73
|
+
n_character: 0,
|
74
|
+
n_factor: 1,
|
75
|
+
n_logical: 0,
|
76
|
+
n_numeric: 1,
|
77
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/vcd/WomenQueue.csv",
|
78
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/vcd/WomenQueue.html"
|
79
|
+
},
|
80
|
+
],
|
81
|
+
[
|
82
|
+
records.size,
|
83
|
+
records[0].to_h,
|
84
|
+
records[-1].to_h
|
85
|
+
])
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
sub_test_case("Rdatasets") do
|
91
|
+
sub_test_case("datasets") do
|
92
|
+
sub_test_case("AirPassengers") do
|
93
|
+
def setup
|
94
|
+
@dataset = Datasets::Rdatasets.new("datasets", "AirPassengers")
|
95
|
+
end
|
96
|
+
|
97
|
+
test("#each") do
|
98
|
+
records = @dataset.each.to_a
|
99
|
+
assert_equal([
|
100
|
+
144,
|
101
|
+
{ time: 1949, value: 112 },
|
102
|
+
{ time: 1960.91666666667, value: 432 },
|
103
|
+
],
|
104
|
+
[
|
105
|
+
records.size,
|
106
|
+
records[0],
|
107
|
+
records[-1]
|
108
|
+
])
|
109
|
+
end
|
110
|
+
|
111
|
+
test("#metadata.id") do
|
112
|
+
assert_equal("rdatasets-datasets-AirPassengers", @dataset.metadata.id)
|
113
|
+
end
|
114
|
+
|
115
|
+
test("#metadata.description") do
|
116
|
+
description = @dataset.metadata.description
|
117
|
+
assert do
|
118
|
+
description.include?("Monthly Airline Passenger Numbers 1949-1960")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
test("invalid dataset name") do
|
124
|
+
assert_raise(ArgumentError) do
|
125
|
+
Datasets::Rdatasets.new("datasets", "invalid datasets name")
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
test("invalid package name") do
|
131
|
+
assert_raise(ArgumentError) do
|
132
|
+
Datasets::Rdatasets.new("invalid package name", "AirPassengers")
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-06-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: csv
|
@@ -161,7 +161,9 @@ files:
|
|
161
161
|
- lib/datasets/penguins.rb
|
162
162
|
- lib/datasets/penn-treebank.rb
|
163
163
|
- lib/datasets/postal-code-japan.rb
|
164
|
+
- lib/datasets/rdatasets.rb
|
164
165
|
- lib/datasets/table.rb
|
166
|
+
- lib/datasets/tar_gz_readable.rb
|
165
167
|
- lib/datasets/version.rb
|
166
168
|
- lib/datasets/wikipedia.rb
|
167
169
|
- lib/datasets/wine.rb
|
@@ -172,7 +174,9 @@ files:
|
|
172
174
|
- test/test-cifar.rb
|
173
175
|
- test/test-cldr-plurals.rb
|
174
176
|
- test/test-communities.rb
|
177
|
+
- test/test-dataset.rb
|
175
178
|
- test/test-dictionary.rb
|
179
|
+
- test/test-downloader.rb
|
176
180
|
- test/test-e-stat-japan.rb
|
177
181
|
- test/test-fashion-mnist.rb
|
178
182
|
- test/test-hepatitis.rb
|
@@ -184,6 +188,7 @@ files:
|
|
184
188
|
- test/test-penguins.rb
|
185
189
|
- test/test-penn-treebank.rb
|
186
190
|
- test/test-postal-code-japan.rb
|
191
|
+
- test/test-rdatasets.rb
|
187
192
|
- test/test-table.rb
|
188
193
|
- test/test-wikipedia.rb
|
189
194
|
- test/test-wine.rb
|
@@ -217,7 +222,9 @@ test_files:
|
|
217
222
|
- test/test-cifar.rb
|
218
223
|
- test/test-cldr-plurals.rb
|
219
224
|
- test/test-communities.rb
|
225
|
+
- test/test-dataset.rb
|
220
226
|
- test/test-dictionary.rb
|
227
|
+
- test/test-downloader.rb
|
221
228
|
- test/test-e-stat-japan.rb
|
222
229
|
- test/test-fashion-mnist.rb
|
223
230
|
- test/test-hepatitis.rb
|
@@ -229,6 +236,7 @@ test_files:
|
|
229
236
|
- test/test-penguins.rb
|
230
237
|
- test/test-penn-treebank.rb
|
231
238
|
- test/test-postal-code-japan.rb
|
239
|
+
- test/test-rdatasets.rb
|
232
240
|
- test/test-table.rb
|
233
241
|
- test/test-wikipedia.rb
|
234
242
|
- test/test-wine.rb
|