red-datasets 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/news.md +9 -0
- data/lib/datasets.rb +1 -0
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/dataset.rb +7 -1
- data/lib/datasets/downloader.rb +53 -27
- data/lib/datasets/penguins.rb +37 -16
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/tar_gz_readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-penguins.rb +29 -17
- data/test/test-rdatasets.rb +136 -0
- metadata +10 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6fbd4d11063f89ba2e09250b751886086c953ec8bc92c75a6a351c31a36da0c4
|
4
|
+
data.tar.gz: acc6ff31f0f4ae3a6c6565fe569233c01615718c01300b0838ff744571edc34d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 26361511155b447ffed56a79b2336a9a1db96494bf856b23e7b39cc6a8b6a2039e7ed27564140761bdb2daaae7ee563b3695c464a7a7b21ff93b0636f6b8338d
|
7
|
+
data.tar.gz: 40446f90e410e0d86abeec186a1d7adcc5375e29c19dc934f823befb26a87d904458ef5ea18c9d64055493d29ed305dba53d6e4d86bd7d84488baf3745ebd792
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.2 - 2021-06-03
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `Datasets::Rdatasets` and `Datasets::RdatasetsList`: Added.
|
8
|
+
|
9
|
+
* `Datasets::Penguins`: Changed for compatibility with seaborn's
|
10
|
+
penguins dataset.
|
11
|
+
|
3
12
|
## 0.1.1 - 2021-04-11
|
4
13
|
|
5
14
|
### Improvements
|
data/lib/datasets.rb
CHANGED
@@ -15,5 +15,6 @@ require_relative "datasets/mushroom"
|
|
15
15
|
require_relative "datasets/penguins"
|
16
16
|
require_relative "datasets/penn-treebank"
|
17
17
|
require_relative "datasets/postal-code-japan"
|
18
|
+
require_relative "datasets/rdatasets"
|
18
19
|
require_relative "datasets/wikipedia"
|
19
20
|
require_relative "datasets/wine"
|
data/lib/datasets/cifar.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
|
2
|
-
require "zlib"
|
3
|
-
|
1
|
+
require_relative "tar_gz_readable"
|
4
2
|
require_relative "dataset"
|
5
3
|
|
6
4
|
module Datasets
|
7
5
|
class CIFAR < Dataset
|
6
|
+
include TarGzReadable
|
7
|
+
|
8
8
|
module Pixelable
|
9
9
|
def pixels
|
10
10
|
data.unpack("C*")
|
@@ -61,7 +61,7 @@ module Datasets
|
|
61
61
|
private
|
62
62
|
|
63
63
|
def parse_data(data_path, &block)
|
64
|
-
|
64
|
+
open_tar_gz(data_path) do |tar|
|
65
65
|
target_file_names.each do |target_file_name|
|
66
66
|
tar.seek(target_file_name) do |entry|
|
67
67
|
parse_entry(entry, &block)
|
@@ -124,14 +124,6 @@ module Datasets
|
|
124
124
|
end
|
125
125
|
end
|
126
126
|
end
|
127
|
-
|
128
|
-
def open_tar(data_path)
|
129
|
-
Zlib::GzipReader.open(data_path) do |f|
|
130
|
-
Gem::Package::TarReader.new(f) do |tar|
|
131
|
-
yield(tar)
|
132
|
-
end
|
133
|
-
end
|
134
|
-
end
|
135
127
|
end
|
136
128
|
end
|
137
129
|
|
data/lib/datasets/dataset.rb
CHANGED
@@ -18,11 +18,17 @@ module Datasets
|
|
18
18
|
Table.new(self)
|
19
19
|
end
|
20
20
|
|
21
|
+
def clear_cache!
|
22
|
+
if cache_dir_path.exist?
|
23
|
+
FileUtils.rmtree(cache_dir_path.to_s, secure: true)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
21
27
|
private
|
22
28
|
def cache_dir_path
|
23
29
|
case RUBY_PLATFORM
|
24
30
|
when /mswin/, /mingw/
|
25
|
-
base_dir = ENV["LOCALAPPDATA"] || "~/AppData"
|
31
|
+
base_dir = ENV["LOCALAPPDATA"] || "~/AppData/Local"
|
26
32
|
when /darwin/
|
27
33
|
base_dir = "~/Library/Caches"
|
28
34
|
else
|
data/lib/datasets/downloader.rb
CHANGED
@@ -8,6 +8,8 @@ require "pathname"
|
|
8
8
|
|
9
9
|
module Datasets
|
10
10
|
class Downloader
|
11
|
+
class TooManyRedirects < StandardError; end
|
12
|
+
|
11
13
|
def initialize(url)
|
12
14
|
if url.is_a?(URI::Generic)
|
13
15
|
url = url.dup
|
@@ -31,41 +33,65 @@ module Datasets
|
|
31
33
|
headers["Range"] = "bytes=#{start}-"
|
32
34
|
end
|
33
35
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
start_http(@url, headers) do |response|
|
37
|
+
if response.is_a?(Net::HTTPPartialContent)
|
38
|
+
mode = "ab"
|
39
|
+
else
|
40
|
+
start = nil
|
41
|
+
mode = "wb"
|
42
|
+
end
|
43
|
+
|
44
|
+
base_name = @url.path.split("/").last
|
45
|
+
size_current = 0
|
46
|
+
size_max = response.content_length
|
47
|
+
if start
|
48
|
+
size_current += start
|
49
|
+
size_max += start
|
50
|
+
end
|
51
|
+
progress_reporter = ProgressReporter.new(base_name, size_max)
|
52
|
+
partial_output_path.open(mode) do |output|
|
53
|
+
response.read_body do |chunk|
|
54
|
+
size_current += chunk.bytesize
|
55
|
+
progress_reporter.report(size_current)
|
56
|
+
output.write(chunk)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
FileUtils.mv(partial_output_path, output_path)
|
61
|
+
rescue TooManyRedirects => error
|
62
|
+
last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
|
63
|
+
raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
|
64
|
+
end
|
65
|
+
|
66
|
+
private def start_http(url, headers, limit = 10, &block)
|
67
|
+
if limit == 0
|
68
|
+
raise TooManyRedirects, "too many redirections: #{url}"
|
69
|
+
end
|
70
|
+
http = Net::HTTP.new(url.hostname, url.port)
|
71
|
+
# http.set_debug_output($stderr)
|
72
|
+
http.use_ssl = (url.scheme == "https")
|
73
|
+
http.start do
|
74
|
+
path = url.path
|
75
|
+
path += "?#{url.query}" if url.query
|
39
76
|
request = Net::HTTP::Get.new(path, headers)
|
40
77
|
http.request(request) do |response|
|
41
78
|
case response
|
42
|
-
when Net::HTTPPartialContent
|
43
|
-
|
44
|
-
when Net::
|
45
|
-
|
46
|
-
|
79
|
+
when Net::HTTPSuccess, Net::HTTPPartialContent
|
80
|
+
return block.call(response)
|
81
|
+
when Net::HTTPRedirection
|
82
|
+
url = URI.parse(response[:location])
|
83
|
+
$stderr.puts "Redirect to #{url}"
|
84
|
+
return start_http(url, headers, limit - 1, &block)
|
47
85
|
else
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
base_name = @url.path.split("/").last
|
52
|
-
size_current = 0
|
53
|
-
size_max = response.content_length
|
54
|
-
if start
|
55
|
-
size_current += start
|
56
|
-
size_max += start
|
57
|
-
end
|
58
|
-
progress_reporter = ProgressReporter.new(base_name, size_max)
|
59
|
-
partial_output_path.open(mode) do |output|
|
60
|
-
response.read_body do |chunk|
|
61
|
-
size_current += chunk.bytesize
|
62
|
-
progress_reporter.report(size_current)
|
63
|
-
output.write(chunk)
|
86
|
+
message = response.code
|
87
|
+
if response.message and not response.message.empty?
|
88
|
+
message += ": #{response.message}"
|
64
89
|
end
|
90
|
+
message += ": #{url}"
|
91
|
+
raise response.error_type.new(message, response)
|
65
92
|
end
|
66
93
|
end
|
67
94
|
end
|
68
|
-
FileUtils.mv(partial_output_path, output_path)
|
69
95
|
end
|
70
96
|
|
71
97
|
class ProgressReporter
|
data/lib/datasets/penguins.rb
CHANGED
@@ -19,7 +19,6 @@ module Datasets
|
|
19
19
|
:delta_15_n_permil,
|
20
20
|
:delta_13_c_permil,
|
21
21
|
:comments)
|
22
|
-
|
23
22
|
class SpeciesBase < Dataset
|
24
23
|
def initialize
|
25
24
|
super
|
@@ -62,17 +61,17 @@ module Datasets
|
|
62
61
|
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
|
63
62
|
end
|
64
63
|
|
65
|
-
# Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
|
66
|
-
class Gentoo < SpeciesBase
|
67
|
-
DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
|
68
|
-
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
|
69
|
-
end
|
70
|
-
|
71
64
|
# Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
|
72
65
|
class Chinstrap < SpeciesBase
|
73
66
|
DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
|
74
67
|
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
|
75
68
|
end
|
69
|
+
|
70
|
+
# Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
|
71
|
+
class Gentoo < SpeciesBase
|
72
|
+
DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
|
73
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
|
74
|
+
end
|
76
75
|
end
|
77
76
|
|
78
77
|
# This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
|
@@ -100,8 +99,8 @@ module Datasets
|
|
100
99
|
|
101
100
|
species_classes = [
|
102
101
|
PenguinsRawData::Adelie,
|
102
|
+
PenguinsRawData::Chinstrap,
|
103
103
|
PenguinsRawData::Gentoo,
|
104
|
-
PenguinsRawData::Chinstrap
|
105
104
|
]
|
106
105
|
|
107
106
|
species_classes.each do |species_class|
|
@@ -112,14 +111,36 @@ module Datasets
|
|
112
111
|
end
|
113
112
|
|
114
113
|
private def convert_record(raw_record)
|
115
|
-
Record.new(raw_record
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
114
|
+
Record.new(*cleanse_fields(raw_record))
|
115
|
+
end
|
116
|
+
|
117
|
+
private def cleanse_fields(raw_record)
|
118
|
+
species = raw_record.species.split(' ')[0]
|
119
|
+
flipper_length_mm = raw_record.flipper_length_mm&.to_i
|
120
|
+
body_mass_g = raw_record.body_mass_g&.to_i
|
121
|
+
sex = normalize_sex(raw_record.sex)
|
122
|
+
year = raw_record.date_egg&.year
|
123
|
+
|
124
|
+
[
|
125
|
+
species,
|
126
|
+
raw_record.island,
|
127
|
+
raw_record.culmen_length_mm,
|
128
|
+
raw_record.culmen_depth_mm,
|
129
|
+
flipper_length_mm,
|
130
|
+
body_mass_g,
|
131
|
+
sex,
|
132
|
+
year
|
133
|
+
]
|
134
|
+
end
|
135
|
+
|
136
|
+
private def normalize_sex(val)
|
137
|
+
val = val&.downcase
|
138
|
+
case val
|
139
|
+
when "female", "male", nil
|
140
|
+
val
|
141
|
+
else
|
142
|
+
nil
|
143
|
+
end
|
123
144
|
end
|
124
145
|
end
|
125
146
|
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "tar_gz_readable"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class RdatasetsList < Dataset
|
6
|
+
Record = Struct.new(:package,
|
7
|
+
:dataset,
|
8
|
+
:title,
|
9
|
+
:rows,
|
10
|
+
:cols,
|
11
|
+
:n_binary,
|
12
|
+
:n_character,
|
13
|
+
:n_factor,
|
14
|
+
:n_logical,
|
15
|
+
:n_numeric,
|
16
|
+
:csv,
|
17
|
+
:doc)
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super
|
21
|
+
@metadata.id = "rdatasets"
|
22
|
+
@metadata.name = "Rdatasets"
|
23
|
+
@metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
|
24
|
+
@metadata.licenses = ["GPL-3"]
|
25
|
+
@data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
|
26
|
+
@data_path = cache_dir_path + "datasets.csv"
|
27
|
+
end
|
28
|
+
|
29
|
+
def filter(package: nil, dataset: nil)
|
30
|
+
return to_enum(__method__, package: package, dataset: dataset) unless block_given?
|
31
|
+
|
32
|
+
conds = {}
|
33
|
+
conds["Package"] = package if package
|
34
|
+
conds["Item"] = dataset if dataset
|
35
|
+
if conds.empty?
|
36
|
+
each_row {|row| yield Record.new(*row.fields) }
|
37
|
+
else
|
38
|
+
each_row do |row|
|
39
|
+
if conds.all? {|k, v| row[k] == v }
|
40
|
+
yield Record.new(*row.fields)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def each(&block)
|
47
|
+
filter(&block)
|
48
|
+
end
|
49
|
+
|
50
|
+
private def each_row(&block)
|
51
|
+
download(@data_path, @data_url) unless @data_path.exist?
|
52
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
53
|
+
csv.each(&block)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class Rdatasets < Dataset
|
59
|
+
def initialize(package_name, dataset_name)
|
60
|
+
list = RdatasetsList.new
|
61
|
+
|
62
|
+
info = list.filter(package: package_name, dataset: dataset_name).first
|
63
|
+
unless info
|
64
|
+
raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
|
65
|
+
end
|
66
|
+
|
67
|
+
super()
|
68
|
+
@metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
|
69
|
+
@metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
|
70
|
+
@metadata.url = info.csv
|
71
|
+
@metadata.licenses = ["GPL-3"]
|
72
|
+
@metadata.description = info.title
|
73
|
+
|
74
|
+
# Follow the original directory structure in the cache directory
|
75
|
+
@data_path = cache_dir_path + (dataset_name + ".csv")
|
76
|
+
|
77
|
+
@package_name = package_name
|
78
|
+
@dataset_name = dataset_name
|
79
|
+
end
|
80
|
+
|
81
|
+
def each(&block)
|
82
|
+
return to_enum(__method__) unless block_given?
|
83
|
+
|
84
|
+
download(@data_path, @metadata.url) unless @data_path.exist?
|
85
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
86
|
+
csv.each do |row|
|
87
|
+
record = row.to_h
|
88
|
+
record.delete("")
|
89
|
+
record.transform_keys!(&:to_sym)
|
90
|
+
yield record
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/datasets/version.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
class TestDataset < Test::Unit::TestCase
|
2
|
+
sub_test_case("#clear_cache!") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::Iris.new
|
5
|
+
@cache_dir_path = @dataset.send(:cache_dir_path)
|
6
|
+
end
|
7
|
+
|
8
|
+
test("when the dataset is downloaded") do
|
9
|
+
@dataset.first # This ensures the dataset downloaded
|
10
|
+
existence = {before: @cache_dir_path.join("iris.csv").exist?}
|
11
|
+
|
12
|
+
@dataset.clear_cache!
|
13
|
+
existence[:after] = @cache_dir_path.join("iris.csv").exist?
|
14
|
+
|
15
|
+
assert_equal({before: true, after: false},
|
16
|
+
existence)
|
17
|
+
end
|
18
|
+
|
19
|
+
test("when the dataset is not downloaded") do
|
20
|
+
FileUtils.rmtree(@cache_dir_path.to_s, secure: true) if @cache_dir_path.exist?
|
21
|
+
|
22
|
+
assert_nothing_raised do
|
23
|
+
@dataset.clear_cache!
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class DownloaderTest < Test::Unit::TestCase
|
2
|
+
include Helper::Sandbox
|
3
|
+
|
4
|
+
sub_test_case("#download") do
|
5
|
+
def setup
|
6
|
+
setup_sandbox
|
7
|
+
end
|
8
|
+
|
9
|
+
def teardown
|
10
|
+
teardown_sandbox
|
11
|
+
end
|
12
|
+
|
13
|
+
test("too many redirection") do
|
14
|
+
first_url = "https://example.com/file"
|
15
|
+
last_url = "https://example.com/last_redirection"
|
16
|
+
expected_message = "too many redirections: #{first_url} .. #{last_url}"
|
17
|
+
output_path = @tmp_dir + "file"
|
18
|
+
downloader = Datasets::Downloader.new(first_url)
|
19
|
+
|
20
|
+
downloader.define_singleton_method(:start_http) do |url, headers|
|
21
|
+
raise Datasets::Downloader::TooManyRedirects, "too many redirections: #{last_url}"
|
22
|
+
end
|
23
|
+
|
24
|
+
assert_raise(Datasets::Downloader::TooManyRedirects.new(expected_message)) do
|
25
|
+
downloader.download(output_path)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/test/test-penguins.rb
CHANGED
@@ -182,6 +182,18 @@ class PenguinsTest < Test::Unit::TestCase
|
|
182
182
|
@dataset = Datasets::Penguins.new
|
183
183
|
end
|
184
184
|
|
185
|
+
test("order of species") do
|
186
|
+
species_values = @dataset.map {|r| r.species }.uniq
|
187
|
+
assert_equal(["Adelie", "Chinstrap", "Gentoo"],
|
188
|
+
species_values)
|
189
|
+
end
|
190
|
+
|
191
|
+
test("data cleansing") do
|
192
|
+
sex_values = @dataset.map {|r| r.sex }.uniq.compact.sort
|
193
|
+
assert_equal(["female", "male"],
|
194
|
+
sex_values)
|
195
|
+
end
|
196
|
+
|
185
197
|
test("#each") do
|
186
198
|
records = @dataset.each.to_a
|
187
199
|
assert_equal([
|
@@ -196,16 +208,6 @@ class PenguinsTest < Test::Unit::TestCase
|
|
196
208
|
sex: "male",
|
197
209
|
year: 2007
|
198
210
|
},
|
199
|
-
{
|
200
|
-
species: "Gentoo",
|
201
|
-
island: "Biscoe",
|
202
|
-
bill_length_mm: 46.1,
|
203
|
-
bill_depth_mm: 13.2,
|
204
|
-
flipper_length_mm: 211,
|
205
|
-
body_mass_g: 4500,
|
206
|
-
sex: "female",
|
207
|
-
year: 2007
|
208
|
-
},
|
209
211
|
{
|
210
212
|
species: "Chinstrap",
|
211
213
|
island: "Dream",
|
@@ -217,13 +219,23 @@ class PenguinsTest < Test::Unit::TestCase
|
|
217
219
|
year: 2007
|
218
220
|
},
|
219
221
|
{
|
220
|
-
species: "
|
221
|
-
island: "
|
222
|
-
bill_length_mm:
|
223
|
-
bill_depth_mm:
|
224
|
-
flipper_length_mm:
|
225
|
-
body_mass_g:
|
222
|
+
species: "Gentoo",
|
223
|
+
island: "Biscoe",
|
224
|
+
bill_length_mm: 46.1,
|
225
|
+
bill_depth_mm: 13.2,
|
226
|
+
flipper_length_mm: 211,
|
227
|
+
body_mass_g: 4500,
|
226
228
|
sex: "female",
|
229
|
+
year: 2007
|
230
|
+
},
|
231
|
+
{
|
232
|
+
species: "Gentoo",
|
233
|
+
island: "Biscoe",
|
234
|
+
bill_length_mm: 49.9,
|
235
|
+
bill_depth_mm: 16.1,
|
236
|
+
flipper_length_mm: 213,
|
237
|
+
body_mass_g: 5400,
|
238
|
+
sex: "male",
|
227
239
|
year: 2009
|
228
240
|
}
|
229
241
|
],
|
@@ -231,7 +243,7 @@ class PenguinsTest < Test::Unit::TestCase
|
|
231
243
|
records.size,
|
232
244
|
records[0].to_h,
|
233
245
|
records[152].to_h,
|
234
|
-
records[
|
246
|
+
records[220].to_h,
|
235
247
|
records[-1].to_h,
|
236
248
|
])
|
237
249
|
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
class RdatasetsTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("RdatasetsList") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::RdatasetsList.new
|
5
|
+
end
|
6
|
+
|
7
|
+
sub_test_case("#each") do
|
8
|
+
test("with package_name") do
|
9
|
+
records = @dataset.filter(package: "datasets").to_a
|
10
|
+
assert_equal([
|
11
|
+
84,
|
12
|
+
{
|
13
|
+
package: "datasets",
|
14
|
+
dataset: "ability.cov",
|
15
|
+
title: "Ability and Intelligence Tests",
|
16
|
+
rows: 6,
|
17
|
+
cols: 8,
|
18
|
+
n_binary: 0,
|
19
|
+
n_character: 0,
|
20
|
+
n_factor: 0,
|
21
|
+
n_logical: 0,
|
22
|
+
n_numeric: 8,
|
23
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
|
24
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
|
25
|
+
},
|
26
|
+
{
|
27
|
+
package: "datasets",
|
28
|
+
dataset: "WWWusage",
|
29
|
+
title: "Internet Usage per Minute",
|
30
|
+
rows: 100,
|
31
|
+
cols: 2,
|
32
|
+
n_binary: 0,
|
33
|
+
n_character: 0,
|
34
|
+
n_factor: 0,
|
35
|
+
n_logical: 0,
|
36
|
+
n_numeric: 2,
|
37
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
|
38
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
|
39
|
+
}
|
40
|
+
],
|
41
|
+
[
|
42
|
+
records.size,
|
43
|
+
records[0].to_h,
|
44
|
+
records[-1].to_h
|
45
|
+
])
|
46
|
+
end
|
47
|
+
|
48
|
+
test("without package_name") do
|
49
|
+
records = @dataset.each.to_a
|
50
|
+
assert_equal([
|
51
|
+
1478,
|
52
|
+
{
|
53
|
+
package: "AER",
|
54
|
+
dataset: "Affairs",
|
55
|
+
title: "Fair's Extramarital Affairs Data",
|
56
|
+
rows: 601,
|
57
|
+
cols: 9,
|
58
|
+
n_binary: 2,
|
59
|
+
n_character: 0,
|
60
|
+
n_factor: 2,
|
61
|
+
n_logical: 0,
|
62
|
+
n_numeric: 7,
|
63
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
|
64
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
|
65
|
+
},
|
66
|
+
{
|
67
|
+
package: "vcd",
|
68
|
+
dataset: "WomenQueue",
|
69
|
+
title: "Women in Queues",
|
70
|
+
rows: 11,
|
71
|
+
cols: 2,
|
72
|
+
n_binary: 0,
|
73
|
+
n_character: 0,
|
74
|
+
n_factor: 1,
|
75
|
+
n_logical: 0,
|
76
|
+
n_numeric: 1,
|
77
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/vcd/WomenQueue.csv",
|
78
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/vcd/WomenQueue.html"
|
79
|
+
},
|
80
|
+
],
|
81
|
+
[
|
82
|
+
records.size,
|
83
|
+
records[0].to_h,
|
84
|
+
records[-1].to_h
|
85
|
+
])
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
sub_test_case("Rdatasets") do
|
91
|
+
sub_test_case("datasets") do
|
92
|
+
sub_test_case("AirPassengers") do
|
93
|
+
def setup
|
94
|
+
@dataset = Datasets::Rdatasets.new("datasets", "AirPassengers")
|
95
|
+
end
|
96
|
+
|
97
|
+
test("#each") do
|
98
|
+
records = @dataset.each.to_a
|
99
|
+
assert_equal([
|
100
|
+
144,
|
101
|
+
{ time: 1949, value: 112 },
|
102
|
+
{ time: 1960.91666666667, value: 432 },
|
103
|
+
],
|
104
|
+
[
|
105
|
+
records.size,
|
106
|
+
records[0],
|
107
|
+
records[-1]
|
108
|
+
])
|
109
|
+
end
|
110
|
+
|
111
|
+
test("#metadata.id") do
|
112
|
+
assert_equal("rdatasets-datasets-AirPassengers", @dataset.metadata.id)
|
113
|
+
end
|
114
|
+
|
115
|
+
test("#metadata.description") do
|
116
|
+
description = @dataset.metadata.description
|
117
|
+
assert do
|
118
|
+
description.include?("Monthly Airline Passenger Numbers 1949-1960")
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
test("invalid dataset name") do
|
124
|
+
assert_raise(ArgumentError) do
|
125
|
+
Datasets::Rdatasets.new("datasets", "invalid datasets name")
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
test("invalid package name") do
|
131
|
+
assert_raise(ArgumentError) do
|
132
|
+
Datasets::Rdatasets.new("invalid package name", "AirPassengers")
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-06-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: csv
|
@@ -161,7 +161,9 @@ files:
|
|
161
161
|
- lib/datasets/penguins.rb
|
162
162
|
- lib/datasets/penn-treebank.rb
|
163
163
|
- lib/datasets/postal-code-japan.rb
|
164
|
+
- lib/datasets/rdatasets.rb
|
164
165
|
- lib/datasets/table.rb
|
166
|
+
- lib/datasets/tar_gz_readable.rb
|
165
167
|
- lib/datasets/version.rb
|
166
168
|
- lib/datasets/wikipedia.rb
|
167
169
|
- lib/datasets/wine.rb
|
@@ -172,7 +174,9 @@ files:
|
|
172
174
|
- test/test-cifar.rb
|
173
175
|
- test/test-cldr-plurals.rb
|
174
176
|
- test/test-communities.rb
|
177
|
+
- test/test-dataset.rb
|
175
178
|
- test/test-dictionary.rb
|
179
|
+
- test/test-downloader.rb
|
176
180
|
- test/test-e-stat-japan.rb
|
177
181
|
- test/test-fashion-mnist.rb
|
178
182
|
- test/test-hepatitis.rb
|
@@ -184,6 +188,7 @@ files:
|
|
184
188
|
- test/test-penguins.rb
|
185
189
|
- test/test-penn-treebank.rb
|
186
190
|
- test/test-postal-code-japan.rb
|
191
|
+
- test/test-rdatasets.rb
|
187
192
|
- test/test-table.rb
|
188
193
|
- test/test-wikipedia.rb
|
189
194
|
- test/test-wine.rb
|
@@ -217,7 +222,9 @@ test_files:
|
|
217
222
|
- test/test-cifar.rb
|
218
223
|
- test/test-cldr-plurals.rb
|
219
224
|
- test/test-communities.rb
|
225
|
+
- test/test-dataset.rb
|
220
226
|
- test/test-dictionary.rb
|
227
|
+
- test/test-downloader.rb
|
221
228
|
- test/test-e-stat-japan.rb
|
222
229
|
- test/test-fashion-mnist.rb
|
223
230
|
- test/test-hepatitis.rb
|
@@ -229,6 +236,7 @@ test_files:
|
|
229
236
|
- test/test-penguins.rb
|
230
237
|
- test/test-penn-treebank.rb
|
231
238
|
- test/test-postal-code-japan.rb
|
239
|
+
- test/test-rdatasets.rb
|
232
240
|
- test/test-table.rb
|
233
241
|
- test/test-wikipedia.rb
|
234
242
|
- test/test-wine.rb
|