red-datasets 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/doc/text/news.md +86 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +8 -12
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +6 -1
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/penguins.rb +4 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +4 -5
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +36 -0
- data/lib/datasets.rb +14 -2
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +64 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- metadata +58 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
8
8
|
- Kouhei Sutou
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-09-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: csv
|
@@ -17,14 +17,14 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - ">="
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: 3.
|
20
|
+
version: 3.2.4
|
21
21
|
type: :runtime
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
25
|
- - ">="
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version: 3.
|
27
|
+
version: 3.2.4
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
29
|
name: rexml
|
30
30
|
requirement: !ruby/object:Gem::Requirement
|
@@ -142,65 +142,95 @@ files:
|
|
142
142
|
- doc/text/news.md
|
143
143
|
- lib/datasets.rb
|
144
144
|
- lib/datasets/adult.rb
|
145
|
+
- lib/datasets/afinn.rb
|
146
|
+
- lib/datasets/aozora-bunko.rb
|
147
|
+
- lib/datasets/cache-path.rb
|
148
|
+
- lib/datasets/california-housing.rb
|
145
149
|
- lib/datasets/cifar.rb
|
146
150
|
- lib/datasets/cldr-plurals.rb
|
147
151
|
- lib/datasets/communities.rb
|
148
152
|
- lib/datasets/dataset.rb
|
153
|
+
- lib/datasets/diamonds.rb
|
149
154
|
- lib/datasets/dictionary.rb
|
150
155
|
- lib/datasets/downloader.rb
|
151
156
|
- lib/datasets/e-stat-japan.rb
|
152
157
|
- lib/datasets/error.rb
|
153
158
|
- lib/datasets/fashion-mnist.rb
|
159
|
+
- lib/datasets/fuel-economy.rb
|
160
|
+
- lib/datasets/geolonia.rb
|
161
|
+
- lib/datasets/ggplot2-dataset.rb
|
154
162
|
- lib/datasets/hepatitis.rb
|
155
163
|
- lib/datasets/iris.rb
|
164
|
+
- lib/datasets/ita-corpus.rb
|
165
|
+
- lib/datasets/kuzushiji-mnist.rb
|
156
166
|
- lib/datasets/libsvm-dataset-list.rb
|
157
167
|
- lib/datasets/libsvm.rb
|
168
|
+
- lib/datasets/license.rb
|
169
|
+
- lib/datasets/livedoor-news.rb
|
158
170
|
- lib/datasets/metadata.rb
|
159
171
|
- lib/datasets/mnist.rb
|
160
172
|
- lib/datasets/mushroom.rb
|
161
173
|
- lib/datasets/penguins.rb
|
162
174
|
- lib/datasets/penn-treebank.rb
|
175
|
+
- lib/datasets/pmjt-dataset-list.rb
|
163
176
|
- lib/datasets/postal-code-japan.rb
|
164
|
-
- lib/datasets/
|
165
|
-
- lib/datasets/
|
177
|
+
- lib/datasets/quora-duplicate-question-pair.rb
|
178
|
+
- lib/datasets/rdataset.rb
|
179
|
+
- lib/datasets/seaborn.rb
|
166
180
|
- lib/datasets/sudachi-synonym-dictionary.rb
|
167
181
|
- lib/datasets/table.rb
|
168
182
|
- lib/datasets/tar-gz-readable.rb
|
169
183
|
- lib/datasets/version.rb
|
184
|
+
- lib/datasets/wikipedia-kyoto-japanese-english.rb
|
170
185
|
- lib/datasets/wikipedia.rb
|
171
186
|
- lib/datasets/wine.rb
|
187
|
+
- lib/datasets/zip-extractor.rb
|
172
188
|
- red-datasets.gemspec
|
173
189
|
- test/helper.rb
|
174
190
|
- test/run-test.rb
|
175
191
|
- test/test-adult.rb
|
192
|
+
- test/test-afinn.rb
|
193
|
+
- test/test-aozora-bunko.rb
|
194
|
+
- test/test-california-housing.rb
|
176
195
|
- test/test-cifar.rb
|
177
196
|
- test/test-cldr-plurals.rb
|
178
197
|
- test/test-communities.rb
|
179
198
|
- test/test-dataset.rb
|
199
|
+
- test/test-diamonds.rb
|
180
200
|
- test/test-dictionary.rb
|
181
201
|
- test/test-downloader.rb
|
182
202
|
- test/test-e-stat-japan.rb
|
183
203
|
- test/test-fashion-mnist.rb
|
204
|
+
- test/test-fuel-economy.rb
|
205
|
+
- test/test-geolonia.rb
|
184
206
|
- test/test-hepatitis.rb
|
185
207
|
- test/test-iris.rb
|
208
|
+
- test/test-ita-corpus.rb
|
209
|
+
- test/test-kuzushiji-mnist.rb
|
186
210
|
- test/test-libsvm-dataset-list.rb
|
187
211
|
- test/test-libsvm.rb
|
212
|
+
- test/test-license.rb
|
213
|
+
- test/test-livedoor-news.rb
|
214
|
+
- test/test-metadata.rb
|
188
215
|
- test/test-mnist.rb
|
189
216
|
- test/test-mushroom.rb
|
190
217
|
- test/test-penguins.rb
|
191
218
|
- test/test-penn-treebank.rb
|
219
|
+
- test/test-pmjt-dataset-list.rb
|
192
220
|
- test/test-postal-code-japan.rb
|
193
|
-
- test/test-
|
194
|
-
- test/test-
|
221
|
+
- test/test-quora-duplicate-question-pair.rb
|
222
|
+
- test/test-rdataset.rb
|
223
|
+
- test/test-seaborn.rb
|
195
224
|
- test/test-sudachi-synonym-dictionary.rb
|
196
225
|
- test/test-table.rb
|
226
|
+
- test/test-wikipedia-kyoto-japanese-english.rb
|
197
227
|
- test/test-wikipedia.rb
|
198
228
|
- test/test-wine.rb
|
199
229
|
homepage: https://github.com/red-data-tools/red-datasets
|
200
230
|
licenses:
|
201
231
|
- MIT
|
202
232
|
metadata: {}
|
203
|
-
post_install_message:
|
233
|
+
post_install_message:
|
204
234
|
rdoc_options: []
|
205
235
|
require_paths:
|
206
236
|
- lib
|
@@ -215,34 +245,48 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
215
245
|
- !ruby/object:Gem::Version
|
216
246
|
version: '0'
|
217
247
|
requirements: []
|
218
|
-
rubygems_version: 3.
|
219
|
-
signing_key:
|
248
|
+
rubygems_version: 3.2.32
|
249
|
+
signing_key:
|
220
250
|
specification_version: 4
|
221
251
|
summary: Red Datasets provides classes that provide common datasets such as iris dataset.
|
222
252
|
test_files:
|
223
253
|
- test/helper.rb
|
224
254
|
- test/run-test.rb
|
225
255
|
- test/test-adult.rb
|
256
|
+
- test/test-afinn.rb
|
257
|
+
- test/test-aozora-bunko.rb
|
258
|
+
- test/test-california-housing.rb
|
226
259
|
- test/test-cifar.rb
|
227
260
|
- test/test-cldr-plurals.rb
|
228
261
|
- test/test-communities.rb
|
229
262
|
- test/test-dataset.rb
|
263
|
+
- test/test-diamonds.rb
|
230
264
|
- test/test-dictionary.rb
|
231
265
|
- test/test-downloader.rb
|
232
266
|
- test/test-e-stat-japan.rb
|
233
267
|
- test/test-fashion-mnist.rb
|
268
|
+
- test/test-fuel-economy.rb
|
269
|
+
- test/test-geolonia.rb
|
234
270
|
- test/test-hepatitis.rb
|
235
271
|
- test/test-iris.rb
|
272
|
+
- test/test-ita-corpus.rb
|
273
|
+
- test/test-kuzushiji-mnist.rb
|
236
274
|
- test/test-libsvm-dataset-list.rb
|
237
275
|
- test/test-libsvm.rb
|
276
|
+
- test/test-license.rb
|
277
|
+
- test/test-livedoor-news.rb
|
278
|
+
- test/test-metadata.rb
|
238
279
|
- test/test-mnist.rb
|
239
280
|
- test/test-mushroom.rb
|
240
281
|
- test/test-penguins.rb
|
241
282
|
- test/test-penn-treebank.rb
|
283
|
+
- test/test-pmjt-dataset-list.rb
|
242
284
|
- test/test-postal-code-japan.rb
|
243
|
-
- test/test-
|
244
|
-
- test/test-
|
285
|
+
- test/test-quora-duplicate-question-pair.rb
|
286
|
+
- test/test-rdataset.rb
|
287
|
+
- test/test-seaborn.rb
|
245
288
|
- test/test-sudachi-synonym-dictionary.rb
|
246
289
|
- test/test-table.rb
|
290
|
+
- test/test-wikipedia-kyoto-japanese-english.rb
|
247
291
|
- test/test-wikipedia.rb
|
248
292
|
- test/test-wine.rb
|
@@ -1,49 +0,0 @@
|
|
1
|
-
module Datasets
|
2
|
-
class SeabornData < Dataset
|
3
|
-
URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
|
4
|
-
|
5
|
-
def initialize(name)
|
6
|
-
super()
|
7
|
-
@metadata.id = "seaborn-data-#{name}"
|
8
|
-
@metadata.name = "SeabornData: #{name}"
|
9
|
-
@metadata.url = URL_FORMAT % {name: name}
|
10
|
-
|
11
|
-
@data_path = cache_dir_path + (name + ".csv")
|
12
|
-
@name = name
|
13
|
-
end
|
14
|
-
|
15
|
-
def each(&block)
|
16
|
-
return to_enum(__method__) unless block_given?
|
17
|
-
|
18
|
-
download(@data_path, @metadata.url) unless @data_path.exist?
|
19
|
-
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
20
|
-
csv.each do |row|
|
21
|
-
record = prepare_record(row)
|
22
|
-
yield record
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
private
|
28
|
-
def prepare_record(csv_row)
|
29
|
-
record = csv_row.to_h
|
30
|
-
record.transform_keys!(&:to_sym)
|
31
|
-
|
32
|
-
# Perform the same preprocessing as seaborn's load_dataset function
|
33
|
-
preprocessor = :"preprocess_#{@name}_record"
|
34
|
-
__send__(preprocessor, record) if respond_to?(preprocessor, true)
|
35
|
-
|
36
|
-
record
|
37
|
-
end
|
38
|
-
|
39
|
-
# The same preprocessing as seaborn.load_dataset
|
40
|
-
def preprocess_flights_record(record)
|
41
|
-
record[:month] &&= record[:month][0,3]
|
42
|
-
end
|
43
|
-
|
44
|
-
# The same preprocessing as seaborn.load_dataset
|
45
|
-
def preprocess_penguins_record(record)
|
46
|
-
record[:sex] &&= record[:sex].capitalize
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
data/test/test-rdatasets.rb
DELETED
@@ -1,136 +0,0 @@
|
|
1
|
-
class RdatasetsTest < Test::Unit::TestCase
|
2
|
-
sub_test_case("RdatasetsList") do
|
3
|
-
def setup
|
4
|
-
@dataset = Datasets::RdatasetsList.new
|
5
|
-
end
|
6
|
-
|
7
|
-
sub_test_case("#each") do
|
8
|
-
test("with package_name") do
|
9
|
-
records = @dataset.filter(package: "datasets").to_a
|
10
|
-
assert_equal([
|
11
|
-
84,
|
12
|
-
{
|
13
|
-
package: "datasets",
|
14
|
-
dataset: "ability.cov",
|
15
|
-
title: "Ability and Intelligence Tests",
|
16
|
-
rows: 6,
|
17
|
-
cols: 8,
|
18
|
-
n_binary: 0,
|
19
|
-
n_character: 0,
|
20
|
-
n_factor: 0,
|
21
|
-
n_logical: 0,
|
22
|
-
n_numeric: 8,
|
23
|
-
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
|
24
|
-
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
|
25
|
-
},
|
26
|
-
{
|
27
|
-
package: "datasets",
|
28
|
-
dataset: "WWWusage",
|
29
|
-
title: "Internet Usage per Minute",
|
30
|
-
rows: 100,
|
31
|
-
cols: 2,
|
32
|
-
n_binary: 0,
|
33
|
-
n_character: 0,
|
34
|
-
n_factor: 0,
|
35
|
-
n_logical: 0,
|
36
|
-
n_numeric: 2,
|
37
|
-
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
|
38
|
-
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
|
39
|
-
}
|
40
|
-
],
|
41
|
-
[
|
42
|
-
records.size,
|
43
|
-
records[0].to_h,
|
44
|
-
records[-1].to_h
|
45
|
-
])
|
46
|
-
end
|
47
|
-
|
48
|
-
test("without package_name") do
|
49
|
-
records = @dataset.each.to_a
|
50
|
-
assert_equal([
|
51
|
-
1714,
|
52
|
-
{
|
53
|
-
package: "AER",
|
54
|
-
dataset: "Affairs",
|
55
|
-
title: "Fair's Extramarital Affairs Data",
|
56
|
-
rows: 601,
|
57
|
-
cols: 9,
|
58
|
-
n_binary: 2,
|
59
|
-
n_character: 0,
|
60
|
-
n_factor: 2,
|
61
|
-
n_logical: 0,
|
62
|
-
n_numeric: 7,
|
63
|
-
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
|
64
|
-
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
|
65
|
-
},
|
66
|
-
{
|
67
|
-
package: "vcd",
|
68
|
-
dataset: "WomenQueue",
|
69
|
-
title: "Women in Queues",
|
70
|
-
rows: 11,
|
71
|
-
cols: 2,
|
72
|
-
n_binary: 0,
|
73
|
-
n_character: 0,
|
74
|
-
n_factor: 1,
|
75
|
-
n_logical: 0,
|
76
|
-
n_numeric: 1,
|
77
|
-
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/vcd/WomenQueue.csv",
|
78
|
-
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/vcd/WomenQueue.html"
|
79
|
-
},
|
80
|
-
],
|
81
|
-
[
|
82
|
-
records.size,
|
83
|
-
records[0].to_h,
|
84
|
-
records[-1].to_h
|
85
|
-
])
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
sub_test_case("Rdatasets") do
|
91
|
-
sub_test_case("datasets") do
|
92
|
-
sub_test_case("AirPassengers") do
|
93
|
-
def setup
|
94
|
-
@dataset = Datasets::Rdatasets.new("datasets", "AirPassengers")
|
95
|
-
end
|
96
|
-
|
97
|
-
test("#each") do
|
98
|
-
records = @dataset.each.to_a
|
99
|
-
assert_equal([
|
100
|
-
144,
|
101
|
-
{ time: 1949, value: 112 },
|
102
|
-
{ time: 1960.91666666667, value: 432 },
|
103
|
-
],
|
104
|
-
[
|
105
|
-
records.size,
|
106
|
-
records[0],
|
107
|
-
records[-1]
|
108
|
-
])
|
109
|
-
end
|
110
|
-
|
111
|
-
test("#metadata.id") do
|
112
|
-
assert_equal("rdatasets-datasets-AirPassengers", @dataset.metadata.id)
|
113
|
-
end
|
114
|
-
|
115
|
-
test("#metadata.description") do
|
116
|
-
description = @dataset.metadata.description
|
117
|
-
assert do
|
118
|
-
description.include?("Monthly Airline Passenger Numbers 1949-1960")
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
test("invalid dataset name") do
|
124
|
-
assert_raise(ArgumentError) do
|
125
|
-
Datasets::Rdatasets.new("datasets", "invalid datasets name")
|
126
|
-
end
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
|
-
test("invalid package name") do
|
131
|
-
assert_raise(ArgumentError) do
|
132
|
-
Datasets::Rdatasets.new("invalid package name", "AirPassengers")
|
133
|
-
end
|
134
|
-
end
|
135
|
-
end
|
136
|
-
end
|