red-datasets 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/doc/text/news.md +86 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +8 -12
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +6 -1
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/penguins.rb +4 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +4 -5
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +36 -0
- data/lib/datasets.rb +14 -2
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +64 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- metadata +58 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
8
8
|
- Kouhei Sutou
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-09-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: csv
|
@@ -17,14 +17,14 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - ">="
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: 3.
|
20
|
+
version: 3.2.4
|
21
21
|
type: :runtime
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
25
|
- - ">="
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version: 3.
|
27
|
+
version: 3.2.4
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
29
|
name: rexml
|
30
30
|
requirement: !ruby/object:Gem::Requirement
|
@@ -142,65 +142,95 @@ files:
|
|
142
142
|
- doc/text/news.md
|
143
143
|
- lib/datasets.rb
|
144
144
|
- lib/datasets/adult.rb
|
145
|
+
- lib/datasets/afinn.rb
|
146
|
+
- lib/datasets/aozora-bunko.rb
|
147
|
+
- lib/datasets/cache-path.rb
|
148
|
+
- lib/datasets/california-housing.rb
|
145
149
|
- lib/datasets/cifar.rb
|
146
150
|
- lib/datasets/cldr-plurals.rb
|
147
151
|
- lib/datasets/communities.rb
|
148
152
|
- lib/datasets/dataset.rb
|
153
|
+
- lib/datasets/diamonds.rb
|
149
154
|
- lib/datasets/dictionary.rb
|
150
155
|
- lib/datasets/downloader.rb
|
151
156
|
- lib/datasets/e-stat-japan.rb
|
152
157
|
- lib/datasets/error.rb
|
153
158
|
- lib/datasets/fashion-mnist.rb
|
159
|
+
- lib/datasets/fuel-economy.rb
|
160
|
+
- lib/datasets/geolonia.rb
|
161
|
+
- lib/datasets/ggplot2-dataset.rb
|
154
162
|
- lib/datasets/hepatitis.rb
|
155
163
|
- lib/datasets/iris.rb
|
164
|
+
- lib/datasets/ita-corpus.rb
|
165
|
+
- lib/datasets/kuzushiji-mnist.rb
|
156
166
|
- lib/datasets/libsvm-dataset-list.rb
|
157
167
|
- lib/datasets/libsvm.rb
|
168
|
+
- lib/datasets/license.rb
|
169
|
+
- lib/datasets/livedoor-news.rb
|
158
170
|
- lib/datasets/metadata.rb
|
159
171
|
- lib/datasets/mnist.rb
|
160
172
|
- lib/datasets/mushroom.rb
|
161
173
|
- lib/datasets/penguins.rb
|
162
174
|
- lib/datasets/penn-treebank.rb
|
175
|
+
- lib/datasets/pmjt-dataset-list.rb
|
163
176
|
- lib/datasets/postal-code-japan.rb
|
164
|
-
- lib/datasets/
|
165
|
-
- lib/datasets/
|
177
|
+
- lib/datasets/quora-duplicate-question-pair.rb
|
178
|
+
- lib/datasets/rdataset.rb
|
179
|
+
- lib/datasets/seaborn.rb
|
166
180
|
- lib/datasets/sudachi-synonym-dictionary.rb
|
167
181
|
- lib/datasets/table.rb
|
168
182
|
- lib/datasets/tar-gz-readable.rb
|
169
183
|
- lib/datasets/version.rb
|
184
|
+
- lib/datasets/wikipedia-kyoto-japanese-english.rb
|
170
185
|
- lib/datasets/wikipedia.rb
|
171
186
|
- lib/datasets/wine.rb
|
187
|
+
- lib/datasets/zip-extractor.rb
|
172
188
|
- red-datasets.gemspec
|
173
189
|
- test/helper.rb
|
174
190
|
- test/run-test.rb
|
175
191
|
- test/test-adult.rb
|
192
|
+
- test/test-afinn.rb
|
193
|
+
- test/test-aozora-bunko.rb
|
194
|
+
- test/test-california-housing.rb
|
176
195
|
- test/test-cifar.rb
|
177
196
|
- test/test-cldr-plurals.rb
|
178
197
|
- test/test-communities.rb
|
179
198
|
- test/test-dataset.rb
|
199
|
+
- test/test-diamonds.rb
|
180
200
|
- test/test-dictionary.rb
|
181
201
|
- test/test-downloader.rb
|
182
202
|
- test/test-e-stat-japan.rb
|
183
203
|
- test/test-fashion-mnist.rb
|
204
|
+
- test/test-fuel-economy.rb
|
205
|
+
- test/test-geolonia.rb
|
184
206
|
- test/test-hepatitis.rb
|
185
207
|
- test/test-iris.rb
|
208
|
+
- test/test-ita-corpus.rb
|
209
|
+
- test/test-kuzushiji-mnist.rb
|
186
210
|
- test/test-libsvm-dataset-list.rb
|
187
211
|
- test/test-libsvm.rb
|
212
|
+
- test/test-license.rb
|
213
|
+
- test/test-livedoor-news.rb
|
214
|
+
- test/test-metadata.rb
|
188
215
|
- test/test-mnist.rb
|
189
216
|
- test/test-mushroom.rb
|
190
217
|
- test/test-penguins.rb
|
191
218
|
- test/test-penn-treebank.rb
|
219
|
+
- test/test-pmjt-dataset-list.rb
|
192
220
|
- test/test-postal-code-japan.rb
|
193
|
-
- test/test-
|
194
|
-
- test/test-
|
221
|
+
- test/test-quora-duplicate-question-pair.rb
|
222
|
+
- test/test-rdataset.rb
|
223
|
+
- test/test-seaborn.rb
|
195
224
|
- test/test-sudachi-synonym-dictionary.rb
|
196
225
|
- test/test-table.rb
|
226
|
+
- test/test-wikipedia-kyoto-japanese-english.rb
|
197
227
|
- test/test-wikipedia.rb
|
198
228
|
- test/test-wine.rb
|
199
229
|
homepage: https://github.com/red-data-tools/red-datasets
|
200
230
|
licenses:
|
201
231
|
- MIT
|
202
232
|
metadata: {}
|
203
|
-
post_install_message:
|
233
|
+
post_install_message:
|
204
234
|
rdoc_options: []
|
205
235
|
require_paths:
|
206
236
|
- lib
|
@@ -215,34 +245,48 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
215
245
|
- !ruby/object:Gem::Version
|
216
246
|
version: '0'
|
217
247
|
requirements: []
|
218
|
-
rubygems_version: 3.
|
219
|
-
signing_key:
|
248
|
+
rubygems_version: 3.2.32
|
249
|
+
signing_key:
|
220
250
|
specification_version: 4
|
221
251
|
summary: Red Datasets provides classes that provide common datasets such as iris dataset.
|
222
252
|
test_files:
|
223
253
|
- test/helper.rb
|
224
254
|
- test/run-test.rb
|
225
255
|
- test/test-adult.rb
|
256
|
+
- test/test-afinn.rb
|
257
|
+
- test/test-aozora-bunko.rb
|
258
|
+
- test/test-california-housing.rb
|
226
259
|
- test/test-cifar.rb
|
227
260
|
- test/test-cldr-plurals.rb
|
228
261
|
- test/test-communities.rb
|
229
262
|
- test/test-dataset.rb
|
263
|
+
- test/test-diamonds.rb
|
230
264
|
- test/test-dictionary.rb
|
231
265
|
- test/test-downloader.rb
|
232
266
|
- test/test-e-stat-japan.rb
|
233
267
|
- test/test-fashion-mnist.rb
|
268
|
+
- test/test-fuel-economy.rb
|
269
|
+
- test/test-geolonia.rb
|
234
270
|
- test/test-hepatitis.rb
|
235
271
|
- test/test-iris.rb
|
272
|
+
- test/test-ita-corpus.rb
|
273
|
+
- test/test-kuzushiji-mnist.rb
|
236
274
|
- test/test-libsvm-dataset-list.rb
|
237
275
|
- test/test-libsvm.rb
|
276
|
+
- test/test-license.rb
|
277
|
+
- test/test-livedoor-news.rb
|
278
|
+
- test/test-metadata.rb
|
238
279
|
- test/test-mnist.rb
|
239
280
|
- test/test-mushroom.rb
|
240
281
|
- test/test-penguins.rb
|
241
282
|
- test/test-penn-treebank.rb
|
283
|
+
- test/test-pmjt-dataset-list.rb
|
242
284
|
- test/test-postal-code-japan.rb
|
243
|
-
- test/test-
|
244
|
-
- test/test-
|
285
|
+
- test/test-quora-duplicate-question-pair.rb
|
286
|
+
- test/test-rdataset.rb
|
287
|
+
- test/test-seaborn.rb
|
245
288
|
- test/test-sudachi-synonym-dictionary.rb
|
246
289
|
- test/test-table.rb
|
290
|
+
- test/test-wikipedia-kyoto-japanese-english.rb
|
247
291
|
- test/test-wikipedia.rb
|
248
292
|
- test/test-wine.rb
|
@@ -1,49 +0,0 @@
|
|
1
|
-
module Datasets
|
2
|
-
class SeabornData < Dataset
|
3
|
-
URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
|
4
|
-
|
5
|
-
def initialize(name)
|
6
|
-
super()
|
7
|
-
@metadata.id = "seaborn-data-#{name}"
|
8
|
-
@metadata.name = "SeabornData: #{name}"
|
9
|
-
@metadata.url = URL_FORMAT % {name: name}
|
10
|
-
|
11
|
-
@data_path = cache_dir_path + (name + ".csv")
|
12
|
-
@name = name
|
13
|
-
end
|
14
|
-
|
15
|
-
def each(&block)
|
16
|
-
return to_enum(__method__) unless block_given?
|
17
|
-
|
18
|
-
download(@data_path, @metadata.url) unless @data_path.exist?
|
19
|
-
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
20
|
-
csv.each do |row|
|
21
|
-
record = prepare_record(row)
|
22
|
-
yield record
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
private
|
28
|
-
def prepare_record(csv_row)
|
29
|
-
record = csv_row.to_h
|
30
|
-
record.transform_keys!(&:to_sym)
|
31
|
-
|
32
|
-
# Perform the same preprocessing as seaborn's load_dataset function
|
33
|
-
preprocessor = :"preprocess_#{@name}_record"
|
34
|
-
__send__(preprocessor, record) if respond_to?(preprocessor, true)
|
35
|
-
|
36
|
-
record
|
37
|
-
end
|
38
|
-
|
39
|
-
# The same preprocessing as seaborn.load_dataset
|
40
|
-
def preprocess_flights_record(record)
|
41
|
-
record[:month] &&= record[:month][0,3]
|
42
|
-
end
|
43
|
-
|
44
|
-
# The same preprocessing as seaborn.load_dataset
|
45
|
-
def preprocess_penguins_record(record)
|
46
|
-
record[:sex] &&= record[:sex].capitalize
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
data/test/test-rdatasets.rb
DELETED
@@ -1,136 +0,0 @@
|
|
1
|
-
class RdatasetsTest < Test::Unit::TestCase
|
2
|
-
sub_test_case("RdatasetsList") do
|
3
|
-
def setup
|
4
|
-
@dataset = Datasets::RdatasetsList.new
|
5
|
-
end
|
6
|
-
|
7
|
-
sub_test_case("#each") do
|
8
|
-
test("with package_name") do
|
9
|
-
records = @dataset.filter(package: "datasets").to_a
|
10
|
-
assert_equal([
|
11
|
-
84,
|
12
|
-
{
|
13
|
-
package: "datasets",
|
14
|
-
dataset: "ability.cov",
|
15
|
-
title: "Ability and Intelligence Tests",
|
16
|
-
rows: 6,
|
17
|
-
cols: 8,
|
18
|
-
n_binary: 0,
|
19
|
-
n_character: 0,
|
20
|
-
n_factor: 0,
|
21
|
-
n_logical: 0,
|
22
|
-
n_numeric: 8,
|
23
|
-
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
|
24
|
-
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
|
25
|
-
},
|
26
|
-
{
|
27
|
-
package: "datasets",
|
28
|
-
dataset: "WWWusage",
|
29
|
-
title: "Internet Usage per Minute",
|
30
|
-
rows: 100,
|
31
|
-
cols: 2,
|
32
|
-
n_binary: 0,
|
33
|
-
n_character: 0,
|
34
|
-
n_factor: 0,
|
35
|
-
n_logical: 0,
|
36
|
-
n_numeric: 2,
|
37
|
-
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
|
38
|
-
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
|
39
|
-
}
|
40
|
-
],
|
41
|
-
[
|
42
|
-
records.size,
|
43
|
-
records[0].to_h,
|
44
|
-
records[-1].to_h
|
45
|
-
])
|
46
|
-
end
|
47
|
-
|
48
|
-
test("without package_name") do
|
49
|
-
records = @dataset.each.to_a
|
50
|
-
assert_equal([
|
51
|
-
1714,
|
52
|
-
{
|
53
|
-
package: "AER",
|
54
|
-
dataset: "Affairs",
|
55
|
-
title: "Fair's Extramarital Affairs Data",
|
56
|
-
rows: 601,
|
57
|
-
cols: 9,
|
58
|
-
n_binary: 2,
|
59
|
-
n_character: 0,
|
60
|
-
n_factor: 2,
|
61
|
-
n_logical: 0,
|
62
|
-
n_numeric: 7,
|
63
|
-
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
|
64
|
-
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
|
65
|
-
},
|
66
|
-
{
|
67
|
-
package: "vcd",
|
68
|
-
dataset: "WomenQueue",
|
69
|
-
title: "Women in Queues",
|
70
|
-
rows: 11,
|
71
|
-
cols: 2,
|
72
|
-
n_binary: 0,
|
73
|
-
n_character: 0,
|
74
|
-
n_factor: 1,
|
75
|
-
n_logical: 0,
|
76
|
-
n_numeric: 1,
|
77
|
-
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/vcd/WomenQueue.csv",
|
78
|
-
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/vcd/WomenQueue.html"
|
79
|
-
},
|
80
|
-
],
|
81
|
-
[
|
82
|
-
records.size,
|
83
|
-
records[0].to_h,
|
84
|
-
records[-1].to_h
|
85
|
-
])
|
86
|
-
end
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
sub_test_case("Rdatasets") do
|
91
|
-
sub_test_case("datasets") do
|
92
|
-
sub_test_case("AirPassengers") do
|
93
|
-
def setup
|
94
|
-
@dataset = Datasets::Rdatasets.new("datasets", "AirPassengers")
|
95
|
-
end
|
96
|
-
|
97
|
-
test("#each") do
|
98
|
-
records = @dataset.each.to_a
|
99
|
-
assert_equal([
|
100
|
-
144,
|
101
|
-
{ time: 1949, value: 112 },
|
102
|
-
{ time: 1960.91666666667, value: 432 },
|
103
|
-
],
|
104
|
-
[
|
105
|
-
records.size,
|
106
|
-
records[0],
|
107
|
-
records[-1]
|
108
|
-
])
|
109
|
-
end
|
110
|
-
|
111
|
-
test("#metadata.id") do
|
112
|
-
assert_equal("rdatasets-datasets-AirPassengers", @dataset.metadata.id)
|
113
|
-
end
|
114
|
-
|
115
|
-
test("#metadata.description") do
|
116
|
-
description = @dataset.metadata.description
|
117
|
-
assert do
|
118
|
-
description.include?("Monthly Airline Passenger Numbers 1949-1960")
|
119
|
-
end
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
test("invalid dataset name") do
|
124
|
-
assert_raise(ArgumentError) do
|
125
|
-
Datasets::Rdatasets.new("datasets", "invalid datasets name")
|
126
|
-
end
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
|
-
test("invalid package name") do
|
131
|
-
assert_raise(ArgumentError) do
|
132
|
-
Datasets::Rdatasets.new("invalid package name", "AirPassengers")
|
133
|
-
end
|
134
|
-
end
|
135
|
-
end
|
136
|
-
end
|