red-datasets 0.1.3 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -2
  3. data/doc/text/news.md +92 -0
  4. data/lib/datasets/adult.rb +6 -9
  5. data/lib/datasets/afinn.rb +48 -0
  6. data/lib/datasets/aozora-bunko.rb +196 -0
  7. data/lib/datasets/cache-path.rb +28 -0
  8. data/lib/datasets/california-housing.rb +60 -0
  9. data/lib/datasets/cifar.rb +2 -4
  10. data/lib/datasets/cldr-plurals.rb +2 -4
  11. data/lib/datasets/communities.rb +5 -8
  12. data/lib/datasets/dataset.rb +8 -12
  13. data/lib/datasets/diamonds.rb +26 -0
  14. data/lib/datasets/downloader.rb +6 -1
  15. data/lib/datasets/e-stat-japan.rb +2 -1
  16. data/lib/datasets/fashion-mnist.rb +4 -0
  17. data/lib/datasets/fuel-economy.rb +35 -0
  18. data/lib/datasets/geolonia.rb +67 -0
  19. data/lib/datasets/ggplot2-dataset.rb +79 -0
  20. data/lib/datasets/hepatitis.rb +5 -8
  21. data/lib/datasets/iris.rb +5 -8
  22. data/lib/datasets/ita-corpus.rb +57 -0
  23. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  24. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  25. data/lib/datasets/libsvm.rb +3 -4
  26. data/lib/datasets/license.rb +26 -0
  27. data/lib/datasets/livedoor-news.rb +80 -0
  28. data/lib/datasets/metadata.rb +14 -0
  29. data/lib/datasets/mnist.rb +7 -7
  30. data/lib/datasets/mushroom.rb +5 -8
  31. data/lib/datasets/penguins.rb +4 -8
  32. data/lib/datasets/penn-treebank.rb +2 -4
  33. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  34. data/lib/datasets/postal-code-japan.rb +2 -6
  35. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  36. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  37. data/lib/datasets/seaborn.rb +90 -0
  38. data/lib/datasets/sudachi-synonym-dictionary.rb +8 -12
  39. data/lib/datasets/version.rb +1 -1
  40. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  41. data/lib/datasets/wikipedia.rb +4 -5
  42. data/lib/datasets/wine.rb +6 -9
  43. data/lib/datasets/zip-extractor.rb +36 -0
  44. data/lib/datasets.rb +14 -2
  45. data/red-datasets.gemspec +1 -1
  46. data/test/helper.rb +21 -0
  47. data/test/test-afinn.rb +60 -0
  48. data/test/test-aozora-bunko.rb +190 -0
  49. data/test/test-california-housing.rb +56 -0
  50. data/test/test-cldr-plurals.rb +1 -1
  51. data/test/test-dataset.rb +15 -7
  52. data/test/test-diamonds.rb +71 -0
  53. data/test/test-fuel-economy.rb +75 -0
  54. data/test/test-geolonia.rb +64 -0
  55. data/test/test-ita-corpus.rb +69 -0
  56. data/test/test-kuzushiji-mnist.rb +137 -0
  57. data/test/test-license.rb +24 -0
  58. data/test/test-livedoor-news.rb +351 -0
  59. data/test/test-metadata.rb +36 -0
  60. data/test/test-penguins.rb +1 -1
  61. data/test/test-pmjt-dataset-list.rb +50 -0
  62. data/test/test-quora-duplicate-question-pair.rb +33 -0
  63. data/test/test-rdataset.rb +246 -0
  64. data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
  65. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  66. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  67. metadata +58 -14
  68. data/lib/datasets/seaborn-data.rb +0 -49
  69. data/test/test-rdatasets.rb +0 -136
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
8
8
  - Kouhei Sutou
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-07-09 00:00:00.000000000 Z
12
+ date: 2022-09-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: csv
@@ -17,14 +17,14 @@ dependencies:
17
17
  requirements:
18
18
  - - ">="
19
19
  - !ruby/object:Gem::Version
20
- version: 3.0.5
20
+ version: 3.2.4
21
21
  type: :runtime
22
22
  prerelease: false
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
25
  - - ">="
26
26
  - !ruby/object:Gem::Version
27
- version: 3.0.5
27
+ version: 3.2.4
28
28
  - !ruby/object:Gem::Dependency
29
29
  name: rexml
30
30
  requirement: !ruby/object:Gem::Requirement
@@ -142,65 +142,95 @@ files:
142
142
  - doc/text/news.md
143
143
  - lib/datasets.rb
144
144
  - lib/datasets/adult.rb
145
+ - lib/datasets/afinn.rb
146
+ - lib/datasets/aozora-bunko.rb
147
+ - lib/datasets/cache-path.rb
148
+ - lib/datasets/california-housing.rb
145
149
  - lib/datasets/cifar.rb
146
150
  - lib/datasets/cldr-plurals.rb
147
151
  - lib/datasets/communities.rb
148
152
  - lib/datasets/dataset.rb
153
+ - lib/datasets/diamonds.rb
149
154
  - lib/datasets/dictionary.rb
150
155
  - lib/datasets/downloader.rb
151
156
  - lib/datasets/e-stat-japan.rb
152
157
  - lib/datasets/error.rb
153
158
  - lib/datasets/fashion-mnist.rb
159
+ - lib/datasets/fuel-economy.rb
160
+ - lib/datasets/geolonia.rb
161
+ - lib/datasets/ggplot2-dataset.rb
154
162
  - lib/datasets/hepatitis.rb
155
163
  - lib/datasets/iris.rb
164
+ - lib/datasets/ita-corpus.rb
165
+ - lib/datasets/kuzushiji-mnist.rb
156
166
  - lib/datasets/libsvm-dataset-list.rb
157
167
  - lib/datasets/libsvm.rb
168
+ - lib/datasets/license.rb
169
+ - lib/datasets/livedoor-news.rb
158
170
  - lib/datasets/metadata.rb
159
171
  - lib/datasets/mnist.rb
160
172
  - lib/datasets/mushroom.rb
161
173
  - lib/datasets/penguins.rb
162
174
  - lib/datasets/penn-treebank.rb
175
+ - lib/datasets/pmjt-dataset-list.rb
163
176
  - lib/datasets/postal-code-japan.rb
164
- - lib/datasets/rdatasets.rb
165
- - lib/datasets/seaborn-data.rb
177
+ - lib/datasets/quora-duplicate-question-pair.rb
178
+ - lib/datasets/rdataset.rb
179
+ - lib/datasets/seaborn.rb
166
180
  - lib/datasets/sudachi-synonym-dictionary.rb
167
181
  - lib/datasets/table.rb
168
182
  - lib/datasets/tar-gz-readable.rb
169
183
  - lib/datasets/version.rb
184
+ - lib/datasets/wikipedia-kyoto-japanese-english.rb
170
185
  - lib/datasets/wikipedia.rb
171
186
  - lib/datasets/wine.rb
187
+ - lib/datasets/zip-extractor.rb
172
188
  - red-datasets.gemspec
173
189
  - test/helper.rb
174
190
  - test/run-test.rb
175
191
  - test/test-adult.rb
192
+ - test/test-afinn.rb
193
+ - test/test-aozora-bunko.rb
194
+ - test/test-california-housing.rb
176
195
  - test/test-cifar.rb
177
196
  - test/test-cldr-plurals.rb
178
197
  - test/test-communities.rb
179
198
  - test/test-dataset.rb
199
+ - test/test-diamonds.rb
180
200
  - test/test-dictionary.rb
181
201
  - test/test-downloader.rb
182
202
  - test/test-e-stat-japan.rb
183
203
  - test/test-fashion-mnist.rb
204
+ - test/test-fuel-economy.rb
205
+ - test/test-geolonia.rb
184
206
  - test/test-hepatitis.rb
185
207
  - test/test-iris.rb
208
+ - test/test-ita-corpus.rb
209
+ - test/test-kuzushiji-mnist.rb
186
210
  - test/test-libsvm-dataset-list.rb
187
211
  - test/test-libsvm.rb
212
+ - test/test-license.rb
213
+ - test/test-livedoor-news.rb
214
+ - test/test-metadata.rb
188
215
  - test/test-mnist.rb
189
216
  - test/test-mushroom.rb
190
217
  - test/test-penguins.rb
191
218
  - test/test-penn-treebank.rb
219
+ - test/test-pmjt-dataset-list.rb
192
220
  - test/test-postal-code-japan.rb
193
- - test/test-rdatasets.rb
194
- - test/test-seaborn-data.rb
221
+ - test/test-quora-duplicate-question-pair.rb
222
+ - test/test-rdataset.rb
223
+ - test/test-seaborn.rb
195
224
  - test/test-sudachi-synonym-dictionary.rb
196
225
  - test/test-table.rb
226
+ - test/test-wikipedia-kyoto-japanese-english.rb
197
227
  - test/test-wikipedia.rb
198
228
  - test/test-wine.rb
199
229
  homepage: https://github.com/red-data-tools/red-datasets
200
230
  licenses:
201
231
  - MIT
202
232
  metadata: {}
203
- post_install_message:
233
+ post_install_message:
204
234
  rdoc_options: []
205
235
  require_paths:
206
236
  - lib
@@ -215,34 +245,48 @@ required_rubygems_version: !ruby/object:Gem::Requirement
215
245
  - !ruby/object:Gem::Version
216
246
  version: '0'
217
247
  requirements: []
218
- rubygems_version: 3.3.0.dev
219
- signing_key:
248
+ rubygems_version: 3.2.32
249
+ signing_key:
220
250
  specification_version: 4
221
251
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
222
252
  test_files:
223
253
  - test/helper.rb
224
254
  - test/run-test.rb
225
255
  - test/test-adult.rb
256
+ - test/test-afinn.rb
257
+ - test/test-aozora-bunko.rb
258
+ - test/test-california-housing.rb
226
259
  - test/test-cifar.rb
227
260
  - test/test-cldr-plurals.rb
228
261
  - test/test-communities.rb
229
262
  - test/test-dataset.rb
263
+ - test/test-diamonds.rb
230
264
  - test/test-dictionary.rb
231
265
  - test/test-downloader.rb
232
266
  - test/test-e-stat-japan.rb
233
267
  - test/test-fashion-mnist.rb
268
+ - test/test-fuel-economy.rb
269
+ - test/test-geolonia.rb
234
270
  - test/test-hepatitis.rb
235
271
  - test/test-iris.rb
272
+ - test/test-ita-corpus.rb
273
+ - test/test-kuzushiji-mnist.rb
236
274
  - test/test-libsvm-dataset-list.rb
237
275
  - test/test-libsvm.rb
276
+ - test/test-license.rb
277
+ - test/test-livedoor-news.rb
278
+ - test/test-metadata.rb
238
279
  - test/test-mnist.rb
239
280
  - test/test-mushroom.rb
240
281
  - test/test-penguins.rb
241
282
  - test/test-penn-treebank.rb
283
+ - test/test-pmjt-dataset-list.rb
242
284
  - test/test-postal-code-japan.rb
243
- - test/test-rdatasets.rb
244
- - test/test-seaborn-data.rb
285
+ - test/test-quora-duplicate-question-pair.rb
286
+ - test/test-rdataset.rb
287
+ - test/test-seaborn.rb
245
288
  - test/test-sudachi-synonym-dictionary.rb
246
289
  - test/test-table.rb
290
+ - test/test-wikipedia-kyoto-japanese-english.rb
247
291
  - test/test-wikipedia.rb
248
292
  - test/test-wine.rb
@@ -1,49 +0,0 @@
1
- module Datasets
2
- class SeabornData < Dataset
3
- URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
4
-
5
- def initialize(name)
6
- super()
7
- @metadata.id = "seaborn-data-#{name}"
8
- @metadata.name = "SeabornData: #{name}"
9
- @metadata.url = URL_FORMAT % {name: name}
10
-
11
- @data_path = cache_dir_path + (name + ".csv")
12
- @name = name
13
- end
14
-
15
- def each(&block)
16
- return to_enum(__method__) unless block_given?
17
-
18
- download(@data_path, @metadata.url) unless @data_path.exist?
19
- CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
20
- csv.each do |row|
21
- record = prepare_record(row)
22
- yield record
23
- end
24
- end
25
- end
26
-
27
- private
28
- def prepare_record(csv_row)
29
- record = csv_row.to_h
30
- record.transform_keys!(&:to_sym)
31
-
32
- # Perform the same preprocessing as seaborn's load_dataset function
33
- preprocessor = :"preprocess_#{@name}_record"
34
- __send__(preprocessor, record) if respond_to?(preprocessor, true)
35
-
36
- record
37
- end
38
-
39
- # The same preprocessing as seaborn.load_dataset
40
- def preprocess_flights_record(record)
41
- record[:month] &&= record[:month][0,3]
42
- end
43
-
44
- # The same preprocessing as seaborn.load_dataset
45
- def preprocess_penguins_record(record)
46
- record[:sex] &&= record[:sex].capitalize
47
- end
48
- end
49
- end
@@ -1,136 +0,0 @@
1
- class RdatasetsTest < Test::Unit::TestCase
2
- sub_test_case("RdatasetsList") do
3
- def setup
4
- @dataset = Datasets::RdatasetsList.new
5
- end
6
-
7
- sub_test_case("#each") do
8
- test("with package_name") do
9
- records = @dataset.filter(package: "datasets").to_a
10
- assert_equal([
11
- 84,
12
- {
13
- package: "datasets",
14
- dataset: "ability.cov",
15
- title: "Ability and Intelligence Tests",
16
- rows: 6,
17
- cols: 8,
18
- n_binary: 0,
19
- n_character: 0,
20
- n_factor: 0,
21
- n_logical: 0,
22
- n_numeric: 8,
23
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
24
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
25
- },
26
- {
27
- package: "datasets",
28
- dataset: "WWWusage",
29
- title: "Internet Usage per Minute",
30
- rows: 100,
31
- cols: 2,
32
- n_binary: 0,
33
- n_character: 0,
34
- n_factor: 0,
35
- n_logical: 0,
36
- n_numeric: 2,
37
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
38
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
39
- }
40
- ],
41
- [
42
- records.size,
43
- records[0].to_h,
44
- records[-1].to_h
45
- ])
46
- end
47
-
48
- test("without package_name") do
49
- records = @dataset.each.to_a
50
- assert_equal([
51
- 1714,
52
- {
53
- package: "AER",
54
- dataset: "Affairs",
55
- title: "Fair's Extramarital Affairs Data",
56
- rows: 601,
57
- cols: 9,
58
- n_binary: 2,
59
- n_character: 0,
60
- n_factor: 2,
61
- n_logical: 0,
62
- n_numeric: 7,
63
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
64
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
65
- },
66
- {
67
- package: "vcd",
68
- dataset: "WomenQueue",
69
- title: "Women in Queues",
70
- rows: 11,
71
- cols: 2,
72
- n_binary: 0,
73
- n_character: 0,
74
- n_factor: 1,
75
- n_logical: 0,
76
- n_numeric: 1,
77
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/vcd/WomenQueue.csv",
78
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/vcd/WomenQueue.html"
79
- },
80
- ],
81
- [
82
- records.size,
83
- records[0].to_h,
84
- records[-1].to_h
85
- ])
86
- end
87
- end
88
- end
89
-
90
- sub_test_case("Rdatasets") do
91
- sub_test_case("datasets") do
92
- sub_test_case("AirPassengers") do
93
- def setup
94
- @dataset = Datasets::Rdatasets.new("datasets", "AirPassengers")
95
- end
96
-
97
- test("#each") do
98
- records = @dataset.each.to_a
99
- assert_equal([
100
- 144,
101
- { time: 1949, value: 112 },
102
- { time: 1960.91666666667, value: 432 },
103
- ],
104
- [
105
- records.size,
106
- records[0],
107
- records[-1]
108
- ])
109
- end
110
-
111
- test("#metadata.id") do
112
- assert_equal("rdatasets-datasets-AirPassengers", @dataset.metadata.id)
113
- end
114
-
115
- test("#metadata.description") do
116
- description = @dataset.metadata.description
117
- assert do
118
- description.include?("Monthly Airline Passenger Numbers 1949-1960")
119
- end
120
- end
121
- end
122
-
123
- test("invalid dataset name") do
124
- assert_raise(ArgumentError) do
125
- Datasets::Rdatasets.new("datasets", "invalid datasets name")
126
- end
127
- end
128
- end
129
-
130
- test("invalid package name") do
131
- assert_raise(ArgumentError) do
132
- Datasets::Rdatasets.new("invalid package name", "AirPassengers")
133
- end
134
- end
135
- end
136
- end