red-datasets 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -2
  3. data/doc/text/news.md +86 -0
  4. data/lib/datasets/adult.rb +6 -9
  5. data/lib/datasets/afinn.rb +48 -0
  6. data/lib/datasets/aozora-bunko.rb +196 -0
  7. data/lib/datasets/cache-path.rb +28 -0
  8. data/lib/datasets/california-housing.rb +60 -0
  9. data/lib/datasets/cifar.rb +2 -4
  10. data/lib/datasets/cldr-plurals.rb +2 -4
  11. data/lib/datasets/communities.rb +5 -8
  12. data/lib/datasets/dataset.rb +8 -12
  13. data/lib/datasets/diamonds.rb +26 -0
  14. data/lib/datasets/downloader.rb +6 -1
  15. data/lib/datasets/e-stat-japan.rb +2 -1
  16. data/lib/datasets/fashion-mnist.rb +4 -0
  17. data/lib/datasets/fuel-economy.rb +35 -0
  18. data/lib/datasets/geolonia.rb +67 -0
  19. data/lib/datasets/ggplot2-dataset.rb +79 -0
  20. data/lib/datasets/hepatitis.rb +5 -8
  21. data/lib/datasets/iris.rb +5 -8
  22. data/lib/datasets/ita-corpus.rb +57 -0
  23. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  24. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  25. data/lib/datasets/libsvm.rb +3 -4
  26. data/lib/datasets/license.rb +26 -0
  27. data/lib/datasets/livedoor-news.rb +80 -0
  28. data/lib/datasets/metadata.rb +14 -0
  29. data/lib/datasets/mnist.rb +7 -7
  30. data/lib/datasets/mushroom.rb +5 -8
  31. data/lib/datasets/penguins.rb +4 -8
  32. data/lib/datasets/penn-treebank.rb +2 -4
  33. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  34. data/lib/datasets/postal-code-japan.rb +2 -6
  35. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  36. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  37. data/lib/datasets/seaborn.rb +90 -0
  38. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  39. data/lib/datasets/version.rb +1 -1
  40. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  41. data/lib/datasets/wikipedia.rb +4 -5
  42. data/lib/datasets/wine.rb +6 -9
  43. data/lib/datasets/zip-extractor.rb +36 -0
  44. data/lib/datasets.rb +14 -2
  45. data/red-datasets.gemspec +1 -1
  46. data/test/helper.rb +21 -0
  47. data/test/test-afinn.rb +60 -0
  48. data/test/test-aozora-bunko.rb +190 -0
  49. data/test/test-california-housing.rb +56 -0
  50. data/test/test-cldr-plurals.rb +1 -1
  51. data/test/test-dataset.rb +15 -7
  52. data/test/test-diamonds.rb +71 -0
  53. data/test/test-fuel-economy.rb +75 -0
  54. data/test/test-geolonia.rb +64 -0
  55. data/test/test-ita-corpus.rb +69 -0
  56. data/test/test-kuzushiji-mnist.rb +137 -0
  57. data/test/test-license.rb +24 -0
  58. data/test/test-livedoor-news.rb +351 -0
  59. data/test/test-metadata.rb +36 -0
  60. data/test/test-penguins.rb +1 -1
  61. data/test/test-pmjt-dataset-list.rb +50 -0
  62. data/test/test-quora-duplicate-question-pair.rb +33 -0
  63. data/test/test-rdataset.rb +246 -0
  64. data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
  65. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  66. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  67. metadata +58 -14
  68. data/lib/datasets/seaborn-data.rb +0 -49
  69. data/test/test-rdatasets.rb +0 -136
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
8
8
  - Kouhei Sutou
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-07-13 00:00:00.000000000 Z
12
+ date: 2022-09-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: csv
@@ -17,14 +17,14 @@ dependencies:
17
17
  requirements:
18
18
  - - ">="
19
19
  - !ruby/object:Gem::Version
20
- version: 3.0.5
20
+ version: 3.2.4
21
21
  type: :runtime
22
22
  prerelease: false
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
25
  - - ">="
26
26
  - !ruby/object:Gem::Version
27
- version: 3.0.5
27
+ version: 3.2.4
28
28
  - !ruby/object:Gem::Dependency
29
29
  name: rexml
30
30
  requirement: !ruby/object:Gem::Requirement
@@ -142,65 +142,95 @@ files:
142
142
  - doc/text/news.md
143
143
  - lib/datasets.rb
144
144
  - lib/datasets/adult.rb
145
+ - lib/datasets/afinn.rb
146
+ - lib/datasets/aozora-bunko.rb
147
+ - lib/datasets/cache-path.rb
148
+ - lib/datasets/california-housing.rb
145
149
  - lib/datasets/cifar.rb
146
150
  - lib/datasets/cldr-plurals.rb
147
151
  - lib/datasets/communities.rb
148
152
  - lib/datasets/dataset.rb
153
+ - lib/datasets/diamonds.rb
149
154
  - lib/datasets/dictionary.rb
150
155
  - lib/datasets/downloader.rb
151
156
  - lib/datasets/e-stat-japan.rb
152
157
  - lib/datasets/error.rb
153
158
  - lib/datasets/fashion-mnist.rb
159
+ - lib/datasets/fuel-economy.rb
160
+ - lib/datasets/geolonia.rb
161
+ - lib/datasets/ggplot2-dataset.rb
154
162
  - lib/datasets/hepatitis.rb
155
163
  - lib/datasets/iris.rb
164
+ - lib/datasets/ita-corpus.rb
165
+ - lib/datasets/kuzushiji-mnist.rb
156
166
  - lib/datasets/libsvm-dataset-list.rb
157
167
  - lib/datasets/libsvm.rb
168
+ - lib/datasets/license.rb
169
+ - lib/datasets/livedoor-news.rb
158
170
  - lib/datasets/metadata.rb
159
171
  - lib/datasets/mnist.rb
160
172
  - lib/datasets/mushroom.rb
161
173
  - lib/datasets/penguins.rb
162
174
  - lib/datasets/penn-treebank.rb
175
+ - lib/datasets/pmjt-dataset-list.rb
163
176
  - lib/datasets/postal-code-japan.rb
164
- - lib/datasets/rdatasets.rb
165
- - lib/datasets/seaborn-data.rb
177
+ - lib/datasets/quora-duplicate-question-pair.rb
178
+ - lib/datasets/rdataset.rb
179
+ - lib/datasets/seaborn.rb
166
180
  - lib/datasets/sudachi-synonym-dictionary.rb
167
181
  - lib/datasets/table.rb
168
182
  - lib/datasets/tar-gz-readable.rb
169
183
  - lib/datasets/version.rb
184
+ - lib/datasets/wikipedia-kyoto-japanese-english.rb
170
185
  - lib/datasets/wikipedia.rb
171
186
  - lib/datasets/wine.rb
187
+ - lib/datasets/zip-extractor.rb
172
188
  - red-datasets.gemspec
173
189
  - test/helper.rb
174
190
  - test/run-test.rb
175
191
  - test/test-adult.rb
192
+ - test/test-afinn.rb
193
+ - test/test-aozora-bunko.rb
194
+ - test/test-california-housing.rb
176
195
  - test/test-cifar.rb
177
196
  - test/test-cldr-plurals.rb
178
197
  - test/test-communities.rb
179
198
  - test/test-dataset.rb
199
+ - test/test-diamonds.rb
180
200
  - test/test-dictionary.rb
181
201
  - test/test-downloader.rb
182
202
  - test/test-e-stat-japan.rb
183
203
  - test/test-fashion-mnist.rb
204
+ - test/test-fuel-economy.rb
205
+ - test/test-geolonia.rb
184
206
  - test/test-hepatitis.rb
185
207
  - test/test-iris.rb
208
+ - test/test-ita-corpus.rb
209
+ - test/test-kuzushiji-mnist.rb
186
210
  - test/test-libsvm-dataset-list.rb
187
211
  - test/test-libsvm.rb
212
+ - test/test-license.rb
213
+ - test/test-livedoor-news.rb
214
+ - test/test-metadata.rb
188
215
  - test/test-mnist.rb
189
216
  - test/test-mushroom.rb
190
217
  - test/test-penguins.rb
191
218
  - test/test-penn-treebank.rb
219
+ - test/test-pmjt-dataset-list.rb
192
220
  - test/test-postal-code-japan.rb
193
- - test/test-rdatasets.rb
194
- - test/test-seaborn-data.rb
221
+ - test/test-quora-duplicate-question-pair.rb
222
+ - test/test-rdataset.rb
223
+ - test/test-seaborn.rb
195
224
  - test/test-sudachi-synonym-dictionary.rb
196
225
  - test/test-table.rb
226
+ - test/test-wikipedia-kyoto-japanese-english.rb
197
227
  - test/test-wikipedia.rb
198
228
  - test/test-wine.rb
199
229
  homepage: https://github.com/red-data-tools/red-datasets
200
230
  licenses:
201
231
  - MIT
202
232
  metadata: {}
203
- post_install_message:
233
+ post_install_message:
204
234
  rdoc_options: []
205
235
  require_paths:
206
236
  - lib
@@ -215,34 +245,48 @@ required_rubygems_version: !ruby/object:Gem::Requirement
215
245
  - !ruby/object:Gem::Version
216
246
  version: '0'
217
247
  requirements: []
218
- rubygems_version: 3.3.0.dev
219
- signing_key:
248
+ rubygems_version: 3.2.32
249
+ signing_key:
220
250
  specification_version: 4
221
251
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
222
252
  test_files:
223
253
  - test/helper.rb
224
254
  - test/run-test.rb
225
255
  - test/test-adult.rb
256
+ - test/test-afinn.rb
257
+ - test/test-aozora-bunko.rb
258
+ - test/test-california-housing.rb
226
259
  - test/test-cifar.rb
227
260
  - test/test-cldr-plurals.rb
228
261
  - test/test-communities.rb
229
262
  - test/test-dataset.rb
263
+ - test/test-diamonds.rb
230
264
  - test/test-dictionary.rb
231
265
  - test/test-downloader.rb
232
266
  - test/test-e-stat-japan.rb
233
267
  - test/test-fashion-mnist.rb
268
+ - test/test-fuel-economy.rb
269
+ - test/test-geolonia.rb
234
270
  - test/test-hepatitis.rb
235
271
  - test/test-iris.rb
272
+ - test/test-ita-corpus.rb
273
+ - test/test-kuzushiji-mnist.rb
236
274
  - test/test-libsvm-dataset-list.rb
237
275
  - test/test-libsvm.rb
276
+ - test/test-license.rb
277
+ - test/test-livedoor-news.rb
278
+ - test/test-metadata.rb
238
279
  - test/test-mnist.rb
239
280
  - test/test-mushroom.rb
240
281
  - test/test-penguins.rb
241
282
  - test/test-penn-treebank.rb
283
+ - test/test-pmjt-dataset-list.rb
242
284
  - test/test-postal-code-japan.rb
243
- - test/test-rdatasets.rb
244
- - test/test-seaborn-data.rb
285
+ - test/test-quora-duplicate-question-pair.rb
286
+ - test/test-rdataset.rb
287
+ - test/test-seaborn.rb
245
288
  - test/test-sudachi-synonym-dictionary.rb
246
289
  - test/test-table.rb
290
+ - test/test-wikipedia-kyoto-japanese-english.rb
247
291
  - test/test-wikipedia.rb
248
292
  - test/test-wine.rb
@@ -1,49 +0,0 @@
1
- module Datasets
2
- class SeabornData < Dataset
3
- URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
4
-
5
- def initialize(name)
6
- super()
7
- @metadata.id = "seaborn-data-#{name}"
8
- @metadata.name = "SeabornData: #{name}"
9
- @metadata.url = URL_FORMAT % {name: name}
10
-
11
- @data_path = cache_dir_path + (name + ".csv")
12
- @name = name
13
- end
14
-
15
- def each(&block)
16
- return to_enum(__method__) unless block_given?
17
-
18
- download(@data_path, @metadata.url) unless @data_path.exist?
19
- CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
20
- csv.each do |row|
21
- record = prepare_record(row)
22
- yield record
23
- end
24
- end
25
- end
26
-
27
- private
28
- def prepare_record(csv_row)
29
- record = csv_row.to_h
30
- record.transform_keys!(&:to_sym)
31
-
32
- # Perform the same preprocessing as seaborn's load_dataset function
33
- preprocessor = :"preprocess_#{@name}_record"
34
- __send__(preprocessor, record) if respond_to?(preprocessor, true)
35
-
36
- record
37
- end
38
-
39
- # The same preprocessing as seaborn.load_dataset
40
- def preprocess_flights_record(record)
41
- record[:month] &&= record[:month][0,3]
42
- end
43
-
44
- # The same preprocessing as seaborn.load_dataset
45
- def preprocess_penguins_record(record)
46
- record[:sex] &&= record[:sex].capitalize
47
- end
48
- end
49
- end
@@ -1,136 +0,0 @@
1
- class RdatasetsTest < Test::Unit::TestCase
2
- sub_test_case("RdatasetsList") do
3
- def setup
4
- @dataset = Datasets::RdatasetsList.new
5
- end
6
-
7
- sub_test_case("#each") do
8
- test("with package_name") do
9
- records = @dataset.filter(package: "datasets").to_a
10
- assert_equal([
11
- 84,
12
- {
13
- package: "datasets",
14
- dataset: "ability.cov",
15
- title: "Ability and Intelligence Tests",
16
- rows: 6,
17
- cols: 8,
18
- n_binary: 0,
19
- n_character: 0,
20
- n_factor: 0,
21
- n_logical: 0,
22
- n_numeric: 8,
23
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
24
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
25
- },
26
- {
27
- package: "datasets",
28
- dataset: "WWWusage",
29
- title: "Internet Usage per Minute",
30
- rows: 100,
31
- cols: 2,
32
- n_binary: 0,
33
- n_character: 0,
34
- n_factor: 0,
35
- n_logical: 0,
36
- n_numeric: 2,
37
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
38
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
39
- }
40
- ],
41
- [
42
- records.size,
43
- records[0].to_h,
44
- records[-1].to_h
45
- ])
46
- end
47
-
48
- test("without package_name") do
49
- records = @dataset.each.to_a
50
- assert_equal([
51
- 1714,
52
- {
53
- package: "AER",
54
- dataset: "Affairs",
55
- title: "Fair's Extramarital Affairs Data",
56
- rows: 601,
57
- cols: 9,
58
- n_binary: 2,
59
- n_character: 0,
60
- n_factor: 2,
61
- n_logical: 0,
62
- n_numeric: 7,
63
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
64
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
65
- },
66
- {
67
- package: "vcd",
68
- dataset: "WomenQueue",
69
- title: "Women in Queues",
70
- rows: 11,
71
- cols: 2,
72
- n_binary: 0,
73
- n_character: 0,
74
- n_factor: 1,
75
- n_logical: 0,
76
- n_numeric: 1,
77
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/vcd/WomenQueue.csv",
78
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/vcd/WomenQueue.html"
79
- },
80
- ],
81
- [
82
- records.size,
83
- records[0].to_h,
84
- records[-1].to_h
85
- ])
86
- end
87
- end
88
- end
89
-
90
- sub_test_case("Rdatasets") do
91
- sub_test_case("datasets") do
92
- sub_test_case("AirPassengers") do
93
- def setup
94
- @dataset = Datasets::Rdatasets.new("datasets", "AirPassengers")
95
- end
96
-
97
- test("#each") do
98
- records = @dataset.each.to_a
99
- assert_equal([
100
- 144,
101
- { time: 1949, value: 112 },
102
- { time: 1960.91666666667, value: 432 },
103
- ],
104
- [
105
- records.size,
106
- records[0],
107
- records[-1]
108
- ])
109
- end
110
-
111
- test("#metadata.id") do
112
- assert_equal("rdatasets-datasets-AirPassengers", @dataset.metadata.id)
113
- end
114
-
115
- test("#metadata.description") do
116
- description = @dataset.metadata.description
117
- assert do
118
- description.include?("Monthly Airline Passenger Numbers 1949-1960")
119
- end
120
- end
121
- end
122
-
123
- test("invalid dataset name") do
124
- assert_raise(ArgumentError) do
125
- Datasets::Rdatasets.new("datasets", "invalid datasets name")
126
- end
127
- end
128
- end
129
-
130
- test("invalid package name") do
131
- assert_raise(ArgumentError) do
132
- Datasets::Rdatasets.new("invalid package name", "AirPassengers")
133
- end
134
- end
135
- end
136
- end