red-datasets 0.1.4 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
@@ -1,100 +1,54 @@
1
1
  class WikipediaTest < Test::Unit::TestCase
2
- sub_test_case("ja") do
2
+ sub_test_case("en") do
3
3
  sub_test_case("articles") do
4
- include Helper::Sandbox
5
-
6
4
  def setup
7
- setup_sandbox
8
- @dataset = Datasets::Wikipedia.new(language: :ja,
5
+ @dataset = Datasets::Wikipedia.new(language: :en,
9
6
  type: :articles)
10
- def @dataset.cache_dir_path
11
- @cache_dir_path
12
- end
13
- def @dataset.cache_dir_path=(path)
14
- @cache_dir_path = path
15
- end
16
- @dataset.cache_dir_path = @tmp_dir
17
- end
18
-
19
- def teardown
20
- teardown_sandbox
21
7
  end
22
8
 
23
9
  test("#each") do
24
- def @dataset.download(output_path, url)
25
- xml_path = output_path.sub_ext("")
26
- xml_path.open("w") do |xml_file|
27
- xml_file.puts(<<-XML)
28
- <mediawiki
29
- xmlns="http://www.mediawiki.org/xml/export-0.10/"
30
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
31
- xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
32
- version="0.10" xml:lang="ja">
33
- <siteinfo>
34
- <sitename>Wikipedia</sitename>
35
- </siteinfo>
36
- <page>
37
- <title>タイトル</title>
38
- <ns>4</ns>
39
- <id>1</id>
40
- <restrictions>sysop</restrictions>
41
- <revision>
42
- <id>3</id>
43
- <parentid>2</parentid>
44
- <timestamp>2004-04-30T14:46:00Z</timestamp>
45
- <contributor>
46
- <username>user</username>
47
- <id>10</id>
48
- </contributor>
49
- <minor />
50
- <comment>コメント</comment>
51
- <model>wikitext</model>
52
- <format>text/x-wiki</format>
53
- <text xml:space="preserve">テキスト</text>
54
- <sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
55
- </revision>
56
- </page>
57
- </mediawiki>
58
- XML
59
- end
60
- unless system("bzip2", xml_path.to_s)
61
- raise "failed to run bzip2"
62
- end
63
- end
64
-
65
- contributor = Datasets::Wikipedia::Contributor.new("user", 10)
10
+ contributor = Datasets::Wikipedia::Contributor.new("Elli", 20842734)
66
11
  revision = Datasets::Wikipedia::Revision.new
67
- revision.id = 3
68
- revision.parent_id = 2
69
- revision.timestamp = Time.iso8601("2004-04-30T14:46:00Z")
12
+ revision.id = 1002250816
13
+ revision.parent_id = 854851586
14
+ revision.timestamp = Time.iso8601("2021-01-23T15:15:01Z")
70
15
  revision.contributor = contributor
71
- revision.comment = "コメント"
16
+ revision.comment = "shel"
72
17
  revision.model = "wikitext"
73
18
  revision.format = "text/x-wiki"
74
- revision.text = "テキスト"
75
- revision.sha1 = "a9674b19f8c56f785c91a555d0a144522bb318e6"
19
+ revision.text = <<-TEXT.chomp
20
+ #REDIRECT [[Computer accessibility]]
21
+
22
+ {{rcat shell|
23
+ {{R from move}}
24
+ {{R from CamelCase}}
25
+ {{R unprintworthy}}
26
+ }}
27
+ TEXT
28
+ revision.sha1 = "kmysdltgexdwkv2xsml3j44jb56dxvn"
76
29
  page = Datasets::Wikipedia::Page.new
77
- page.title = "タイトル"
78
- page.namespace = 4
79
- page.id = 1
80
- page.restrictions = ["sysop"]
30
+ page.title = "AccessibleComputing"
31
+ page.namespace = 0
32
+ page.id = 10
33
+ page.restrictions = nil
34
+ page.redirect = "Computer accessibility"
81
35
  page.revision = revision
82
36
  assert_equal(page, @dataset.each.first)
83
37
  end
84
38
 
85
39
  sub_test_case("#metadata") do
86
40
  test("#id") do
87
- assert_equal("wikipedia-ja-articles",
41
+ assert_equal("wikipedia-en-articles",
88
42
  @dataset.metadata.id)
89
43
  end
90
44
 
91
45
  test("#name") do
92
- assert_equal("Wikipedia articles (ja)",
46
+ assert_equal("Wikipedia articles (en)",
93
47
  @dataset.metadata.name)
94
48
  end
95
49
 
96
50
  test("#description") do
97
- assert_equal("Wikipedia articles in ja",
51
+ assert_equal("Wikipedia articles in en",
98
52
  @dataset.metadata.description)
99
53
  end
100
54
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
8
8
  - Kouhei Sutou
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-07-13 00:00:00.000000000 Z
12
+ date: 2023-05-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: csv
@@ -17,14 +17,14 @@ dependencies:
17
17
  requirements:
18
18
  - - ">="
19
19
  - !ruby/object:Gem::Version
20
- version: 3.0.5
20
+ version: 3.2.4
21
21
  type: :runtime
22
22
  prerelease: false
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
25
  - - ">="
26
26
  - !ruby/object:Gem::Version
27
- version: 3.0.5
27
+ version: 3.2.4
28
28
  - !ruby/object:Gem::Dependency
29
29
  name: rexml
30
30
  requirement: !ruby/object:Gem::Requirement
@@ -142,65 +142,98 @@ files:
142
142
  - doc/text/news.md
143
143
  - lib/datasets.rb
144
144
  - lib/datasets/adult.rb
145
+ - lib/datasets/afinn.rb
146
+ - lib/datasets/aozora-bunko.rb
147
+ - lib/datasets/cache-path.rb
148
+ - lib/datasets/california-housing.rb
145
149
  - lib/datasets/cifar.rb
146
150
  - lib/datasets/cldr-plurals.rb
147
151
  - lib/datasets/communities.rb
148
152
  - lib/datasets/dataset.rb
153
+ - lib/datasets/diamonds.rb
149
154
  - lib/datasets/dictionary.rb
150
155
  - lib/datasets/downloader.rb
151
156
  - lib/datasets/e-stat-japan.rb
152
157
  - lib/datasets/error.rb
153
158
  - lib/datasets/fashion-mnist.rb
159
+ - lib/datasets/fuel-economy.rb
160
+ - lib/datasets/geolonia.rb
161
+ - lib/datasets/ggplot2-dataset.rb
154
162
  - lib/datasets/hepatitis.rb
155
163
  - lib/datasets/iris.rb
164
+ - lib/datasets/ita-corpus.rb
165
+ - lib/datasets/kuzushiji-mnist.rb
166
+ - lib/datasets/lazy.rb
156
167
  - lib/datasets/libsvm-dataset-list.rb
157
168
  - lib/datasets/libsvm.rb
169
+ - lib/datasets/license.rb
170
+ - lib/datasets/livedoor-news.rb
158
171
  - lib/datasets/metadata.rb
159
172
  - lib/datasets/mnist.rb
160
173
  - lib/datasets/mushroom.rb
174
+ - lib/datasets/nagoya-university-conversation-corpus.rb
161
175
  - lib/datasets/penguins.rb
162
176
  - lib/datasets/penn-treebank.rb
177
+ - lib/datasets/pmjt-dataset-list.rb
163
178
  - lib/datasets/postal-code-japan.rb
164
- - lib/datasets/rdatasets.rb
165
- - lib/datasets/seaborn-data.rb
179
+ - lib/datasets/quora-duplicate-question-pair.rb
180
+ - lib/datasets/rdataset.rb
181
+ - lib/datasets/seaborn.rb
166
182
  - lib/datasets/sudachi-synonym-dictionary.rb
167
183
  - lib/datasets/table.rb
168
184
  - lib/datasets/tar-gz-readable.rb
169
185
  - lib/datasets/version.rb
186
+ - lib/datasets/wikipedia-kyoto-japanese-english.rb
170
187
  - lib/datasets/wikipedia.rb
171
188
  - lib/datasets/wine.rb
189
+ - lib/datasets/zip-extractor.rb
172
190
  - red-datasets.gemspec
173
191
  - test/helper.rb
174
192
  - test/run-test.rb
175
193
  - test/test-adult.rb
194
+ - test/test-afinn.rb
195
+ - test/test-aozora-bunko.rb
196
+ - test/test-california-housing.rb
176
197
  - test/test-cifar.rb
177
198
  - test/test-cldr-plurals.rb
178
199
  - test/test-communities.rb
179
200
  - test/test-dataset.rb
201
+ - test/test-diamonds.rb
180
202
  - test/test-dictionary.rb
181
203
  - test/test-downloader.rb
182
204
  - test/test-e-stat-japan.rb
183
205
  - test/test-fashion-mnist.rb
206
+ - test/test-fuel-economy.rb
207
+ - test/test-geolonia.rb
184
208
  - test/test-hepatitis.rb
185
209
  - test/test-iris.rb
210
+ - test/test-ita-corpus.rb
211
+ - test/test-kuzushiji-mnist.rb
186
212
  - test/test-libsvm-dataset-list.rb
187
213
  - test/test-libsvm.rb
214
+ - test/test-license.rb
215
+ - test/test-livedoor-news.rb
216
+ - test/test-metadata.rb
188
217
  - test/test-mnist.rb
189
218
  - test/test-mushroom.rb
219
+ - test/test-nagoya-university-conversation-corpus.rb
190
220
  - test/test-penguins.rb
191
221
  - test/test-penn-treebank.rb
222
+ - test/test-pmjt-dataset-list.rb
192
223
  - test/test-postal-code-japan.rb
193
- - test/test-rdatasets.rb
194
- - test/test-seaborn-data.rb
224
+ - test/test-quora-duplicate-question-pair.rb
225
+ - test/test-rdataset.rb
226
+ - test/test-seaborn.rb
195
227
  - test/test-sudachi-synonym-dictionary.rb
196
228
  - test/test-table.rb
229
+ - test/test-wikipedia-kyoto-japanese-english.rb
197
230
  - test/test-wikipedia.rb
198
231
  - test/test-wine.rb
199
232
  homepage: https://github.com/red-data-tools/red-datasets
200
233
  licenses:
201
234
  - MIT
202
235
  metadata: {}
203
- post_install_message:
236
+ post_install_message:
204
237
  rdoc_options: []
205
238
  require_paths:
206
239
  - lib
@@ -215,34 +248,49 @@ required_rubygems_version: !ruby/object:Gem::Requirement
215
248
  - !ruby/object:Gem::Version
216
249
  version: '0'
217
250
  requirements: []
218
- rubygems_version: 3.3.0.dev
219
- signing_key:
251
+ rubygems_version: 3.5.0.dev
252
+ signing_key:
220
253
  specification_version: 4
221
254
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
222
255
  test_files:
223
256
  - test/helper.rb
224
257
  - test/run-test.rb
225
258
  - test/test-adult.rb
259
+ - test/test-afinn.rb
260
+ - test/test-aozora-bunko.rb
261
+ - test/test-california-housing.rb
226
262
  - test/test-cifar.rb
227
263
  - test/test-cldr-plurals.rb
228
264
  - test/test-communities.rb
229
265
  - test/test-dataset.rb
266
+ - test/test-diamonds.rb
230
267
  - test/test-dictionary.rb
231
268
  - test/test-downloader.rb
232
269
  - test/test-e-stat-japan.rb
233
270
  - test/test-fashion-mnist.rb
271
+ - test/test-fuel-economy.rb
272
+ - test/test-geolonia.rb
234
273
  - test/test-hepatitis.rb
235
274
  - test/test-iris.rb
275
+ - test/test-ita-corpus.rb
276
+ - test/test-kuzushiji-mnist.rb
236
277
  - test/test-libsvm-dataset-list.rb
237
278
  - test/test-libsvm.rb
279
+ - test/test-license.rb
280
+ - test/test-livedoor-news.rb
281
+ - test/test-metadata.rb
238
282
  - test/test-mnist.rb
239
283
  - test/test-mushroom.rb
284
+ - test/test-nagoya-university-conversation-corpus.rb
240
285
  - test/test-penguins.rb
241
286
  - test/test-penn-treebank.rb
287
+ - test/test-pmjt-dataset-list.rb
242
288
  - test/test-postal-code-japan.rb
243
- - test/test-rdatasets.rb
244
- - test/test-seaborn-data.rb
289
+ - test/test-quora-duplicate-question-pair.rb
290
+ - test/test-rdataset.rb
291
+ - test/test-seaborn.rb
245
292
  - test/test-sudachi-synonym-dictionary.rb
246
293
  - test/test-table.rb
294
+ - test/test-wikipedia-kyoto-japanese-english.rb
247
295
  - test/test-wikipedia.rb
248
296
  - test/test-wine.rb
@@ -1,49 +0,0 @@
1
- module Datasets
2
- class SeabornData < Dataset
3
- URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
4
-
5
- def initialize(name)
6
- super()
7
- @metadata.id = "seaborn-data-#{name}"
8
- @metadata.name = "SeabornData: #{name}"
9
- @metadata.url = URL_FORMAT % {name: name}
10
-
11
- @data_path = cache_dir_path + (name + ".csv")
12
- @name = name
13
- end
14
-
15
- def each(&block)
16
- return to_enum(__method__) unless block_given?
17
-
18
- download(@data_path, @metadata.url) unless @data_path.exist?
19
- CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
20
- csv.each do |row|
21
- record = prepare_record(row)
22
- yield record
23
- end
24
- end
25
- end
26
-
27
- private
28
- def prepare_record(csv_row)
29
- record = csv_row.to_h
30
- record.transform_keys!(&:to_sym)
31
-
32
- # Perform the same preprocessing as seaborn's load_dataset function
33
- preprocessor = :"preprocess_#{@name}_record"
34
- __send__(preprocessor, record) if respond_to?(preprocessor, true)
35
-
36
- record
37
- end
38
-
39
- # The same preprocessing as seaborn.load_dataset
40
- def preprocess_flights_record(record)
41
- record[:month] &&= record[:month][0,3]
42
- end
43
-
44
- # The same preprocessing as seaborn.load_dataset
45
- def preprocess_penguins_record(record)
46
- record[:sex] &&= record[:sex].capitalize
47
- end
48
- end
49
- end
@@ -1,136 +0,0 @@
1
- class RdatasetsTest < Test::Unit::TestCase
2
- sub_test_case("RdatasetsList") do
3
- def setup
4
- @dataset = Datasets::RdatasetsList.new
5
- end
6
-
7
- sub_test_case("#each") do
8
- test("with package_name") do
9
- records = @dataset.filter(package: "datasets").to_a
10
- assert_equal([
11
- 84,
12
- {
13
- package: "datasets",
14
- dataset: "ability.cov",
15
- title: "Ability and Intelligence Tests",
16
- rows: 6,
17
- cols: 8,
18
- n_binary: 0,
19
- n_character: 0,
20
- n_factor: 0,
21
- n_logical: 0,
22
- n_numeric: 8,
23
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
24
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
25
- },
26
- {
27
- package: "datasets",
28
- dataset: "WWWusage",
29
- title: "Internet Usage per Minute",
30
- rows: 100,
31
- cols: 2,
32
- n_binary: 0,
33
- n_character: 0,
34
- n_factor: 0,
35
- n_logical: 0,
36
- n_numeric: 2,
37
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
38
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
39
- }
40
- ],
41
- [
42
- records.size,
43
- records[0].to_h,
44
- records[-1].to_h
45
- ])
46
- end
47
-
48
- test("without package_name") do
49
- records = @dataset.each.to_a
50
- assert_equal([
51
- 1714,
52
- {
53
- package: "AER",
54
- dataset: "Affairs",
55
- title: "Fair's Extramarital Affairs Data",
56
- rows: 601,
57
- cols: 9,
58
- n_binary: 2,
59
- n_character: 0,
60
- n_factor: 2,
61
- n_logical: 0,
62
- n_numeric: 7,
63
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
64
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
65
- },
66
- {
67
- package: "vcd",
68
- dataset: "WomenQueue",
69
- title: "Women in Queues",
70
- rows: 11,
71
- cols: 2,
72
- n_binary: 0,
73
- n_character: 0,
74
- n_factor: 1,
75
- n_logical: 0,
76
- n_numeric: 1,
77
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/vcd/WomenQueue.csv",
78
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/vcd/WomenQueue.html"
79
- },
80
- ],
81
- [
82
- records.size,
83
- records[0].to_h,
84
- records[-1].to_h
85
- ])
86
- end
87
- end
88
- end
89
-
90
- sub_test_case("Rdatasets") do
91
- sub_test_case("datasets") do
92
- sub_test_case("AirPassengers") do
93
- def setup
94
- @dataset = Datasets::Rdatasets.new("datasets", "AirPassengers")
95
- end
96
-
97
- test("#each") do
98
- records = @dataset.each.to_a
99
- assert_equal([
100
- 144,
101
- { time: 1949, value: 112 },
102
- { time: 1960.91666666667, value: 432 },
103
- ],
104
- [
105
- records.size,
106
- records[0],
107
- records[-1]
108
- ])
109
- end
110
-
111
- test("#metadata.id") do
112
- assert_equal("rdatasets-datasets-AirPassengers", @dataset.metadata.id)
113
- end
114
-
115
- test("#metadata.description") do
116
- description = @dataset.metadata.description
117
- assert do
118
- description.include?("Monthly Airline Passenger Numbers 1949-1960")
119
- end
120
- end
121
- end
122
-
123
- test("invalid dataset name") do
124
- assert_raise(ArgumentError) do
125
- Datasets::Rdatasets.new("datasets", "invalid datasets name")
126
- end
127
- end
128
- end
129
-
130
- test("invalid package name") do
131
- assert_raise(ArgumentError) do
132
- Datasets::Rdatasets.new("invalid package name", "AirPassengers")
133
- end
134
- end
135
- end
136
- end