red-datasets 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
@@ -1,100 +1,54 @@
1
1
  class WikipediaTest < Test::Unit::TestCase
2
- sub_test_case("ja") do
2
+ sub_test_case("en") do
3
3
  sub_test_case("articles") do
4
- include Helper::Sandbox
5
-
6
4
  def setup
7
- setup_sandbox
8
- @dataset = Datasets::Wikipedia.new(language: :ja,
5
+ @dataset = Datasets::Wikipedia.new(language: :en,
9
6
  type: :articles)
10
- def @dataset.cache_dir_path
11
- @cache_dir_path
12
- end
13
- def @dataset.cache_dir_path=(path)
14
- @cache_dir_path = path
15
- end
16
- @dataset.cache_dir_path = @tmp_dir
17
- end
18
-
19
- def teardown
20
- teardown_sandbox
21
7
  end
22
8
 
23
9
  test("#each") do
24
- def @dataset.download(output_path, url)
25
- xml_path = output_path.sub_ext("")
26
- xml_path.open("w") do |xml_file|
27
- xml_file.puts(<<-XML)
28
- <mediawiki
29
- xmlns="http://www.mediawiki.org/xml/export-0.10/"
30
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
31
- xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd"
32
- version="0.10" xml:lang="ja">
33
- <siteinfo>
34
- <sitename>Wikipedia</sitename>
35
- </siteinfo>
36
- <page>
37
- <title>タイトル</title>
38
- <ns>4</ns>
39
- <id>1</id>
40
- <restrictions>sysop</restrictions>
41
- <revision>
42
- <id>3</id>
43
- <parentid>2</parentid>
44
- <timestamp>2004-04-30T14:46:00Z</timestamp>
45
- <contributor>
46
- <username>user</username>
47
- <id>10</id>
48
- </contributor>
49
- <minor />
50
- <comment>コメント</comment>
51
- <model>wikitext</model>
52
- <format>text/x-wiki</format>
53
- <text xml:space="preserve">テキスト</text>
54
- <sha1>a9674b19f8c56f785c91a555d0a144522bb318e6</sha1>
55
- </revision>
56
- </page>
57
- </mediawiki>
58
- XML
59
- end
60
- unless system("bzip2", xml_path.to_s)
61
- raise "failed to run bzip2"
62
- end
63
- end
64
-
65
- contributor = Datasets::Wikipedia::Contributor.new("user", 10)
10
+ contributor = Datasets::Wikipedia::Contributor.new("Elli", 20842734)
66
11
  revision = Datasets::Wikipedia::Revision.new
67
- revision.id = 3
68
- revision.parent_id = 2
69
- revision.timestamp = Time.iso8601("2004-04-30T14:46:00Z")
12
+ revision.id = 1002250816
13
+ revision.parent_id = 854851586
14
+ revision.timestamp = Time.iso8601("2021-01-23T15:15:01Z")
70
15
  revision.contributor = contributor
71
- revision.comment = "コメント"
16
+ revision.comment = "shel"
72
17
  revision.model = "wikitext"
73
18
  revision.format = "text/x-wiki"
74
- revision.text = "テキスト"
75
- revision.sha1 = "a9674b19f8c56f785c91a555d0a144522bb318e6"
19
+ revision.text = <<-TEXT.chomp
20
+ #REDIRECT [[Computer accessibility]]
21
+
22
+ {{rcat shell|
23
+ {{R from move}}
24
+ {{R from CamelCase}}
25
+ {{R unprintworthy}}
26
+ }}
27
+ TEXT
28
+ revision.sha1 = "kmysdltgexdwkv2xsml3j44jb56dxvn"
76
29
  page = Datasets::Wikipedia::Page.new
77
- page.title = "タイトル"
78
- page.namespace = 4
79
- page.id = 1
80
- page.restrictions = ["sysop"]
30
+ page.title = "AccessibleComputing"
31
+ page.namespace = 0
32
+ page.id = 10
33
+ page.restrictions = nil
34
+ page.redirect = "Computer accessibility"
81
35
  page.revision = revision
82
36
  assert_equal(page, @dataset.each.first)
83
37
  end
84
38
 
85
39
  sub_test_case("#metadata") do
86
40
  test("#id") do
87
- assert_equal("wikipedia-ja-articles",
41
+ assert_equal("wikipedia-en-articles",
88
42
  @dataset.metadata.id)
89
43
  end
90
44
 
91
45
  test("#name") do
92
- assert_equal("Wikipedia articles (ja)",
46
+ assert_equal("Wikipedia articles (en)",
93
47
  @dataset.metadata.name)
94
48
  end
95
49
 
96
50
  test("#description") do
97
- assert_equal("Wikipedia articles in ja",
51
+ assert_equal("Wikipedia articles in en",
98
52
  @dataset.metadata.description)
99
53
  end
100
54
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
8
8
  - Kouhei Sutou
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-07-13 00:00:00.000000000 Z
12
+ date: 2023-05-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: csv
@@ -17,14 +17,14 @@ dependencies:
17
17
  requirements:
18
18
  - - ">="
19
19
  - !ruby/object:Gem::Version
20
- version: 3.0.5
20
+ version: 3.2.4
21
21
  type: :runtime
22
22
  prerelease: false
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
25
  - - ">="
26
26
  - !ruby/object:Gem::Version
27
- version: 3.0.5
27
+ version: 3.2.4
28
28
  - !ruby/object:Gem::Dependency
29
29
  name: rexml
30
30
  requirement: !ruby/object:Gem::Requirement
@@ -142,65 +142,98 @@ files:
142
142
  - doc/text/news.md
143
143
  - lib/datasets.rb
144
144
  - lib/datasets/adult.rb
145
+ - lib/datasets/afinn.rb
146
+ - lib/datasets/aozora-bunko.rb
147
+ - lib/datasets/cache-path.rb
148
+ - lib/datasets/california-housing.rb
145
149
  - lib/datasets/cifar.rb
146
150
  - lib/datasets/cldr-plurals.rb
147
151
  - lib/datasets/communities.rb
148
152
  - lib/datasets/dataset.rb
153
+ - lib/datasets/diamonds.rb
149
154
  - lib/datasets/dictionary.rb
150
155
  - lib/datasets/downloader.rb
151
156
  - lib/datasets/e-stat-japan.rb
152
157
  - lib/datasets/error.rb
153
158
  - lib/datasets/fashion-mnist.rb
159
+ - lib/datasets/fuel-economy.rb
160
+ - lib/datasets/geolonia.rb
161
+ - lib/datasets/ggplot2-dataset.rb
154
162
  - lib/datasets/hepatitis.rb
155
163
  - lib/datasets/iris.rb
164
+ - lib/datasets/ita-corpus.rb
165
+ - lib/datasets/kuzushiji-mnist.rb
166
+ - lib/datasets/lazy.rb
156
167
  - lib/datasets/libsvm-dataset-list.rb
157
168
  - lib/datasets/libsvm.rb
169
+ - lib/datasets/license.rb
170
+ - lib/datasets/livedoor-news.rb
158
171
  - lib/datasets/metadata.rb
159
172
  - lib/datasets/mnist.rb
160
173
  - lib/datasets/mushroom.rb
174
+ - lib/datasets/nagoya-university-conversation-corpus.rb
161
175
  - lib/datasets/penguins.rb
162
176
  - lib/datasets/penn-treebank.rb
177
+ - lib/datasets/pmjt-dataset-list.rb
163
178
  - lib/datasets/postal-code-japan.rb
164
- - lib/datasets/rdatasets.rb
165
- - lib/datasets/seaborn-data.rb
179
+ - lib/datasets/quora-duplicate-question-pair.rb
180
+ - lib/datasets/rdataset.rb
181
+ - lib/datasets/seaborn.rb
166
182
  - lib/datasets/sudachi-synonym-dictionary.rb
167
183
  - lib/datasets/table.rb
168
184
  - lib/datasets/tar-gz-readable.rb
169
185
  - lib/datasets/version.rb
186
+ - lib/datasets/wikipedia-kyoto-japanese-english.rb
170
187
  - lib/datasets/wikipedia.rb
171
188
  - lib/datasets/wine.rb
189
+ - lib/datasets/zip-extractor.rb
172
190
  - red-datasets.gemspec
173
191
  - test/helper.rb
174
192
  - test/run-test.rb
175
193
  - test/test-adult.rb
194
+ - test/test-afinn.rb
195
+ - test/test-aozora-bunko.rb
196
+ - test/test-california-housing.rb
176
197
  - test/test-cifar.rb
177
198
  - test/test-cldr-plurals.rb
178
199
  - test/test-communities.rb
179
200
  - test/test-dataset.rb
201
+ - test/test-diamonds.rb
180
202
  - test/test-dictionary.rb
181
203
  - test/test-downloader.rb
182
204
  - test/test-e-stat-japan.rb
183
205
  - test/test-fashion-mnist.rb
206
+ - test/test-fuel-economy.rb
207
+ - test/test-geolonia.rb
184
208
  - test/test-hepatitis.rb
185
209
  - test/test-iris.rb
210
+ - test/test-ita-corpus.rb
211
+ - test/test-kuzushiji-mnist.rb
186
212
  - test/test-libsvm-dataset-list.rb
187
213
  - test/test-libsvm.rb
214
+ - test/test-license.rb
215
+ - test/test-livedoor-news.rb
216
+ - test/test-metadata.rb
188
217
  - test/test-mnist.rb
189
218
  - test/test-mushroom.rb
219
+ - test/test-nagoya-university-conversation-corpus.rb
190
220
  - test/test-penguins.rb
191
221
  - test/test-penn-treebank.rb
222
+ - test/test-pmjt-dataset-list.rb
192
223
  - test/test-postal-code-japan.rb
193
- - test/test-rdatasets.rb
194
- - test/test-seaborn-data.rb
224
+ - test/test-quora-duplicate-question-pair.rb
225
+ - test/test-rdataset.rb
226
+ - test/test-seaborn.rb
195
227
  - test/test-sudachi-synonym-dictionary.rb
196
228
  - test/test-table.rb
229
+ - test/test-wikipedia-kyoto-japanese-english.rb
197
230
  - test/test-wikipedia.rb
198
231
  - test/test-wine.rb
199
232
  homepage: https://github.com/red-data-tools/red-datasets
200
233
  licenses:
201
234
  - MIT
202
235
  metadata: {}
203
- post_install_message:
236
+ post_install_message:
204
237
  rdoc_options: []
205
238
  require_paths:
206
239
  - lib
@@ -215,34 +248,49 @@ required_rubygems_version: !ruby/object:Gem::Requirement
215
248
  - !ruby/object:Gem::Version
216
249
  version: '0'
217
250
  requirements: []
218
- rubygems_version: 3.3.0.dev
219
- signing_key:
251
+ rubygems_version: 3.5.0.dev
252
+ signing_key:
220
253
  specification_version: 4
221
254
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
222
255
  test_files:
223
256
  - test/helper.rb
224
257
  - test/run-test.rb
225
258
  - test/test-adult.rb
259
+ - test/test-afinn.rb
260
+ - test/test-aozora-bunko.rb
261
+ - test/test-california-housing.rb
226
262
  - test/test-cifar.rb
227
263
  - test/test-cldr-plurals.rb
228
264
  - test/test-communities.rb
229
265
  - test/test-dataset.rb
266
+ - test/test-diamonds.rb
230
267
  - test/test-dictionary.rb
231
268
  - test/test-downloader.rb
232
269
  - test/test-e-stat-japan.rb
233
270
  - test/test-fashion-mnist.rb
271
+ - test/test-fuel-economy.rb
272
+ - test/test-geolonia.rb
234
273
  - test/test-hepatitis.rb
235
274
  - test/test-iris.rb
275
+ - test/test-ita-corpus.rb
276
+ - test/test-kuzushiji-mnist.rb
236
277
  - test/test-libsvm-dataset-list.rb
237
278
  - test/test-libsvm.rb
279
+ - test/test-license.rb
280
+ - test/test-livedoor-news.rb
281
+ - test/test-metadata.rb
238
282
  - test/test-mnist.rb
239
283
  - test/test-mushroom.rb
284
+ - test/test-nagoya-university-conversation-corpus.rb
240
285
  - test/test-penguins.rb
241
286
  - test/test-penn-treebank.rb
287
+ - test/test-pmjt-dataset-list.rb
242
288
  - test/test-postal-code-japan.rb
243
- - test/test-rdatasets.rb
244
- - test/test-seaborn-data.rb
289
+ - test/test-quora-duplicate-question-pair.rb
290
+ - test/test-rdataset.rb
291
+ - test/test-seaborn.rb
245
292
  - test/test-sudachi-synonym-dictionary.rb
246
293
  - test/test-table.rb
294
+ - test/test-wikipedia-kyoto-japanese-english.rb
247
295
  - test/test-wikipedia.rb
248
296
  - test/test-wine.rb
@@ -1,49 +0,0 @@
1
- module Datasets
2
- class SeabornData < Dataset
3
- URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
4
-
5
- def initialize(name)
6
- super()
7
- @metadata.id = "seaborn-data-#{name}"
8
- @metadata.name = "SeabornData: #{name}"
9
- @metadata.url = URL_FORMAT % {name: name}
10
-
11
- @data_path = cache_dir_path + (name + ".csv")
12
- @name = name
13
- end
14
-
15
- def each(&block)
16
- return to_enum(__method__) unless block_given?
17
-
18
- download(@data_path, @metadata.url) unless @data_path.exist?
19
- CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
20
- csv.each do |row|
21
- record = prepare_record(row)
22
- yield record
23
- end
24
- end
25
- end
26
-
27
- private
28
- def prepare_record(csv_row)
29
- record = csv_row.to_h
30
- record.transform_keys!(&:to_sym)
31
-
32
- # Perform the same preprocessing as seaborn's load_dataset function
33
- preprocessor = :"preprocess_#{@name}_record"
34
- __send__(preprocessor, record) if respond_to?(preprocessor, true)
35
-
36
- record
37
- end
38
-
39
- # The same preprocessing as seaborn.load_dataset
40
- def preprocess_flights_record(record)
41
- record[:month] &&= record[:month][0,3]
42
- end
43
-
44
- # The same preprocessing as seaborn.load_dataset
45
- def preprocess_penguins_record(record)
46
- record[:sex] &&= record[:sex].capitalize
47
- end
48
- end
49
- end
@@ -1,136 +0,0 @@
1
- class RdatasetsTest < Test::Unit::TestCase
2
- sub_test_case("RdatasetsList") do
3
- def setup
4
- @dataset = Datasets::RdatasetsList.new
5
- end
6
-
7
- sub_test_case("#each") do
8
- test("with package_name") do
9
- records = @dataset.filter(package: "datasets").to_a
10
- assert_equal([
11
- 84,
12
- {
13
- package: "datasets",
14
- dataset: "ability.cov",
15
- title: "Ability and Intelligence Tests",
16
- rows: 6,
17
- cols: 8,
18
- n_binary: 0,
19
- n_character: 0,
20
- n_factor: 0,
21
- n_logical: 0,
22
- n_numeric: 8,
23
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
24
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
25
- },
26
- {
27
- package: "datasets",
28
- dataset: "WWWusage",
29
- title: "Internet Usage per Minute",
30
- rows: 100,
31
- cols: 2,
32
- n_binary: 0,
33
- n_character: 0,
34
- n_factor: 0,
35
- n_logical: 0,
36
- n_numeric: 2,
37
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
38
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
39
- }
40
- ],
41
- [
42
- records.size,
43
- records[0].to_h,
44
- records[-1].to_h
45
- ])
46
- end
47
-
48
- test("without package_name") do
49
- records = @dataset.each.to_a
50
- assert_equal([
51
- 1714,
52
- {
53
- package: "AER",
54
- dataset: "Affairs",
55
- title: "Fair's Extramarital Affairs Data",
56
- rows: 601,
57
- cols: 9,
58
- n_binary: 2,
59
- n_character: 0,
60
- n_factor: 2,
61
- n_logical: 0,
62
- n_numeric: 7,
63
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
64
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
65
- },
66
- {
67
- package: "vcd",
68
- dataset: "WomenQueue",
69
- title: "Women in Queues",
70
- rows: 11,
71
- cols: 2,
72
- n_binary: 0,
73
- n_character: 0,
74
- n_factor: 1,
75
- n_logical: 0,
76
- n_numeric: 1,
77
- csv: "https://vincentarelbundock.github.io/Rdatasets/csv/vcd/WomenQueue.csv",
78
- doc: "https://vincentarelbundock.github.io/Rdatasets/doc/vcd/WomenQueue.html"
79
- },
80
- ],
81
- [
82
- records.size,
83
- records[0].to_h,
84
- records[-1].to_h
85
- ])
86
- end
87
- end
88
- end
89
-
90
- sub_test_case("Rdatasets") do
91
- sub_test_case("datasets") do
92
- sub_test_case("AirPassengers") do
93
- def setup
94
- @dataset = Datasets::Rdatasets.new("datasets", "AirPassengers")
95
- end
96
-
97
- test("#each") do
98
- records = @dataset.each.to_a
99
- assert_equal([
100
- 144,
101
- { time: 1949, value: 112 },
102
- { time: 1960.91666666667, value: 432 },
103
- ],
104
- [
105
- records.size,
106
- records[0],
107
- records[-1]
108
- ])
109
- end
110
-
111
- test("#metadata.id") do
112
- assert_equal("rdatasets-datasets-AirPassengers", @dataset.metadata.id)
113
- end
114
-
115
- test("#metadata.description") do
116
- description = @dataset.metadata.description
117
- assert do
118
- description.include?("Monthly Airline Passenger Numbers 1949-1960")
119
- end
120
- end
121
- end
122
-
123
- test("invalid dataset name") do
124
- assert_raise(ArgumentError) do
125
- Datasets::Rdatasets.new("datasets", "invalid datasets name")
126
- end
127
- end
128
- end
129
-
130
- test("invalid package name") do
131
- assert_raise(ArgumentError) do
132
- Datasets::Rdatasets.new("invalid package name", "AirPassengers")
133
- end
134
- end
135
- end
136
- end