red-datasets 0.1.3 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -2
  3. data/doc/text/news.md +92 -0
  4. data/lib/datasets/adult.rb +6 -9
  5. data/lib/datasets/afinn.rb +48 -0
  6. data/lib/datasets/aozora-bunko.rb +196 -0
  7. data/lib/datasets/cache-path.rb +28 -0
  8. data/lib/datasets/california-housing.rb +60 -0
  9. data/lib/datasets/cifar.rb +2 -4
  10. data/lib/datasets/cldr-plurals.rb +2 -4
  11. data/lib/datasets/communities.rb +5 -8
  12. data/lib/datasets/dataset.rb +8 -12
  13. data/lib/datasets/diamonds.rb +26 -0
  14. data/lib/datasets/downloader.rb +6 -1
  15. data/lib/datasets/e-stat-japan.rb +2 -1
  16. data/lib/datasets/fashion-mnist.rb +4 -0
  17. data/lib/datasets/fuel-economy.rb +35 -0
  18. data/lib/datasets/geolonia.rb +67 -0
  19. data/lib/datasets/ggplot2-dataset.rb +79 -0
  20. data/lib/datasets/hepatitis.rb +5 -8
  21. data/lib/datasets/iris.rb +5 -8
  22. data/lib/datasets/ita-corpus.rb +57 -0
  23. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  24. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  25. data/lib/datasets/libsvm.rb +3 -4
  26. data/lib/datasets/license.rb +26 -0
  27. data/lib/datasets/livedoor-news.rb +80 -0
  28. data/lib/datasets/metadata.rb +14 -0
  29. data/lib/datasets/mnist.rb +7 -7
  30. data/lib/datasets/mushroom.rb +5 -8
  31. data/lib/datasets/penguins.rb +4 -8
  32. data/lib/datasets/penn-treebank.rb +2 -4
  33. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  34. data/lib/datasets/postal-code-japan.rb +2 -6
  35. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  36. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  37. data/lib/datasets/seaborn.rb +90 -0
  38. data/lib/datasets/sudachi-synonym-dictionary.rb +8 -12
  39. data/lib/datasets/version.rb +1 -1
  40. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  41. data/lib/datasets/wikipedia.rb +4 -5
  42. data/lib/datasets/wine.rb +6 -9
  43. data/lib/datasets/zip-extractor.rb +36 -0
  44. data/lib/datasets.rb +14 -2
  45. data/red-datasets.gemspec +1 -1
  46. data/test/helper.rb +21 -0
  47. data/test/test-afinn.rb +60 -0
  48. data/test/test-aozora-bunko.rb +190 -0
  49. data/test/test-california-housing.rb +56 -0
  50. data/test/test-cldr-plurals.rb +1 -1
  51. data/test/test-dataset.rb +15 -7
  52. data/test/test-diamonds.rb +71 -0
  53. data/test/test-fuel-economy.rb +75 -0
  54. data/test/test-geolonia.rb +64 -0
  55. data/test/test-ita-corpus.rb +69 -0
  56. data/test/test-kuzushiji-mnist.rb +137 -0
  57. data/test/test-license.rb +24 -0
  58. data/test/test-livedoor-news.rb +351 -0
  59. data/test/test-metadata.rb +36 -0
  60. data/test/test-penguins.rb +1 -1
  61. data/test/test-pmjt-dataset-list.rb +50 -0
  62. data/test/test-quora-duplicate-question-pair.rb +33 -0
  63. data/test/test-rdataset.rb +246 -0
  64. data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
  65. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  66. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  67. metadata +58 -14
  68. data/lib/datasets/seaborn-data.rb +0 -49
  69. data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,33 @@
1
+ class QuoraDuplicateQuestionPairTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::QuoraDuplicateQuestionPair.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::QuoraDuplicateQuestionPair::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 404290,
14
+ record(0,
15
+ 1,
16
+ 2,
17
+ "What is the step by step guide to invest in share market in india?",
18
+ "What is the step by step guide to invest in share market?",
19
+ false),
20
+ record(404289,
21
+ 537932,
22
+ 537933,
23
+ "What is like to have sex with cousin?",
24
+ "What is it like to have sex with your cousin?",
25
+ false),
26
+ ],
27
+ [
28
+ records.size,
29
+ records.first,
30
+ records.last,
31
+ ])
32
+ end
33
+ end
@@ -0,0 +1,246 @@
1
+ class RdatasetTest < Test::Unit::TestCase
2
+ sub_test_case("RdatasetList") do
3
+ def setup
4
+ @dataset = Datasets::RdatasetList.new
5
+ end
6
+
7
+ sub_test_case("#each") do
8
+ test("with package_name") do
9
+ records = @dataset.filter(package: "datasets").to_a
10
+ assert_equal([
11
+ 84,
12
+ {
13
+ package: "datasets",
14
+ dataset: "ability.cov",
15
+ title: "Ability and Intelligence Tests",
16
+ rows: 6,
17
+ cols: 8,
18
+ n_binary: 0,
19
+ n_character: 0,
20
+ n_factor: 0,
21
+ n_logical: 0,
22
+ n_numeric: 8,
23
+ csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
24
+ doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
25
+ },
26
+ {
27
+ package: "datasets",
28
+ dataset: "WWWusage",
29
+ title: "Internet Usage per Minute",
30
+ rows: 100,
31
+ cols: 2,
32
+ n_binary: 0,
33
+ n_character: 0,
34
+ n_factor: 0,
35
+ n_logical: 0,
36
+ n_numeric: 2,
37
+ csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
38
+ doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
39
+ }
40
+ ],
41
+ [
42
+ records.size,
43
+ records[0].to_h,
44
+ records[-1].to_h
45
+ ])
46
+ end
47
+
48
+ test("without package_name") do
49
+ records = @dataset.each.to_a
50
+ assert_equal([
51
+ 1892,
52
+ {
53
+ package: "AER",
54
+ dataset: "Affairs",
55
+ title: "Fair's Extramarital Affairs Data",
56
+ rows: 601,
57
+ cols: 9,
58
+ n_binary: 2,
59
+ n_character: 0,
60
+ n_factor: 2,
61
+ n_logical: 0,
62
+ n_numeric: 7,
63
+ csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
64
+ doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
65
+ },
66
+ {
67
+ package: "wooldridge",
68
+ dataset: "wine",
69
+ title: "wine",
70
+ rows: 21,
71
+ cols: 5,
72
+ n_binary: 0,
73
+ n_character: 1,
74
+ n_factor: 0,
75
+ n_logical: 0,
76
+ n_numeric: 4,
77
+ csv: "https://vincentarelbundock.github.io/Rdatasets/csv/wooldridge/wine.csv",
78
+ doc: "https://vincentarelbundock.github.io/Rdatasets/doc/wooldridge/wine.html"
79
+ },
80
+ ],
81
+ [
82
+ records.size,
83
+ records[0].to_h,
84
+ records[-1].to_h
85
+ ])
86
+ end
87
+ end
88
+ end
89
+
90
+ sub_test_case("Rdataset") do
91
+ test('invalid package name') do
92
+ assert_raise(ArgumentError) do
93
+ Datasets::Rdataset.new('invalid package name', 'AirPassengers')
94
+ end
95
+ end
96
+
97
+ sub_test_case("datasets") do
98
+ test("invalid dataset name") do
99
+ assert_raise(ArgumentError) do
100
+ Datasets::Rdataset.new("datasets", "invalid datasets name")
101
+ end
102
+ end
103
+
104
+ sub_test_case("AirPassengers") do
105
+ def setup
106
+ @dataset = Datasets::Rdataset.new("datasets", "AirPassengers")
107
+ end
108
+
109
+ test("#each") do
110
+ records = @dataset.each.to_a
111
+ assert_equal([
112
+ 144,
113
+ { time: 1949, value: 112 },
114
+ { time: 1960.91666666667, value: 432 },
115
+ ],
116
+ [
117
+ records.size,
118
+ records[0],
119
+ records[-1]
120
+ ])
121
+ end
122
+
123
+ test("#metadata.id") do
124
+ assert_equal("rdataset-datasets-AirPassengers", @dataset.metadata.id)
125
+ end
126
+
127
+ test("#metadata.description") do
128
+ description = @dataset.metadata.description
129
+ assert do
130
+ description.include?("Monthly Airline Passenger Numbers 1949-1960")
131
+ end
132
+ end
133
+ end
134
+
135
+ sub_test_case("airquality") do
136
+ def setup
137
+ @dataset = Datasets::Rdataset.new("datasets", "airquality")
138
+ end
139
+
140
+ test("#each") do
141
+ records = @dataset.each.to_a
142
+ assert_equal([
143
+ 153,
144
+ { Ozone: nil, "Solar.R": nil, Wind: 14.3, Temp: 56, Month: 5, Day: 5 },
145
+ { Ozone: 20, "Solar.R": 223, Wind: 11.5, Temp: 68, Month: 9, Day: 30 },
146
+ ],
147
+ [
148
+ records.size,
149
+ records[4],
150
+ records[-1]
151
+ ])
152
+ end
153
+ end
154
+
155
+ sub_test_case('attenu') do
156
+ def setup
157
+ @dataset = Datasets::Rdataset.new('datasets', 'attenu')
158
+ end
159
+
160
+ test('#each') do
161
+ records = @dataset.each.to_a
162
+ assert_equal([
163
+ 182,
164
+ { event: 1, mag: 7, station: "117", dist: 12, accel: 0.359 },
165
+ { event: 16, mag: 5.1, station: nil, dist: 7.6, accel: 0.28 },
166
+ { event: 23, mag: 5.3, station: "c168", dist: 25.3, accel: 0.23 },
167
+ { event: 23, mag: 5.3, station: "5072", dist: 53.1, accel: 0.022 }
168
+ ],
169
+ [
170
+ records.size,
171
+ records[0],
172
+ records[78],
173
+ records[169],
174
+ records[-1]
175
+ ])
176
+ end
177
+ end
178
+ end
179
+
180
+ sub_test_case('drc') do
181
+ sub_test_case('germination') do
182
+ def setup
183
+ @dataset = Datasets::Rdataset.new('drc', 'germination')
184
+ end
185
+
186
+ test('#each') do
187
+ records = @dataset.each.to_a
188
+ assert_equal([
189
+ 192,
190
+ { temp: 10, species: 'wheat', start: 0, end: 1.0, germinated: 0 },
191
+ { temp: 40, species: 'rice', start: 18, end: Float::INFINITY, germinated: 12 }
192
+ ],
193
+ [
194
+ records.size,
195
+ records[0],
196
+ records[-1]
197
+ ])
198
+ end
199
+ end
200
+ end
201
+
202
+ sub_test_case('validate') do
203
+ sub_test_case('nace_rev2') do
204
+ def setup
205
+ @dataset = Datasets::Rdataset.new('validate', 'nace_rev2')
206
+ end
207
+
208
+ test('#each') do
209
+ records = @dataset.each.to_a
210
+ assert_equal([
211
+ 996,
212
+ {
213
+ Order: 398_481,
214
+ Level: 1,
215
+ Code: 'A',
216
+ Parent: '',
217
+ Description: 'AGRICULTURE, FORESTRY AND FISHING',
218
+ This_item_includes: 'This section includes the exploitation of vegetal and animal natural resources, comprising the activities of growing of crops, raising and breeding of animals, harvesting of timber and other plants, animals or animal products from a farm or their natural habitats.',
219
+ This_item_also_includes: '',
220
+ Rulings: '',
221
+ This_item_excludes: '',
222
+ "Reference_to_ISIC_Rev._4": 'A'
223
+ },
224
+ {
225
+ Order: 399_476,
226
+ Level: 4,
227
+ Code: '99.00',
228
+ Parent: '99.0',
229
+ Description: 'Activities of extraterritorial organisations and bodies',
230
+ This_item_includes: "This class includes:\n- activities of international organisations such as the United Nations and the specialised agencies of the United Nations system, regional bodies etc., the International Monetary Fund, the World Bank, the World Customs Organisation, the Organisation for Economic Co-operation and Development, the organisation of Petroleum Exporting Countries, the European Communities, the European Free Trade Association etc.",
231
+ This_item_also_includes: "This class also includes:\n- activities of diplomatic and consular missions when being determined by the country of their location rather than by the country they represent",
232
+ Rulings: '',
233
+ This_item_excludes: '',
234
+ "Reference_to_ISIC_Rev._4": '9900'
235
+ }
236
+ ],
237
+ [
238
+ records.size,
239
+ records[0],
240
+ records[-1]
241
+ ])
242
+ end
243
+ end
244
+ end
245
+ end
246
+ end
@@ -1,7 +1,41 @@
1
- class SeabornDataTest < Test::Unit::TestCase
1
+ class SeabornTest < Test::Unit::TestCase
2
+ sub_test_case("list") do
3
+ def setup
4
+ @dataset = Datasets::SeabornList.new
5
+ end
6
+
7
+ def test_each
8
+ records = @dataset.each.to_a
9
+ assert_equal([
10
+ {dataset: "anagrams"},
11
+ {dataset: "anscombe"},
12
+ {dataset: "attention"},
13
+ {dataset: "brain_networks"},
14
+ {dataset: "car_crashes"},
15
+ {dataset: "diamonds"},
16
+ {dataset: "dots"},
17
+ {dataset: "exercise"},
18
+ {dataset: "flights"},
19
+ {dataset: "fmri"},
20
+ {dataset: "geyser"},
21
+ {dataset: "glue"},
22
+ {dataset: "healthexp"},
23
+ {dataset: "iris"},
24
+ {dataset: "mpg"},
25
+ {dataset: "penguins"},
26
+ {dataset: "planets"},
27
+ {dataset: "seaice"},
28
+ {dataset: "taxis"},
29
+ {dataset: "tips"},
30
+ {dataset: "titanic"},
31
+ ],
32
+ records)
33
+ end
34
+ end
35
+
2
36
  sub_test_case("fmri") do
3
37
  def setup
4
- @dataset = Datasets::SeabornData.new("fmri")
38
+ @dataset = Datasets::Seaborn.new("fmri")
5
39
  end
6
40
 
7
41
  def test_each
@@ -33,7 +67,7 @@ class SeabornDataTest < Test::Unit::TestCase
33
67
 
34
68
  sub_test_case("flights") do
35
69
  def setup
36
- @dataset = Datasets::SeabornData.new("flights")
70
+ @dataset = Datasets::Seaborn.new("flights")
37
71
  end
38
72
 
39
73
  def test_each
@@ -61,7 +95,7 @@ class SeabornDataTest < Test::Unit::TestCase
61
95
 
62
96
  sub_test_case("penguins") do
63
97
  def setup
64
- @dataset = Datasets::SeabornData.new("penguins")
98
+ @dataset = Datasets::Seaborn.new("penguins")
65
99
  end
66
100
 
67
101
  def test_each
@@ -94,4 +128,36 @@ class SeabornDataTest < Test::Unit::TestCase
94
128
  ])
95
129
  end
96
130
  end
131
+
132
+ sub_test_case("attention") do
133
+ def setup
134
+ @dataset = Datasets::Seaborn.new("attention")
135
+ end
136
+
137
+ def test_each
138
+ records = @dataset.to_a
139
+ assert_equal([
140
+ 60,
141
+ {
142
+ index: 1,
143
+ subject: 2,
144
+ attention: "divided",
145
+ solutions: 1,
146
+ score: 3.0
147
+ },
148
+ {
149
+ index: 59,
150
+ subject: 20,
151
+ attention: "focused",
152
+ solutions: 3,
153
+ score: 5.0
154
+ }
155
+ ],
156
+ [
157
+ records.size,
158
+ records[1],
159
+ records[-1]
160
+ ])
161
+ end
162
+ end
97
163
  end
@@ -6,7 +6,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
6
6
  test('#each') do
7
7
  records = @dataset.each.to_a
8
8
  assert_equal([
9
- 61335,
9
+ 65182,
10
10
  {
11
11
  group_id: "000001",
12
12
  is_noun: true,
@@ -19,15 +19,15 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
19
19
  notation: "曖昧",
20
20
  },
21
21
  {
22
- group_id: "023705",
22
+ group_id: "024909",
23
23
  is_noun: true,
24
- expansion_type: :always,
24
+ expansion_type: :expanded,
25
25
  lexeme_id: 1,
26
26
  form_type: :typical,
27
27
  acronym_type: :alphabet,
28
28
  variant_type: :typical,
29
- categories: ["単位"],
30
- notation: "GB",
29
+ categories: ["ビジネス"],
30
+ notation: "BPO",
31
31
  },
32
32
  ],
33
33
  [
@@ -0,0 +1,178 @@
1
+ class WikipediaKyotoJapaneseEnglishTest < Test::Unit::TestCase
2
+ sub_test_case("article") do
3
+ def setup
4
+ @dataset = Datasets::WikipediaKyotoJapaneseEnglish.new(type: :article)
5
+ end
6
+
7
+ def shorten_text(text)
8
+ max = 20
9
+ if text.size <= max
10
+ text
11
+ else
12
+ "#{text[0, max]}..."
13
+ end
14
+ end
15
+
16
+ def hashify(record)
17
+ hash = {class: record.class.name.split("::").last}
18
+ case record
19
+ when Datasets::WikipediaKyotoJapaneseEnglish::Title
20
+ hash[:section] = record.section&.id
21
+ hash[:japanese] = shorten_text(record.japanese)
22
+ hash[:english] = shorten_text(record.english)
23
+ when Datasets::WikipediaKyotoJapaneseEnglish::Sentence
24
+ hash[:id] = record.id
25
+ hash[:section] = record.section&.id
26
+ hash[:paragraph] = record.paragraph&.id
27
+ hash[:japanese] = shorten_text(record.japanese)
28
+ hash[:english] = shorten_text(record.english)
29
+ else
30
+ record.members.each do |member|
31
+ value = record[member]
32
+ case value
33
+ when Array
34
+ value = value.collect do |v|
35
+ hashify(v)
36
+ end
37
+ when String
38
+ value = shorten_text(value)
39
+ when Struct
40
+ value = hasify(value)
41
+ end
42
+ hash[member] = value
43
+ end
44
+ end
45
+ hash
46
+ end
47
+
48
+ test("#each") do
49
+ first_record = @dataset.each.first
50
+ assert_equal({
51
+ class: "Article",
52
+ copyright: "copyright (c) 2010 前...",
53
+ sections: [],
54
+ source: "jawiki-20080607-page...",
55
+ contents: [
56
+ {
57
+ class: "Title",
58
+ section: nil,
59
+ english: "Genkitsu SANYO",
60
+ japanese: "三要元佶",
61
+ },
62
+ {
63
+ class: "Sentence",
64
+ id: "1",
65
+ section: nil,
66
+ paragraph: "1",
67
+ english: "Genkitsu SANYO (1548...",
68
+ japanese: "三要元佶(さんよう げんきつ, 天文 (...",
69
+ },
70
+ {
71
+ class: "Sentence",
72
+ id: "2",
73
+ section: nil,
74
+ paragraph: "2",
75
+ english: "He was originally fr...",
76
+ japanese: "肥前国(佐賀県)の出身。",
77
+ },
78
+ {
79
+ class: "Sentence",
80
+ id: "3",
81
+ section: nil,
82
+ paragraph: "2",
83
+ english: "His Go (pen name) wa...",
84
+ japanese: "号は閑室。",
85
+ },
86
+ {
87
+ class: "Sentence",
88
+ id: "4",
89
+ section: nil,
90
+ paragraph: "2",
91
+ english: "He was called Kiccho...",
92
+ japanese: "佶長老、閑室和尚と呼ばれた。",
93
+ },
94
+ {
95
+ class: "Sentence",
96
+ id: "5",
97
+ section: nil,
98
+ paragraph: "3",
99
+ english: "He went up to the ca...",
100
+ japanese: "幼少時に都に上り、岩倉の円通寺 (京都市...",
101
+ },
102
+ {
103
+ class: "Sentence",
104
+ id: "6",
105
+ section: nil,
106
+ paragraph: "4",
107
+ english: "After assuming the p...",
108
+ japanese: "足利学校の長となるが、関ヶ原の戦いの折に...",
109
+ },
110
+ {
111
+ class: "Sentence",
112
+ id: "7",
113
+ section: nil,
114
+ paragraph: "5",
115
+ english: "He assumed the posit...",
116
+ japanese: "金地院崇伝と寺社奉行の任に当たり、西笑承...",
117
+ },
118
+ {
119
+ class: "Sentence",
120
+ id: "8",
121
+ section: nil,
122
+ paragraph: "6",
123
+ english: "Later, he was invite...",
124
+ japanese: "家康によって、伏見区の学校に招かれ、円光...",
125
+ },
126
+ ],
127
+ },
128
+ hashify(first_record))
129
+ end
130
+ end
131
+
132
+ sub_test_case("lexicon") do
133
+ def setup
134
+ @dataset = Datasets::WikipediaKyotoJapaneseEnglish.new(type: :lexicon)
135
+ end
136
+
137
+ test("#each") do
138
+ records = @dataset.each.to_a
139
+ assert_equal([
140
+ 51982,
141
+ {
142
+ :japanese => "102世吉田日厚貫首",
143
+ :english => "the 102nd head priest, Nikko TOSHIDA"
144
+ },
145
+ {
146
+ :japanese => "龗神社",
147
+ :english => "Okami-jinja Shrine"
148
+ },
149
+ ],
150
+ [
151
+ records.size,
152
+ records[0].to_h,
153
+ records[-1].to_h,
154
+ ])
155
+ end
156
+ end
157
+
158
+ test("invalid") do
159
+ message = "Please set type :article or :lexicon: :invalid"
160
+ assert_raise(ArgumentError.new(message)) do
161
+ Datasets::WikipediaKyotoJapaneseEnglish.new(type: :invalid)
162
+ end
163
+ end
164
+
165
+ test("description") do
166
+ dataset = Datasets::WikipediaKyotoJapaneseEnglish.new
167
+ description = dataset.metadata.description
168
+ assert_equal(<<-DESCRIPTION, description)
169
+ "The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
170
+ aims mainly at supporting research and development relevant to
171
+ high-performance multilingual machine translation, information
172
+ extraction, and other language processing technologies. The National
173
+ Institute of Information and Communications Technology (NICT) has
174
+ created this corpus by manually translating Japanese Wikipedia
175
+ articles (related to Kyoto) into English.
176
+ DESCRIPTION
177
+ end
178
+ end