red-datasets 0.1.4 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,60 @@
1
+ class AFINNTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::AFINN.new
4
+ end
5
+
6
+ test('#each') do
7
+ records = @dataset.each.to_a
8
+ assert_equal([
9
+ 2477,
10
+ {
11
+ :valence => -2,
12
+ :word => "abandon"
13
+ },
14
+ {
15
+ :valence => 2,
16
+ :word => "zealous"
17
+ },
18
+ ],
19
+ [
20
+ records.size,
21
+ records[0].to_h,
22
+ records[-1].to_h,
23
+ ])
24
+ end
25
+
26
+ sub_test_case('#metadata') do
27
+ test('#description') do
28
+ description = @dataset.metadata.description
29
+ assert_equal(<<-DESCRIPTION.chomp, description)
30
+ AFINN is a list of English words rated for valence with an integer
31
+ between minus five (negative) and plus five (positive). The words have
32
+ been manually labeled by Finn Årup Nielsen in 2009-2011. The file
33
+ is tab-separated. There are two versions:
34
+
35
+ AFINN-111: Newest version with 2477 words and phrases.
36
+
37
+ An evaluation of the word list is available in:
38
+
39
+ Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
40
+ sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
41
+
42
+ The list was used in:
43
+
44
+ Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
45
+ Michael Etter, "Good Friends, Bad News - Affect and Virality in
46
+ Twitter", The 2011 International Workshop on Social Computing,
47
+ Network, and Services (SocialComNet 2011).
48
+
49
+
50
+ This database of words is copyright protected and distributed under
51
+ "Open Database License (ODbL) v1.0"
52
+ http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
53
+ copyleft license.
54
+
55
+ See comments on the word list here:
56
+ http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
57
+ DESCRIPTION
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,190 @@
1
+ class AozoraBunkoTest < Test::Unit::TestCase
2
+ include Helper::PathRestorable
3
+
4
+ def setup
5
+ @dataset = Datasets::AozoraBunko.new
6
+ @cache_path = @dataset.send(:cache_path)
7
+ end
8
+
9
+ test('#new') do
10
+ assert_equal({
11
+ title_id: '059898',
12
+ title: 'ウェストミンスター寺院',
13
+ title_reading: 'ウェストミンスターじいん',
14
+ title_reading_collation: 'うえすとみんすたあしいん',
15
+ subtitle: '',
16
+ subtitle_reading: '',
17
+ original_title: '',
18
+ first_appearance: '',
19
+ ndc_code: 'NDC 933',
20
+ syllabary_spelling_type: '新字新仮名',
21
+ copyrighted: false,
22
+ published_date: '2020-04-03',
23
+ last_updated_date: '2020-03-28',
24
+ detail_url: 'https://www.aozora.gr.jp/cards/001257/card59898.html',
25
+ person_id: '001257',
26
+ person_family_name: 'アーヴィング',
27
+ person_first_name: 'ワシントン',
28
+ person_family_name_reading: 'アーヴィング',
29
+ person_first_name_reading: 'ワシントン',
30
+ person_family_name_reading_collation: 'ああういんく',
31
+ person_first_name_reading_collation: 'わしんとん',
32
+ person_family_name_romaji: 'Irving',
33
+ person_first_name_romaji: 'Washington',
34
+ person_type: '著者',
35
+ person_birthday: '1783-04-03',
36
+ person_date_of_death: '1859-11-28',
37
+ person_copyrighted: false,
38
+ original_book_name1: 'スケッチ・ブック',
39
+ original_book_publisher_name1: '新潮文庫、新潮社',
40
+ original_book_first_published_date1: '1957(昭和32)年5月20日',
41
+ used_version_for_registration1: '2000(平成12)年2月20日33刷改版',
42
+ used_version_for_proofreading1: '2000(平成12)年2月20日33刷改版',
43
+ base_of_original_book_name1: '',
44
+ base_of_original_book_publisher_name1: '',
45
+ base_of_original_book_first_published_date1: '',
46
+ original_book_name2: '',
47
+ original_book_publisher_name2: '',
48
+ original_book_first_published_date2: '',
49
+ used_version_for_registration2: '',
50
+ used_version_for_proofreading2: '',
51
+ base_of_original_book_name2: '',
52
+ base_of_original_book_publisher_name2: '',
53
+ base_of_original_book_first_published_date2: '',
54
+ registered_person_name: 'えにしだ',
55
+ proofreader_name: '砂場清隆',
56
+ text_file_url: 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip',
57
+ last_text_file_updated_date: '2020-03-28',
58
+ text_file_character_encoding: 'ShiftJIS',
59
+ text_file_character_set: 'JIS X 0208',
60
+ text_file_updating_count: '0',
61
+ html_file_url: 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html',
62
+ last_html_file_updated_date: '2020-03-28',
63
+ html_file_character_encoding: 'ShiftJIS',
64
+ html_file_character_set: 'JIS X 0208',
65
+ html_file_updating_count: '0'
66
+
67
+ },
68
+ @dataset.first.to_h)
69
+ end
70
+
71
+ sub_test_case(:Book) do
72
+ sub_test_case('#text') do
73
+ test('readable') do
74
+ book = Datasets::AozoraBunko::Book.new
75
+ book.cache_path = @cache_path
76
+ book.title_id = '059898'
77
+ book.person_id = '001257'
78
+ book.text_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip'
79
+ book.text_file_character_encoding = 'ShiftJIS'
80
+
81
+ assert_equal([
82
+ 'ウェストミンスター寺',
83
+ "アの皆さんです。\r\n"
84
+ ],
85
+ [
86
+ book.text[0, 10],
87
+ book.text[-10, 10]
88
+ ])
89
+ end
90
+
91
+ test('not readable') do
92
+ book = Datasets::AozoraBunko::Book.new
93
+ book.text_file_url = 'https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE'
94
+
95
+ assert_equal(nil, book.text)
96
+ end
97
+ end
98
+
99
+ sub_test_case('#html') do
100
+ sub_test_case('readable') do
101
+ test('encoding is ShiftJIS') do
102
+ book = Datasets::AozoraBunko::Book.new
103
+ book.cache_path = @cache_path
104
+ book.title_id = '059898'
105
+ book.person_id = '001257'
106
+ book.html_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html'
107
+ book.html_file_character_encoding = 'ShiftJIS'
108
+
109
+ assert_equal("<title>ワシントン・アーヴィング Washington Irving 吉田甲子太郎訳 ウェストミンスター寺院</title>",
110
+ book.html.split("\n")[8].strip)
111
+ end
112
+
113
+ test('encoding is UTF-8') do
114
+ book = Datasets::AozoraBunko::Book.new
115
+ book.cache_path = @cache_path
116
+
117
+ book.title_id = '000750'
118
+ book.person_id = '000146'
119
+ book.html_file_url = 'http://www.lcv.ne.jp/~ibs52086/fire/'
120
+ book.html_file_character_encoding = 'UTF-8'
121
+
122
+ assert_equal('<title>種田山頭火句集 | 『草木塔抄』他 FIRE ON THE MOUNTAIN</title>',
123
+ book.html.split("\n")[7])
124
+ end
125
+ end
126
+
127
+ test('not readable') do
128
+ book = Datasets::AozoraBunko::Book.new
129
+ book.html_file_url = ''
130
+
131
+ assert_equal(nil, book.html)
132
+ end
133
+ end
134
+
135
+ sub_test_case('converting boolean') do
136
+ test('#person_copyrighted?') do
137
+ book = @dataset.first
138
+ assert_equal([
139
+ false,
140
+ false,
141
+ false,
142
+ ],
143
+ [
144
+ book.person_copyrighted?,
145
+ book.person_copyrighted,
146
+ book.to_h[:person_copyrighted],
147
+ ])
148
+ end
149
+
150
+ test('#copyrighted?') do
151
+ book = @dataset.first
152
+ assert_equal([
153
+ false,
154
+ false,
155
+ false,
156
+ ],
157
+ [
158
+ book.copyrighted?,
159
+ book.copyrighted,
160
+ book.to_h[:copyrighted],
161
+ ])
162
+ end
163
+ end
164
+
165
+ test('#clear_cache! removes all cache files') do
166
+ book = Datasets::AozoraBunko::Book.new
167
+ book.cache_path = @cache_path
168
+
169
+ book.title_id = '059898'
170
+ book.person_id = '001257'
171
+ book.text_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip'
172
+ book.text_file_character_encoding = 'ShiftJIS'
173
+ book.html_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html'
174
+ book.html_file_character_encoding = 'ShiftJIS'
175
+
176
+ book.text
177
+ book.html
178
+
179
+ restore_path(@cache_path.base_dir) do
180
+ assert_equal(true, @cache_path.base_dir.exist?)
181
+ assert_equal(true, book.send(:text_file_output_path).exist?)
182
+ assert_equal(true, book.send(:html_file_output_path).exist?)
183
+ @dataset.clear_cache!
184
+ assert_equal(false, book.send(:html_file_output_path).exist?)
185
+ assert_equal(false, book.send(:text_file_output_path).exist?)
186
+ assert_equal(false, @cache_path.base_dir.exist?)
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,56 @@
1
+ class CaliforniaHousingTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::CaliforniaHousing.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::CaliforniaHousing::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 20640,
14
+ {
15
+ median_house_value: 452600.000000,
16
+ median_income: 8.325200,
17
+ housing_median_age: 41.000000,
18
+ total_rooms: 880.000000,
19
+ total_bedrooms: 129.000000,
20
+ population: 322.000000,
21
+ households: 126.000000,
22
+ latitude: 37.880000,
23
+ longitude: -122.230000
24
+ },
25
+ {
26
+ median_house_value: 89400.000000,
27
+ median_income: 2.388600,
28
+ housing_median_age: 16.000000,
29
+ total_rooms: 2785.000000,
30
+ total_bedrooms: 616.000000,
31
+ population: 1387.000000,
32
+ households: 530.000000,
33
+ latitude: 39.370000,
34
+ longitude: -121.240000
35
+ },
36
+ ],
37
+ [
38
+ records.size,
39
+ records[0].to_h,
40
+ records[-1].to_h
41
+ ])
42
+ end
43
+
44
+ sub_test_case("#metadata") do
45
+ test("#description") do
46
+ description = @dataset.metadata.description
47
+ assert_equal(<<-DESCRIPTION, description)
48
+ Housing information from the 1990 census used in
49
+ Pace, R. Kelley and Ronald Barry,
50
+ "Sparse Spatial Autoregressions",
51
+ Statistics and Probability Letters, 33 (1997) 291-297.
52
+ Available from http://lib.stat.cmu.edu/datasets/.
53
+ DESCRIPTION
54
+ end
55
+ end
56
+ end
@@ -14,7 +14,7 @@ class CLDRPluralsTest < Test::Unit::TestCase
14
14
  test("#each") do
15
15
  locales = @dataset.each.to_a
16
16
  assert_equal([
17
- 218,
17
+ 219,
18
18
  locale("bm",
19
19
  [
20
20
  rule("other",
data/test/test-dataset.rb CHANGED
@@ -1,5 +1,7 @@
1
1
  class TestDataset < Test::Unit::TestCase
2
2
  sub_test_case("#clear_cache!") do
3
+ include Helper::PathRestorable
4
+
3
5
  def setup
4
6
  @dataset = Datasets::Iris.new
5
7
  @cache_dir_path = @dataset.send(:cache_dir_path)
@@ -9,18 +11,24 @@ class TestDataset < Test::Unit::TestCase
9
11
  @dataset.first # This ensures the dataset downloaded
10
12
  existence = {before: @cache_dir_path.join("iris.csv").exist?}
11
13
 
12
- @dataset.clear_cache!
13
- existence[:after] = @cache_dir_path.join("iris.csv").exist?
14
+ restore_path(@cache_dir_path) do
15
+ @dataset.clear_cache!
16
+ existence[:after] = @cache_dir_path.join("iris.csv").exist?
14
17
 
15
- assert_equal({before: true, after: false},
16
- existence)
18
+ assert_equal({before: true, after: false},
19
+ existence)
20
+ end
17
21
  end
18
22
 
19
23
  test("when the dataset is not downloaded") do
20
- FileUtils.rmtree(@cache_dir_path.to_s, secure: true) if @cache_dir_path.exist?
24
+ restore_path(@cache_dir_path) do
25
+ if @cache_dir_path.exist?
26
+ FileUtils.rmtree(@cache_dir_path.to_s, secure: true)
27
+ end
21
28
 
22
- assert_nothing_raised do
23
- @dataset.clear_cache!
29
+ assert_nothing_raised do
30
+ @dataset.clear_cache!
31
+ end
24
32
  end
25
33
  end
26
34
  end
@@ -0,0 +1,71 @@
1
+ class DiamondsTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Diamonds.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Diamonds::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 53940,
14
+ {
15
+ carat: 0.23,
16
+ clarity: "SI2",
17
+ color: "E",
18
+ cut: "Ideal",
19
+ depth: 61.5,
20
+ price: 326,
21
+ table: 55.0,
22
+ x: 3.95,
23
+ y: 3.98,
24
+ z: 2.43,
25
+ },
26
+ {
27
+ carat: 0.75,
28
+ clarity: "SI2",
29
+ color: "D",
30
+ cut: "Ideal",
31
+ depth: 62.2,
32
+ price: 2757,
33
+ table: 55.0,
34
+ x: 5.83,
35
+ y: 5.87,
36
+ z: 3.64,
37
+ },
38
+ ],
39
+ [
40
+ records.size,
41
+ records[0].to_h,
42
+ records[-1].to_h
43
+ ])
44
+ end
45
+
46
+ sub_test_case("#metadata") do
47
+ test("#description") do
48
+ description = @dataset.metadata.description
49
+ assert_equal(<<-DESCRIPTION, description)
50
+ Prices of over 50,000 round cut diamonds
51
+
52
+ A dataset containing the prices and other attributes of almost 54,000
53
+ diamonds. The variables are as follows:
54
+
55
+ A data frame with 53940 rows and 10 variables:
56
+
57
+ * price: price in US dollars ($326--$18,823)
58
+ * carat: weight of the diamond (0.2--5.01)
59
+ * cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
60
+ * color: diamond colour, from D (best) to J (worst)
61
+ * clarity: a measurement of how clear the diamond is (I1 (worst), SI2,
62
+ SI1, VS2, VS1, VVS2, VVS1, IF (best))
63
+ * x: length in mm (0--10.74)
64
+ * y: width in mm (0--58.9)
65
+ * z: depth in mm (0--31.8)
66
+ * depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
67
+ * table: width of top of diamond relative to widest point (43--95)
68
+ DESCRIPTION
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,75 @@
1
+ class FuelEconomyTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::FuelEconomy.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::FuelEconomy::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 234,
14
+ {
15
+ city_mpg: 18,
16
+ displacement: 1.8,
17
+ drive_train: "f",
18
+ fuel: "p",
19
+ highway_mpg: 29,
20
+ manufacturer: "audi",
21
+ model: "a4",
22
+ n_cylinders: 4,
23
+ transmission: "auto(l5)",
24
+ type: "compact",
25
+ year: 1999
26
+ },
27
+ {
28
+ city_mpg: 17,
29
+ displacement: 3.6,
30
+ drive_train: "f",
31
+ fuel: "p",
32
+ highway_mpg: 26,
33
+ manufacturer: "volkswagen",
34
+ model: "passat",
35
+ n_cylinders: 6,
36
+ transmission: "auto(s6)",
37
+ type: "midsize",
38
+ year: 2008
39
+ },
40
+ ],
41
+ [
42
+ records.size,
43
+ records[0].to_h,
44
+ records[-1].to_h
45
+ ])
46
+ end
47
+
48
+ sub_test_case("#metadata") do
49
+ test("#description") do
50
+ description = @dataset.metadata.description
51
+ assert_equal(<<-DESCRIPTION, description)
52
+ Fuel economy data from 1999 to 2008 for 38 popular models of cars
53
+
54
+ This dataset contains a subset of the fuel economy data that the EPA makes
55
+ available on https://fueleconomy.gov/. It contains only models which
56
+ had a new release every year between 1999 and 2008 - this was used as a
57
+ proxy for the popularity of the car.
58
+
59
+ A data frame with 234 rows and 11 variables:
60
+
61
+ * manufacturer: manufacturer name
62
+ * model: model name
63
+ * displacement: engine displacement, in litres
64
+ * year: year of manufacture
65
+ * n_cylinders: number of cylinders
66
+ * transmissions: type of transmission
67
+ * drive_train: the type of drive train, where f = front-wheel drive, r = rear wheel drive, 4 = 4wd
68
+ * city_mpg: city miles per gallon
69
+ * highway_mpg: highway miles per gallon
70
+ * fuel: fuel type
71
+ * type: "type" of car
72
+ DESCRIPTION
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,65 @@
1
+ class GeoloniaTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Geolonia.new
4
+ end
5
+
6
+ test('#each') do
7
+ records = @dataset.each.to_a
8
+ assert_equal([
9
+ 277616,
10
+ {
11
+ :prefecture_code => "01",
12
+ :prefecture_name => "北海道",
13
+ :prefecture_kana => "ホッカイドウ",
14
+ :prefecture_romaji => "HOKKAIDO",
15
+ :municipality_code => "01101",
16
+ :municipality_name => "札幌市中央区",
17
+ :municipality_kana => "サッポロシチュウオウク",
18
+ :municipality_romaji => "SAPPORO SHI CHUO KU",
19
+ :street_name => "旭ケ丘一丁目",
20
+ :street_kana => "アサヒガオカ 1",
21
+ :street_romaji => "ASAHIGAOKA 1",
22
+ :alias => nil,
23
+ :latitude => "43.04223",
24
+ :longitude => "141.319722"
25
+ },
26
+ {
27
+ :prefecture_code => "47",
28
+ :prefecture_name => "沖縄県",
29
+ :prefecture_kana => "オキナワケン",
30
+ :prefecture_romaji => "OKINAWA KEN",
31
+ :municipality_code => "47382",
32
+ :municipality_name => "八重山郡与那国町",
33
+ :municipality_kana => "ヤエヤマグンヨナグニチョウ",
34
+ :municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
35
+ :street_name => "字与那国",
36
+ :street_kana => nil,
37
+ :street_romaji => nil,
38
+ :alias => nil,
39
+ :latitude => "24.455925",
40
+ :longitude => "122.987678",
41
+ },
42
+ ],
43
+ [
44
+ records.size,
45
+ records[0].to_h,
46
+ records[-1].to_h,
47
+ ])
48
+ end
49
+
50
+ sub_test_case("#metadata") do
51
+ test("#description") do
52
+ description = @dataset.metadata.description
53
+ assert_equal([
54
+ "# Geolonia 住所データ",
55
+ "## 住所データ仕様",
56
+ "### ファイルフォーマット",
57
+ "### 列",
58
+ "### ソート順",
59
+ ],
60
+ description.scan(/^#.*$/),
61
+ description)
62
+ end
63
+ end
64
+
65
+ end
@@ -0,0 +1,69 @@
1
+ class ITACorpusTest < Test::Unit::TestCase
2
+
3
+ sub_test_case("type") do
4
+ test("emotion") do
5
+ dataset = Datasets::ITACorpus.new(type: :emotion)
6
+ records = dataset.to_a
7
+ assert_equal([
8
+ 100,
9
+ {
10
+ :id => "EMOTION100_001",
11
+ :sentence => "えっ嘘でしょ。,エッウソデショ。"
12
+ },
13
+ {
14
+ :id => "EMOTION100_100",
15
+ :sentence => "ラーテャン。,ラーテャン。",
16
+ },
17
+ ],
18
+ [
19
+ records.size,
20
+ records[0].to_h,
21
+ records[-1].to_h,
22
+ ])
23
+ end
24
+
25
+ test("recitation") do
26
+ dataset = Datasets::ITACorpus.new(type: :recitation)
27
+ records = dataset.to_a
28
+ assert_equal([
29
+ 324,
30
+ {
31
+ :id => "RECITATION324_001",
32
+ :sentence => "女の子がキッキッ嬉しそう。,オンナノコガキッキッウレシソー。"
33
+ },
34
+ {
35
+ :id => "RECITATION324_324",
36
+ :sentence => "チュクンの波長は、パツンと共通している。,チュクンノハチョーワ、パツントキョーツウシテイル。",
37
+ },
38
+ ],
39
+ [
40
+ records.size,
41
+ records[0].to_h,
42
+ records[-1].to_h,
43
+ ])
44
+ end
45
+
46
+ test("invalid") do
47
+ message = "Please set type :emotion or :recitation: :invalid"
48
+ assert_raise(ArgumentError.new(message)) do
49
+ Datasets::ITACorpus.new(type: :invalid)
50
+ end
51
+ end
52
+
53
+ end
54
+
55
+ sub_test_case("#metadata") do
56
+ test("#description") do
57
+ dataset = Datasets::ITACorpus.new(type: :emotion)
58
+ description = dataset.metadata.description
59
+ assert_equal([
60
+ "# ITAコーパスの文章リスト公開用リポジトリ",
61
+ "## ITAコーパスとは",
62
+ "## ITAコーパスの文献情報"
63
+ ],
64
+ description.scan(/^#.*$/),
65
+ description)
66
+ end
67
+ end
68
+
69
+ end