red-datasets 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,60 @@
1
+ class AFINNTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::AFINN.new
4
+ end
5
+
6
+ test('#each') do
7
+ records = @dataset.each.to_a
8
+ assert_equal([
9
+ 2477,
10
+ {
11
+ :valence => -2,
12
+ :word => "abandon"
13
+ },
14
+ {
15
+ :valence => 2,
16
+ :word => "zealous"
17
+ },
18
+ ],
19
+ [
20
+ records.size,
21
+ records[0].to_h,
22
+ records[-1].to_h,
23
+ ])
24
+ end
25
+
26
+ sub_test_case('#metadata') do
27
+ test('#description') do
28
+ description = @dataset.metadata.description
29
+ assert_equal(<<-DESCRIPTION.chomp, description)
30
+ AFINN is a list of English words rated for valence with an integer
31
+ between minus five (negative) and plus five (positive). The words have
32
+ been manually labeled by Finn Årup Nielsen in 2009-2011. The file
33
+ is tab-separated. There are two versions:
34
+
35
+ AFINN-111: Newest version with 2477 words and phrases.
36
+
37
+ An evaluation of the word list is available in:
38
+
39
+ Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
40
+ sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
41
+
42
+ The list was used in:
43
+
44
+ Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
45
+ Michael Etter, "Good Friends, Bad News - Affect and Virality in
46
+ Twitter", The 2011 International Workshop on Social Computing,
47
+ Network, and Services (SocialComNet 2011).
48
+
49
+
50
+ This database of words is copyright protected and distributed under
51
+ "Open Database License (ODbL) v1.0"
52
+ http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
53
+ copyleft license.
54
+
55
+ See comments on the word list here:
56
+ http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
57
+ DESCRIPTION
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,190 @@
1
+ class AozoraBunkoTest < Test::Unit::TestCase
2
+ include Helper::PathRestorable
3
+
4
+ def setup
5
+ @dataset = Datasets::AozoraBunko.new
6
+ @cache_path = @dataset.send(:cache_path)
7
+ end
8
+
9
+ test('#new') do
10
+ assert_equal({
11
+ title_id: '059898',
12
+ title: 'ウェストミンスター寺院',
13
+ title_reading: 'ウェストミンスターじいん',
14
+ title_reading_collation: 'うえすとみんすたあしいん',
15
+ subtitle: '',
16
+ subtitle_reading: '',
17
+ original_title: '',
18
+ first_appearance: '',
19
+ ndc_code: 'NDC 933',
20
+ syllabary_spelling_type: '新字新仮名',
21
+ copyrighted: false,
22
+ published_date: '2020-04-03',
23
+ last_updated_date: '2020-03-28',
24
+ detail_url: 'https://www.aozora.gr.jp/cards/001257/card59898.html',
25
+ person_id: '001257',
26
+ person_family_name: 'アーヴィング',
27
+ person_first_name: 'ワシントン',
28
+ person_family_name_reading: 'アーヴィング',
29
+ person_first_name_reading: 'ワシントン',
30
+ person_family_name_reading_collation: 'ああういんく',
31
+ person_first_name_reading_collation: 'わしんとん',
32
+ person_family_name_romaji: 'Irving',
33
+ person_first_name_romaji: 'Washington',
34
+ person_type: '著者',
35
+ person_birthday: '1783-04-03',
36
+ person_date_of_death: '1859-11-28',
37
+ person_copyrighted: false,
38
+ original_book_name1: 'スケッチ・ブック',
39
+ original_book_publisher_name1: '新潮文庫、新潮社',
40
+ original_book_first_published_date1: '1957(昭和32)年5月20日',
41
+ used_version_for_registration1: '2000(平成12)年2月20日33刷改版',
42
+ used_version_for_proofreading1: '2000(平成12)年2月20日33刷改版',
43
+ base_of_original_book_name1: '',
44
+ base_of_original_book_publisher_name1: '',
45
+ base_of_original_book_first_published_date1: '',
46
+ original_book_name2: '',
47
+ original_book_publisher_name2: '',
48
+ original_book_first_published_date2: '',
49
+ used_version_for_registration2: '',
50
+ used_version_for_proofreading2: '',
51
+ base_of_original_book_name2: '',
52
+ base_of_original_book_publisher_name2: '',
53
+ base_of_original_book_first_published_date2: '',
54
+ registered_person_name: 'えにしだ',
55
+ proofreader_name: '砂場清隆',
56
+ text_file_url: 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip',
57
+ last_text_file_updated_date: '2020-03-28',
58
+ text_file_character_encoding: 'ShiftJIS',
59
+ text_file_character_set: 'JIS X 0208',
60
+ text_file_updating_count: '0',
61
+ html_file_url: 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html',
62
+ last_html_file_updated_date: '2020-03-28',
63
+ html_file_character_encoding: 'ShiftJIS',
64
+ html_file_character_set: 'JIS X 0208',
65
+ html_file_updating_count: '0'
66
+
67
+ },
68
+ @dataset.first.to_h)
69
+ end
70
+
71
+ sub_test_case(:Book) do
72
+ sub_test_case('#text') do
73
+ test('readable') do
74
+ book = Datasets::AozoraBunko::Book.new
75
+ book.cache_path = @cache_path
76
+ book.title_id = '059898'
77
+ book.person_id = '001257'
78
+ book.text_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip'
79
+ book.text_file_character_encoding = 'ShiftJIS'
80
+
81
+ assert_equal([
82
+ 'ウェストミンスター寺',
83
+ "アの皆さんです。\r\n"
84
+ ],
85
+ [
86
+ book.text[0, 10],
87
+ book.text[-10, 10]
88
+ ])
89
+ end
90
+
91
+ test('not readable') do
92
+ book = Datasets::AozoraBunko::Book.new
93
+ book.text_file_url = 'https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE'
94
+
95
+ assert_equal(nil, book.text)
96
+ end
97
+ end
98
+
99
+ sub_test_case('#html') do
100
+ sub_test_case('readable') do
101
+ test('encoding is ShiftJIS') do
102
+ book = Datasets::AozoraBunko::Book.new
103
+ book.cache_path = @cache_path
104
+ book.title_id = '059898'
105
+ book.person_id = '001257'
106
+ book.html_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html'
107
+ book.html_file_character_encoding = 'ShiftJIS'
108
+
109
+ assert_equal("<title>ワシントン・アーヴィング Washington Irving 吉田甲子太郎訳 ウェストミンスター寺院</title>",
110
+ book.html.split("\n")[8].strip)
111
+ end
112
+
113
+ test('encoding is UTF-8') do
114
+ book = Datasets::AozoraBunko::Book.new
115
+ book.cache_path = @cache_path
116
+
117
+ book.title_id = '000750'
118
+ book.person_id = '000146'
119
+ book.html_file_url = 'http://www.lcv.ne.jp/~ibs52086/fire/'
120
+ book.html_file_character_encoding = 'UTF-8'
121
+
122
+ assert_equal('<title>種田山頭火句集 | 『草木塔抄』他 FIRE ON THE MOUNTAIN</title>',
123
+ book.html.split("\n")[7])
124
+ end
125
+ end
126
+
127
+ test('not readable') do
128
+ book = Datasets::AozoraBunko::Book.new
129
+ book.html_file_url = ''
130
+
131
+ assert_equal(nil, book.html)
132
+ end
133
+ end
134
+
135
+ sub_test_case('converting boolean') do
136
+ test('#person_copyrighted?') do
137
+ book = @dataset.first
138
+ assert_equal([
139
+ false,
140
+ false,
141
+ false,
142
+ ],
143
+ [
144
+ book.person_copyrighted?,
145
+ book.person_copyrighted,
146
+ book.to_h[:person_copyrighted],
147
+ ])
148
+ end
149
+
150
+ test('#copyrighted?') do
151
+ book = @dataset.first
152
+ assert_equal([
153
+ false,
154
+ false,
155
+ false,
156
+ ],
157
+ [
158
+ book.copyrighted?,
159
+ book.copyrighted,
160
+ book.to_h[:copyrighted],
161
+ ])
162
+ end
163
+ end
164
+
165
+ test('#clear_cache! removes all cache files') do
166
+ book = Datasets::AozoraBunko::Book.new
167
+ book.cache_path = @cache_path
168
+
169
+ book.title_id = '059898'
170
+ book.person_id = '001257'
171
+ book.text_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_ruby_70679.zip'
172
+ book.text_file_character_encoding = 'ShiftJIS'
173
+ book.html_file_url = 'https://www.aozora.gr.jp/cards/001257/files/59898_70731.html'
174
+ book.html_file_character_encoding = 'ShiftJIS'
175
+
176
+ book.text
177
+ book.html
178
+
179
+ restore_path(@cache_path.base_dir) do
180
+ assert_equal(true, @cache_path.base_dir.exist?)
181
+ assert_equal(true, book.send(:text_file_output_path).exist?)
182
+ assert_equal(true, book.send(:html_file_output_path).exist?)
183
+ @dataset.clear_cache!
184
+ assert_equal(false, book.send(:html_file_output_path).exist?)
185
+ assert_equal(false, book.send(:text_file_output_path).exist?)
186
+ assert_equal(false, @cache_path.base_dir.exist?)
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,56 @@
1
+ class CaliforniaHousingTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::CaliforniaHousing.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::CaliforniaHousing::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 20640,
14
+ {
15
+ median_house_value: 452600.000000,
16
+ median_income: 8.325200,
17
+ housing_median_age: 41.000000,
18
+ total_rooms: 880.000000,
19
+ total_bedrooms: 129.000000,
20
+ population: 322.000000,
21
+ households: 126.000000,
22
+ latitude: 37.880000,
23
+ longitude: -122.230000
24
+ },
25
+ {
26
+ median_house_value: 89400.000000,
27
+ median_income: 2.388600,
28
+ housing_median_age: 16.000000,
29
+ total_rooms: 2785.000000,
30
+ total_bedrooms: 616.000000,
31
+ population: 1387.000000,
32
+ households: 530.000000,
33
+ latitude: 39.370000,
34
+ longitude: -121.240000
35
+ },
36
+ ],
37
+ [
38
+ records.size,
39
+ records[0].to_h,
40
+ records[-1].to_h
41
+ ])
42
+ end
43
+
44
+ sub_test_case("#metadata") do
45
+ test("#description") do
46
+ description = @dataset.metadata.description
47
+ assert_equal(<<-DESCRIPTION, description)
48
+ Housing information from the 1990 census used in
49
+ Pace, R. Kelley and Ronald Barry,
50
+ "Sparse Spatial Autoregressions",
51
+ Statistics and Probability Letters, 33 (1997) 291-297.
52
+ Available from http://lib.stat.cmu.edu/datasets/.
53
+ DESCRIPTION
54
+ end
55
+ end
56
+ end
@@ -14,7 +14,7 @@ class CLDRPluralsTest < Test::Unit::TestCase
14
14
  test("#each") do
15
15
  locales = @dataset.each.to_a
16
16
  assert_equal([
17
- 218,
17
+ 219,
18
18
  locale("bm",
19
19
  [
20
20
  rule("other",
data/test/test-dataset.rb CHANGED
@@ -1,5 +1,7 @@
1
1
  class TestDataset < Test::Unit::TestCase
2
2
  sub_test_case("#clear_cache!") do
3
+ include Helper::PathRestorable
4
+
3
5
  def setup
4
6
  @dataset = Datasets::Iris.new
5
7
  @cache_dir_path = @dataset.send(:cache_dir_path)
@@ -9,18 +11,24 @@ class TestDataset < Test::Unit::TestCase
9
11
  @dataset.first # This ensures the dataset downloaded
10
12
  existence = {before: @cache_dir_path.join("iris.csv").exist?}
11
13
 
12
- @dataset.clear_cache!
13
- existence[:after] = @cache_dir_path.join("iris.csv").exist?
14
+ restore_path(@cache_dir_path) do
15
+ @dataset.clear_cache!
16
+ existence[:after] = @cache_dir_path.join("iris.csv").exist?
14
17
 
15
- assert_equal({before: true, after: false},
16
- existence)
18
+ assert_equal({before: true, after: false},
19
+ existence)
20
+ end
17
21
  end
18
22
 
19
23
  test("when the dataset is not downloaded") do
20
- FileUtils.rmtree(@cache_dir_path.to_s, secure: true) if @cache_dir_path.exist?
24
+ restore_path(@cache_dir_path) do
25
+ if @cache_dir_path.exist?
26
+ FileUtils.rmtree(@cache_dir_path.to_s, secure: true)
27
+ end
21
28
 
22
- assert_nothing_raised do
23
- @dataset.clear_cache!
29
+ assert_nothing_raised do
30
+ @dataset.clear_cache!
31
+ end
24
32
  end
25
33
  end
26
34
  end
@@ -0,0 +1,71 @@
1
+ class DiamondsTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Diamonds.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::Diamonds::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 53940,
14
+ {
15
+ carat: 0.23,
16
+ clarity: "SI2",
17
+ color: "E",
18
+ cut: "Ideal",
19
+ depth: 61.5,
20
+ price: 326,
21
+ table: 55.0,
22
+ x: 3.95,
23
+ y: 3.98,
24
+ z: 2.43,
25
+ },
26
+ {
27
+ carat: 0.75,
28
+ clarity: "SI2",
29
+ color: "D",
30
+ cut: "Ideal",
31
+ depth: 62.2,
32
+ price: 2757,
33
+ table: 55.0,
34
+ x: 5.83,
35
+ y: 5.87,
36
+ z: 3.64,
37
+ },
38
+ ],
39
+ [
40
+ records.size,
41
+ records[0].to_h,
42
+ records[-1].to_h
43
+ ])
44
+ end
45
+
46
+ sub_test_case("#metadata") do
47
+ test("#description") do
48
+ description = @dataset.metadata.description
49
+ assert_equal(<<-DESCRIPTION, description)
50
+ Prices of over 50,000 round cut diamonds
51
+
52
+ A dataset containing the prices and other attributes of almost 54,000
53
+ diamonds. The variables are as follows:
54
+
55
+ A data frame with 53940 rows and 10 variables:
56
+
57
+ * price: price in US dollars ($326--$18,823)
58
+ * carat: weight of the diamond (0.2--5.01)
59
+ * cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
60
+ * color: diamond colour, from D (best) to J (worst)
61
+ * clarity: a measurement of how clear the diamond is (I1 (worst), SI2,
62
+ SI1, VS2, VS1, VVS2, VVS1, IF (best))
63
+ * x: length in mm (0--10.74)
64
+ * y: width in mm (0--58.9)
65
+ * z: depth in mm (0--31.8)
66
+ * depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
67
+ * table: width of top of diamond relative to widest point (43--95)
68
+ DESCRIPTION
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,75 @@
1
+ class FuelEconomyTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::FuelEconomy.new
4
+ end
5
+
6
+ def record(*args)
7
+ Datasets::FuelEconomy::Record.new(*args)
8
+ end
9
+
10
+ test("#each") do
11
+ records = @dataset.each.to_a
12
+ assert_equal([
13
+ 234,
14
+ {
15
+ city_mpg: 18,
16
+ displacement: 1.8,
17
+ drive_train: "f",
18
+ fuel: "p",
19
+ highway_mpg: 29,
20
+ manufacturer: "audi",
21
+ model: "a4",
22
+ n_cylinders: 4,
23
+ transmission: "auto(l5)",
24
+ type: "compact",
25
+ year: 1999
26
+ },
27
+ {
28
+ city_mpg: 17,
29
+ displacement: 3.6,
30
+ drive_train: "f",
31
+ fuel: "p",
32
+ highway_mpg: 26,
33
+ manufacturer: "volkswagen",
34
+ model: "passat",
35
+ n_cylinders: 6,
36
+ transmission: "auto(s6)",
37
+ type: "midsize",
38
+ year: 2008
39
+ },
40
+ ],
41
+ [
42
+ records.size,
43
+ records[0].to_h,
44
+ records[-1].to_h
45
+ ])
46
+ end
47
+
48
+ sub_test_case("#metadata") do
49
+ test("#description") do
50
+ description = @dataset.metadata.description
51
+ assert_equal(<<-DESCRIPTION, description)
52
+ Fuel economy data from 1999 to 2008 for 38 popular models of cars
53
+
54
+ This dataset contains a subset of the fuel economy data that the EPA makes
55
+ available on https://fueleconomy.gov/. It contains only models which
56
+ had a new release every year between 1999 and 2008 - this was used as a
57
+ proxy for the popularity of the car.
58
+
59
+ A data frame with 234 rows and 11 variables:
60
+
61
+ * manufacturer: manufacturer name
62
+ * model: model name
63
+ * displacement: engine displacement, in litres
64
+ * year: year of manufacture
65
+ * n_cylinders: number of cylinders
66
+ * transmissions: type of transmission
67
+ * drive_train: the type of drive train, where f = front-wheel drive, r = rear wheel drive, 4 = 4wd
68
+ * city_mpg: city miles per gallon
69
+ * highway_mpg: highway miles per gallon
70
+ * fuel: fuel type
71
+ * type: "type" of car
72
+ DESCRIPTION
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,65 @@
1
+ class GeoloniaTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::Geolonia.new
4
+ end
5
+
6
+ test('#each') do
7
+ records = @dataset.each.to_a
8
+ assert_equal([
9
+ 277616,
10
+ {
11
+ :prefecture_code => "01",
12
+ :prefecture_name => "北海道",
13
+ :prefecture_kana => "ホッカイドウ",
14
+ :prefecture_romaji => "HOKKAIDO",
15
+ :municipality_code => "01101",
16
+ :municipality_name => "札幌市中央区",
17
+ :municipality_kana => "サッポロシチュウオウク",
18
+ :municipality_romaji => "SAPPORO SHI CHUO KU",
19
+ :street_name => "旭ケ丘一丁目",
20
+ :street_kana => "アサヒガオカ 1",
21
+ :street_romaji => "ASAHIGAOKA 1",
22
+ :alias => nil,
23
+ :latitude => "43.04223",
24
+ :longitude => "141.319722"
25
+ },
26
+ {
27
+ :prefecture_code => "47",
28
+ :prefecture_name => "沖縄県",
29
+ :prefecture_kana => "オキナワケン",
30
+ :prefecture_romaji => "OKINAWA KEN",
31
+ :municipality_code => "47382",
32
+ :municipality_name => "八重山郡与那国町",
33
+ :municipality_kana => "ヤエヤマグンヨナグニチョウ",
34
+ :municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
35
+ :street_name => "字与那国",
36
+ :street_kana => nil,
37
+ :street_romaji => nil,
38
+ :alias => nil,
39
+ :latitude => "24.455925",
40
+ :longitude => "122.987678",
41
+ },
42
+ ],
43
+ [
44
+ records.size,
45
+ records[0].to_h,
46
+ records[-1].to_h,
47
+ ])
48
+ end
49
+
50
+ sub_test_case("#metadata") do
51
+ test("#description") do
52
+ description = @dataset.metadata.description
53
+ assert_equal([
54
+ "# Geolonia 住所データ",
55
+ "## 住所データ仕様",
56
+ "### ファイルフォーマット",
57
+ "### 列",
58
+ "### ソート順",
59
+ ],
60
+ description.scan(/^#.*$/),
61
+ description)
62
+ end
63
+ end
64
+
65
+ end
@@ -0,0 +1,69 @@
1
+ class ITACorpusTest < Test::Unit::TestCase
2
+
3
+ sub_test_case("type") do
4
+ test("emotion") do
5
+ dataset = Datasets::ITACorpus.new(type: :emotion)
6
+ records = dataset.to_a
7
+ assert_equal([
8
+ 100,
9
+ {
10
+ :id => "EMOTION100_001",
11
+ :sentence => "えっ嘘でしょ。,エッウソデショ。"
12
+ },
13
+ {
14
+ :id => "EMOTION100_100",
15
+ :sentence => "ラーテャン。,ラーテャン。",
16
+ },
17
+ ],
18
+ [
19
+ records.size,
20
+ records[0].to_h,
21
+ records[-1].to_h,
22
+ ])
23
+ end
24
+
25
+ test("recitation") do
26
+ dataset = Datasets::ITACorpus.new(type: :recitation)
27
+ records = dataset.to_a
28
+ assert_equal([
29
+ 324,
30
+ {
31
+ :id => "RECITATION324_001",
32
+ :sentence => "女の子がキッキッ嬉しそう。,オンナノコガキッキッウレシソー。"
33
+ },
34
+ {
35
+ :id => "RECITATION324_324",
36
+ :sentence => "チュクンの波長は、パツンと共通している。,チュクンノハチョーワ、パツントキョーツウシテイル。",
37
+ },
38
+ ],
39
+ [
40
+ records.size,
41
+ records[0].to_h,
42
+ records[-1].to_h,
43
+ ])
44
+ end
45
+
46
+ test("invalid") do
47
+ message = "Please set type :emotion or :recitation: :invalid"
48
+ assert_raise(ArgumentError.new(message)) do
49
+ Datasets::ITACorpus.new(type: :invalid)
50
+ end
51
+ end
52
+
53
+ end
54
+
55
+ sub_test_case("#metadata") do
56
+ test("#description") do
57
+ dataset = Datasets::ITACorpus.new(type: :emotion)
58
+ description = dataset.metadata.description
59
+ assert_equal([
60
+ "# ITAコーパスの文章リスト公開用リポジトリ",
61
+ "## ITAコーパスとは",
62
+ "## ITAコーパスの文献情報"
63
+ ],
64
+ description.scan(/^#.*$/),
65
+ description)
66
+ end
67
+ end
68
+
69
+ end