red-datasets 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/doc/text/news.md +86 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +8 -12
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +6 -1
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/penguins.rb +4 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +4 -5
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +36 -0
- data/lib/datasets.rb +14 -2
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +64 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- metadata +58 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,33 @@
|
|
1
|
+
class QuoraDuplicateQuestionPairTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::QuoraDuplicateQuestionPair.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def record(*args)
|
7
|
+
Datasets::QuoraDuplicateQuestionPair::Record.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
test("#each") do
|
11
|
+
records = @dataset.each.to_a
|
12
|
+
assert_equal([
|
13
|
+
404290,
|
14
|
+
record(0,
|
15
|
+
1,
|
16
|
+
2,
|
17
|
+
"What is the step by step guide to invest in share market in india?",
|
18
|
+
"What is the step by step guide to invest in share market?",
|
19
|
+
false),
|
20
|
+
record(404289,
|
21
|
+
537932,
|
22
|
+
537933,
|
23
|
+
"What is like to have sex with cousin?",
|
24
|
+
"What is it like to have sex with your cousin?",
|
25
|
+
false),
|
26
|
+
],
|
27
|
+
[
|
28
|
+
records.size,
|
29
|
+
records.first,
|
30
|
+
records.last,
|
31
|
+
])
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,246 @@
|
|
1
|
+
class RdatasetTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("RdatasetList") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::RdatasetList.new
|
5
|
+
end
|
6
|
+
|
7
|
+
sub_test_case("#each") do
|
8
|
+
test("with package_name") do
|
9
|
+
records = @dataset.filter(package: "datasets").to_a
|
10
|
+
assert_equal([
|
11
|
+
84,
|
12
|
+
{
|
13
|
+
package: "datasets",
|
14
|
+
dataset: "ability.cov",
|
15
|
+
title: "Ability and Intelligence Tests",
|
16
|
+
rows: 6,
|
17
|
+
cols: 8,
|
18
|
+
n_binary: 0,
|
19
|
+
n_character: 0,
|
20
|
+
n_factor: 0,
|
21
|
+
n_logical: 0,
|
22
|
+
n_numeric: 8,
|
23
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/ability.cov.csv",
|
24
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/ability.cov.html"
|
25
|
+
},
|
26
|
+
{
|
27
|
+
package: "datasets",
|
28
|
+
dataset: "WWWusage",
|
29
|
+
title: "Internet Usage per Minute",
|
30
|
+
rows: 100,
|
31
|
+
cols: 2,
|
32
|
+
n_binary: 0,
|
33
|
+
n_character: 0,
|
34
|
+
n_factor: 0,
|
35
|
+
n_logical: 0,
|
36
|
+
n_numeric: 2,
|
37
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/WWWusage.csv",
|
38
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/datasets/WWWusage.html"
|
39
|
+
}
|
40
|
+
],
|
41
|
+
[
|
42
|
+
records.size,
|
43
|
+
records[0].to_h,
|
44
|
+
records[-1].to_h
|
45
|
+
])
|
46
|
+
end
|
47
|
+
|
48
|
+
test("without package_name") do
|
49
|
+
records = @dataset.each.to_a
|
50
|
+
assert_equal([
|
51
|
+
1892,
|
52
|
+
{
|
53
|
+
package: "AER",
|
54
|
+
dataset: "Affairs",
|
55
|
+
title: "Fair's Extramarital Affairs Data",
|
56
|
+
rows: 601,
|
57
|
+
cols: 9,
|
58
|
+
n_binary: 2,
|
59
|
+
n_character: 0,
|
60
|
+
n_factor: 2,
|
61
|
+
n_logical: 0,
|
62
|
+
n_numeric: 7,
|
63
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv",
|
64
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/AER/Affairs.html"
|
65
|
+
},
|
66
|
+
{
|
67
|
+
package: "wooldridge",
|
68
|
+
dataset: "wine",
|
69
|
+
title: "wine",
|
70
|
+
rows: 21,
|
71
|
+
cols: 5,
|
72
|
+
n_binary: 0,
|
73
|
+
n_character: 1,
|
74
|
+
n_factor: 0,
|
75
|
+
n_logical: 0,
|
76
|
+
n_numeric: 4,
|
77
|
+
csv: "https://vincentarelbundock.github.io/Rdatasets/csv/wooldridge/wine.csv",
|
78
|
+
doc: "https://vincentarelbundock.github.io/Rdatasets/doc/wooldridge/wine.html"
|
79
|
+
},
|
80
|
+
],
|
81
|
+
[
|
82
|
+
records.size,
|
83
|
+
records[0].to_h,
|
84
|
+
records[-1].to_h
|
85
|
+
])
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
sub_test_case("Rdataset") do
|
91
|
+
test('invalid package name') do
|
92
|
+
assert_raise(ArgumentError) do
|
93
|
+
Datasets::Rdataset.new('invalid package name', 'AirPassengers')
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
sub_test_case("datasets") do
|
98
|
+
test("invalid dataset name") do
|
99
|
+
assert_raise(ArgumentError) do
|
100
|
+
Datasets::Rdataset.new("datasets", "invalid datasets name")
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
sub_test_case("AirPassengers") do
|
105
|
+
def setup
|
106
|
+
@dataset = Datasets::Rdataset.new("datasets", "AirPassengers")
|
107
|
+
end
|
108
|
+
|
109
|
+
test("#each") do
|
110
|
+
records = @dataset.each.to_a
|
111
|
+
assert_equal([
|
112
|
+
144,
|
113
|
+
{ time: 1949, value: 112 },
|
114
|
+
{ time: 1960.91666666667, value: 432 },
|
115
|
+
],
|
116
|
+
[
|
117
|
+
records.size,
|
118
|
+
records[0],
|
119
|
+
records[-1]
|
120
|
+
])
|
121
|
+
end
|
122
|
+
|
123
|
+
test("#metadata.id") do
|
124
|
+
assert_equal("rdataset-datasets-AirPassengers", @dataset.metadata.id)
|
125
|
+
end
|
126
|
+
|
127
|
+
test("#metadata.description") do
|
128
|
+
description = @dataset.metadata.description
|
129
|
+
assert do
|
130
|
+
description.include?("Monthly Airline Passenger Numbers 1949-1960")
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
sub_test_case("airquality") do
|
136
|
+
def setup
|
137
|
+
@dataset = Datasets::Rdataset.new("datasets", "airquality")
|
138
|
+
end
|
139
|
+
|
140
|
+
test("#each") do
|
141
|
+
records = @dataset.each.to_a
|
142
|
+
assert_equal([
|
143
|
+
153,
|
144
|
+
{ Ozone: nil, "Solar.R": nil, Wind: 14.3, Temp: 56, Month: 5, Day: 5 },
|
145
|
+
{ Ozone: 20, "Solar.R": 223, Wind: 11.5, Temp: 68, Month: 9, Day: 30 },
|
146
|
+
],
|
147
|
+
[
|
148
|
+
records.size,
|
149
|
+
records[4],
|
150
|
+
records[-1]
|
151
|
+
])
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
sub_test_case('attenu') do
|
156
|
+
def setup
|
157
|
+
@dataset = Datasets::Rdataset.new('datasets', 'attenu')
|
158
|
+
end
|
159
|
+
|
160
|
+
test('#each') do
|
161
|
+
records = @dataset.each.to_a
|
162
|
+
assert_equal([
|
163
|
+
182,
|
164
|
+
{ event: 1, mag: 7, station: "117", dist: 12, accel: 0.359 },
|
165
|
+
{ event: 16, mag: 5.1, station: nil, dist: 7.6, accel: 0.28 },
|
166
|
+
{ event: 23, mag: 5.3, station: "c168", dist: 25.3, accel: 0.23 },
|
167
|
+
{ event: 23, mag: 5.3, station: "5072", dist: 53.1, accel: 0.022 }
|
168
|
+
],
|
169
|
+
[
|
170
|
+
records.size,
|
171
|
+
records[0],
|
172
|
+
records[78],
|
173
|
+
records[169],
|
174
|
+
records[-1]
|
175
|
+
])
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
sub_test_case('drc') do
|
181
|
+
sub_test_case('germination') do
|
182
|
+
def setup
|
183
|
+
@dataset = Datasets::Rdataset.new('drc', 'germination')
|
184
|
+
end
|
185
|
+
|
186
|
+
test('#each') do
|
187
|
+
records = @dataset.each.to_a
|
188
|
+
assert_equal([
|
189
|
+
192,
|
190
|
+
{ temp: 10, species: 'wheat', start: 0, end: 1.0, germinated: 0 },
|
191
|
+
{ temp: 40, species: 'rice', start: 18, end: Float::INFINITY, germinated: 12 }
|
192
|
+
],
|
193
|
+
[
|
194
|
+
records.size,
|
195
|
+
records[0],
|
196
|
+
records[-1]
|
197
|
+
])
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
sub_test_case('validate') do
|
203
|
+
sub_test_case('nace_rev2') do
|
204
|
+
def setup
|
205
|
+
@dataset = Datasets::Rdataset.new('validate', 'nace_rev2')
|
206
|
+
end
|
207
|
+
|
208
|
+
test('#each') do
|
209
|
+
records = @dataset.each.to_a
|
210
|
+
assert_equal([
|
211
|
+
996,
|
212
|
+
{
|
213
|
+
Order: 398_481,
|
214
|
+
Level: 1,
|
215
|
+
Code: 'A',
|
216
|
+
Parent: '',
|
217
|
+
Description: 'AGRICULTURE, FORESTRY AND FISHING',
|
218
|
+
This_item_includes: 'This section includes the exploitation of vegetal and animal natural resources, comprising the activities of growing of crops, raising and breeding of animals, harvesting of timber and other plants, animals or animal products from a farm or their natural habitats.',
|
219
|
+
This_item_also_includes: '',
|
220
|
+
Rulings: '',
|
221
|
+
This_item_excludes: '',
|
222
|
+
"Reference_to_ISIC_Rev._4": 'A'
|
223
|
+
},
|
224
|
+
{
|
225
|
+
Order: 399_476,
|
226
|
+
Level: 4,
|
227
|
+
Code: '99.00',
|
228
|
+
Parent: '99.0',
|
229
|
+
Description: 'Activities of extraterritorial organisations and bodies',
|
230
|
+
This_item_includes: "This class includes:\n- activities of international organisations such as the United Nations and the specialised agencies of the United Nations system, regional bodies etc., the International Monetary Fund, the World Bank, the World Customs Organisation, the Organisation for Economic Co-operation and Development, the organisation of Petroleum Exporting Countries, the European Communities, the European Free Trade Association etc.",
|
231
|
+
This_item_also_includes: "This class also includes:\n- activities of diplomatic and consular missions when being determined by the country of their location rather than by the country they represent",
|
232
|
+
Rulings: '',
|
233
|
+
This_item_excludes: '',
|
234
|
+
"Reference_to_ISIC_Rev._4": '9900'
|
235
|
+
}
|
236
|
+
],
|
237
|
+
[
|
238
|
+
records.size,
|
239
|
+
records[0],
|
240
|
+
records[-1]
|
241
|
+
])
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
@@ -1,7 +1,41 @@
|
|
1
|
-
class
|
1
|
+
class SeabornTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("list") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::SeabornList.new
|
5
|
+
end
|
6
|
+
|
7
|
+
def test_each
|
8
|
+
records = @dataset.each.to_a
|
9
|
+
assert_equal([
|
10
|
+
{dataset: "anagrams"},
|
11
|
+
{dataset: "anscombe"},
|
12
|
+
{dataset: "attention"},
|
13
|
+
{dataset: "brain_networks"},
|
14
|
+
{dataset: "car_crashes"},
|
15
|
+
{dataset: "diamonds"},
|
16
|
+
{dataset: "dots"},
|
17
|
+
{dataset: "exercise"},
|
18
|
+
{dataset: "flights"},
|
19
|
+
{dataset: "fmri"},
|
20
|
+
{dataset: "geyser"},
|
21
|
+
{dataset: "glue"},
|
22
|
+
{dataset: "healthexp"},
|
23
|
+
{dataset: "iris"},
|
24
|
+
{dataset: "mpg"},
|
25
|
+
{dataset: "penguins"},
|
26
|
+
{dataset: "planets"},
|
27
|
+
{dataset: "seaice"},
|
28
|
+
{dataset: "taxis"},
|
29
|
+
{dataset: "tips"},
|
30
|
+
{dataset: "titanic"},
|
31
|
+
],
|
32
|
+
records)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
2
36
|
sub_test_case("fmri") do
|
3
37
|
def setup
|
4
|
-
@dataset = Datasets::
|
38
|
+
@dataset = Datasets::Seaborn.new("fmri")
|
5
39
|
end
|
6
40
|
|
7
41
|
def test_each
|
@@ -33,7 +67,7 @@ class SeabornDataTest < Test::Unit::TestCase
|
|
33
67
|
|
34
68
|
sub_test_case("flights") do
|
35
69
|
def setup
|
36
|
-
@dataset = Datasets::
|
70
|
+
@dataset = Datasets::Seaborn.new("flights")
|
37
71
|
end
|
38
72
|
|
39
73
|
def test_each
|
@@ -61,7 +95,7 @@ class SeabornDataTest < Test::Unit::TestCase
|
|
61
95
|
|
62
96
|
sub_test_case("penguins") do
|
63
97
|
def setup
|
64
|
-
@dataset = Datasets::
|
98
|
+
@dataset = Datasets::Seaborn.new("penguins")
|
65
99
|
end
|
66
100
|
|
67
101
|
def test_each
|
@@ -94,4 +128,36 @@ class SeabornDataTest < Test::Unit::TestCase
|
|
94
128
|
])
|
95
129
|
end
|
96
130
|
end
|
131
|
+
|
132
|
+
sub_test_case("attention") do
|
133
|
+
def setup
|
134
|
+
@dataset = Datasets::Seaborn.new("attention")
|
135
|
+
end
|
136
|
+
|
137
|
+
def test_each
|
138
|
+
records = @dataset.to_a
|
139
|
+
assert_equal([
|
140
|
+
60,
|
141
|
+
{
|
142
|
+
index: 1,
|
143
|
+
subject: 2,
|
144
|
+
attention: "divided",
|
145
|
+
solutions: 1,
|
146
|
+
score: 3.0
|
147
|
+
},
|
148
|
+
{
|
149
|
+
index: 59,
|
150
|
+
subject: 20,
|
151
|
+
attention: "focused",
|
152
|
+
solutions: 3,
|
153
|
+
score: 5.0
|
154
|
+
}
|
155
|
+
],
|
156
|
+
[
|
157
|
+
records.size,
|
158
|
+
records[1],
|
159
|
+
records[-1]
|
160
|
+
])
|
161
|
+
end
|
162
|
+
end
|
97
163
|
end
|
@@ -6,7 +6,7 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
|
|
6
6
|
test('#each') do
|
7
7
|
records = @dataset.each.to_a
|
8
8
|
assert_equal([
|
9
|
-
|
9
|
+
65182,
|
10
10
|
{
|
11
11
|
group_id: "000001",
|
12
12
|
is_noun: true,
|
@@ -19,15 +19,15 @@ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
|
|
19
19
|
notation: "曖昧",
|
20
20
|
},
|
21
21
|
{
|
22
|
-
group_id: "
|
22
|
+
group_id: "024909",
|
23
23
|
is_noun: true,
|
24
|
-
expansion_type: :
|
24
|
+
expansion_type: :expanded,
|
25
25
|
lexeme_id: 1,
|
26
26
|
form_type: :typical,
|
27
27
|
acronym_type: :alphabet,
|
28
28
|
variant_type: :typical,
|
29
|
-
categories: ["
|
30
|
-
notation: "
|
29
|
+
categories: ["ビジネス"],
|
30
|
+
notation: "BPO",
|
31
31
|
},
|
32
32
|
],
|
33
33
|
[
|
@@ -0,0 +1,178 @@
|
|
1
|
+
class WikipediaKyotoJapaneseEnglishTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("article") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::WikipediaKyotoJapaneseEnglish.new(type: :article)
|
5
|
+
end
|
6
|
+
|
7
|
+
def shorten_text(text)
|
8
|
+
max = 20
|
9
|
+
if text.size <= max
|
10
|
+
text
|
11
|
+
else
|
12
|
+
"#{text[0, max]}..."
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def hashify(record)
|
17
|
+
hash = {class: record.class.name.split("::").last}
|
18
|
+
case record
|
19
|
+
when Datasets::WikipediaKyotoJapaneseEnglish::Title
|
20
|
+
hash[:section] = record.section&.id
|
21
|
+
hash[:japanese] = shorten_text(record.japanese)
|
22
|
+
hash[:english] = shorten_text(record.english)
|
23
|
+
when Datasets::WikipediaKyotoJapaneseEnglish::Sentence
|
24
|
+
hash[:id] = record.id
|
25
|
+
hash[:section] = record.section&.id
|
26
|
+
hash[:paragraph] = record.paragraph&.id
|
27
|
+
hash[:japanese] = shorten_text(record.japanese)
|
28
|
+
hash[:english] = shorten_text(record.english)
|
29
|
+
else
|
30
|
+
record.members.each do |member|
|
31
|
+
value = record[member]
|
32
|
+
case value
|
33
|
+
when Array
|
34
|
+
value = value.collect do |v|
|
35
|
+
hashify(v)
|
36
|
+
end
|
37
|
+
when String
|
38
|
+
value = shorten_text(value)
|
39
|
+
when Struct
|
40
|
+
value = hasify(value)
|
41
|
+
end
|
42
|
+
hash[member] = value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
hash
|
46
|
+
end
|
47
|
+
|
48
|
+
test("#each") do
|
49
|
+
first_record = @dataset.each.first
|
50
|
+
assert_equal({
|
51
|
+
class: "Article",
|
52
|
+
copyright: "copyright (c) 2010 前...",
|
53
|
+
sections: [],
|
54
|
+
source: "jawiki-20080607-page...",
|
55
|
+
contents: [
|
56
|
+
{
|
57
|
+
class: "Title",
|
58
|
+
section: nil,
|
59
|
+
english: "Genkitsu SANYO",
|
60
|
+
japanese: "三要元佶",
|
61
|
+
},
|
62
|
+
{
|
63
|
+
class: "Sentence",
|
64
|
+
id: "1",
|
65
|
+
section: nil,
|
66
|
+
paragraph: "1",
|
67
|
+
english: "Genkitsu SANYO (1548...",
|
68
|
+
japanese: "三要元佶(さんよう げんきつ, 天文 (...",
|
69
|
+
},
|
70
|
+
{
|
71
|
+
class: "Sentence",
|
72
|
+
id: "2",
|
73
|
+
section: nil,
|
74
|
+
paragraph: "2",
|
75
|
+
english: "He was originally fr...",
|
76
|
+
japanese: "肥前国(佐賀県)の出身。",
|
77
|
+
},
|
78
|
+
{
|
79
|
+
class: "Sentence",
|
80
|
+
id: "3",
|
81
|
+
section: nil,
|
82
|
+
paragraph: "2",
|
83
|
+
english: "His Go (pen name) wa...",
|
84
|
+
japanese: "号は閑室。",
|
85
|
+
},
|
86
|
+
{
|
87
|
+
class: "Sentence",
|
88
|
+
id: "4",
|
89
|
+
section: nil,
|
90
|
+
paragraph: "2",
|
91
|
+
english: "He was called Kiccho...",
|
92
|
+
japanese: "佶長老、閑室和尚と呼ばれた。",
|
93
|
+
},
|
94
|
+
{
|
95
|
+
class: "Sentence",
|
96
|
+
id: "5",
|
97
|
+
section: nil,
|
98
|
+
paragraph: "3",
|
99
|
+
english: "He went up to the ca...",
|
100
|
+
japanese: "幼少時に都に上り、岩倉の円通寺 (京都市...",
|
101
|
+
},
|
102
|
+
{
|
103
|
+
class: "Sentence",
|
104
|
+
id: "6",
|
105
|
+
section: nil,
|
106
|
+
paragraph: "4",
|
107
|
+
english: "After assuming the p...",
|
108
|
+
japanese: "足利学校の長となるが、関ヶ原の戦いの折に...",
|
109
|
+
},
|
110
|
+
{
|
111
|
+
class: "Sentence",
|
112
|
+
id: "7",
|
113
|
+
section: nil,
|
114
|
+
paragraph: "5",
|
115
|
+
english: "He assumed the posit...",
|
116
|
+
japanese: "金地院崇伝と寺社奉行の任に当たり、西笑承...",
|
117
|
+
},
|
118
|
+
{
|
119
|
+
class: "Sentence",
|
120
|
+
id: "8",
|
121
|
+
section: nil,
|
122
|
+
paragraph: "6",
|
123
|
+
english: "Later, he was invite...",
|
124
|
+
japanese: "家康によって、伏見区の学校に招かれ、円光...",
|
125
|
+
},
|
126
|
+
],
|
127
|
+
},
|
128
|
+
hashify(first_record))
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
sub_test_case("lexicon") do
|
133
|
+
def setup
|
134
|
+
@dataset = Datasets::WikipediaKyotoJapaneseEnglish.new(type: :lexicon)
|
135
|
+
end
|
136
|
+
|
137
|
+
test("#each") do
|
138
|
+
records = @dataset.each.to_a
|
139
|
+
assert_equal([
|
140
|
+
51982,
|
141
|
+
{
|
142
|
+
:japanese => "102世吉田日厚貫首",
|
143
|
+
:english => "the 102nd head priest, Nikko TOSHIDA"
|
144
|
+
},
|
145
|
+
{
|
146
|
+
:japanese => "龗神社",
|
147
|
+
:english => "Okami-jinja Shrine"
|
148
|
+
},
|
149
|
+
],
|
150
|
+
[
|
151
|
+
records.size,
|
152
|
+
records[0].to_h,
|
153
|
+
records[-1].to_h,
|
154
|
+
])
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
test("invalid") do
|
159
|
+
message = "Please set type :article or :lexicon: :invalid"
|
160
|
+
assert_raise(ArgumentError.new(message)) do
|
161
|
+
Datasets::WikipediaKyotoJapaneseEnglish.new(type: :invalid)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
test("description") do
|
166
|
+
dataset = Datasets::WikipediaKyotoJapaneseEnglish.new
|
167
|
+
description = dataset.metadata.description
|
168
|
+
assert_equal(<<-DESCRIPTION, description)
|
169
|
+
"The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
|
170
|
+
aims mainly at supporting research and development relevant to
|
171
|
+
high-performance multilingual machine translation, information
|
172
|
+
extraction, and other language processing technologies. The National
|
173
|
+
Institute of Information and Communications Technology (NICT) has
|
174
|
+
created this corpus by manually translating Japanese Wikipedia
|
175
|
+
articles (related to Kyoto) into English.
|
176
|
+
DESCRIPTION
|
177
|
+
end
|
178
|
+
end
|