red-datasets 0.1.6 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -0
  3. data/Rakefile +10 -0
  4. data/doc/text/news.md +36 -0
  5. data/lib/datasets/california-housing.rb +1 -1
  6. data/lib/datasets/dataset.rb +2 -2
  7. data/lib/datasets/downloader.rb +34 -16
  8. data/lib/datasets/fashion-mnist.rb +6 -2
  9. data/lib/datasets/ggplot2-dataset.rb +3 -3
  10. data/lib/datasets/house-of-councillor.rb +169 -0
  11. data/lib/datasets/house-of-representative.rb +107 -0
  12. data/lib/datasets/japanese-date-parser.rb +38 -0
  13. data/lib/datasets/kuzushiji-mnist.rb +6 -2
  14. data/lib/datasets/lazy.rb +2 -0
  15. data/lib/datasets/libsvm-dataset-list.rb +1 -1
  16. data/lib/datasets/mnist.rb +12 -6
  17. data/lib/datasets/nagoya-university-conversation-corpus.rb +6 -6
  18. data/lib/datasets/postal-code-japan.rb +3 -3
  19. data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
  20. data/lib/datasets/version.rb +1 -1
  21. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
  22. data/lib/datasets/wikipedia.rb +2 -2
  23. data/test/japanese-date-parser-test.rb +27 -0
  24. data/test/test-adult.rb +36 -86
  25. data/test/test-aozora-bunko.rb +5 -5
  26. data/test/test-california-housing.rb +12 -31
  27. data/test/test-cldr-plurals.rb +1 -1
  28. data/test/test-diamonds.rb +13 -33
  29. data/test/test-downloader.rb +1 -1
  30. data/test/test-geolonia.rb +17 -41
  31. data/test/test-house-of-councillor.rb +223 -0
  32. data/test/test-house-of-representative.rb +54 -0
  33. data/test/test-nagoya-university-conversation-corpus.rb +17 -69
  34. data/test/test-postal-code-japan.rb +7 -0
  35. data/test/test-quora-duplicate-question-pair.rb +7 -21
  36. data/test/test-rdataset.rb +24 -22
  37. data/test/test-sudachi-synonym-dictionary.rb +12 -31
  38. data/test/test-wikipedia.rb +5 -5
  39. metadata +12 -6
@@ -41,7 +41,7 @@ module Datasets
41
41
  super()
42
42
  @reading = reading
43
43
  unless VALID_READINGS.include?(@reading)
44
- message = ":reading must be one of ["
44
+ message = +":reading must be one of ["
45
45
  message << VALID_READINGS.collect(&:inspect).join(", ")
46
46
  message << "]: #{@reading.inspect}"
47
47
  raise ArgumentError, message
@@ -104,14 +104,14 @@ module Datasets
104
104
 
105
105
  private
106
106
  def open_data
107
- data_url = "https://www.post.japanpost.jp/zipcode/dl"
107
+ data_url = +"https://www.post.japanpost.jp/zipcode/dl"
108
108
  case @reading
109
109
  when :lowercase
110
110
  data_url << "/kogaki/zip/ken_all.zip"
111
111
  when :uppercase
112
112
  data_url << "/oogaki/zip/ken_all.zip"
113
113
  when :romaji
114
- data_url << "/roman/ken_all_rome.zip"
114
+ data_url << "/roman/KEN_ALL_ROME.zip"
115
115
  end
116
116
  data_path = cache_dir_path + "#{@reading}-ken-all.zip"
117
117
  download(data_path, data_url)
@@ -43,7 +43,7 @@ module Datasets
43
43
  data_path = cache_dir_path + "quora_duplicate_questions.tsv"
44
44
  data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
45
45
  download(data_path, data_url)
46
- CSV.open(data_path, col_sep: "\t", headers: true, converters: :all) do |csv|
46
+ CSV.open(data_path, col_sep: "\t", headers: true, converters: :integer) do |csv|
47
47
  yield(csv)
48
48
  end
49
49
  end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.8"
3
3
  end
@@ -126,7 +126,7 @@ articles (related to Kyoto) into English.
126
126
  @sentence = nil
127
127
  @text_container_stack = []
128
128
  @element_stack = []
129
- @text_stack = [""]
129
+ @text_stack = [+""]
130
130
  end
131
131
 
132
132
  def tag_start(name, attributes)
@@ -207,7 +207,7 @@ articles (related to Kyoto) into English.
207
207
  private
208
208
  def push_stacks(name, attributes)
209
209
  @element_stack.push({name: name, attributes: attributes})
210
- @text_stack.push("")
210
+ @text_stack.push(+"")
211
211
  end
212
212
 
213
213
  def pop_stacks
@@ -90,7 +90,7 @@ module Datasets
90
90
  @contributor = nil
91
91
  @current_tag = nil
92
92
  @tag_stack = []
93
- @text_stack = [""]
93
+ @text_stack = [+""]
94
94
  @first_page = true
95
95
  end
96
96
 
@@ -172,7 +172,7 @@ module Datasets
172
172
 
173
173
  def push_stacks(tag)
174
174
  @tag_stack << tag
175
- @text_stack << ""
175
+ @text_stack << +""
176
176
  end
177
177
 
178
178
  def pop_stacks
@@ -0,0 +1,27 @@
1
+ class JapaneseDateParserTest < Test::Unit::TestCase
2
+ def setup
3
+ @parser = Datasets::JapaneseDateParser.new
4
+ end
5
+
6
+ data("month and day with leading a space in Heisei", ["H10.01.01", "平成10年 1月 1日"])
7
+ data("month with leading a space in Heisei", ["H10.01.10", "平成10年 1月10日"])
8
+ data(" day with leading a space in Heisei", ["H10.10.01", "平成10年10月 1日"])
9
+ data(" without leading a space in Heisei", ["H10.10.10", "平成10年10月10日"])
10
+ data("year, month and day with leading a space in Reiwa", ["R02.01.01", "令和 2年 1月 1日"])
11
+ data("year, month with leading a space in Reiwa", ["R02.01.10", "令和 2年 1月10日"])
12
+ data("year, day with leading a space in Reiwa", ["R02.10.01", "令和 2年10月 1日"])
13
+ data("year, without leading a space in Reiwa", ["R02.10.10", "令和 2年10月10日"])
14
+ data("boundary within Heisei", ["H31.04.30", "平成31年 4月30日"])
15
+ data("boundary within Reiwa", ["R01.05.01", "令和元年 5月 1日"])
16
+ test("#parse") do
17
+ expected_jisx0301, japanese_date_string = data
18
+ assert_equal(expected_jisx0301, @parser.parse(japanese_date_string).jisx0301)
19
+ end
20
+
21
+ test("unsupported era initial range") do
22
+ expected_message = "era must be one of [平成, 令和]: 昭和"
23
+ assert_raise(Datasets::JapaneseDateParser::UnsupportedEraInitialRange.new(expected_message)) do
24
+ @parser.parse("昭和元年 1月 1日")
25
+ end
26
+ end
27
+ end
data/test/test-adult.rb CHANGED
@@ -9,49 +9,24 @@ class AdultTest < Test::Unit::TestCase
9
9
  end
10
10
 
11
11
  test("#each") do
12
- records = @dataset.each.to_a
13
- assert_equal([
14
- 32561,
15
- {
16
- :age => 39,
17
- :work_class => "State-gov",
18
- :final_weight => 77516,
19
- :education => "Bachelors",
20
- :n_education_years => 13,
21
- :marital_status => "Never-married",
22
- :occupation => "Adm-clerical",
23
- :relationship => "Not-in-family",
24
- :race => "White",
25
- :sex => "Male",
26
- :capital_gain => 2174,
27
- :capital_loss => 0,
28
- :hours_per_week => 40,
29
- :native_country => "United-States",
30
- :label => "<=50K"
31
- },
32
- {
33
- :age => 52,
34
- :work_class => "Self-emp-inc",
35
- :final_weight => 287927,
36
- :education => "HS-grad",
37
- :n_education_years => 9,
38
- :marital_status => "Married-civ-spouse",
39
- :occupation => "Exec-managerial",
40
- :relationship => "Wife",
41
- :race => "White",
42
- :sex => "Female",
43
- :capital_gain => 15024,
44
- :capital_loss => 0,
45
- :hours_per_week => 40,
46
- :native_country => "United-States",
47
- :label => ">50K"
48
- }
49
- ],
50
- [
51
- records.size,
52
- records[0].to_h,
53
- records[-1].to_h
54
- ])
12
+ assert_equal({
13
+ :age => 39,
14
+ :work_class => "State-gov",
15
+ :final_weight => 77516,
16
+ :education => "Bachelors",
17
+ :n_education_years => 13,
18
+ :marital_status => "Never-married",
19
+ :occupation => "Adm-clerical",
20
+ :relationship => "Not-in-family",
21
+ :race => "White",
22
+ :sex => "Male",
23
+ :capital_gain => 2174,
24
+ :capital_loss => 0,
25
+ :hours_per_week => 40,
26
+ :native_country => "United-States",
27
+ :label => "<=50K"
28
+ },
29
+ @dataset.each.next.to_h)
55
30
  end
56
31
  end
57
32
 
@@ -65,49 +40,24 @@ class AdultTest < Test::Unit::TestCase
65
40
  end
66
41
 
67
42
  test("#each") do
68
- records = @dataset.each.to_a
69
- assert_equal([
70
- 16281,
71
- {
72
- :age => 25,
73
- :work_class => "Private",
74
- :final_weight => 226802,
75
- :education => "11th",
76
- :n_education_years => 7,
77
- :marital_status => "Never-married",
78
- :occupation => "Machine-op-inspct",
79
- :relationship => "Own-child",
80
- :race => "Black",
81
- :sex => "Male",
82
- :capital_gain => 0,
83
- :capital_loss => 0,
84
- :hours_per_week => 40,
85
- :native_country => "United-States",
86
- :label => "<=50K."
87
- },
88
- {
89
- :age => 35,
90
- :work_class => "Self-emp-inc",
91
- :final_weight => 182148,
92
- :education => "Bachelors",
93
- :n_education_years => 13,
94
- :marital_status => "Married-civ-spouse",
95
- :occupation => "Exec-managerial",
96
- :relationship => "Husband",
97
- :race => "White",
98
- :sex => "Male",
99
- :capital_gain => 0,
100
- :capital_loss => 0,
101
- :hours_per_week => 60,
102
- :native_country => "United-States",
103
- :label => ">50K."
104
- }
105
- ],
106
- [
107
- records.size,
108
- records[0].to_h,
109
- records[-1].to_h
110
- ])
43
+ assert_equal({
44
+ :age => 25,
45
+ :work_class => "Private",
46
+ :final_weight => 226802,
47
+ :education => "11th",
48
+ :n_education_years => 7,
49
+ :marital_status => "Never-married",
50
+ :occupation => "Machine-op-inspct",
51
+ :relationship => "Own-child",
52
+ :race => "Black",
53
+ :sex => "Male",
54
+ :capital_gain => 0,
55
+ :capital_loss => 0,
56
+ :hours_per_week => 40,
57
+ :native_country => "United-States",
58
+ :label => "<=50K.",
59
+ },
60
+ @dataset.each.next.to_h)
111
61
  end
112
62
  end
113
63
 
@@ -114,13 +114,13 @@ class AozoraBunkoTest < Test::Unit::TestCase
114
114
  book = Datasets::AozoraBunko::Book.new
115
115
  book.cache_path = @cache_path
116
116
 
117
- book.title_id = '000750'
118
- book.person_id = '000146'
119
- book.html_file_url = 'http://www.lcv.ne.jp/~ibs52086/fire/'
117
+ book.title_id = '061551'
118
+ book.person_id = '002225'
119
+ book.html_file_url = 'http://minken.party/2019/03/09/yakunin/'
120
120
  book.html_file_character_encoding = 'UTF-8'
121
121
 
122
- assert_equal('<title>種田山頭火句集 | 『草木塔抄』他 FIRE ON THE MOUNTAIN</title>',
123
- book.html.split("\n")[7])
122
+ assert_equal('<title>自由民権現代研究会</title>',
123
+ book.html[/<title>.*?<\/title>/])
124
124
  end
125
125
  end
126
126
 
@@ -8,37 +8,18 @@ class CaliforniaHousingTest < Test::Unit::TestCase
8
8
  end
9
9
 
10
10
  test("#each") do
11
- records = @dataset.each.to_a
12
- assert_equal([
13
- 20640,
14
- {
15
- median_house_value: 452600.000000,
16
- median_income: 8.325200,
17
- housing_median_age: 41.000000,
18
- total_rooms: 880.000000,
19
- total_bedrooms: 129.000000,
20
- population: 322.000000,
21
- households: 126.000000,
22
- latitude: 37.880000,
23
- longitude: -122.230000
24
- },
25
- {
26
- median_house_value: 89400.000000,
27
- median_income: 2.388600,
28
- housing_median_age: 16.000000,
29
- total_rooms: 2785.000000,
30
- total_bedrooms: 616.000000,
31
- population: 1387.000000,
32
- households: 530.000000,
33
- latitude: 39.370000,
34
- longitude: -121.240000
35
- },
36
- ],
37
- [
38
- records.size,
39
- records[0].to_h,
40
- records[-1].to_h
41
- ])
11
+ assert_equal({
12
+ median_house_value: 452600.000000,
13
+ median_income: 8.325200,
14
+ housing_median_age: 41.000000,
15
+ total_rooms: 880.000000,
16
+ total_bedrooms: 129.000000,
17
+ population: 322.000000,
18
+ households: 126.000000,
19
+ latitude: 37.880000,
20
+ longitude: -122.230000,
21
+ },
22
+ @dataset.each.next.to_h)
42
23
  end
43
24
 
44
25
  sub_test_case("#metadata") do
@@ -14,7 +14,7 @@ class CLDRPluralsTest < Test::Unit::TestCase
14
14
  test("#each") do
15
15
  locales = @dataset.each.to_a
16
16
  assert_equal([
17
- 219,
17
+ 222,
18
18
  locale("bm",
19
19
  [
20
20
  rule("other",
@@ -8,39 +8,19 @@ class DiamondsTest < Test::Unit::TestCase
8
8
  end
9
9
 
10
10
  test("#each") do
11
- records = @dataset.each.to_a
12
- assert_equal([
13
- 53940,
14
- {
15
- carat: 0.23,
16
- clarity: "SI2",
17
- color: "E",
18
- cut: "Ideal",
19
- depth: 61.5,
20
- price: 326,
21
- table: 55.0,
22
- x: 3.95,
23
- y: 3.98,
24
- z: 2.43,
25
- },
26
- {
27
- carat: 0.75,
28
- clarity: "SI2",
29
- color: "D",
30
- cut: "Ideal",
31
- depth: 62.2,
32
- price: 2757,
33
- table: 55.0,
34
- x: 5.83,
35
- y: 5.87,
36
- z: 3.64,
37
- },
38
- ],
39
- [
40
- records.size,
41
- records[0].to_h,
42
- records[-1].to_h
43
- ])
11
+ assert_equal({
12
+ carat: 0.23,
13
+ clarity: "SI2",
14
+ color: "E",
15
+ cut: "Ideal",
16
+ depth: 61.5,
17
+ price: 326,
18
+ table: 55.0,
19
+ x: 3.95,
20
+ y: 3.98,
21
+ z: 2.43,
22
+ },
23
+ @dataset.each.next.to_h)
44
24
  end
45
25
 
46
26
  sub_test_case("#metadata") do
@@ -17,7 +17,7 @@ class DownloaderTest < Test::Unit::TestCase
17
17
  output_path = @tmp_dir + "file"
18
18
  downloader = Datasets::Downloader.new(first_url)
19
19
 
20
- downloader.define_singleton_method(:start_http) do |url, headers|
20
+ downloader.define_singleton_method(:start_http) do |url, fallback_urls, headers|
21
21
  raise Datasets::Downloader::TooManyRedirects, "too many redirections: #{last_url}"
22
22
  end
23
23
 
@@ -4,47 +4,23 @@ class GeoloniaTest < Test::Unit::TestCase
4
4
  end
5
5
 
6
6
  test('#each') do
7
- records = @dataset.each.to_a
8
- assert_equal([
9
- 277616,
10
- {
11
- :prefecture_code => "01",
12
- :prefecture_name => "北海道",
13
- :prefecture_kana => "ホッカイドウ",
14
- :prefecture_romaji => "HOKKAIDO",
15
- :municipality_code => "01101",
16
- :municipality_name => "札幌市中央区",
17
- :municipality_kana => "サッポロシチュウオウク",
18
- :municipality_romaji => "SAPPORO SHI CHUO KU",
19
- :street_name => "旭ケ丘一丁目",
20
- :street_kana => "アサヒガオカ 1",
21
- :street_romaji => "ASAHIGAOKA 1",
22
- :alias => nil,
23
- :latitude => "43.04223",
24
- :longitude => "141.319722"
25
- },
26
- {
27
- :prefecture_code => "47",
28
- :prefecture_name => "沖縄県",
29
- :prefecture_kana => "オキナワケン",
30
- :prefecture_romaji => "OKINAWA KEN",
31
- :municipality_code => "47382",
32
- :municipality_name => "八重山郡与那国町",
33
- :municipality_kana => "ヤエヤマグンヨナグニチョウ",
34
- :municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
35
- :street_name => "字与那国",
36
- :street_kana => nil,
37
- :street_romaji => nil,
38
- :alias => nil,
39
- :latitude => "24.455925",
40
- :longitude => "122.987678",
41
- },
42
- ],
43
- [
44
- records.size,
45
- records[0].to_h,
46
- records[-1].to_h,
47
- ])
7
+ assert_equal({
8
+ :prefecture_code => "01",
9
+ :prefecture_name => "北海道",
10
+ :prefecture_kana => "ホッカイドウ",
11
+ :prefecture_romaji => "HOKKAIDO",
12
+ :municipality_code => "01101",
13
+ :municipality_name => "札幌市中央区",
14
+ :municipality_kana => "サッポロシチュウオウク",
15
+ :municipality_romaji => "SAPPORO SHI CHUO KU",
16
+ :street_name => "旭ケ丘一丁目",
17
+ :street_kana => "アサヒガオカ 1",
18
+ :street_romaji => "ASAHIGAOKA 1",
19
+ :alias => nil,
20
+ :latitude => "43.04223",
21
+ :longitude => "141.319722",
22
+ },
23
+ @dataset.each.next.to_h)
48
24
  end
49
25
 
50
26
  sub_test_case("#metadata") do