red-datasets 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -0
  3. data/Rakefile +10 -0
  4. data/doc/text/news.md +36 -0
  5. data/lib/datasets/california-housing.rb +1 -1
  6. data/lib/datasets/dataset.rb +2 -2
  7. data/lib/datasets/downloader.rb +34 -16
  8. data/lib/datasets/fashion-mnist.rb +6 -2
  9. data/lib/datasets/ggplot2-dataset.rb +3 -3
  10. data/lib/datasets/house-of-councillor.rb +169 -0
  11. data/lib/datasets/house-of-representative.rb +107 -0
  12. data/lib/datasets/japanese-date-parser.rb +38 -0
  13. data/lib/datasets/kuzushiji-mnist.rb +6 -2
  14. data/lib/datasets/lazy.rb +2 -0
  15. data/lib/datasets/libsvm-dataset-list.rb +1 -1
  16. data/lib/datasets/mnist.rb +12 -6
  17. data/lib/datasets/nagoya-university-conversation-corpus.rb +6 -6
  18. data/lib/datasets/postal-code-japan.rb +3 -3
  19. data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
  20. data/lib/datasets/version.rb +1 -1
  21. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
  22. data/lib/datasets/wikipedia.rb +2 -2
  23. data/test/japanese-date-parser-test.rb +27 -0
  24. data/test/test-adult.rb +36 -86
  25. data/test/test-aozora-bunko.rb +5 -5
  26. data/test/test-california-housing.rb +12 -31
  27. data/test/test-cldr-plurals.rb +1 -1
  28. data/test/test-diamonds.rb +13 -33
  29. data/test/test-downloader.rb +1 -1
  30. data/test/test-geolonia.rb +17 -41
  31. data/test/test-house-of-councillor.rb +223 -0
  32. data/test/test-house-of-representative.rb +54 -0
  33. data/test/test-nagoya-university-conversation-corpus.rb +17 -69
  34. data/test/test-postal-code-japan.rb +7 -0
  35. data/test/test-quora-duplicate-question-pair.rb +7 -21
  36. data/test/test-rdataset.rb +24 -22
  37. data/test/test-sudachi-synonym-dictionary.rb +12 -31
  38. data/test/test-wikipedia.rb +5 -5
  39. metadata +12 -6
@@ -41,7 +41,7 @@ module Datasets
41
41
  super()
42
42
  @reading = reading
43
43
  unless VALID_READINGS.include?(@reading)
44
- message = ":reading must be one of ["
44
+ message = +":reading must be one of ["
45
45
  message << VALID_READINGS.collect(&:inspect).join(", ")
46
46
  message << "]: #{@reading.inspect}"
47
47
  raise ArgumentError, message
@@ -104,14 +104,14 @@ module Datasets
104
104
 
105
105
  private
106
106
  def open_data
107
- data_url = "https://www.post.japanpost.jp/zipcode/dl"
107
+ data_url = +"https://www.post.japanpost.jp/zipcode/dl"
108
108
  case @reading
109
109
  when :lowercase
110
110
  data_url << "/kogaki/zip/ken_all.zip"
111
111
  when :uppercase
112
112
  data_url << "/oogaki/zip/ken_all.zip"
113
113
  when :romaji
114
- data_url << "/roman/ken_all_rome.zip"
114
+ data_url << "/roman/KEN_ALL_ROME.zip"
115
115
  end
116
116
  data_path = cache_dir_path + "#{@reading}-ken-all.zip"
117
117
  download(data_path, data_url)
@@ -43,7 +43,7 @@ module Datasets
43
43
  data_path = cache_dir_path + "quora_duplicate_questions.tsv"
44
44
  data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
45
45
  download(data_path, data_url)
46
- CSV.open(data_path, col_sep: "\t", headers: true, converters: :all) do |csv|
46
+ CSV.open(data_path, col_sep: "\t", headers: true, converters: :integer) do |csv|
47
47
  yield(csv)
48
48
  end
49
49
  end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.1.6"
2
+ VERSION = "0.1.8"
3
3
  end
@@ -126,7 +126,7 @@ articles (related to Kyoto) into English.
126
126
  @sentence = nil
127
127
  @text_container_stack = []
128
128
  @element_stack = []
129
- @text_stack = [""]
129
+ @text_stack = [+""]
130
130
  end
131
131
 
132
132
  def tag_start(name, attributes)
@@ -207,7 +207,7 @@ articles (related to Kyoto) into English.
207
207
  private
208
208
  def push_stacks(name, attributes)
209
209
  @element_stack.push({name: name, attributes: attributes})
210
- @text_stack.push("")
210
+ @text_stack.push(+"")
211
211
  end
212
212
 
213
213
  def pop_stacks
@@ -90,7 +90,7 @@ module Datasets
90
90
  @contributor = nil
91
91
  @current_tag = nil
92
92
  @tag_stack = []
93
- @text_stack = [""]
93
+ @text_stack = [+""]
94
94
  @first_page = true
95
95
  end
96
96
 
@@ -172,7 +172,7 @@ module Datasets
172
172
 
173
173
  def push_stacks(tag)
174
174
  @tag_stack << tag
175
- @text_stack << ""
175
+ @text_stack << +""
176
176
  end
177
177
 
178
178
  def pop_stacks
@@ -0,0 +1,27 @@
1
+ class JapaneseDateParserTest < Test::Unit::TestCase
2
+ def setup
3
+ @parser = Datasets::JapaneseDateParser.new
4
+ end
5
+
6
+ data("month and day with leading a space in Heisei", ["H10.01.01", "平成10年 1月 1日"])
7
+ data("month with leading a space in Heisei", ["H10.01.10", "平成10年 1月10日"])
8
+ data(" day with leading a space in Heisei", ["H10.10.01", "平成10年10月 1日"])
9
+ data(" without leading a space in Heisei", ["H10.10.10", "平成10年10月10日"])
10
+ data("year, month and day with leading a space in Reiwa", ["R02.01.01", "令和 2年 1月 1日"])
11
+ data("year, month with leading a space in Reiwa", ["R02.01.10", "令和 2年 1月10日"])
12
+ data("year, day with leading a space in Reiwa", ["R02.10.01", "令和 2年10月 1日"])
13
+ data("year, without leading a space in Reiwa", ["R02.10.10", "令和 2年10月10日"])
14
+ data("boundary within Heisei", ["H31.04.30", "平成31年 4月30日"])
15
+ data("boundary within Reiwa", ["R01.05.01", "令和元年 5月 1日"])
16
+ test("#parse") do
17
+ expected_jisx0301, japanese_date_string = data
18
+ assert_equal(expected_jisx0301, @parser.parse(japanese_date_string).jisx0301)
19
+ end
20
+
21
+ test("unsupported era initial range") do
22
+ expected_message = "era must be one of [平成, 令和]: 昭和"
23
+ assert_raise(Datasets::JapaneseDateParser::UnsupportedEraInitialRange.new(expected_message)) do
24
+ @parser.parse("昭和元年 1月 1日")
25
+ end
26
+ end
27
+ end
data/test/test-adult.rb CHANGED
@@ -9,49 +9,24 @@ class AdultTest < Test::Unit::TestCase
9
9
  end
10
10
 
11
11
  test("#each") do
12
- records = @dataset.each.to_a
13
- assert_equal([
14
- 32561,
15
- {
16
- :age => 39,
17
- :work_class => "State-gov",
18
- :final_weight => 77516,
19
- :education => "Bachelors",
20
- :n_education_years => 13,
21
- :marital_status => "Never-married",
22
- :occupation => "Adm-clerical",
23
- :relationship => "Not-in-family",
24
- :race => "White",
25
- :sex => "Male",
26
- :capital_gain => 2174,
27
- :capital_loss => 0,
28
- :hours_per_week => 40,
29
- :native_country => "United-States",
30
- :label => "<=50K"
31
- },
32
- {
33
- :age => 52,
34
- :work_class => "Self-emp-inc",
35
- :final_weight => 287927,
36
- :education => "HS-grad",
37
- :n_education_years => 9,
38
- :marital_status => "Married-civ-spouse",
39
- :occupation => "Exec-managerial",
40
- :relationship => "Wife",
41
- :race => "White",
42
- :sex => "Female",
43
- :capital_gain => 15024,
44
- :capital_loss => 0,
45
- :hours_per_week => 40,
46
- :native_country => "United-States",
47
- :label => ">50K"
48
- }
49
- ],
50
- [
51
- records.size,
52
- records[0].to_h,
53
- records[-1].to_h
54
- ])
12
+ assert_equal({
13
+ :age => 39,
14
+ :work_class => "State-gov",
15
+ :final_weight => 77516,
16
+ :education => "Bachelors",
17
+ :n_education_years => 13,
18
+ :marital_status => "Never-married",
19
+ :occupation => "Adm-clerical",
20
+ :relationship => "Not-in-family",
21
+ :race => "White",
22
+ :sex => "Male",
23
+ :capital_gain => 2174,
24
+ :capital_loss => 0,
25
+ :hours_per_week => 40,
26
+ :native_country => "United-States",
27
+ :label => "<=50K"
28
+ },
29
+ @dataset.each.next.to_h)
55
30
  end
56
31
  end
57
32
 
@@ -65,49 +40,24 @@ class AdultTest < Test::Unit::TestCase
65
40
  end
66
41
 
67
42
  test("#each") do
68
- records = @dataset.each.to_a
69
- assert_equal([
70
- 16281,
71
- {
72
- :age => 25,
73
- :work_class => "Private",
74
- :final_weight => 226802,
75
- :education => "11th",
76
- :n_education_years => 7,
77
- :marital_status => "Never-married",
78
- :occupation => "Machine-op-inspct",
79
- :relationship => "Own-child",
80
- :race => "Black",
81
- :sex => "Male",
82
- :capital_gain => 0,
83
- :capital_loss => 0,
84
- :hours_per_week => 40,
85
- :native_country => "United-States",
86
- :label => "<=50K."
87
- },
88
- {
89
- :age => 35,
90
- :work_class => "Self-emp-inc",
91
- :final_weight => 182148,
92
- :education => "Bachelors",
93
- :n_education_years => 13,
94
- :marital_status => "Married-civ-spouse",
95
- :occupation => "Exec-managerial",
96
- :relationship => "Husband",
97
- :race => "White",
98
- :sex => "Male",
99
- :capital_gain => 0,
100
- :capital_loss => 0,
101
- :hours_per_week => 60,
102
- :native_country => "United-States",
103
- :label => ">50K."
104
- }
105
- ],
106
- [
107
- records.size,
108
- records[0].to_h,
109
- records[-1].to_h
110
- ])
43
+ assert_equal({
44
+ :age => 25,
45
+ :work_class => "Private",
46
+ :final_weight => 226802,
47
+ :education => "11th",
48
+ :n_education_years => 7,
49
+ :marital_status => "Never-married",
50
+ :occupation => "Machine-op-inspct",
51
+ :relationship => "Own-child",
52
+ :race => "Black",
53
+ :sex => "Male",
54
+ :capital_gain => 0,
55
+ :capital_loss => 0,
56
+ :hours_per_week => 40,
57
+ :native_country => "United-States",
58
+ :label => "<=50K.",
59
+ },
60
+ @dataset.each.next.to_h)
111
61
  end
112
62
  end
113
63
 
@@ -114,13 +114,13 @@ class AozoraBunkoTest < Test::Unit::TestCase
114
114
  book = Datasets::AozoraBunko::Book.new
115
115
  book.cache_path = @cache_path
116
116
 
117
- book.title_id = '000750'
118
- book.person_id = '000146'
119
- book.html_file_url = 'http://www.lcv.ne.jp/~ibs52086/fire/'
117
+ book.title_id = '061551'
118
+ book.person_id = '002225'
119
+ book.html_file_url = 'http://minken.party/2019/03/09/yakunin/'
120
120
  book.html_file_character_encoding = 'UTF-8'
121
121
 
122
- assert_equal('<title>種田山頭火句集 | 『草木塔抄』他 FIRE ON THE MOUNTAIN</title>',
123
- book.html.split("\n")[7])
122
+ assert_equal('<title>自由民権現代研究会</title>',
123
+ book.html[/<title>.*?<\/title>/])
124
124
  end
125
125
  end
126
126
 
@@ -8,37 +8,18 @@ class CaliforniaHousingTest < Test::Unit::TestCase
8
8
  end
9
9
 
10
10
  test("#each") do
11
- records = @dataset.each.to_a
12
- assert_equal([
13
- 20640,
14
- {
15
- median_house_value: 452600.000000,
16
- median_income: 8.325200,
17
- housing_median_age: 41.000000,
18
- total_rooms: 880.000000,
19
- total_bedrooms: 129.000000,
20
- population: 322.000000,
21
- households: 126.000000,
22
- latitude: 37.880000,
23
- longitude: -122.230000
24
- },
25
- {
26
- median_house_value: 89400.000000,
27
- median_income: 2.388600,
28
- housing_median_age: 16.000000,
29
- total_rooms: 2785.000000,
30
- total_bedrooms: 616.000000,
31
- population: 1387.000000,
32
- households: 530.000000,
33
- latitude: 39.370000,
34
- longitude: -121.240000
35
- },
36
- ],
37
- [
38
- records.size,
39
- records[0].to_h,
40
- records[-1].to_h
41
- ])
11
+ assert_equal({
12
+ median_house_value: 452600.000000,
13
+ median_income: 8.325200,
14
+ housing_median_age: 41.000000,
15
+ total_rooms: 880.000000,
16
+ total_bedrooms: 129.000000,
17
+ population: 322.000000,
18
+ households: 126.000000,
19
+ latitude: 37.880000,
20
+ longitude: -122.230000,
21
+ },
22
+ @dataset.each.next.to_h)
42
23
  end
43
24
 
44
25
  sub_test_case("#metadata") do
@@ -14,7 +14,7 @@ class CLDRPluralsTest < Test::Unit::TestCase
14
14
  test("#each") do
15
15
  locales = @dataset.each.to_a
16
16
  assert_equal([
17
- 219,
17
+ 222,
18
18
  locale("bm",
19
19
  [
20
20
  rule("other",
@@ -8,39 +8,19 @@ class DiamondsTest < Test::Unit::TestCase
8
8
  end
9
9
 
10
10
  test("#each") do
11
- records = @dataset.each.to_a
12
- assert_equal([
13
- 53940,
14
- {
15
- carat: 0.23,
16
- clarity: "SI2",
17
- color: "E",
18
- cut: "Ideal",
19
- depth: 61.5,
20
- price: 326,
21
- table: 55.0,
22
- x: 3.95,
23
- y: 3.98,
24
- z: 2.43,
25
- },
26
- {
27
- carat: 0.75,
28
- clarity: "SI2",
29
- color: "D",
30
- cut: "Ideal",
31
- depth: 62.2,
32
- price: 2757,
33
- table: 55.0,
34
- x: 5.83,
35
- y: 5.87,
36
- z: 3.64,
37
- },
38
- ],
39
- [
40
- records.size,
41
- records[0].to_h,
42
- records[-1].to_h
43
- ])
11
+ assert_equal({
12
+ carat: 0.23,
13
+ clarity: "SI2",
14
+ color: "E",
15
+ cut: "Ideal",
16
+ depth: 61.5,
17
+ price: 326,
18
+ table: 55.0,
19
+ x: 3.95,
20
+ y: 3.98,
21
+ z: 2.43,
22
+ },
23
+ @dataset.each.next.to_h)
44
24
  end
45
25
 
46
26
  sub_test_case("#metadata") do
@@ -17,7 +17,7 @@ class DownloaderTest < Test::Unit::TestCase
17
17
  output_path = @tmp_dir + "file"
18
18
  downloader = Datasets::Downloader.new(first_url)
19
19
 
20
- downloader.define_singleton_method(:start_http) do |url, headers|
20
+ downloader.define_singleton_method(:start_http) do |url, fallback_urls, headers|
21
21
  raise Datasets::Downloader::TooManyRedirects, "too many redirections: #{last_url}"
22
22
  end
23
23
 
@@ -4,47 +4,23 @@ class GeoloniaTest < Test::Unit::TestCase
4
4
  end
5
5
 
6
6
  test('#each') do
7
- records = @dataset.each.to_a
8
- assert_equal([
9
- 277616,
10
- {
11
- :prefecture_code => "01",
12
- :prefecture_name => "北海道",
13
- :prefecture_kana => "ホッカイドウ",
14
- :prefecture_romaji => "HOKKAIDO",
15
- :municipality_code => "01101",
16
- :municipality_name => "札幌市中央区",
17
- :municipality_kana => "サッポロシチュウオウク",
18
- :municipality_romaji => "SAPPORO SHI CHUO KU",
19
- :street_name => "旭ケ丘一丁目",
20
- :street_kana => "アサヒガオカ 1",
21
- :street_romaji => "ASAHIGAOKA 1",
22
- :alias => nil,
23
- :latitude => "43.04223",
24
- :longitude => "141.319722"
25
- },
26
- {
27
- :prefecture_code => "47",
28
- :prefecture_name => "沖縄県",
29
- :prefecture_kana => "オキナワケン",
30
- :prefecture_romaji => "OKINAWA KEN",
31
- :municipality_code => "47382",
32
- :municipality_name => "八重山郡与那国町",
33
- :municipality_kana => "ヤエヤマグンヨナグニチョウ",
34
- :municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
35
- :street_name => "字与那国",
36
- :street_kana => nil,
37
- :street_romaji => nil,
38
- :alias => nil,
39
- :latitude => "24.455925",
40
- :longitude => "122.987678",
41
- },
42
- ],
43
- [
44
- records.size,
45
- records[0].to_h,
46
- records[-1].to_h,
47
- ])
7
+ assert_equal({
8
+ :prefecture_code => "01",
9
+ :prefecture_name => "北海道",
10
+ :prefecture_kana => "ホッカイドウ",
11
+ :prefecture_romaji => "HOKKAIDO",
12
+ :municipality_code => "01101",
13
+ :municipality_name => "札幌市中央区",
14
+ :municipality_kana => "サッポロシチュウオウク",
15
+ :municipality_romaji => "SAPPORO SHI CHUO KU",
16
+ :street_name => "旭ケ丘一丁目",
17
+ :street_kana => "アサヒガオカ 1",
18
+ :street_romaji => "ASAHIGAOKA 1",
19
+ :alias => nil,
20
+ :latitude => "43.04223",
21
+ :longitude => "141.319722",
22
+ },
23
+ @dataset.each.next.to_h)
48
24
  end
49
25
 
50
26
  sub_test_case("#metadata") do