red-datasets 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -0
  3. data/Rakefile +10 -0
  4. data/doc/text/news.md +36 -0
  5. data/lib/datasets/california-housing.rb +1 -1
  6. data/lib/datasets/dataset.rb +2 -2
  7. data/lib/datasets/downloader.rb +51 -17
  8. data/lib/datasets/fashion-mnist.rb +6 -2
  9. data/lib/datasets/ggplot2-dataset.rb +3 -3
  10. data/lib/datasets/house-of-councillor.rb +169 -0
  11. data/lib/datasets/house-of-representative.rb +107 -0
  12. data/lib/datasets/japanese-date-parser.rb +38 -0
  13. data/lib/datasets/kuzushiji-mnist.rb +6 -2
  14. data/lib/datasets/lazy.rb +2 -0
  15. data/lib/datasets/libsvm-dataset-list.rb +1 -1
  16. data/lib/datasets/mnist.rb +12 -6
  17. data/lib/datasets/nagoya-university-conversation-corpus.rb +2 -2
  18. data/lib/datasets/penguins.rb +28 -5
  19. data/lib/datasets/postal-code-japan.rb +3 -3
  20. data/lib/datasets/quora-duplicate-question-pair.rb +1 -1
  21. data/lib/datasets/version.rb +1 -1
  22. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +2 -2
  23. data/lib/datasets/wikipedia.rb +2 -2
  24. data/test/japanese-date-parser-test.rb +27 -0
  25. data/test/test-adult.rb +36 -86
  26. data/test/test-aozora-bunko.rb +5 -5
  27. data/test/test-california-housing.rb +12 -31
  28. data/test/test-cldr-plurals.rb +1 -1
  29. data/test/test-diamonds.rb +13 -33
  30. data/test/test-downloader.rb +1 -1
  31. data/test/test-geolonia.rb +17 -41
  32. data/test/test-house-of-councillor.rb +223 -0
  33. data/test/test-house-of-representative.rb +54 -0
  34. data/test/test-nagoya-university-conversation-corpus.rb +17 -69
  35. data/test/test-postal-code-japan.rb +7 -0
  36. data/test/test-quora-duplicate-question-pair.rb +7 -21
  37. data/test/test-rdataset.rb +24 -22
  38. data/test/test-sudachi-synonym-dictionary.rb +12 -31
  39. data/test/test-wikipedia.rb +5 -5
  40. metadata +12 -6
@@ -28,8 +28,8 @@ module Datasets
28
28
 
29
29
  def initialize
30
30
  super()
31
- @metadata.id = 'nagoya-university-conversation-curpus'
32
- @metadata.name = 'Nagoya University Conversation Curpus'
31
+ @metadata.id = 'nagoya-university-conversation-corpus'
32
+ @metadata.name = 'Nagoya University Conversation Corpus'
33
33
  @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
34
34
  @metadata.licenses = ['CC-BY-NC-ND-4.0']
35
35
  @metadata.description = <<~DESCRIPTION
@@ -26,7 +26,9 @@ module Datasets
26
26
  super
27
27
  species = self.class.name.split("::").last.downcase
28
28
  @metadata.id = "palmerpenguins-#{species}"
29
- @metadata.url = self.class::URL
29
+ package_id = http_parameters["packageid"]
30
+ @metadata.url = "https://portal.edirepository.org/nis/mapbrowse" +
31
+ "?packageid=#{package_id}"
30
32
  @metadata.licenses = ["CC0-1.0"]
31
33
  @data_path = cache_dir_path + "#{species}.csv"
32
34
  end
@@ -46,7 +48,10 @@ module Datasets
46
48
  end
47
49
 
48
50
  private def open_data
49
- download(data_path, metadata.url)
51
+ download(data_path,
52
+ "https://portal.edirepository.org/nis/dataviewer",
53
+ http_method: :post,
54
+ http_parameters: http_parameters)
50
55
  CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
51
56
  yield csv
52
57
  end
@@ -56,19 +61,37 @@ module Datasets
56
61
  # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
57
62
  class Adelie < SpeciesBase
58
63
  DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
59
- URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
64
+
65
+ private def http_parameters
66
+ {
67
+ "packageid" => "knb-lter-pal.219.3",
68
+ "entityid" => "002f3893385f710df69eeebe893144ff",
69
+ }
70
+ end
60
71
  end
61
72
 
62
73
  # Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
63
74
  class Chinstrap < SpeciesBase
64
75
  DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
65
- URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
76
+
77
+ private def http_parameters
78
+ {
79
+ "packageid" => "knb-lter-pal.221.2",
80
+ "entityid" => "fe853aa8f7a59aa84cdd3197619ef462",
81
+ }
82
+ end
66
83
  end
67
84
 
68
85
  # Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
69
86
  class Gentoo < SpeciesBase
70
87
  DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
71
- URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
88
+
89
+ private def http_parameters
90
+ {
91
+ "packageid" => "knb-lter-pal.220.3",
92
+ "entityid" => "e03b43c924f226486f2f0ab6709d2381",
93
+ }
94
+ end
72
95
  end
73
96
  end
74
97
 
@@ -41,7 +41,7 @@ module Datasets
41
41
  super()
42
42
  @reading = reading
43
43
  unless VALID_READINGS.include?(@reading)
44
- message = ":reading must be one of ["
44
+ message = +":reading must be one of ["
45
45
  message << VALID_READINGS.collect(&:inspect).join(", ")
46
46
  message << "]: #{@reading.inspect}"
47
47
  raise ArgumentError, message
@@ -104,14 +104,14 @@ module Datasets
104
104
 
105
105
  private
106
106
  def open_data
107
- data_url = "https://www.post.japanpost.jp/zipcode/dl"
107
+ data_url = +"https://www.post.japanpost.jp/zipcode/dl"
108
108
  case @reading
109
109
  when :lowercase
110
110
  data_url << "/kogaki/zip/ken_all.zip"
111
111
  when :uppercase
112
112
  data_url << "/oogaki/zip/ken_all.zip"
113
113
  when :romaji
114
- data_url << "/roman/ken_all_rome.zip"
114
+ data_url << "/roman/KEN_ALL_ROME.zip"
115
115
  end
116
116
  data_path = cache_dir_path + "#{@reading}-ken-all.zip"
117
117
  download(data_path, data_url)
@@ -43,7 +43,7 @@ module Datasets
43
43
  data_path = cache_dir_path + "quora_duplicate_questions.tsv"
44
44
  data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
45
45
  download(data_path, data_url)
46
- CSV.open(data_path, col_sep: "\t", headers: true, converters: :all) do |csv|
46
+ CSV.open(data_path, col_sep: "\t", headers: true, converters: :integer) do |csv|
47
47
  yield(csv)
48
48
  end
49
49
  end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.1.7"
2
+ VERSION = "0.1.9"
3
3
  end
@@ -126,7 +126,7 @@ articles (related to Kyoto) into English.
126
126
  @sentence = nil
127
127
  @text_container_stack = []
128
128
  @element_stack = []
129
- @text_stack = [""]
129
+ @text_stack = [+""]
130
130
  end
131
131
 
132
132
  def tag_start(name, attributes)
@@ -207,7 +207,7 @@ articles (related to Kyoto) into English.
207
207
  private
208
208
  def push_stacks(name, attributes)
209
209
  @element_stack.push({name: name, attributes: attributes})
210
- @text_stack.push("")
210
+ @text_stack.push(+"")
211
211
  end
212
212
 
213
213
  def pop_stacks
@@ -90,7 +90,7 @@ module Datasets
90
90
  @contributor = nil
91
91
  @current_tag = nil
92
92
  @tag_stack = []
93
- @text_stack = [""]
93
+ @text_stack = [+""]
94
94
  @first_page = true
95
95
  end
96
96
 
@@ -172,7 +172,7 @@ module Datasets
172
172
 
173
173
  def push_stacks(tag)
174
174
  @tag_stack << tag
175
- @text_stack << ""
175
+ @text_stack << +""
176
176
  end
177
177
 
178
178
  def pop_stacks
@@ -0,0 +1,27 @@
1
+ class JapaneseDateParserTest < Test::Unit::TestCase
2
+ def setup
3
+ @parser = Datasets::JapaneseDateParser.new
4
+ end
5
+
6
+ data("month and day with leading a space in Heisei", ["H10.01.01", "平成10年 1月 1日"])
7
+ data("month with leading a space in Heisei", ["H10.01.10", "平成10年 1月10日"])
8
+ data(" day with leading a space in Heisei", ["H10.10.01", "平成10年10月 1日"])
9
+ data(" without leading a space in Heisei", ["H10.10.10", "平成10年10月10日"])
10
+ data("year, month and day with leading a space in Reiwa", ["R02.01.01", "令和 2年 1月 1日"])
11
+ data("year, month with leading a space in Reiwa", ["R02.01.10", "令和 2年 1月10日"])
12
+ data("year, day with leading a space in Reiwa", ["R02.10.01", "令和 2年10月 1日"])
13
+ data("year, without leading a space in Reiwa", ["R02.10.10", "令和 2年10月10日"])
14
+ data("boundary within Heisei", ["H31.04.30", "平成31年 4月30日"])
15
+ data("boundary within Reiwa", ["R01.05.01", "令和元年 5月 1日"])
16
+ test("#parse") do
17
+ expected_jisx0301, japanese_date_string = data
18
+ assert_equal(expected_jisx0301, @parser.parse(japanese_date_string).jisx0301)
19
+ end
20
+
21
+ test("unsupported era initial range") do
22
+ expected_message = "era must be one of [平成, 令和]: 昭和"
23
+ assert_raise(Datasets::JapaneseDateParser::UnsupportedEraInitialRange.new(expected_message)) do
24
+ @parser.parse("昭和元年 1月 1日")
25
+ end
26
+ end
27
+ end
data/test/test-adult.rb CHANGED
@@ -9,49 +9,24 @@ class AdultTest < Test::Unit::TestCase
9
9
  end
10
10
 
11
11
  test("#each") do
12
- records = @dataset.each.to_a
13
- assert_equal([
14
- 32561,
15
- {
16
- :age => 39,
17
- :work_class => "State-gov",
18
- :final_weight => 77516,
19
- :education => "Bachelors",
20
- :n_education_years => 13,
21
- :marital_status => "Never-married",
22
- :occupation => "Adm-clerical",
23
- :relationship => "Not-in-family",
24
- :race => "White",
25
- :sex => "Male",
26
- :capital_gain => 2174,
27
- :capital_loss => 0,
28
- :hours_per_week => 40,
29
- :native_country => "United-States",
30
- :label => "<=50K"
31
- },
32
- {
33
- :age => 52,
34
- :work_class => "Self-emp-inc",
35
- :final_weight => 287927,
36
- :education => "HS-grad",
37
- :n_education_years => 9,
38
- :marital_status => "Married-civ-spouse",
39
- :occupation => "Exec-managerial",
40
- :relationship => "Wife",
41
- :race => "White",
42
- :sex => "Female",
43
- :capital_gain => 15024,
44
- :capital_loss => 0,
45
- :hours_per_week => 40,
46
- :native_country => "United-States",
47
- :label => ">50K"
48
- }
49
- ],
50
- [
51
- records.size,
52
- records[0].to_h,
53
- records[-1].to_h
54
- ])
12
+ assert_equal({
13
+ :age => 39,
14
+ :work_class => "State-gov",
15
+ :final_weight => 77516,
16
+ :education => "Bachelors",
17
+ :n_education_years => 13,
18
+ :marital_status => "Never-married",
19
+ :occupation => "Adm-clerical",
20
+ :relationship => "Not-in-family",
21
+ :race => "White",
22
+ :sex => "Male",
23
+ :capital_gain => 2174,
24
+ :capital_loss => 0,
25
+ :hours_per_week => 40,
26
+ :native_country => "United-States",
27
+ :label => "<=50K"
28
+ },
29
+ @dataset.each.next.to_h)
55
30
  end
56
31
  end
57
32
 
@@ -65,49 +40,24 @@ class AdultTest < Test::Unit::TestCase
65
40
  end
66
41
 
67
42
  test("#each") do
68
- records = @dataset.each.to_a
69
- assert_equal([
70
- 16281,
71
- {
72
- :age => 25,
73
- :work_class => "Private",
74
- :final_weight => 226802,
75
- :education => "11th",
76
- :n_education_years => 7,
77
- :marital_status => "Never-married",
78
- :occupation => "Machine-op-inspct",
79
- :relationship => "Own-child",
80
- :race => "Black",
81
- :sex => "Male",
82
- :capital_gain => 0,
83
- :capital_loss => 0,
84
- :hours_per_week => 40,
85
- :native_country => "United-States",
86
- :label => "<=50K."
87
- },
88
- {
89
- :age => 35,
90
- :work_class => "Self-emp-inc",
91
- :final_weight => 182148,
92
- :education => "Bachelors",
93
- :n_education_years => 13,
94
- :marital_status => "Married-civ-spouse",
95
- :occupation => "Exec-managerial",
96
- :relationship => "Husband",
97
- :race => "White",
98
- :sex => "Male",
99
- :capital_gain => 0,
100
- :capital_loss => 0,
101
- :hours_per_week => 60,
102
- :native_country => "United-States",
103
- :label => ">50K."
104
- }
105
- ],
106
- [
107
- records.size,
108
- records[0].to_h,
109
- records[-1].to_h
110
- ])
43
+ assert_equal({
44
+ :age => 25,
45
+ :work_class => "Private",
46
+ :final_weight => 226802,
47
+ :education => "11th",
48
+ :n_education_years => 7,
49
+ :marital_status => "Never-married",
50
+ :occupation => "Machine-op-inspct",
51
+ :relationship => "Own-child",
52
+ :race => "Black",
53
+ :sex => "Male",
54
+ :capital_gain => 0,
55
+ :capital_loss => 0,
56
+ :hours_per_week => 40,
57
+ :native_country => "United-States",
58
+ :label => "<=50K.",
59
+ },
60
+ @dataset.each.next.to_h)
111
61
  end
112
62
  end
113
63
 
@@ -114,13 +114,13 @@ class AozoraBunkoTest < Test::Unit::TestCase
114
114
  book = Datasets::AozoraBunko::Book.new
115
115
  book.cache_path = @cache_path
116
116
 
117
- book.title_id = '000750'
118
- book.person_id = '000146'
119
- book.html_file_url = 'http://www.lcv.ne.jp/~ibs52086/fire/'
117
+ book.title_id = '061551'
118
+ book.person_id = '002225'
119
+ book.html_file_url = 'http://minken.party/2019/03/09/yakunin/'
120
120
  book.html_file_character_encoding = 'UTF-8'
121
121
 
122
- assert_equal('<title>種田山頭火句集 | 『草木塔抄』他 FIRE ON THE MOUNTAIN</title>',
123
- book.html.split("\n")[7])
122
+ assert_equal('<title>自由民権現代研究会</title>',
123
+ book.html[/<title>.*?<\/title>/])
124
124
  end
125
125
  end
126
126
 
@@ -8,37 +8,18 @@ class CaliforniaHousingTest < Test::Unit::TestCase
8
8
  end
9
9
 
10
10
  test("#each") do
11
- records = @dataset.each.to_a
12
- assert_equal([
13
- 20640,
14
- {
15
- median_house_value: 452600.000000,
16
- median_income: 8.325200,
17
- housing_median_age: 41.000000,
18
- total_rooms: 880.000000,
19
- total_bedrooms: 129.000000,
20
- population: 322.000000,
21
- households: 126.000000,
22
- latitude: 37.880000,
23
- longitude: -122.230000
24
- },
25
- {
26
- median_house_value: 89400.000000,
27
- median_income: 2.388600,
28
- housing_median_age: 16.000000,
29
- total_rooms: 2785.000000,
30
- total_bedrooms: 616.000000,
31
- population: 1387.000000,
32
- households: 530.000000,
33
- latitude: 39.370000,
34
- longitude: -121.240000
35
- },
36
- ],
37
- [
38
- records.size,
39
- records[0].to_h,
40
- records[-1].to_h
41
- ])
11
+ assert_equal({
12
+ median_house_value: 452600.000000,
13
+ median_income: 8.325200,
14
+ housing_median_age: 41.000000,
15
+ total_rooms: 880.000000,
16
+ total_bedrooms: 129.000000,
17
+ population: 322.000000,
18
+ households: 126.000000,
19
+ latitude: 37.880000,
20
+ longitude: -122.230000,
21
+ },
22
+ @dataset.each.next.to_h)
42
23
  end
43
24
 
44
25
  sub_test_case("#metadata") do
@@ -14,7 +14,7 @@ class CLDRPluralsTest < Test::Unit::TestCase
14
14
  test("#each") do
15
15
  locales = @dataset.each.to_a
16
16
  assert_equal([
17
- 219,
17
+ 220,
18
18
  locale("bm",
19
19
  [
20
20
  rule("other",
@@ -8,39 +8,19 @@ class DiamondsTest < Test::Unit::TestCase
8
8
  end
9
9
 
10
10
  test("#each") do
11
- records = @dataset.each.to_a
12
- assert_equal([
13
- 53940,
14
- {
15
- carat: 0.23,
16
- clarity: "SI2",
17
- color: "E",
18
- cut: "Ideal",
19
- depth: 61.5,
20
- price: 326,
21
- table: 55.0,
22
- x: 3.95,
23
- y: 3.98,
24
- z: 2.43,
25
- },
26
- {
27
- carat: 0.75,
28
- clarity: "SI2",
29
- color: "D",
30
- cut: "Ideal",
31
- depth: 62.2,
32
- price: 2757,
33
- table: 55.0,
34
- x: 5.83,
35
- y: 5.87,
36
- z: 3.64,
37
- },
38
- ],
39
- [
40
- records.size,
41
- records[0].to_h,
42
- records[-1].to_h
43
- ])
11
+ assert_equal({
12
+ carat: 0.23,
13
+ clarity: "SI2",
14
+ color: "E",
15
+ cut: "Ideal",
16
+ depth: 61.5,
17
+ price: 326,
18
+ table: 55.0,
19
+ x: 3.95,
20
+ y: 3.98,
21
+ z: 2.43,
22
+ },
23
+ @dataset.each.next.to_h)
44
24
  end
45
25
 
46
26
  sub_test_case("#metadata") do
@@ -17,7 +17,7 @@ class DownloaderTest < Test::Unit::TestCase
17
17
  output_path = @tmp_dir + "file"
18
18
  downloader = Datasets::Downloader.new(first_url)
19
19
 
20
- downloader.define_singleton_method(:start_http) do |url, headers|
20
+ downloader.define_singleton_method(:start_http) do |url, fallback_urls, headers|
21
21
  raise Datasets::Downloader::TooManyRedirects, "too many redirections: #{last_url}"
22
22
  end
23
23
 
@@ -4,47 +4,23 @@ class GeoloniaTest < Test::Unit::TestCase
4
4
  end
5
5
 
6
6
  test('#each') do
7
- records = @dataset.each.to_a
8
- assert_equal([
9
- 277616,
10
- {
11
- :prefecture_code => "01",
12
- :prefecture_name => "北海道",
13
- :prefecture_kana => "ホッカイドウ",
14
- :prefecture_romaji => "HOKKAIDO",
15
- :municipality_code => "01101",
16
- :municipality_name => "札幌市中央区",
17
- :municipality_kana => "サッポロシチュウオウク",
18
- :municipality_romaji => "SAPPORO SHI CHUO KU",
19
- :street_name => "旭ケ丘一丁目",
20
- :street_kana => "アサヒガオカ 1",
21
- :street_romaji => "ASAHIGAOKA 1",
22
- :alias => nil,
23
- :latitude => "43.04223",
24
- :longitude => "141.319722"
25
- },
26
- {
27
- :prefecture_code => "47",
28
- :prefecture_name => "沖縄県",
29
- :prefecture_kana => "オキナワケン",
30
- :prefecture_romaji => "OKINAWA KEN",
31
- :municipality_code => "47382",
32
- :municipality_name => "八重山郡与那国町",
33
- :municipality_kana => "ヤエヤマグンヨナグニチョウ",
34
- :municipality_romaji => "YAEYAMA GUN YONAGUNI CHO",
35
- :street_name => "字与那国",
36
- :street_kana => nil,
37
- :street_romaji => nil,
38
- :alias => nil,
39
- :latitude => "24.455925",
40
- :longitude => "122.987678",
41
- },
42
- ],
43
- [
44
- records.size,
45
- records[0].to_h,
46
- records[-1].to_h,
47
- ])
7
+ assert_equal({
8
+ :prefecture_code => "01",
9
+ :prefecture_name => "北海道",
10
+ :prefecture_kana => "ホッカイドウ",
11
+ :prefecture_romaji => "HOKKAIDO",
12
+ :municipality_code => "01101",
13
+ :municipality_name => "札幌市中央区",
14
+ :municipality_kana => "サッポロシチュウオウク",
15
+ :municipality_romaji => "SAPPORO SHI CHUO KU",
16
+ :street_name => "旭ケ丘一丁目",
17
+ :street_kana => "アサヒガオカ 1",
18
+ :street_romaji => "ASAHIGAOKA 1",
19
+ :alias => nil,
20
+ :latitude => "43.04223",
21
+ :longitude => "141.319722",
22
+ },
23
+ @dataset.each.next.to_h)
48
24
  end
49
25
 
50
26
  sub_test_case("#metadata") do