red-datasets 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -2
  3. data/doc/text/news.md +86 -0
  4. data/lib/datasets/adult.rb +6 -9
  5. data/lib/datasets/afinn.rb +48 -0
  6. data/lib/datasets/aozora-bunko.rb +196 -0
  7. data/lib/datasets/cache-path.rb +28 -0
  8. data/lib/datasets/california-housing.rb +60 -0
  9. data/lib/datasets/cifar.rb +2 -4
  10. data/lib/datasets/cldr-plurals.rb +2 -4
  11. data/lib/datasets/communities.rb +5 -8
  12. data/lib/datasets/dataset.rb +8 -12
  13. data/lib/datasets/diamonds.rb +26 -0
  14. data/lib/datasets/downloader.rb +6 -1
  15. data/lib/datasets/e-stat-japan.rb +2 -1
  16. data/lib/datasets/fashion-mnist.rb +4 -0
  17. data/lib/datasets/fuel-economy.rb +35 -0
  18. data/lib/datasets/geolonia.rb +67 -0
  19. data/lib/datasets/ggplot2-dataset.rb +79 -0
  20. data/lib/datasets/hepatitis.rb +5 -8
  21. data/lib/datasets/iris.rb +5 -8
  22. data/lib/datasets/ita-corpus.rb +57 -0
  23. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  24. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  25. data/lib/datasets/libsvm.rb +3 -4
  26. data/lib/datasets/license.rb +26 -0
  27. data/lib/datasets/livedoor-news.rb +80 -0
  28. data/lib/datasets/metadata.rb +14 -0
  29. data/lib/datasets/mnist.rb +7 -7
  30. data/lib/datasets/mushroom.rb +5 -8
  31. data/lib/datasets/penguins.rb +4 -8
  32. data/lib/datasets/penn-treebank.rb +2 -4
  33. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  34. data/lib/datasets/postal-code-japan.rb +2 -6
  35. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  36. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  37. data/lib/datasets/seaborn.rb +90 -0
  38. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  39. data/lib/datasets/version.rb +1 -1
  40. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  41. data/lib/datasets/wikipedia.rb +4 -5
  42. data/lib/datasets/wine.rb +6 -9
  43. data/lib/datasets/zip-extractor.rb +36 -0
  44. data/lib/datasets.rb +14 -2
  45. data/red-datasets.gemspec +1 -1
  46. data/test/helper.rb +21 -0
  47. data/test/test-afinn.rb +60 -0
  48. data/test/test-aozora-bunko.rb +190 -0
  49. data/test/test-california-housing.rb +56 -0
  50. data/test/test-cldr-plurals.rb +1 -1
  51. data/test/test-dataset.rb +15 -7
  52. data/test/test-diamonds.rb +71 -0
  53. data/test/test-fuel-economy.rb +75 -0
  54. data/test/test-geolonia.rb +64 -0
  55. data/test/test-ita-corpus.rb +69 -0
  56. data/test/test-kuzushiji-mnist.rb +137 -0
  57. data/test/test-license.rb +24 -0
  58. data/test/test-livedoor-news.rb +351 -0
  59. data/test/test-metadata.rb +36 -0
  60. data/test/test-penguins.rb +1 -1
  61. data/test/test-pmjt-dataset-list.rb +50 -0
  62. data/test/test-quora-duplicate-question-pair.rb +33 -0
  63. data/test/test-rdataset.rb +246 -0
  64. data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
  65. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  66. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  67. metadata +58 -14
  68. data/lib/datasets/seaborn-data.rb +0 -49
  69. data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,35 @@
1
+ require_relative "ggplot2-dataset"
2
+
3
+ module Datasets
4
+ class FuelEconomy < Ggplot2Dataset
5
+ Record = Struct.new(:manufacturer,
6
+ :model,
7
+ :displacement,
8
+ :year,
9
+ :n_cylinders,
10
+ :transmission,
11
+ :drive_train,
12
+ :city_mpg,
13
+ :highway_mpg,
14
+ :fuel,
15
+ :type)
16
+
17
+ def initialize
18
+ super("mpg")
19
+ @metadata.id = "fuel-economy"
20
+ @metadata.name = "Fuel economy"
21
+ @metadata.licenses = ["CC0-1.0"]
22
+ end
23
+
24
+ COLUMN_NAME_MAPPING = {
25
+ "displ" => "displacement",
26
+ "cyl" => "n_cylinders",
27
+ "trans" => "transmissions",
28
+ "drv" => "drive_train",
29
+ "cty" => "city_mpg",
30
+ "hwy" => "highway_mpg",
31
+ "fl" => "fuel",
32
+ "class" => "type",
33
+ }
34
+ end
35
+ end
@@ -0,0 +1,67 @@
1
+ require 'csv'
2
+
3
+ require_relative 'dataset'
4
+
5
+ module Datasets
6
+ class Geolonia < Dataset
7
+ Record = Struct.new(:prefecture_code,
8
+ :prefecture_name,
9
+ :prefecture_kana,
10
+ :prefecture_romaji,
11
+ :municipality_code,
12
+ :municipality_name,
13
+ :municipality_kana,
14
+ :municipality_romaji,
15
+ :street_name,
16
+ :street_kana,
17
+ :street_romaji,
18
+ :alias,
19
+ :latitude,
20
+ :longitude)
21
+
22
+ def initialize
23
+ super
24
+ @metadata.id = 'geolonia'
25
+ @metadata.name = 'Geolonia'
26
+ @metadata.url = 'https://github.com/geolonia/japanese-addresses'
27
+ @metadata.licenses = ["CC-BY-4.0"]
28
+ @metadata.description = lambda do
29
+ fetch_readme
30
+ end
31
+ end
32
+
33
+ def each
34
+ return to_enum(__method__) unless block_given?
35
+
36
+ open_data do |csv|
37
+ csv.readline
38
+ csv.each do |row|
39
+ record = Record.new(*row)
40
+ yield(record)
41
+ end
42
+ end
43
+ end
44
+
45
+ private
46
+ def download_base_url
47
+ "https://raw.githubusercontent.com/geolonia/japanese-addresses/master"
48
+ end
49
+
50
+ def open_data
51
+ data_path = cache_dir_path + 'latest.csv'
52
+ data_url = "#{download_base_url}/data/latest.csv"
53
+ download(data_path, data_url)
54
+ CSV.open(data_path) do |csv|
55
+ yield(csv)
56
+ end
57
+ end
58
+
59
+ def fetch_readme
60
+ readme_base_name = "README.md"
61
+ readme_path = cache_dir_path + readme_base_name
62
+ readme_url = "#{download_base_url}/#{readme_base_name}"
63
+ download(readme_path, readme_url)
64
+ readme_path.read.split(/^## API/, 2)[0].strip
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,79 @@
1
+ module Datasets
2
+ class Ggplot2Dataset < Dataset
3
+ def initialize(ggplot2_dataset_name)
4
+ super()
5
+ @ggplot2_dataset_name = ggplot2_dataset_name
6
+ @metadata.url =
7
+ "https://ggplot2.tidyverse.org/reference/#{@ggplot2_dataset_name}.html"
8
+ @metadata.description = lambda do
9
+ fetch_description
10
+ end
11
+ end
12
+
13
+ def each
14
+ return to_enum(__method__) unless block_given?
15
+
16
+ data_base_name = "#{@ggplot2_dataset_name}.csv"
17
+ data_path = cache_dir_path + data_base_name
18
+ data_url = "#{download_base_url}/data-raw/#{data_base_name}"
19
+ download(data_path, data_url)
20
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
21
+ record_class = self.class::Record
22
+ csv.each do |row|
23
+ record = record_class.new(*row.fields)
24
+ yield record
25
+ end
26
+ end
27
+ end
28
+
29
+ private
30
+ def download_base_url
31
+ "https://raw.githubusercontent.com/tidyverse/ggplot2/main"
32
+ end
33
+
34
+ def fetch_description
35
+ data_r_base_name = "data.R"
36
+ data_r_path = cache_dir_path + data_r_base_name
37
+ data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
38
+ download(data_r_path, data_r_url)
39
+ descriptions = {}
40
+ comment = ""
41
+ File.open(data_r_path) do |data_r|
42
+ data_r.each_line do |line|
43
+ case line.chomp
44
+ when /\A#'/
45
+ comment_content = Regexp.last_match.post_match
46
+ unless comment_content.empty?
47
+ comment_content = comment_content[1..-1]
48
+ end
49
+ comment << comment_content
50
+ comment << "\n"
51
+ when /\A"(.+)"\z/
52
+ name = Regexp.last_match[1]
53
+ descriptions[name] = parse_roxygen(comment.rstrip)
54
+ comment = ""
55
+ end
56
+ end
57
+ descriptions[@ggplot2_dataset_name]
58
+ end
59
+ end
60
+
61
+ def parse_roxygen(roxygen)
62
+ column_name_mapping = self.class::COLUMN_NAME_MAPPING
63
+ roxygen
64
+ .gsub(/\\url\{(.*?)\}/, "\\1")
65
+ .gsub(/^@format /, "")
66
+ .gsub(/\\describe\{(.*)\}/m) do
67
+ content = $1
68
+ content.gsub(/\\item\{(.*?)\}\{(.*?)\}/m) do
69
+ column_name = $1
70
+ description = $2
71
+ column_name = column_name_mapping[column_name] || column_name
72
+ description = description
73
+ .gsub(/\\\$/, "$")
74
+ "* #{column_name}: #{description}"
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
@@ -163,6 +163,7 @@ module Datasets
163
163
  @metadata.id = "hepatitis"
164
164
  @metadata.name = "Hepatitis"
165
165
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
166
+ @metadata.licenses = ["CC-BY-4.0"]
166
167
  @metadata.description = lambda do
167
168
  read_names
168
169
  end
@@ -186,10 +187,8 @@ module Datasets
186
187
 
187
188
  def open_data
188
189
  data_path = cache_dir_path + "hepatitis.csv"
189
- unless data_path.exist?
190
- data_url = "#{base_url}/hepatitis.data"
191
- download(data_path, data_url)
192
- end
190
+ data_url = "#{base_url}/hepatitis.data"
191
+ download(data_path, data_url)
193
192
  CSV.open(data_path) do |csv|
194
193
  yield(csv)
195
194
  end
@@ -197,10 +196,8 @@ module Datasets
197
196
 
198
197
  def read_names
199
198
  names_path = cache_dir_path + "hepatitis.names"
200
- unless names_path.exist?
201
- names_url = "#{base_url}/hepatitis.names"
202
- download(names_path, names_url)
203
- end
199
+ names_url = "#{base_url}/hepatitis.names"
200
+ download(names_path, names_url)
204
201
  names_path.read
205
202
  end
206
203
  end
data/lib/datasets/iris.rb CHANGED
@@ -15,6 +15,7 @@ module Datasets
15
15
  @metadata.id = "iris"
16
16
  @metadata.name = "Iris"
17
17
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris"
18
+ @metadata.licenses = ["CC-BY-4.0"]
18
19
  @metadata.description = lambda do
19
20
  read_names
20
21
  end
@@ -35,10 +36,8 @@ module Datasets
35
36
  private
36
37
  def open_data
37
38
  data_path = cache_dir_path + "iris.csv"
38
- unless data_path.exist?
39
- data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
40
- download(data_path, data_url)
41
- end
39
+ data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
40
+ download(data_path, data_url)
42
41
  CSV.open(data_path, converters: [:numeric]) do |csv|
43
42
  yield(csv)
44
43
  end
@@ -46,10 +45,8 @@ module Datasets
46
45
 
47
46
  def read_names
48
47
  names_path = cache_dir_path + "iris.names"
49
- unless names_path.exist?
50
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
51
- download(names_path, names_url)
52
- end
48
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
49
+ download(names_path, names_url)
53
50
  names_path.read
54
51
  end
55
52
  end
@@ -0,0 +1,57 @@
1
+ require_relative 'dataset'
2
+
3
+ module Datasets
4
+ class ITACorpus < Dataset
5
+ Record = Struct.new(:id,
6
+ :sentence)
7
+
8
+ def initialize(type: :emotion)
9
+ unless [:emotion, :recitation].include?(type)
10
+ raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}"
11
+ end
12
+
13
+ super()
14
+ @type = type
15
+ @metadata.id = 'ita-corpus'
16
+ @metadata.name = 'ITA-corpus'
17
+ @metadata.url = 'https://github.com/mmorise/ita-corpus'
18
+ @metadata.licenses = ['Unlicense']
19
+ @metadata.description = lambda do
20
+ fetch_readme
21
+ end
22
+ end
23
+
24
+ def each(&block)
25
+ return to_enum(__method__) unless block_given?
26
+
27
+ data_path = cache_dir_path + "#{@type}_transcript_utf8.txt"
28
+ data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt"
29
+ download(data_path, data_url)
30
+
31
+ parse_data(data_path, &block)
32
+ end
33
+
34
+ private
35
+ def fetch_readme
36
+ readme_base_name = "README.md"
37
+ readme_path = cache_dir_path + readme_base_name
38
+ readme_url = "#{download_base_url}/#{readme_base_name}"
39
+ download(readme_path, readme_url)
40
+ readme_path.read.split(/^## ファイル構成/, 2)[0].strip
41
+ end
42
+
43
+ def download_base_url
44
+ "https://raw.githubusercontent.com/mmorise/ita-corpus/main"
45
+ end
46
+
47
+ def parse_data(data_path)
48
+ File.open(data_path) do |f|
49
+ f.each_line(chomp: true) do |line|
50
+ id, sentence = line.split(':', 2)
51
+ record = Record.new(id , sentence)
52
+ yield(record)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,16 @@
1
+ require_relative 'mnist'
2
+
3
+ module Datasets
4
+ class KuzushijiMNIST < MNIST
5
+ BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
6
+
7
+ private
8
+ def dataset_name
9
+ "Kuzushiji-MNIST"
10
+ end
11
+
12
+ def licenses
13
+ ["CC-BY-SA-4.0"]
14
+ end
15
+ end
16
+ end
@@ -28,6 +28,7 @@ module Datasets
28
28
  @metadata.id = "libsvm-dataset-list"
29
29
  @metadata.name = "LIBSVM dataset list"
30
30
  @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
31
+ @metadata.licenses = ["BSD-3-Clause"]
31
32
  @metadata.description = lambda do
32
33
  extract_description
33
34
  end
@@ -51,10 +52,8 @@ module Datasets
51
52
  private
52
53
  def open_data
53
54
  data_path = cache_dir_path + "index.html"
54
- unless data_path.exist?
55
- download(data_path, @metadata.url)
56
- end
57
- ::File.open(data_path) do |input|
55
+ download(data_path, @metadata.url)
56
+ data_path.open do |input|
58
57
  yield(input)
59
58
  end
60
59
  end
@@ -78,10 +77,8 @@ module Datasets
78
77
 
79
78
  def open_detail(detail)
80
79
  data_path = cache_dir_path + detail
81
- unless data_path.exist?
82
- download(data_path, @metadata.url + detail)
83
- end
84
- ::File.open(data_path) do |input|
80
+ download(data_path, @metadata.url + detail)
81
+ data_path.open do |input|
85
82
  yield(input)
86
83
  end
87
84
  end
@@ -41,6 +41,7 @@ module Datasets
41
41
  @metadata.id = "libsvm-#{normalize_name(name)}"
42
42
  @metadata.name = "LIBSVM dataset: #{name}"
43
43
  @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
44
+ @metadata.licenses = ["BSD-3-Clause"]
44
45
  end
45
46
 
46
47
  def each
@@ -99,13 +100,11 @@ module Datasets
99
100
 
100
101
  def open_data(&block)
101
102
  data_path = cache_dir_path + @file.name
102
- unless data_path.exist?
103
- download(data_path, @file.url)
104
- end
103
+ download(data_path, @file.url)
105
104
  if data_path.extname == ".bz2"
106
105
  extract_bz2(data_path, &block)
107
106
  else
108
- File.open(data_path, &block)
107
+ data_path.open(&block)
109
108
  end
110
109
  end
111
110
 
@@ -0,0 +1,26 @@
1
+ module Datasets
2
+ class License < Struct.new(:spdx_id,
3
+ :name,
4
+ :url)
5
+ class << self
6
+ def try_convert(value)
7
+ case value
8
+ when self
9
+ value
10
+ when String
11
+ license = new
12
+ license.spdx_id = value
13
+ license
14
+ when Hash
15
+ license = new
16
+ license.spdx_id = value[:spdx_id]
17
+ license.name = value[:name]
18
+ license.url = value[:url]
19
+ license
20
+ else
21
+ nil
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,80 @@
1
+ require_relative "dataset"
2
+ require_relative "tar-gz-readable"
3
+
4
+ module Datasets
5
+ class LivedoorNews < Dataset
6
+ include TarGzReadable
7
+ Record = Struct.new(:url,
8
+ :timestamp,
9
+ :sentence)
10
+
11
+ def initialize(type: :topic_news)
12
+ news_list = [
13
+ :topic_news,
14
+ :sports_watch,
15
+ :it_life_hack,
16
+ :kaden_channel,
17
+ :movie_enter,
18
+ :dokujo_tsushin,
19
+ :smax,
20
+ :livedoor_homme,
21
+ :peachy
22
+ ]
23
+ unless news_list.include?(type)
24
+ valid_type_labels = news_list.collect(&:inspect).join(", ")
25
+ message = ":type must be one of [#{valid_type_labels}]: #{type.inspect}"
26
+ raise ArgumentError, message
27
+ end
28
+
29
+ super()
30
+ @type = type
31
+ @metadata.id = 'livedoor-news'
32
+ @metadata.name = 'livedoor-news'
33
+ @metadata.url = 'https://www.rondhuit.com/download.html#ldcc'
34
+ @metadata.licenses = ['CC-BY-ND-2.1-JP']
35
+ @metadata.description = lambda do
36
+ fetch_readme
37
+ end
38
+ end
39
+
40
+ def each(&block)
41
+ return to_enum(__method__) unless block_given?
42
+
43
+ data_path = download_tar_gz
44
+ parse_data(data_path, &block)
45
+ end
46
+
47
+ private
48
+ def download_tar_gz
49
+ data_path = cache_dir_path + "livedoor-news.tar.gz"
50
+ data_url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
51
+ download(data_path, data_url)
52
+ data_path
53
+ end
54
+
55
+ def fetch_readme
56
+ data_path = download_tar_gz
57
+ target_file_name = 'text/README.txt'
58
+ open_tar_gz(data_path) do |tar|
59
+ tar.seek(target_file_name) do |entry|
60
+ return entry.read.force_encoding("UTF-8")
61
+ end
62
+ end
63
+ end
64
+
65
+ def parse_data(data_path, &block)
66
+ target_directory_name = "text/#{@type.to_s.gsub(/_/, '-')}"
67
+ open_tar_gz(data_path) do |tar|
68
+ tar.each do |entry|
69
+ next unless entry.file?
70
+ directory_name, base_name = File.split(entry.full_name)
71
+ next unless directory_name == target_directory_name
72
+ next if base_name == "LICENSE.txt"
73
+ url, timestamp, sentence = entry.read.force_encoding("UTF-8").split("\n", 3)
74
+ record = Record.new(url, Time.iso8601(timestamp), sentence)
75
+ yield(record)
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -1,9 +1,23 @@
1
+ require_relative "license"
2
+
1
3
  module Datasets
2
4
  class Metadata < Struct.new(:id,
3
5
  :name,
4
6
  :url,
5
7
  :licenses,
6
8
  :description)
9
+ def licenses=(licenses)
10
+ licenses = [licenses] unless licenses.is_a?(Array)
11
+ licenses = licenses.collect do |license|
12
+ l = License.try_convert(license)
13
+ if l.nil?
14
+ raise ArgumentError.new("invalid license: #{license.inspect}")
15
+ end
16
+ l
17
+ end
18
+ super(licenses)
19
+ end
20
+
7
21
  def description
8
22
  description_raw = super
9
23
  if description_raw.respond_to?(:call)
@@ -28,6 +28,7 @@ module Datasets
28
28
  @metadata.id = "#{dataset_name.downcase}-#{type}"
29
29
  @metadata.name = "#{dataset_name}: #{type}"
30
30
  @metadata.url = self.class::BASE_URL
31
+ @metadata.licenses = licenses
31
32
  @type = type
32
33
 
33
34
  case type
@@ -45,18 +46,17 @@ module Datasets
45
46
  label_path = cache_dir_path + target_file(:label)
46
47
  base_url = self.class::BASE_URL
47
48
 
48
- unless image_path.exist?
49
- download(image_path, base_url + target_file(:image))
50
- end
51
-
52
- unless label_path.exist?
53
- download(label_path, base_url + target_file(:label))
54
- end
49
+ download(image_path, base_url + target_file(:image))
50
+ download(label_path, base_url + target_file(:label))
55
51
 
56
52
  open_data(image_path, label_path, &block)
57
53
  end
58
54
 
59
55
  private
56
+ def licenses
57
+ []
58
+ end
59
+
60
60
  def open_data(image_path, label_path, &block)
61
61
  labels = parse_labels(label_path)
62
62
 
@@ -35,6 +35,7 @@ module Datasets
35
35
  @metadata.id = "mushroom"
36
36
  @metadata.name = "Mushroom"
37
37
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.licenses = ["CC-BY-4.0"]
38
39
  @metadata.description = lambda do
39
40
  read_names
40
41
  end
@@ -58,10 +59,8 @@ module Datasets
58
59
  private
59
60
  def open_data
60
61
  data_path = cache_dir_path + "agaricus-lepiota.data"
61
- unless data_path.exist?
62
- data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
- download(data_path, data_url)
64
- end
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
65
64
  CSV.open(data_path) do |csv|
66
65
  yield(csv)
67
66
  end
@@ -69,10 +68,8 @@ module Datasets
69
68
 
70
69
  def read_names
71
70
  names_path = cache_dir_path + "agaricus-lepiota.names"
72
- unless names_path.exist?
73
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
- download(names_path, names_url)
75
- end
71
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
72
+ download(names_path, names_url)
76
73
  names_path.read
77
74
  end
78
75
 
@@ -23,10 +23,10 @@ module Datasets
23
23
  def initialize
24
24
  super
25
25
  species = self.class.name.split("::").last.downcase
26
- @metadata.id = "palmerpenguins-raw-#{species}"
26
+ @metadata.id = "palmerpenguins-#{species}"
27
27
  @metadata.url = self.class::URL
28
- @metadata.licenses = ["CC0"]
29
- @data_path = cache_dir_path + "penguins" + (species + ".csv")
28
+ @metadata.licenses = ["CC0-1.0"]
29
+ @data_path = cache_dir_path + "#{species}.csv"
30
30
  end
31
31
 
32
32
  attr_reader :data_path
@@ -44,15 +44,11 @@ module Datasets
44
44
  end
45
45
 
46
46
  private def open_data
47
- download unless data_path.exist?
47
+ download(data_path, metadata.url)
48
48
  CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
49
49
  yield csv
50
50
  end
51
51
  end
52
-
53
- private def download
54
- super(data_path, metadata.url)
55
- end
56
52
  end
57
53
 
58
54
  # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
@@ -36,10 +36,8 @@ module Datasets
36
36
 
37
37
  base_name = "ptb.#{@type}.txt"
38
38
  data_path = cache_dir_path + base_name
39
- unless data_path.exist?
40
- base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
41
- download(data_path, "#{base_url}/#{base_name}")
42
- end
39
+ base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
40
+ download(data_path, "#{base_url}/#{base_name}")
43
41
 
44
42
  parse_data(data_path, &block)
45
43
  end