red-datasets 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -2
  3. data/doc/text/news.md +86 -0
  4. data/lib/datasets/adult.rb +6 -9
  5. data/lib/datasets/afinn.rb +48 -0
  6. data/lib/datasets/aozora-bunko.rb +196 -0
  7. data/lib/datasets/cache-path.rb +28 -0
  8. data/lib/datasets/california-housing.rb +60 -0
  9. data/lib/datasets/cifar.rb +2 -4
  10. data/lib/datasets/cldr-plurals.rb +2 -4
  11. data/lib/datasets/communities.rb +5 -8
  12. data/lib/datasets/dataset.rb +8 -12
  13. data/lib/datasets/diamonds.rb +26 -0
  14. data/lib/datasets/downloader.rb +6 -1
  15. data/lib/datasets/e-stat-japan.rb +2 -1
  16. data/lib/datasets/fashion-mnist.rb +4 -0
  17. data/lib/datasets/fuel-economy.rb +35 -0
  18. data/lib/datasets/geolonia.rb +67 -0
  19. data/lib/datasets/ggplot2-dataset.rb +79 -0
  20. data/lib/datasets/hepatitis.rb +5 -8
  21. data/lib/datasets/iris.rb +5 -8
  22. data/lib/datasets/ita-corpus.rb +57 -0
  23. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  24. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  25. data/lib/datasets/libsvm.rb +3 -4
  26. data/lib/datasets/license.rb +26 -0
  27. data/lib/datasets/livedoor-news.rb +80 -0
  28. data/lib/datasets/metadata.rb +14 -0
  29. data/lib/datasets/mnist.rb +7 -7
  30. data/lib/datasets/mushroom.rb +5 -8
  31. data/lib/datasets/penguins.rb +4 -8
  32. data/lib/datasets/penn-treebank.rb +2 -4
  33. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  34. data/lib/datasets/postal-code-japan.rb +2 -6
  35. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  36. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  37. data/lib/datasets/seaborn.rb +90 -0
  38. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  39. data/lib/datasets/version.rb +1 -1
  40. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  41. data/lib/datasets/wikipedia.rb +4 -5
  42. data/lib/datasets/wine.rb +6 -9
  43. data/lib/datasets/zip-extractor.rb +36 -0
  44. data/lib/datasets.rb +14 -2
  45. data/red-datasets.gemspec +1 -1
  46. data/test/helper.rb +21 -0
  47. data/test/test-afinn.rb +60 -0
  48. data/test/test-aozora-bunko.rb +190 -0
  49. data/test/test-california-housing.rb +56 -0
  50. data/test/test-cldr-plurals.rb +1 -1
  51. data/test/test-dataset.rb +15 -7
  52. data/test/test-diamonds.rb +71 -0
  53. data/test/test-fuel-economy.rb +75 -0
  54. data/test/test-geolonia.rb +64 -0
  55. data/test/test-ita-corpus.rb +69 -0
  56. data/test/test-kuzushiji-mnist.rb +137 -0
  57. data/test/test-license.rb +24 -0
  58. data/test/test-livedoor-news.rb +351 -0
  59. data/test/test-metadata.rb +36 -0
  60. data/test/test-penguins.rb +1 -1
  61. data/test/test-pmjt-dataset-list.rb +50 -0
  62. data/test/test-quora-duplicate-question-pair.rb +33 -0
  63. data/test/test-rdataset.rb +246 -0
  64. data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
  65. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  66. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  67. metadata +58 -14
  68. data/lib/datasets/seaborn-data.rb +0 -49
  69. data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,35 @@
1
+ require_relative "ggplot2-dataset"
2
+
3
+ module Datasets
4
+ class FuelEconomy < Ggplot2Dataset
5
+ Record = Struct.new(:manufacturer,
6
+ :model,
7
+ :displacement,
8
+ :year,
9
+ :n_cylinders,
10
+ :transmission,
11
+ :drive_train,
12
+ :city_mpg,
13
+ :highway_mpg,
14
+ :fuel,
15
+ :type)
16
+
17
+ def initialize
18
+ super("mpg")
19
+ @metadata.id = "fuel-economy"
20
+ @metadata.name = "Fuel economy"
21
+ @metadata.licenses = ["CC0-1.0"]
22
+ end
23
+
24
+ COLUMN_NAME_MAPPING = {
25
+ "displ" => "displacement",
26
+ "cyl" => "n_cylinders",
27
+ "trans" => "transmissions",
28
+ "drv" => "drive_train",
29
+ "cty" => "city_mpg",
30
+ "hwy" => "highway_mpg",
31
+ "fl" => "fuel",
32
+ "class" => "type",
33
+ }
34
+ end
35
+ end
@@ -0,0 +1,67 @@
1
+ require 'csv'
2
+
3
+ require_relative 'dataset'
4
+
5
+ module Datasets
6
+ class Geolonia < Dataset
7
+ Record = Struct.new(:prefecture_code,
8
+ :prefecture_name,
9
+ :prefecture_kana,
10
+ :prefecture_romaji,
11
+ :municipality_code,
12
+ :municipality_name,
13
+ :municipality_kana,
14
+ :municipality_romaji,
15
+ :street_name,
16
+ :street_kana,
17
+ :street_romaji,
18
+ :alias,
19
+ :latitude,
20
+ :longitude)
21
+
22
+ def initialize
23
+ super
24
+ @metadata.id = 'geolonia'
25
+ @metadata.name = 'Geolonia'
26
+ @metadata.url = 'https://github.com/geolonia/japanese-addresses'
27
+ @metadata.licenses = ["CC-BY-4.0"]
28
+ @metadata.description = lambda do
29
+ fetch_readme
30
+ end
31
+ end
32
+
33
+ def each
34
+ return to_enum(__method__) unless block_given?
35
+
36
+ open_data do |csv|
37
+ csv.readline
38
+ csv.each do |row|
39
+ record = Record.new(*row)
40
+ yield(record)
41
+ end
42
+ end
43
+ end
44
+
45
+ private
46
+ def download_base_url
47
+ "https://raw.githubusercontent.com/geolonia/japanese-addresses/master"
48
+ end
49
+
50
+ def open_data
51
+ data_path = cache_dir_path + 'latest.csv'
52
+ data_url = "#{download_base_url}/data/latest.csv"
53
+ download(data_path, data_url)
54
+ CSV.open(data_path) do |csv|
55
+ yield(csv)
56
+ end
57
+ end
58
+
59
+ def fetch_readme
60
+ readme_base_name = "README.md"
61
+ readme_path = cache_dir_path + readme_base_name
62
+ readme_url = "#{download_base_url}/#{readme_base_name}"
63
+ download(readme_path, readme_url)
64
+ readme_path.read.split(/^## API/, 2)[0].strip
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,79 @@
1
+ module Datasets
2
+ class Ggplot2Dataset < Dataset
3
+ def initialize(ggplot2_dataset_name)
4
+ super()
5
+ @ggplot2_dataset_name = ggplot2_dataset_name
6
+ @metadata.url =
7
+ "https://ggplot2.tidyverse.org/reference/#{@ggplot2_dataset_name}.html"
8
+ @metadata.description = lambda do
9
+ fetch_description
10
+ end
11
+ end
12
+
13
+ def each
14
+ return to_enum(__method__) unless block_given?
15
+
16
+ data_base_name = "#{@ggplot2_dataset_name}.csv"
17
+ data_path = cache_dir_path + data_base_name
18
+ data_url = "#{download_base_url}/data-raw/#{data_base_name}"
19
+ download(data_path, data_url)
20
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
21
+ record_class = self.class::Record
22
+ csv.each do |row|
23
+ record = record_class.new(*row.fields)
24
+ yield record
25
+ end
26
+ end
27
+ end
28
+
29
+ private
30
+ def download_base_url
31
+ "https://raw.githubusercontent.com/tidyverse/ggplot2/main"
32
+ end
33
+
34
+ def fetch_description
35
+ data_r_base_name = "data.R"
36
+ data_r_path = cache_dir_path + data_r_base_name
37
+ data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
38
+ download(data_r_path, data_r_url)
39
+ descriptions = {}
40
+ comment = ""
41
+ File.open(data_r_path) do |data_r|
42
+ data_r.each_line do |line|
43
+ case line.chomp
44
+ when /\A#'/
45
+ comment_content = Regexp.last_match.post_match
46
+ unless comment_content.empty?
47
+ comment_content = comment_content[1..-1]
48
+ end
49
+ comment << comment_content
50
+ comment << "\n"
51
+ when /\A"(.+)"\z/
52
+ name = Regexp.last_match[1]
53
+ descriptions[name] = parse_roxygen(comment.rstrip)
54
+ comment = ""
55
+ end
56
+ end
57
+ descriptions[@ggplot2_dataset_name]
58
+ end
59
+ end
60
+
61
+ def parse_roxygen(roxygen)
62
+ column_name_mapping = self.class::COLUMN_NAME_MAPPING
63
+ roxygen
64
+ .gsub(/\\url\{(.*?)\}/, "\\1")
65
+ .gsub(/^@format /, "")
66
+ .gsub(/\\describe\{(.*)\}/m) do
67
+ content = $1
68
+ content.gsub(/\\item\{(.*?)\}\{(.*?)\}/m) do
69
+ column_name = $1
70
+ description = $2
71
+ column_name = column_name_mapping[column_name] || column_name
72
+ description = description
73
+ .gsub(/\\\$/, "$")
74
+ "* #{column_name}: #{description}"
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
@@ -163,6 +163,7 @@ module Datasets
163
163
  @metadata.id = "hepatitis"
164
164
  @metadata.name = "Hepatitis"
165
165
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
166
+ @metadata.licenses = ["CC-BY-4.0"]
166
167
  @metadata.description = lambda do
167
168
  read_names
168
169
  end
@@ -186,10 +187,8 @@ module Datasets
186
187
 
187
188
  def open_data
188
189
  data_path = cache_dir_path + "hepatitis.csv"
189
- unless data_path.exist?
190
- data_url = "#{base_url}/hepatitis.data"
191
- download(data_path, data_url)
192
- end
190
+ data_url = "#{base_url}/hepatitis.data"
191
+ download(data_path, data_url)
193
192
  CSV.open(data_path) do |csv|
194
193
  yield(csv)
195
194
  end
@@ -197,10 +196,8 @@ module Datasets
197
196
 
198
197
  def read_names
199
198
  names_path = cache_dir_path + "hepatitis.names"
200
- unless names_path.exist?
201
- names_url = "#{base_url}/hepatitis.names"
202
- download(names_path, names_url)
203
- end
199
+ names_url = "#{base_url}/hepatitis.names"
200
+ download(names_path, names_url)
204
201
  names_path.read
205
202
  end
206
203
  end
data/lib/datasets/iris.rb CHANGED
@@ -15,6 +15,7 @@ module Datasets
15
15
  @metadata.id = "iris"
16
16
  @metadata.name = "Iris"
17
17
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris"
18
+ @metadata.licenses = ["CC-BY-4.0"]
18
19
  @metadata.description = lambda do
19
20
  read_names
20
21
  end
@@ -35,10 +36,8 @@ module Datasets
35
36
  private
36
37
  def open_data
37
38
  data_path = cache_dir_path + "iris.csv"
38
- unless data_path.exist?
39
- data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
40
- download(data_path, data_url)
41
- end
39
+ data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
40
+ download(data_path, data_url)
42
41
  CSV.open(data_path, converters: [:numeric]) do |csv|
43
42
  yield(csv)
44
43
  end
@@ -46,10 +45,8 @@ module Datasets
46
45
 
47
46
  def read_names
48
47
  names_path = cache_dir_path + "iris.names"
49
- unless names_path.exist?
50
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
51
- download(names_path, names_url)
52
- end
48
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
49
+ download(names_path, names_url)
53
50
  names_path.read
54
51
  end
55
52
  end
@@ -0,0 +1,57 @@
1
+ require_relative 'dataset'
2
+
3
+ module Datasets
4
+ class ITACorpus < Dataset
5
+ Record = Struct.new(:id,
6
+ :sentence)
7
+
8
+ def initialize(type: :emotion)
9
+ unless [:emotion, :recitation].include?(type)
10
+ raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}"
11
+ end
12
+
13
+ super()
14
+ @type = type
15
+ @metadata.id = 'ita-corpus'
16
+ @metadata.name = 'ITA-corpus'
17
+ @metadata.url = 'https://github.com/mmorise/ita-corpus'
18
+ @metadata.licenses = ['Unlicense']
19
+ @metadata.description = lambda do
20
+ fetch_readme
21
+ end
22
+ end
23
+
24
+ def each(&block)
25
+ return to_enum(__method__) unless block_given?
26
+
27
+ data_path = cache_dir_path + "#{@type}_transcript_utf8.txt"
28
+ data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt"
29
+ download(data_path, data_url)
30
+
31
+ parse_data(data_path, &block)
32
+ end
33
+
34
+ private
35
+ def fetch_readme
36
+ readme_base_name = "README.md"
37
+ readme_path = cache_dir_path + readme_base_name
38
+ readme_url = "#{download_base_url}/#{readme_base_name}"
39
+ download(readme_path, readme_url)
40
+ readme_path.read.split(/^## ファイル構成/, 2)[0].strip
41
+ end
42
+
43
+ def download_base_url
44
+ "https://raw.githubusercontent.com/mmorise/ita-corpus/main"
45
+ end
46
+
47
+ def parse_data(data_path)
48
+ File.open(data_path) do |f|
49
+ f.each_line(chomp: true) do |line|
50
+ id, sentence = line.split(':', 2)
51
+ record = Record.new(id , sentence)
52
+ yield(record)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,16 @@
1
+ require_relative 'mnist'
2
+
3
+ module Datasets
4
+ class KuzushijiMNIST < MNIST
5
+ BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
6
+
7
+ private
8
+ def dataset_name
9
+ "Kuzushiji-MNIST"
10
+ end
11
+
12
+ def licenses
13
+ ["CC-BY-SA-4.0"]
14
+ end
15
+ end
16
+ end
@@ -28,6 +28,7 @@ module Datasets
28
28
  @metadata.id = "libsvm-dataset-list"
29
29
  @metadata.name = "LIBSVM dataset list"
30
30
  @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
31
+ @metadata.licenses = ["BSD-3-Clause"]
31
32
  @metadata.description = lambda do
32
33
  extract_description
33
34
  end
@@ -51,10 +52,8 @@ module Datasets
51
52
  private
52
53
  def open_data
53
54
  data_path = cache_dir_path + "index.html"
54
- unless data_path.exist?
55
- download(data_path, @metadata.url)
56
- end
57
- ::File.open(data_path) do |input|
55
+ download(data_path, @metadata.url)
56
+ data_path.open do |input|
58
57
  yield(input)
59
58
  end
60
59
  end
@@ -78,10 +77,8 @@ module Datasets
78
77
 
79
78
  def open_detail(detail)
80
79
  data_path = cache_dir_path + detail
81
- unless data_path.exist?
82
- download(data_path, @metadata.url + detail)
83
- end
84
- ::File.open(data_path) do |input|
80
+ download(data_path, @metadata.url + detail)
81
+ data_path.open do |input|
85
82
  yield(input)
86
83
  end
87
84
  end
@@ -41,6 +41,7 @@ module Datasets
41
41
  @metadata.id = "libsvm-#{normalize_name(name)}"
42
42
  @metadata.name = "LIBSVM dataset: #{name}"
43
43
  @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
44
+ @metadata.licenses = ["BSD-3-Clause"]
44
45
  end
45
46
 
46
47
  def each
@@ -99,13 +100,11 @@ module Datasets
99
100
 
100
101
  def open_data(&block)
101
102
  data_path = cache_dir_path + @file.name
102
- unless data_path.exist?
103
- download(data_path, @file.url)
104
- end
103
+ download(data_path, @file.url)
105
104
  if data_path.extname == ".bz2"
106
105
  extract_bz2(data_path, &block)
107
106
  else
108
- File.open(data_path, &block)
107
+ data_path.open(&block)
109
108
  end
110
109
  end
111
110
 
@@ -0,0 +1,26 @@
1
+ module Datasets
2
+ class License < Struct.new(:spdx_id,
3
+ :name,
4
+ :url)
5
+ class << self
6
+ def try_convert(value)
7
+ case value
8
+ when self
9
+ value
10
+ when String
11
+ license = new
12
+ license.spdx_id = value
13
+ license
14
+ when Hash
15
+ license = new
16
+ license.spdx_id = value[:spdx_id]
17
+ license.name = value[:name]
18
+ license.url = value[:url]
19
+ license
20
+ else
21
+ nil
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,80 @@
1
+ require_relative "dataset"
2
+ require_relative "tar-gz-readable"
3
+
4
+ module Datasets
5
+ class LivedoorNews < Dataset
6
+ include TarGzReadable
7
+ Record = Struct.new(:url,
8
+ :timestamp,
9
+ :sentence)
10
+
11
+ def initialize(type: :topic_news)
12
+ news_list = [
13
+ :topic_news,
14
+ :sports_watch,
15
+ :it_life_hack,
16
+ :kaden_channel,
17
+ :movie_enter,
18
+ :dokujo_tsushin,
19
+ :smax,
20
+ :livedoor_homme,
21
+ :peachy
22
+ ]
23
+ unless news_list.include?(type)
24
+ valid_type_labels = news_list.collect(&:inspect).join(", ")
25
+ message = ":type must be one of [#{valid_type_labels}]: #{type.inspect}"
26
+ raise ArgumentError, message
27
+ end
28
+
29
+ super()
30
+ @type = type
31
+ @metadata.id = 'livedoor-news'
32
+ @metadata.name = 'livedoor-news'
33
+ @metadata.url = 'https://www.rondhuit.com/download.html#ldcc'
34
+ @metadata.licenses = ['CC-BY-ND-2.1-JP']
35
+ @metadata.description = lambda do
36
+ fetch_readme
37
+ end
38
+ end
39
+
40
+ def each(&block)
41
+ return to_enum(__method__) unless block_given?
42
+
43
+ data_path = download_tar_gz
44
+ parse_data(data_path, &block)
45
+ end
46
+
47
+ private
48
+ def download_tar_gz
49
+ data_path = cache_dir_path + "livedoor-news.tar.gz"
50
+ data_url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
51
+ download(data_path, data_url)
52
+ data_path
53
+ end
54
+
55
+ def fetch_readme
56
+ data_path = download_tar_gz
57
+ target_file_name = 'text/README.txt'
58
+ open_tar_gz(data_path) do |tar|
59
+ tar.seek(target_file_name) do |entry|
60
+ return entry.read.force_encoding("UTF-8")
61
+ end
62
+ end
63
+ end
64
+
65
+ def parse_data(data_path, &block)
66
+ target_directory_name = "text/#{@type.to_s.gsub(/_/, '-')}"
67
+ open_tar_gz(data_path) do |tar|
68
+ tar.each do |entry|
69
+ next unless entry.file?
70
+ directory_name, base_name = File.split(entry.full_name)
71
+ next unless directory_name == target_directory_name
72
+ next if base_name == "LICENSE.txt"
73
+ url, timestamp, sentence = entry.read.force_encoding("UTF-8").split("\n", 3)
74
+ record = Record.new(url, Time.iso8601(timestamp), sentence)
75
+ yield(record)
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -1,9 +1,23 @@
1
+ require_relative "license"
2
+
1
3
  module Datasets
2
4
  class Metadata < Struct.new(:id,
3
5
  :name,
4
6
  :url,
5
7
  :licenses,
6
8
  :description)
9
+ def licenses=(licenses)
10
+ licenses = [licenses] unless licenses.is_a?(Array)
11
+ licenses = licenses.collect do |license|
12
+ l = License.try_convert(license)
13
+ if l.nil?
14
+ raise ArgumentError.new("invalid license: #{license.inspect}")
15
+ end
16
+ l
17
+ end
18
+ super(licenses)
19
+ end
20
+
7
21
  def description
8
22
  description_raw = super
9
23
  if description_raw.respond_to?(:call)
@@ -28,6 +28,7 @@ module Datasets
28
28
  @metadata.id = "#{dataset_name.downcase}-#{type}"
29
29
  @metadata.name = "#{dataset_name}: #{type}"
30
30
  @metadata.url = self.class::BASE_URL
31
+ @metadata.licenses = licenses
31
32
  @type = type
32
33
 
33
34
  case type
@@ -45,18 +46,17 @@ module Datasets
45
46
  label_path = cache_dir_path + target_file(:label)
46
47
  base_url = self.class::BASE_URL
47
48
 
48
- unless image_path.exist?
49
- download(image_path, base_url + target_file(:image))
50
- end
51
-
52
- unless label_path.exist?
53
- download(label_path, base_url + target_file(:label))
54
- end
49
+ download(image_path, base_url + target_file(:image))
50
+ download(label_path, base_url + target_file(:label))
55
51
 
56
52
  open_data(image_path, label_path, &block)
57
53
  end
58
54
 
59
55
  private
56
+ def licenses
57
+ []
58
+ end
59
+
60
60
  def open_data(image_path, label_path, &block)
61
61
  labels = parse_labels(label_path)
62
62
 
@@ -35,6 +35,7 @@ module Datasets
35
35
  @metadata.id = "mushroom"
36
36
  @metadata.name = "Mushroom"
37
37
  @metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
38
+ @metadata.licenses = ["CC-BY-4.0"]
38
39
  @metadata.description = lambda do
39
40
  read_names
40
41
  end
@@ -58,10 +59,8 @@ module Datasets
58
59
  private
59
60
  def open_data
60
61
  data_path = cache_dir_path + "agaricus-lepiota.data"
61
- unless data_path.exist?
62
- data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
- download(data_path, data_url)
64
- end
62
+ data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
63
+ download(data_path, data_url)
65
64
  CSV.open(data_path) do |csv|
66
65
  yield(csv)
67
66
  end
@@ -69,10 +68,8 @@ module Datasets
69
68
 
70
69
  def read_names
71
70
  names_path = cache_dir_path + "agaricus-lepiota.names"
72
- unless names_path.exist?
73
- names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
74
- download(names_path, names_url)
75
- end
71
+ names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
72
+ download(names_path, names_url)
76
73
  names_path.read
77
74
  end
78
75
 
@@ -23,10 +23,10 @@ module Datasets
23
23
  def initialize
24
24
  super
25
25
  species = self.class.name.split("::").last.downcase
26
- @metadata.id = "palmerpenguins-raw-#{species}"
26
+ @metadata.id = "palmerpenguins-#{species}"
27
27
  @metadata.url = self.class::URL
28
- @metadata.licenses = ["CC0"]
29
- @data_path = cache_dir_path + "penguins" + (species + ".csv")
28
+ @metadata.licenses = ["CC0-1.0"]
29
+ @data_path = cache_dir_path + "#{species}.csv"
30
30
  end
31
31
 
32
32
  attr_reader :data_path
@@ -44,15 +44,11 @@ module Datasets
44
44
  end
45
45
 
46
46
  private def open_data
47
- download unless data_path.exist?
47
+ download(data_path, metadata.url)
48
48
  CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
49
49
  yield csv
50
50
  end
51
51
  end
52
-
53
- private def download
54
- super(data_path, metadata.url)
55
- end
56
52
  end
57
53
 
58
54
  # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
@@ -36,10 +36,8 @@ module Datasets
36
36
 
37
37
  base_name = "ptb.#{@type}.txt"
38
38
  data_path = cache_dir_path + base_name
39
- unless data_path.exist?
40
- base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
41
- download(data_path, "#{base_url}/#{base_name}")
42
- end
39
+ base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
40
+ download(data_path, "#{base_url}/#{base_name}")
43
41
 
44
42
  parse_data(data_path, &block)
45
43
  end