red-datasets 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/doc/text/news.md +86 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +8 -12
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +6 -1
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/penguins.rb +4 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +4 -5
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +36 -0
- data/lib/datasets.rb +14 -2
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +64 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- metadata +58 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative "ggplot2-dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class FuelEconomy < Ggplot2Dataset
|
5
|
+
Record = Struct.new(:manufacturer,
|
6
|
+
:model,
|
7
|
+
:displacement,
|
8
|
+
:year,
|
9
|
+
:n_cylinders,
|
10
|
+
:transmission,
|
11
|
+
:drive_train,
|
12
|
+
:city_mpg,
|
13
|
+
:highway_mpg,
|
14
|
+
:fuel,
|
15
|
+
:type)
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
super("mpg")
|
19
|
+
@metadata.id = "fuel-economy"
|
20
|
+
@metadata.name = "Fuel economy"
|
21
|
+
@metadata.licenses = ["CC0-1.0"]
|
22
|
+
end
|
23
|
+
|
24
|
+
COLUMN_NAME_MAPPING = {
|
25
|
+
"displ" => "displacement",
|
26
|
+
"cyl" => "n_cylinders",
|
27
|
+
"trans" => "transmissions",
|
28
|
+
"drv" => "drive_train",
|
29
|
+
"cty" => "city_mpg",
|
30
|
+
"hwy" => "highway_mpg",
|
31
|
+
"fl" => "fuel",
|
32
|
+
"class" => "type",
|
33
|
+
}
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
require_relative 'dataset'
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Geolonia < Dataset
|
7
|
+
Record = Struct.new(:prefecture_code,
|
8
|
+
:prefecture_name,
|
9
|
+
:prefecture_kana,
|
10
|
+
:prefecture_romaji,
|
11
|
+
:municipality_code,
|
12
|
+
:municipality_name,
|
13
|
+
:municipality_kana,
|
14
|
+
:municipality_romaji,
|
15
|
+
:street_name,
|
16
|
+
:street_kana,
|
17
|
+
:street_romaji,
|
18
|
+
:alias,
|
19
|
+
:latitude,
|
20
|
+
:longitude)
|
21
|
+
|
22
|
+
def initialize
|
23
|
+
super
|
24
|
+
@metadata.id = 'geolonia'
|
25
|
+
@metadata.name = 'Geolonia'
|
26
|
+
@metadata.url = 'https://github.com/geolonia/japanese-addresses'
|
27
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
28
|
+
@metadata.description = lambda do
|
29
|
+
fetch_readme
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def each
|
34
|
+
return to_enum(__method__) unless block_given?
|
35
|
+
|
36
|
+
open_data do |csv|
|
37
|
+
csv.readline
|
38
|
+
csv.each do |row|
|
39
|
+
record = Record.new(*row)
|
40
|
+
yield(record)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def download_base_url
|
47
|
+
"https://raw.githubusercontent.com/geolonia/japanese-addresses/master"
|
48
|
+
end
|
49
|
+
|
50
|
+
def open_data
|
51
|
+
data_path = cache_dir_path + 'latest.csv'
|
52
|
+
data_url = "#{download_base_url}/data/latest.csv"
|
53
|
+
download(data_path, data_url)
|
54
|
+
CSV.open(data_path) do |csv|
|
55
|
+
yield(csv)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def fetch_readme
|
60
|
+
readme_base_name = "README.md"
|
61
|
+
readme_path = cache_dir_path + readme_base_name
|
62
|
+
readme_url = "#{download_base_url}/#{readme_base_name}"
|
63
|
+
download(readme_path, readme_url)
|
64
|
+
readme_path.read.split(/^## API/, 2)[0].strip
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Datasets
|
2
|
+
class Ggplot2Dataset < Dataset
|
3
|
+
def initialize(ggplot2_dataset_name)
|
4
|
+
super()
|
5
|
+
@ggplot2_dataset_name = ggplot2_dataset_name
|
6
|
+
@metadata.url =
|
7
|
+
"https://ggplot2.tidyverse.org/reference/#{@ggplot2_dataset_name}.html"
|
8
|
+
@metadata.description = lambda do
|
9
|
+
fetch_description
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def each
|
14
|
+
return to_enum(__method__) unless block_given?
|
15
|
+
|
16
|
+
data_base_name = "#{@ggplot2_dataset_name}.csv"
|
17
|
+
data_path = cache_dir_path + data_base_name
|
18
|
+
data_url = "#{download_base_url}/data-raw/#{data_base_name}"
|
19
|
+
download(data_path, data_url)
|
20
|
+
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
21
|
+
record_class = self.class::Record
|
22
|
+
csv.each do |row|
|
23
|
+
record = record_class.new(*row.fields)
|
24
|
+
yield record
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def download_base_url
|
31
|
+
"https://raw.githubusercontent.com/tidyverse/ggplot2/main"
|
32
|
+
end
|
33
|
+
|
34
|
+
def fetch_description
|
35
|
+
data_r_base_name = "data.R"
|
36
|
+
data_r_path = cache_dir_path + data_r_base_name
|
37
|
+
data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
|
38
|
+
download(data_r_path, data_r_url)
|
39
|
+
descriptions = {}
|
40
|
+
comment = ""
|
41
|
+
File.open(data_r_path) do |data_r|
|
42
|
+
data_r.each_line do |line|
|
43
|
+
case line.chomp
|
44
|
+
when /\A#'/
|
45
|
+
comment_content = Regexp.last_match.post_match
|
46
|
+
unless comment_content.empty?
|
47
|
+
comment_content = comment_content[1..-1]
|
48
|
+
end
|
49
|
+
comment << comment_content
|
50
|
+
comment << "\n"
|
51
|
+
when /\A"(.+)"\z/
|
52
|
+
name = Regexp.last_match[1]
|
53
|
+
descriptions[name] = parse_roxygen(comment.rstrip)
|
54
|
+
comment = ""
|
55
|
+
end
|
56
|
+
end
|
57
|
+
descriptions[@ggplot2_dataset_name]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def parse_roxygen(roxygen)
|
62
|
+
column_name_mapping = self.class::COLUMN_NAME_MAPPING
|
63
|
+
roxygen
|
64
|
+
.gsub(/\\url\{(.*?)\}/, "\\1")
|
65
|
+
.gsub(/^@format /, "")
|
66
|
+
.gsub(/\\describe\{(.*)\}/m) do
|
67
|
+
content = $1
|
68
|
+
content.gsub(/\\item\{(.*?)\}\{(.*?)\}/m) do
|
69
|
+
column_name = $1
|
70
|
+
description = $2
|
71
|
+
column_name = column_name_mapping[column_name] || column_name
|
72
|
+
description = description
|
73
|
+
.gsub(/\\\$/, "$")
|
74
|
+
"* #{column_name}: #{description}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/datasets/hepatitis.rb
CHANGED
@@ -163,6 +163,7 @@ module Datasets
|
|
163
163
|
@metadata.id = "hepatitis"
|
164
164
|
@metadata.name = "Hepatitis"
|
165
165
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
|
166
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
166
167
|
@metadata.description = lambda do
|
167
168
|
read_names
|
168
169
|
end
|
@@ -186,10 +187,8 @@ module Datasets
|
|
186
187
|
|
187
188
|
def open_data
|
188
189
|
data_path = cache_dir_path + "hepatitis.csv"
|
189
|
-
|
190
|
-
|
191
|
-
download(data_path, data_url)
|
192
|
-
end
|
190
|
+
data_url = "#{base_url}/hepatitis.data"
|
191
|
+
download(data_path, data_url)
|
193
192
|
CSV.open(data_path) do |csv|
|
194
193
|
yield(csv)
|
195
194
|
end
|
@@ -197,10 +196,8 @@ module Datasets
|
|
197
196
|
|
198
197
|
def read_names
|
199
198
|
names_path = cache_dir_path + "hepatitis.names"
|
200
|
-
|
201
|
-
|
202
|
-
download(names_path, names_url)
|
203
|
-
end
|
199
|
+
names_url = "#{base_url}/hepatitis.names"
|
200
|
+
download(names_path, names_url)
|
204
201
|
names_path.read
|
205
202
|
end
|
206
203
|
end
|
data/lib/datasets/iris.rb
CHANGED
@@ -15,6 +15,7 @@ module Datasets
|
|
15
15
|
@metadata.id = "iris"
|
16
16
|
@metadata.name = "Iris"
|
17
17
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris"
|
18
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
18
19
|
@metadata.description = lambda do
|
19
20
|
read_names
|
20
21
|
end
|
@@ -35,10 +36,8 @@ module Datasets
|
|
35
36
|
private
|
36
37
|
def open_data
|
37
38
|
data_path = cache_dir_path + "iris.csv"
|
38
|
-
|
39
|
-
|
40
|
-
download(data_path, data_url)
|
41
|
-
end
|
39
|
+
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
|
40
|
+
download(data_path, data_url)
|
42
41
|
CSV.open(data_path, converters: [:numeric]) do |csv|
|
43
42
|
yield(csv)
|
44
43
|
end
|
@@ -46,10 +45,8 @@ module Datasets
|
|
46
45
|
|
47
46
|
def read_names
|
48
47
|
names_path = cache_dir_path + "iris.names"
|
49
|
-
|
50
|
-
|
51
|
-
download(names_path, names_url)
|
52
|
-
end
|
48
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
|
49
|
+
download(names_path, names_url)
|
53
50
|
names_path.read
|
54
51
|
end
|
55
52
|
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class ITACorpus < Dataset
|
5
|
+
Record = Struct.new(:id,
|
6
|
+
:sentence)
|
7
|
+
|
8
|
+
def initialize(type: :emotion)
|
9
|
+
unless [:emotion, :recitation].include?(type)
|
10
|
+
raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}"
|
11
|
+
end
|
12
|
+
|
13
|
+
super()
|
14
|
+
@type = type
|
15
|
+
@metadata.id = 'ita-corpus'
|
16
|
+
@metadata.name = 'ITA-corpus'
|
17
|
+
@metadata.url = 'https://github.com/mmorise/ita-corpus'
|
18
|
+
@metadata.licenses = ['Unlicense']
|
19
|
+
@metadata.description = lambda do
|
20
|
+
fetch_readme
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def each(&block)
|
25
|
+
return to_enum(__method__) unless block_given?
|
26
|
+
|
27
|
+
data_path = cache_dir_path + "#{@type}_transcript_utf8.txt"
|
28
|
+
data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt"
|
29
|
+
download(data_path, data_url)
|
30
|
+
|
31
|
+
parse_data(data_path, &block)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def fetch_readme
|
36
|
+
readme_base_name = "README.md"
|
37
|
+
readme_path = cache_dir_path + readme_base_name
|
38
|
+
readme_url = "#{download_base_url}/#{readme_base_name}"
|
39
|
+
download(readme_path, readme_url)
|
40
|
+
readme_path.read.split(/^## ファイル構成/, 2)[0].strip
|
41
|
+
end
|
42
|
+
|
43
|
+
def download_base_url
|
44
|
+
"https://raw.githubusercontent.com/mmorise/ita-corpus/main"
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse_data(data_path)
|
48
|
+
File.open(data_path) do |f|
|
49
|
+
f.each_line(chomp: true) do |line|
|
50
|
+
id, sentence = line.split(':', 2)
|
51
|
+
record = Record.new(id , sentence)
|
52
|
+
yield(record)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require_relative 'mnist'
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class KuzushijiMNIST < MNIST
|
5
|
+
BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
|
6
|
+
|
7
|
+
private
|
8
|
+
def dataset_name
|
9
|
+
"Kuzushiji-MNIST"
|
10
|
+
end
|
11
|
+
|
12
|
+
def licenses
|
13
|
+
["CC-BY-SA-4.0"]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -28,6 +28,7 @@ module Datasets
|
|
28
28
|
@metadata.id = "libsvm-dataset-list"
|
29
29
|
@metadata.name = "LIBSVM dataset list"
|
30
30
|
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
31
|
+
@metadata.licenses = ["BSD-3-Clause"]
|
31
32
|
@metadata.description = lambda do
|
32
33
|
extract_description
|
33
34
|
end
|
@@ -51,10 +52,8 @@ module Datasets
|
|
51
52
|
private
|
52
53
|
def open_data
|
53
54
|
data_path = cache_dir_path + "index.html"
|
54
|
-
|
55
|
-
|
56
|
-
end
|
57
|
-
::File.open(data_path) do |input|
|
55
|
+
download(data_path, @metadata.url)
|
56
|
+
data_path.open do |input|
|
58
57
|
yield(input)
|
59
58
|
end
|
60
59
|
end
|
@@ -78,10 +77,8 @@ module Datasets
|
|
78
77
|
|
79
78
|
def open_detail(detail)
|
80
79
|
data_path = cache_dir_path + detail
|
81
|
-
|
82
|
-
|
83
|
-
end
|
84
|
-
::File.open(data_path) do |input|
|
80
|
+
download(data_path, @metadata.url + detail)
|
81
|
+
data_path.open do |input|
|
85
82
|
yield(input)
|
86
83
|
end
|
87
84
|
end
|
data/lib/datasets/libsvm.rb
CHANGED
@@ -41,6 +41,7 @@ module Datasets
|
|
41
41
|
@metadata.id = "libsvm-#{normalize_name(name)}"
|
42
42
|
@metadata.name = "LIBSVM dataset: #{name}"
|
43
43
|
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
44
|
+
@metadata.licenses = ["BSD-3-Clause"]
|
44
45
|
end
|
45
46
|
|
46
47
|
def each
|
@@ -99,13 +100,11 @@ module Datasets
|
|
99
100
|
|
100
101
|
def open_data(&block)
|
101
102
|
data_path = cache_dir_path + @file.name
|
102
|
-
|
103
|
-
download(data_path, @file.url)
|
104
|
-
end
|
103
|
+
download(data_path, @file.url)
|
105
104
|
if data_path.extname == ".bz2"
|
106
105
|
extract_bz2(data_path, &block)
|
107
106
|
else
|
108
|
-
|
107
|
+
data_path.open(&block)
|
109
108
|
end
|
110
109
|
end
|
111
110
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Datasets
|
2
|
+
class License < Struct.new(:spdx_id,
|
3
|
+
:name,
|
4
|
+
:url)
|
5
|
+
class << self
|
6
|
+
def try_convert(value)
|
7
|
+
case value
|
8
|
+
when self
|
9
|
+
value
|
10
|
+
when String
|
11
|
+
license = new
|
12
|
+
license.spdx_id = value
|
13
|
+
license
|
14
|
+
when Hash
|
15
|
+
license = new
|
16
|
+
license.spdx_id = value[:spdx_id]
|
17
|
+
license.name = value[:name]
|
18
|
+
license.url = value[:url]
|
19
|
+
license
|
20
|
+
else
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "tar-gz-readable"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class LivedoorNews < Dataset
|
6
|
+
include TarGzReadable
|
7
|
+
Record = Struct.new(:url,
|
8
|
+
:timestamp,
|
9
|
+
:sentence)
|
10
|
+
|
11
|
+
def initialize(type: :topic_news)
|
12
|
+
news_list = [
|
13
|
+
:topic_news,
|
14
|
+
:sports_watch,
|
15
|
+
:it_life_hack,
|
16
|
+
:kaden_channel,
|
17
|
+
:movie_enter,
|
18
|
+
:dokujo_tsushin,
|
19
|
+
:smax,
|
20
|
+
:livedoor_homme,
|
21
|
+
:peachy
|
22
|
+
]
|
23
|
+
unless news_list.include?(type)
|
24
|
+
valid_type_labels = news_list.collect(&:inspect).join(", ")
|
25
|
+
message = ":type must be one of [#{valid_type_labels}]: #{type.inspect}"
|
26
|
+
raise ArgumentError, message
|
27
|
+
end
|
28
|
+
|
29
|
+
super()
|
30
|
+
@type = type
|
31
|
+
@metadata.id = 'livedoor-news'
|
32
|
+
@metadata.name = 'livedoor-news'
|
33
|
+
@metadata.url = 'https://www.rondhuit.com/download.html#ldcc'
|
34
|
+
@metadata.licenses = ['CC-BY-ND-2.1-JP']
|
35
|
+
@metadata.description = lambda do
|
36
|
+
fetch_readme
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def each(&block)
|
41
|
+
return to_enum(__method__) unless block_given?
|
42
|
+
|
43
|
+
data_path = download_tar_gz
|
44
|
+
parse_data(data_path, &block)
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def download_tar_gz
|
49
|
+
data_path = cache_dir_path + "livedoor-news.tar.gz"
|
50
|
+
data_url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
|
51
|
+
download(data_path, data_url)
|
52
|
+
data_path
|
53
|
+
end
|
54
|
+
|
55
|
+
def fetch_readme
|
56
|
+
data_path = download_tar_gz
|
57
|
+
target_file_name = 'text/README.txt'
|
58
|
+
open_tar_gz(data_path) do |tar|
|
59
|
+
tar.seek(target_file_name) do |entry|
|
60
|
+
return entry.read.force_encoding("UTF-8")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse_data(data_path, &block)
|
66
|
+
target_directory_name = "text/#{@type.to_s.gsub(/_/, '-')}"
|
67
|
+
open_tar_gz(data_path) do |tar|
|
68
|
+
tar.each do |entry|
|
69
|
+
next unless entry.file?
|
70
|
+
directory_name, base_name = File.split(entry.full_name)
|
71
|
+
next unless directory_name == target_directory_name
|
72
|
+
next if base_name == "LICENSE.txt"
|
73
|
+
url, timestamp, sentence = entry.read.force_encoding("UTF-8").split("\n", 3)
|
74
|
+
record = Record.new(url, Time.iso8601(timestamp), sentence)
|
75
|
+
yield(record)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/datasets/metadata.rb
CHANGED
@@ -1,9 +1,23 @@
|
|
1
|
+
require_relative "license"
|
2
|
+
|
1
3
|
module Datasets
|
2
4
|
class Metadata < Struct.new(:id,
|
3
5
|
:name,
|
4
6
|
:url,
|
5
7
|
:licenses,
|
6
8
|
:description)
|
9
|
+
def licenses=(licenses)
|
10
|
+
licenses = [licenses] unless licenses.is_a?(Array)
|
11
|
+
licenses = licenses.collect do |license|
|
12
|
+
l = License.try_convert(license)
|
13
|
+
if l.nil?
|
14
|
+
raise ArgumentError.new("invalid license: #{license.inspect}")
|
15
|
+
end
|
16
|
+
l
|
17
|
+
end
|
18
|
+
super(licenses)
|
19
|
+
end
|
20
|
+
|
7
21
|
def description
|
8
22
|
description_raw = super
|
9
23
|
if description_raw.respond_to?(:call)
|
data/lib/datasets/mnist.rb
CHANGED
@@ -28,6 +28,7 @@ module Datasets
|
|
28
28
|
@metadata.id = "#{dataset_name.downcase}-#{type}"
|
29
29
|
@metadata.name = "#{dataset_name}: #{type}"
|
30
30
|
@metadata.url = self.class::BASE_URL
|
31
|
+
@metadata.licenses = licenses
|
31
32
|
@type = type
|
32
33
|
|
33
34
|
case type
|
@@ -45,18 +46,17 @@ module Datasets
|
|
45
46
|
label_path = cache_dir_path + target_file(:label)
|
46
47
|
base_url = self.class::BASE_URL
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
unless label_path.exist?
|
53
|
-
download(label_path, base_url + target_file(:label))
|
54
|
-
end
|
49
|
+
download(image_path, base_url + target_file(:image))
|
50
|
+
download(label_path, base_url + target_file(:label))
|
55
51
|
|
56
52
|
open_data(image_path, label_path, &block)
|
57
53
|
end
|
58
54
|
|
59
55
|
private
|
56
|
+
def licenses
|
57
|
+
[]
|
58
|
+
end
|
59
|
+
|
60
60
|
def open_data(image_path, label_path, &block)
|
61
61
|
labels = parse_labels(label_path)
|
62
62
|
|
data/lib/datasets/mushroom.rb
CHANGED
@@ -35,6 +35,7 @@ module Datasets
|
|
35
35
|
@metadata.id = "mushroom"
|
36
36
|
@metadata.name = "Mushroom"
|
37
37
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
|
38
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
38
39
|
@metadata.description = lambda do
|
39
40
|
read_names
|
40
41
|
end
|
@@ -58,10 +59,8 @@ module Datasets
|
|
58
59
|
private
|
59
60
|
def open_data
|
60
61
|
data_path = cache_dir_path + "agaricus-lepiota.data"
|
61
|
-
|
62
|
-
|
63
|
-
download(data_path, data_url)
|
64
|
-
end
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
|
63
|
+
download(data_path, data_url)
|
65
64
|
CSV.open(data_path) do |csv|
|
66
65
|
yield(csv)
|
67
66
|
end
|
@@ -69,10 +68,8 @@ module Datasets
|
|
69
68
|
|
70
69
|
def read_names
|
71
70
|
names_path = cache_dir_path + "agaricus-lepiota.names"
|
72
|
-
|
73
|
-
|
74
|
-
download(names_path, names_url)
|
75
|
-
end
|
71
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
|
72
|
+
download(names_path, names_url)
|
76
73
|
names_path.read
|
77
74
|
end
|
78
75
|
|
data/lib/datasets/penguins.rb
CHANGED
@@ -23,10 +23,10 @@ module Datasets
|
|
23
23
|
def initialize
|
24
24
|
super
|
25
25
|
species = self.class.name.split("::").last.downcase
|
26
|
-
@metadata.id = "palmerpenguins
|
26
|
+
@metadata.id = "palmerpenguins-#{species}"
|
27
27
|
@metadata.url = self.class::URL
|
28
|
-
@metadata.licenses = ["CC0"]
|
29
|
-
@data_path = cache_dir_path + "
|
28
|
+
@metadata.licenses = ["CC0-1.0"]
|
29
|
+
@data_path = cache_dir_path + "#{species}.csv"
|
30
30
|
end
|
31
31
|
|
32
32
|
attr_reader :data_path
|
@@ -44,15 +44,11 @@ module Datasets
|
|
44
44
|
end
|
45
45
|
|
46
46
|
private def open_data
|
47
|
-
download
|
47
|
+
download(data_path, metadata.url)
|
48
48
|
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
49
49
|
yield csv
|
50
50
|
end
|
51
51
|
end
|
52
|
-
|
53
|
-
private def download
|
54
|
-
super(data_path, metadata.url)
|
55
|
-
end
|
56
52
|
end
|
57
53
|
|
58
54
|
# Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
|
@@ -36,10 +36,8 @@ module Datasets
|
|
36
36
|
|
37
37
|
base_name = "ptb.#{@type}.txt"
|
38
38
|
data_path = cache_dir_path + base_name
|
39
|
-
|
40
|
-
|
41
|
-
download(data_path, "#{base_url}/#{base_name}")
|
42
|
-
end
|
39
|
+
base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
|
40
|
+
download(data_path, "#{base_url}/#{base_name}")
|
43
41
|
|
44
42
|
parse_data(data_path, &block)
|
45
43
|
end
|