red-datasets 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -2
- data/doc/text/news.md +86 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +8 -12
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +6 -1
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/penguins.rb +4 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +4 -5
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +36 -0
- data/lib/datasets.rb +14 -2
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +64 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +70 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- metadata +58 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative "ggplot2-dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class FuelEconomy < Ggplot2Dataset
|
5
|
+
Record = Struct.new(:manufacturer,
|
6
|
+
:model,
|
7
|
+
:displacement,
|
8
|
+
:year,
|
9
|
+
:n_cylinders,
|
10
|
+
:transmission,
|
11
|
+
:drive_train,
|
12
|
+
:city_mpg,
|
13
|
+
:highway_mpg,
|
14
|
+
:fuel,
|
15
|
+
:type)
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
super("mpg")
|
19
|
+
@metadata.id = "fuel-economy"
|
20
|
+
@metadata.name = "Fuel economy"
|
21
|
+
@metadata.licenses = ["CC0-1.0"]
|
22
|
+
end
|
23
|
+
|
24
|
+
COLUMN_NAME_MAPPING = {
|
25
|
+
"displ" => "displacement",
|
26
|
+
"cyl" => "n_cylinders",
|
27
|
+
"trans" => "transmissions",
|
28
|
+
"drv" => "drive_train",
|
29
|
+
"cty" => "city_mpg",
|
30
|
+
"hwy" => "highway_mpg",
|
31
|
+
"fl" => "fuel",
|
32
|
+
"class" => "type",
|
33
|
+
}
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
require_relative 'dataset'
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Geolonia < Dataset
|
7
|
+
Record = Struct.new(:prefecture_code,
|
8
|
+
:prefecture_name,
|
9
|
+
:prefecture_kana,
|
10
|
+
:prefecture_romaji,
|
11
|
+
:municipality_code,
|
12
|
+
:municipality_name,
|
13
|
+
:municipality_kana,
|
14
|
+
:municipality_romaji,
|
15
|
+
:street_name,
|
16
|
+
:street_kana,
|
17
|
+
:street_romaji,
|
18
|
+
:alias,
|
19
|
+
:latitude,
|
20
|
+
:longitude)
|
21
|
+
|
22
|
+
def initialize
|
23
|
+
super
|
24
|
+
@metadata.id = 'geolonia'
|
25
|
+
@metadata.name = 'Geolonia'
|
26
|
+
@metadata.url = 'https://github.com/geolonia/japanese-addresses'
|
27
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
28
|
+
@metadata.description = lambda do
|
29
|
+
fetch_readme
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def each
|
34
|
+
return to_enum(__method__) unless block_given?
|
35
|
+
|
36
|
+
open_data do |csv|
|
37
|
+
csv.readline
|
38
|
+
csv.each do |row|
|
39
|
+
record = Record.new(*row)
|
40
|
+
yield(record)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def download_base_url
|
47
|
+
"https://raw.githubusercontent.com/geolonia/japanese-addresses/master"
|
48
|
+
end
|
49
|
+
|
50
|
+
def open_data
|
51
|
+
data_path = cache_dir_path + 'latest.csv'
|
52
|
+
data_url = "#{download_base_url}/data/latest.csv"
|
53
|
+
download(data_path, data_url)
|
54
|
+
CSV.open(data_path) do |csv|
|
55
|
+
yield(csv)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def fetch_readme
|
60
|
+
readme_base_name = "README.md"
|
61
|
+
readme_path = cache_dir_path + readme_base_name
|
62
|
+
readme_url = "#{download_base_url}/#{readme_base_name}"
|
63
|
+
download(readme_path, readme_url)
|
64
|
+
readme_path.read.split(/^## API/, 2)[0].strip
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Datasets
|
2
|
+
class Ggplot2Dataset < Dataset
|
3
|
+
def initialize(ggplot2_dataset_name)
|
4
|
+
super()
|
5
|
+
@ggplot2_dataset_name = ggplot2_dataset_name
|
6
|
+
@metadata.url =
|
7
|
+
"https://ggplot2.tidyverse.org/reference/#{@ggplot2_dataset_name}.html"
|
8
|
+
@metadata.description = lambda do
|
9
|
+
fetch_description
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def each
|
14
|
+
return to_enum(__method__) unless block_given?
|
15
|
+
|
16
|
+
data_base_name = "#{@ggplot2_dataset_name}.csv"
|
17
|
+
data_path = cache_dir_path + data_base_name
|
18
|
+
data_url = "#{download_base_url}/data-raw/#{data_base_name}"
|
19
|
+
download(data_path, data_url)
|
20
|
+
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
21
|
+
record_class = self.class::Record
|
22
|
+
csv.each do |row|
|
23
|
+
record = record_class.new(*row.fields)
|
24
|
+
yield record
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def download_base_url
|
31
|
+
"https://raw.githubusercontent.com/tidyverse/ggplot2/main"
|
32
|
+
end
|
33
|
+
|
34
|
+
def fetch_description
|
35
|
+
data_r_base_name = "data.R"
|
36
|
+
data_r_path = cache_dir_path + data_r_base_name
|
37
|
+
data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
|
38
|
+
download(data_r_path, data_r_url)
|
39
|
+
descriptions = {}
|
40
|
+
comment = ""
|
41
|
+
File.open(data_r_path) do |data_r|
|
42
|
+
data_r.each_line do |line|
|
43
|
+
case line.chomp
|
44
|
+
when /\A#'/
|
45
|
+
comment_content = Regexp.last_match.post_match
|
46
|
+
unless comment_content.empty?
|
47
|
+
comment_content = comment_content[1..-1]
|
48
|
+
end
|
49
|
+
comment << comment_content
|
50
|
+
comment << "\n"
|
51
|
+
when /\A"(.+)"\z/
|
52
|
+
name = Regexp.last_match[1]
|
53
|
+
descriptions[name] = parse_roxygen(comment.rstrip)
|
54
|
+
comment = ""
|
55
|
+
end
|
56
|
+
end
|
57
|
+
descriptions[@ggplot2_dataset_name]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def parse_roxygen(roxygen)
|
62
|
+
column_name_mapping = self.class::COLUMN_NAME_MAPPING
|
63
|
+
roxygen
|
64
|
+
.gsub(/\\url\{(.*?)\}/, "\\1")
|
65
|
+
.gsub(/^@format /, "")
|
66
|
+
.gsub(/\\describe\{(.*)\}/m) do
|
67
|
+
content = $1
|
68
|
+
content.gsub(/\\item\{(.*?)\}\{(.*?)\}/m) do
|
69
|
+
column_name = $1
|
70
|
+
description = $2
|
71
|
+
column_name = column_name_mapping[column_name] || column_name
|
72
|
+
description = description
|
73
|
+
.gsub(/\\\$/, "$")
|
74
|
+
"* #{column_name}: #{description}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/datasets/hepatitis.rb
CHANGED
@@ -163,6 +163,7 @@ module Datasets
|
|
163
163
|
@metadata.id = "hepatitis"
|
164
164
|
@metadata.name = "Hepatitis"
|
165
165
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
|
166
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
166
167
|
@metadata.description = lambda do
|
167
168
|
read_names
|
168
169
|
end
|
@@ -186,10 +187,8 @@ module Datasets
|
|
186
187
|
|
187
188
|
def open_data
|
188
189
|
data_path = cache_dir_path + "hepatitis.csv"
|
189
|
-
|
190
|
-
|
191
|
-
download(data_path, data_url)
|
192
|
-
end
|
190
|
+
data_url = "#{base_url}/hepatitis.data"
|
191
|
+
download(data_path, data_url)
|
193
192
|
CSV.open(data_path) do |csv|
|
194
193
|
yield(csv)
|
195
194
|
end
|
@@ -197,10 +196,8 @@ module Datasets
|
|
197
196
|
|
198
197
|
def read_names
|
199
198
|
names_path = cache_dir_path + "hepatitis.names"
|
200
|
-
|
201
|
-
|
202
|
-
download(names_path, names_url)
|
203
|
-
end
|
199
|
+
names_url = "#{base_url}/hepatitis.names"
|
200
|
+
download(names_path, names_url)
|
204
201
|
names_path.read
|
205
202
|
end
|
206
203
|
end
|
data/lib/datasets/iris.rb
CHANGED
@@ -15,6 +15,7 @@ module Datasets
|
|
15
15
|
@metadata.id = "iris"
|
16
16
|
@metadata.name = "Iris"
|
17
17
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris"
|
18
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
18
19
|
@metadata.description = lambda do
|
19
20
|
read_names
|
20
21
|
end
|
@@ -35,10 +36,8 @@ module Datasets
|
|
35
36
|
private
|
36
37
|
def open_data
|
37
38
|
data_path = cache_dir_path + "iris.csv"
|
38
|
-
|
39
|
-
|
40
|
-
download(data_path, data_url)
|
41
|
-
end
|
39
|
+
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
|
40
|
+
download(data_path, data_url)
|
42
41
|
CSV.open(data_path, converters: [:numeric]) do |csv|
|
43
42
|
yield(csv)
|
44
43
|
end
|
@@ -46,10 +45,8 @@ module Datasets
|
|
46
45
|
|
47
46
|
def read_names
|
48
47
|
names_path = cache_dir_path + "iris.names"
|
49
|
-
|
50
|
-
|
51
|
-
download(names_path, names_url)
|
52
|
-
end
|
48
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
|
49
|
+
download(names_path, names_url)
|
53
50
|
names_path.read
|
54
51
|
end
|
55
52
|
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class ITACorpus < Dataset
|
5
|
+
Record = Struct.new(:id,
|
6
|
+
:sentence)
|
7
|
+
|
8
|
+
def initialize(type: :emotion)
|
9
|
+
unless [:emotion, :recitation].include?(type)
|
10
|
+
raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}"
|
11
|
+
end
|
12
|
+
|
13
|
+
super()
|
14
|
+
@type = type
|
15
|
+
@metadata.id = 'ita-corpus'
|
16
|
+
@metadata.name = 'ITA-corpus'
|
17
|
+
@metadata.url = 'https://github.com/mmorise/ita-corpus'
|
18
|
+
@metadata.licenses = ['Unlicense']
|
19
|
+
@metadata.description = lambda do
|
20
|
+
fetch_readme
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def each(&block)
|
25
|
+
return to_enum(__method__) unless block_given?
|
26
|
+
|
27
|
+
data_path = cache_dir_path + "#{@type}_transcript_utf8.txt"
|
28
|
+
data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt"
|
29
|
+
download(data_path, data_url)
|
30
|
+
|
31
|
+
parse_data(data_path, &block)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def fetch_readme
|
36
|
+
readme_base_name = "README.md"
|
37
|
+
readme_path = cache_dir_path + readme_base_name
|
38
|
+
readme_url = "#{download_base_url}/#{readme_base_name}"
|
39
|
+
download(readme_path, readme_url)
|
40
|
+
readme_path.read.split(/^## ファイル構成/, 2)[0].strip
|
41
|
+
end
|
42
|
+
|
43
|
+
def download_base_url
|
44
|
+
"https://raw.githubusercontent.com/mmorise/ita-corpus/main"
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse_data(data_path)
|
48
|
+
File.open(data_path) do |f|
|
49
|
+
f.each_line(chomp: true) do |line|
|
50
|
+
id, sentence = line.split(':', 2)
|
51
|
+
record = Record.new(id , sentence)
|
52
|
+
yield(record)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require_relative 'mnist'
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class KuzushijiMNIST < MNIST
|
5
|
+
BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
|
6
|
+
|
7
|
+
private
|
8
|
+
def dataset_name
|
9
|
+
"Kuzushiji-MNIST"
|
10
|
+
end
|
11
|
+
|
12
|
+
def licenses
|
13
|
+
["CC-BY-SA-4.0"]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -28,6 +28,7 @@ module Datasets
|
|
28
28
|
@metadata.id = "libsvm-dataset-list"
|
29
29
|
@metadata.name = "LIBSVM dataset list"
|
30
30
|
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
31
|
+
@metadata.licenses = ["BSD-3-Clause"]
|
31
32
|
@metadata.description = lambda do
|
32
33
|
extract_description
|
33
34
|
end
|
@@ -51,10 +52,8 @@ module Datasets
|
|
51
52
|
private
|
52
53
|
def open_data
|
53
54
|
data_path = cache_dir_path + "index.html"
|
54
|
-
|
55
|
-
|
56
|
-
end
|
57
|
-
::File.open(data_path) do |input|
|
55
|
+
download(data_path, @metadata.url)
|
56
|
+
data_path.open do |input|
|
58
57
|
yield(input)
|
59
58
|
end
|
60
59
|
end
|
@@ -78,10 +77,8 @@ module Datasets
|
|
78
77
|
|
79
78
|
def open_detail(detail)
|
80
79
|
data_path = cache_dir_path + detail
|
81
|
-
|
82
|
-
|
83
|
-
end
|
84
|
-
::File.open(data_path) do |input|
|
80
|
+
download(data_path, @metadata.url + detail)
|
81
|
+
data_path.open do |input|
|
85
82
|
yield(input)
|
86
83
|
end
|
87
84
|
end
|
data/lib/datasets/libsvm.rb
CHANGED
@@ -41,6 +41,7 @@ module Datasets
|
|
41
41
|
@metadata.id = "libsvm-#{normalize_name(name)}"
|
42
42
|
@metadata.name = "LIBSVM dataset: #{name}"
|
43
43
|
@metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
|
44
|
+
@metadata.licenses = ["BSD-3-Clause"]
|
44
45
|
end
|
45
46
|
|
46
47
|
def each
|
@@ -99,13 +100,11 @@ module Datasets
|
|
99
100
|
|
100
101
|
def open_data(&block)
|
101
102
|
data_path = cache_dir_path + @file.name
|
102
|
-
|
103
|
-
download(data_path, @file.url)
|
104
|
-
end
|
103
|
+
download(data_path, @file.url)
|
105
104
|
if data_path.extname == ".bz2"
|
106
105
|
extract_bz2(data_path, &block)
|
107
106
|
else
|
108
|
-
|
107
|
+
data_path.open(&block)
|
109
108
|
end
|
110
109
|
end
|
111
110
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Datasets
|
2
|
+
class License < Struct.new(:spdx_id,
|
3
|
+
:name,
|
4
|
+
:url)
|
5
|
+
class << self
|
6
|
+
def try_convert(value)
|
7
|
+
case value
|
8
|
+
when self
|
9
|
+
value
|
10
|
+
when String
|
11
|
+
license = new
|
12
|
+
license.spdx_id = value
|
13
|
+
license
|
14
|
+
when Hash
|
15
|
+
license = new
|
16
|
+
license.spdx_id = value[:spdx_id]
|
17
|
+
license.name = value[:name]
|
18
|
+
license.url = value[:url]
|
19
|
+
license
|
20
|
+
else
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "tar-gz-readable"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class LivedoorNews < Dataset
|
6
|
+
include TarGzReadable
|
7
|
+
Record = Struct.new(:url,
|
8
|
+
:timestamp,
|
9
|
+
:sentence)
|
10
|
+
|
11
|
+
def initialize(type: :topic_news)
|
12
|
+
news_list = [
|
13
|
+
:topic_news,
|
14
|
+
:sports_watch,
|
15
|
+
:it_life_hack,
|
16
|
+
:kaden_channel,
|
17
|
+
:movie_enter,
|
18
|
+
:dokujo_tsushin,
|
19
|
+
:smax,
|
20
|
+
:livedoor_homme,
|
21
|
+
:peachy
|
22
|
+
]
|
23
|
+
unless news_list.include?(type)
|
24
|
+
valid_type_labels = news_list.collect(&:inspect).join(", ")
|
25
|
+
message = ":type must be one of [#{valid_type_labels}]: #{type.inspect}"
|
26
|
+
raise ArgumentError, message
|
27
|
+
end
|
28
|
+
|
29
|
+
super()
|
30
|
+
@type = type
|
31
|
+
@metadata.id = 'livedoor-news'
|
32
|
+
@metadata.name = 'livedoor-news'
|
33
|
+
@metadata.url = 'https://www.rondhuit.com/download.html#ldcc'
|
34
|
+
@metadata.licenses = ['CC-BY-ND-2.1-JP']
|
35
|
+
@metadata.description = lambda do
|
36
|
+
fetch_readme
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def each(&block)
|
41
|
+
return to_enum(__method__) unless block_given?
|
42
|
+
|
43
|
+
data_path = download_tar_gz
|
44
|
+
parse_data(data_path, &block)
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def download_tar_gz
|
49
|
+
data_path = cache_dir_path + "livedoor-news.tar.gz"
|
50
|
+
data_url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
|
51
|
+
download(data_path, data_url)
|
52
|
+
data_path
|
53
|
+
end
|
54
|
+
|
55
|
+
def fetch_readme
|
56
|
+
data_path = download_tar_gz
|
57
|
+
target_file_name = 'text/README.txt'
|
58
|
+
open_tar_gz(data_path) do |tar|
|
59
|
+
tar.seek(target_file_name) do |entry|
|
60
|
+
return entry.read.force_encoding("UTF-8")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse_data(data_path, &block)
|
66
|
+
target_directory_name = "text/#{@type.to_s.gsub(/_/, '-')}"
|
67
|
+
open_tar_gz(data_path) do |tar|
|
68
|
+
tar.each do |entry|
|
69
|
+
next unless entry.file?
|
70
|
+
directory_name, base_name = File.split(entry.full_name)
|
71
|
+
next unless directory_name == target_directory_name
|
72
|
+
next if base_name == "LICENSE.txt"
|
73
|
+
url, timestamp, sentence = entry.read.force_encoding("UTF-8").split("\n", 3)
|
74
|
+
record = Record.new(url, Time.iso8601(timestamp), sentence)
|
75
|
+
yield(record)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/datasets/metadata.rb
CHANGED
@@ -1,9 +1,23 @@
|
|
1
|
+
require_relative "license"
|
2
|
+
|
1
3
|
module Datasets
|
2
4
|
class Metadata < Struct.new(:id,
|
3
5
|
:name,
|
4
6
|
:url,
|
5
7
|
:licenses,
|
6
8
|
:description)
|
9
|
+
def licenses=(licenses)
|
10
|
+
licenses = [licenses] unless licenses.is_a?(Array)
|
11
|
+
licenses = licenses.collect do |license|
|
12
|
+
l = License.try_convert(license)
|
13
|
+
if l.nil?
|
14
|
+
raise ArgumentError.new("invalid license: #{license.inspect}")
|
15
|
+
end
|
16
|
+
l
|
17
|
+
end
|
18
|
+
super(licenses)
|
19
|
+
end
|
20
|
+
|
7
21
|
def description
|
8
22
|
description_raw = super
|
9
23
|
if description_raw.respond_to?(:call)
|
data/lib/datasets/mnist.rb
CHANGED
@@ -28,6 +28,7 @@ module Datasets
|
|
28
28
|
@metadata.id = "#{dataset_name.downcase}-#{type}"
|
29
29
|
@metadata.name = "#{dataset_name}: #{type}"
|
30
30
|
@metadata.url = self.class::BASE_URL
|
31
|
+
@metadata.licenses = licenses
|
31
32
|
@type = type
|
32
33
|
|
33
34
|
case type
|
@@ -45,18 +46,17 @@ module Datasets
|
|
45
46
|
label_path = cache_dir_path + target_file(:label)
|
46
47
|
base_url = self.class::BASE_URL
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
unless label_path.exist?
|
53
|
-
download(label_path, base_url + target_file(:label))
|
54
|
-
end
|
49
|
+
download(image_path, base_url + target_file(:image))
|
50
|
+
download(label_path, base_url + target_file(:label))
|
55
51
|
|
56
52
|
open_data(image_path, label_path, &block)
|
57
53
|
end
|
58
54
|
|
59
55
|
private
|
56
|
+
def licenses
|
57
|
+
[]
|
58
|
+
end
|
59
|
+
|
60
60
|
def open_data(image_path, label_path, &block)
|
61
61
|
labels = parse_labels(label_path)
|
62
62
|
|
data/lib/datasets/mushroom.rb
CHANGED
@@ -35,6 +35,7 @@ module Datasets
|
|
35
35
|
@metadata.id = "mushroom"
|
36
36
|
@metadata.name = "Mushroom"
|
37
37
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/mushroom"
|
38
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
38
39
|
@metadata.description = lambda do
|
39
40
|
read_names
|
40
41
|
end
|
@@ -58,10 +59,8 @@ module Datasets
|
|
58
59
|
private
|
59
60
|
def open_data
|
60
61
|
data_path = cache_dir_path + "agaricus-lepiota.data"
|
61
|
-
|
62
|
-
|
63
|
-
download(data_path, data_url)
|
64
|
-
end
|
62
|
+
data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
|
63
|
+
download(data_path, data_url)
|
65
64
|
CSV.open(data_path) do |csv|
|
66
65
|
yield(csv)
|
67
66
|
end
|
@@ -69,10 +68,8 @@ module Datasets
|
|
69
68
|
|
70
69
|
def read_names
|
71
70
|
names_path = cache_dir_path + "agaricus-lepiota.names"
|
72
|
-
|
73
|
-
|
74
|
-
download(names_path, names_url)
|
75
|
-
end
|
71
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases//mushroom/agaricus-lepiota.names"
|
72
|
+
download(names_path, names_url)
|
76
73
|
names_path.read
|
77
74
|
end
|
78
75
|
|
data/lib/datasets/penguins.rb
CHANGED
@@ -23,10 +23,10 @@ module Datasets
|
|
23
23
|
def initialize
|
24
24
|
super
|
25
25
|
species = self.class.name.split("::").last.downcase
|
26
|
-
@metadata.id = "palmerpenguins
|
26
|
+
@metadata.id = "palmerpenguins-#{species}"
|
27
27
|
@metadata.url = self.class::URL
|
28
|
-
@metadata.licenses = ["CC0"]
|
29
|
-
@data_path = cache_dir_path + "
|
28
|
+
@metadata.licenses = ["CC0-1.0"]
|
29
|
+
@data_path = cache_dir_path + "#{species}.csv"
|
30
30
|
end
|
31
31
|
|
32
32
|
attr_reader :data_path
|
@@ -44,15 +44,11 @@ module Datasets
|
|
44
44
|
end
|
45
45
|
|
46
46
|
private def open_data
|
47
|
-
download
|
47
|
+
download(data_path, metadata.url)
|
48
48
|
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
49
49
|
yield csv
|
50
50
|
end
|
51
51
|
end
|
52
|
-
|
53
|
-
private def download
|
54
|
-
super(data_path, metadata.url)
|
55
|
-
end
|
56
52
|
end
|
57
53
|
|
58
54
|
# Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
|
@@ -36,10 +36,8 @@ module Datasets
|
|
36
36
|
|
37
37
|
base_name = "ptb.#{@type}.txt"
|
38
38
|
data_path = cache_dir_path + base_name
|
39
|
-
|
40
|
-
|
41
|
-
download(data_path, "#{base_url}/#{base_name}")
|
42
|
-
end
|
39
|
+
base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
|
40
|
+
download(data_path, "#{base_url}/#{base_name}")
|
43
41
|
|
44
42
|
parse_data(data_path, &block)
|
45
43
|
end
|