red-datasets 0.1.4 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -3
- data/Rakefile +56 -1
- data/doc/text/news.md +102 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +58 -23
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +110 -30
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/lazy.rb +90 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
- data/lib/datasets/penguins.rb +6 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +16 -8
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +48 -0
- data/lib/datasets.rb +2 -22
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +65 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-nagoya-university-conversation-corpus.rb +132 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- data/test/test-wikipedia.rb +25 -71
- metadata +62 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
data/lib/datasets/dataset.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "pathname"
|
2
2
|
|
3
|
+
require_relative "cache-path"
|
3
4
|
require_relative "downloader"
|
4
5
|
require_relative "error"
|
5
6
|
require_relative "metadata"
|
@@ -19,38 +20,72 @@ module Datasets
|
|
19
20
|
end
|
20
21
|
|
21
22
|
def clear_cache!
|
22
|
-
|
23
|
-
FileUtils.rmtree(cache_dir_path.to_s, secure: true)
|
24
|
-
end
|
23
|
+
cache_path.remove
|
25
24
|
end
|
26
25
|
|
27
26
|
private
|
27
|
+
|
28
28
|
def cache_dir_path
|
29
|
-
|
30
|
-
when /mswin/, /mingw/
|
31
|
-
base_dir = ENV["LOCALAPPDATA"] || "~/AppData/Local"
|
32
|
-
when /darwin/
|
33
|
-
base_dir = "~/Library/Caches"
|
34
|
-
else
|
35
|
-
base_dir = ENV["XDG_CACHE_HOME"] || "~/.cache"
|
36
|
-
end
|
37
|
-
Pathname(base_dir).expand_path + "red-datasets" + metadata.id
|
29
|
+
cache_path.base_dir
|
38
30
|
end
|
39
31
|
|
40
|
-
def
|
32
|
+
def cache_path
|
33
|
+
@cache_path ||= CachePath.new(@metadata.id)
|
34
|
+
end
|
35
|
+
|
36
|
+
def download(output_path, url, &block)
|
41
37
|
downloader = Downloader.new(url)
|
42
|
-
downloader.download(output_path)
|
38
|
+
downloader.download(output_path, &block)
|
43
39
|
end
|
44
40
|
|
45
|
-
def extract_bz2(
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
41
|
+
def extract_bz2(bz2)
|
42
|
+
case bz2
|
43
|
+
when Pathname, String
|
44
|
+
IO.pipe do |input, output|
|
45
|
+
pid = spawn("bzcat", bz2.to_s, {out: output})
|
46
|
+
begin
|
47
|
+
output.close
|
48
|
+
yield(input)
|
49
|
+
ensure
|
50
|
+
input.close
|
51
|
+
Process.waitpid(pid)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
else
|
55
|
+
IO.pipe do |bz2_input, bz2_output|
|
56
|
+
IO.pipe do |plain_input, plain_output|
|
57
|
+
bz2_stop = false
|
58
|
+
bz2_thread = Thread.new do
|
59
|
+
begin
|
60
|
+
bz2.each do |chunk|
|
61
|
+
bz2_output.write(chunk)
|
62
|
+
bz2_output.flush
|
63
|
+
break if bz2_stop
|
64
|
+
end
|
65
|
+
rescue => error
|
66
|
+
message = "Failed to read bzcat input: " +
|
67
|
+
"#{error.class}: #{error.message}"
|
68
|
+
$stderr.puts(message)
|
69
|
+
ensure
|
70
|
+
bz2_output.close
|
71
|
+
end
|
72
|
+
end
|
73
|
+
begin
|
74
|
+
pid = spawn("bzcat", {in: bz2_input, out: plain_output})
|
75
|
+
begin
|
76
|
+
bz2_input.close
|
77
|
+
plain_output.close
|
78
|
+
yield(plain_input)
|
79
|
+
ensure
|
80
|
+
plain_input.close
|
81
|
+
Process.waitpid(pid)
|
82
|
+
end
|
83
|
+
ensure
|
84
|
+
bz2_stop = true
|
85
|
+
bz2_thread.join
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
54
89
|
end
|
55
90
|
end
|
56
91
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative "ggplot2-dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class Diamonds < Ggplot2Dataset
|
5
|
+
Record = Struct.new(:carat,
|
6
|
+
:cut,
|
7
|
+
:color,
|
8
|
+
:clarity,
|
9
|
+
:depth,
|
10
|
+
:table,
|
11
|
+
:price,
|
12
|
+
:x,
|
13
|
+
:y,
|
14
|
+
:z)
|
15
|
+
|
16
|
+
def initialize()
|
17
|
+
super("diamonds")
|
18
|
+
@metadata.id = "diamonds"
|
19
|
+
@metadata.name = "Diamonds"
|
20
|
+
@metadata.licenses = ["CC0-1.0"]
|
21
|
+
end
|
22
|
+
|
23
|
+
COLUMN_NAME_MAPPING = {
|
24
|
+
}
|
25
|
+
end
|
26
|
+
end
|
data/lib/datasets/downloader.rb
CHANGED
@@ -22,45 +22,115 @@ module Datasets
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
-
def download(output_path)
|
26
|
-
output_path.
|
25
|
+
def download(output_path, &block)
|
26
|
+
if output_path.exist?
|
27
|
+
yield_chunks(output_path, &block) if block_given?
|
28
|
+
return
|
29
|
+
end
|
27
30
|
|
28
|
-
headers = {"User-Agent" => "Red Datasets/#{VERSION}"}
|
29
|
-
start = nil
|
30
31
|
partial_output_path = Pathname.new("#{output_path}.partial")
|
31
|
-
|
32
|
-
|
33
|
-
headers["Range"] = "bytes=#{start}-"
|
34
|
-
end
|
32
|
+
synchronize(output_path, partial_output_path) do
|
33
|
+
output_path.parent.mkpath
|
35
34
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
n_retries = 0
|
36
|
+
n_max_retries = 5
|
37
|
+
begin
|
38
|
+
headers = {
|
39
|
+
"Accept-Encoding" => "identity",
|
40
|
+
"User-Agent" => "Red Datasets/#{VERSION}",
|
41
|
+
}
|
40
42
|
start = nil
|
41
|
-
|
42
|
-
|
43
|
+
if partial_output_path.exist?
|
44
|
+
start = partial_output_path.size
|
45
|
+
headers["Range"] = "bytes=#{start}-"
|
46
|
+
end
|
47
|
+
|
48
|
+
start_http(@url, headers) do |response|
|
49
|
+
if response.is_a?(Net::HTTPPartialContent)
|
50
|
+
mode = "ab"
|
51
|
+
else
|
52
|
+
start = nil
|
53
|
+
mode = "wb"
|
54
|
+
end
|
43
55
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
56
|
+
base_name = @url.path.split("/").last
|
57
|
+
size_current = 0
|
58
|
+
size_max = response.content_length
|
59
|
+
if start
|
60
|
+
size_current += start
|
61
|
+
size_max += start
|
62
|
+
if block_given? and n_retries.zero?
|
63
|
+
yield_chunks(partial_output_path, &block)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
progress_reporter = ProgressReporter.new(base_name, size_max)
|
67
|
+
partial_output_path.open(mode) do |output|
|
68
|
+
response.read_body do |chunk|
|
69
|
+
size_current += chunk.bytesize
|
70
|
+
progress_reporter.report(size_current)
|
71
|
+
output.write(chunk)
|
72
|
+
yield(chunk) if block_given?
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
FileUtils.mv(partial_output_path, output_path)
|
77
|
+
rescue Net::ReadTimeout => error
|
78
|
+
n_retries += 1
|
79
|
+
retry if n_retries < n_max_retries
|
80
|
+
raise
|
81
|
+
rescue TooManyRedirects => error
|
82
|
+
last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
|
83
|
+
raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
|
50
84
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
private def synchronize(output_path, partial_output_path)
|
89
|
+
begin
|
90
|
+
Process.getpgid(Process.pid)
|
91
|
+
rescue NotImplementedError
|
92
|
+
return yield
|
93
|
+
end
|
94
|
+
|
95
|
+
lock_path = Pathname("#{output_path}.lock")
|
96
|
+
loop do
|
97
|
+
lock_path.parent.mkpath
|
98
|
+
begin
|
99
|
+
lock = lock_path.open(File::RDWR | File::CREAT | File::EXCL)
|
100
|
+
rescue SystemCallError
|
101
|
+
valid_lock_path = true
|
102
|
+
begin
|
103
|
+
pid = Integer(lock_path.read.chomp, 10)
|
104
|
+
rescue ArgumentError
|
105
|
+
# The process that acquired the lock will be exited before
|
106
|
+
# it stores its process ID.
|
107
|
+
valid_lock_path = (lock_path.mtime > 10)
|
108
|
+
else
|
109
|
+
begin
|
110
|
+
Process.getpgid(pid)
|
111
|
+
rescue SystemCallError
|
112
|
+
# Process that acquired the lock doesn't exist
|
113
|
+
valid_lock_path = false
|
114
|
+
end
|
115
|
+
end
|
116
|
+
if valid_lock_path
|
117
|
+
sleep(1 + rand(10))
|
118
|
+
else
|
119
|
+
lock_path.delete
|
57
120
|
end
|
121
|
+
retry
|
122
|
+
else
|
123
|
+
begin
|
124
|
+
lock.puts(Process.pid.to_s)
|
125
|
+
lock.flush
|
126
|
+
yield
|
127
|
+
ensure
|
128
|
+
lock.close
|
129
|
+
lock_path.delete
|
130
|
+
end
|
131
|
+
break
|
58
132
|
end
|
59
133
|
end
|
60
|
-
FileUtils.mv(partial_output_path, output_path)
|
61
|
-
rescue TooManyRedirects => error
|
62
|
-
last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
|
63
|
-
raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
|
64
134
|
end
|
65
135
|
|
66
136
|
private def start_http(url, headers, limit = 10, &block)
|
@@ -94,6 +164,16 @@ module Datasets
|
|
94
164
|
end
|
95
165
|
end
|
96
166
|
|
167
|
+
private def yield_chunks(path)
|
168
|
+
path.open("rb") do |output|
|
169
|
+
chunk_size = 1024 * 1024
|
170
|
+
chunk = ""
|
171
|
+
while output.read(chunk_size, chunk)
|
172
|
+
yield(chunk)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
97
177
|
class ProgressReporter
|
98
178
|
def initialize(base_name, size_max)
|
99
179
|
@base_name = base_name
|
@@ -74,6 +74,7 @@ module Datasets
|
|
74
74
|
@metadata.id = "e-stat-japan-#{@api_version}"
|
75
75
|
@metadata.name = "e-Stat API #{@api_version}"
|
76
76
|
@metadata.url = @base_url
|
77
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
77
78
|
@metadata.description = "e-Stat API #{@api_version}"
|
78
79
|
|
79
80
|
@id = id
|
@@ -214,7 +215,7 @@ module Datasets
|
|
214
215
|
# even if error happens dispite of its error mapping.
|
215
216
|
# So we can't avoid caching retrieved response from the api.
|
216
217
|
# ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
|
217
|
-
download(@data_path, @url.to_s)
|
218
|
+
download(@data_path, @url.to_s)
|
218
219
|
end
|
219
220
|
|
220
221
|
def index_data
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative "ggplot2-dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class FuelEconomy < Ggplot2Dataset
|
5
|
+
Record = Struct.new(:manufacturer,
|
6
|
+
:model,
|
7
|
+
:displacement,
|
8
|
+
:year,
|
9
|
+
:n_cylinders,
|
10
|
+
:transmission,
|
11
|
+
:drive_train,
|
12
|
+
:city_mpg,
|
13
|
+
:highway_mpg,
|
14
|
+
:fuel,
|
15
|
+
:type)
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
super("mpg")
|
19
|
+
@metadata.id = "fuel-economy"
|
20
|
+
@metadata.name = "Fuel economy"
|
21
|
+
@metadata.licenses = ["CC0-1.0"]
|
22
|
+
end
|
23
|
+
|
24
|
+
COLUMN_NAME_MAPPING = {
|
25
|
+
"displ" => "displacement",
|
26
|
+
"cyl" => "n_cylinders",
|
27
|
+
"trans" => "transmissions",
|
28
|
+
"drv" => "drive_train",
|
29
|
+
"cty" => "city_mpg",
|
30
|
+
"hwy" => "highway_mpg",
|
31
|
+
"fl" => "fuel",
|
32
|
+
"class" => "type",
|
33
|
+
}
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
require_relative 'dataset'
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Geolonia < Dataset
|
7
|
+
Record = Struct.new(:prefecture_code,
|
8
|
+
:prefecture_name,
|
9
|
+
:prefecture_kana,
|
10
|
+
:prefecture_romaji,
|
11
|
+
:municipality_code,
|
12
|
+
:municipality_name,
|
13
|
+
:municipality_kana,
|
14
|
+
:municipality_romaji,
|
15
|
+
:street_name,
|
16
|
+
:street_kana,
|
17
|
+
:street_romaji,
|
18
|
+
:alias,
|
19
|
+
:latitude,
|
20
|
+
:longitude)
|
21
|
+
|
22
|
+
def initialize
|
23
|
+
super
|
24
|
+
@metadata.id = 'geolonia'
|
25
|
+
@metadata.name = 'Geolonia'
|
26
|
+
@metadata.url = 'https://github.com/geolonia/japanese-addresses'
|
27
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
28
|
+
@metadata.description = lambda do
|
29
|
+
fetch_readme
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def each
|
34
|
+
return to_enum(__method__) unless block_given?
|
35
|
+
|
36
|
+
open_data do |csv|
|
37
|
+
csv.readline
|
38
|
+
csv.each do |row|
|
39
|
+
record = Record.new(*row)
|
40
|
+
yield(record)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def download_base_url
|
47
|
+
"https://raw.githubusercontent.com/geolonia/japanese-addresses/master"
|
48
|
+
end
|
49
|
+
|
50
|
+
def open_data
|
51
|
+
data_path = cache_dir_path + 'latest.csv'
|
52
|
+
data_url = "#{download_base_url}/data/latest.csv"
|
53
|
+
download(data_path, data_url)
|
54
|
+
CSV.open(data_path) do |csv|
|
55
|
+
yield(csv)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def fetch_readme
|
60
|
+
readme_base_name = "README.md"
|
61
|
+
readme_path = cache_dir_path + readme_base_name
|
62
|
+
readme_url = "#{download_base_url}/#{readme_base_name}"
|
63
|
+
download(readme_path, readme_url)
|
64
|
+
readme_path.read.split(/^## API/, 2)[0].strip
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Datasets
|
2
|
+
class Ggplot2Dataset < Dataset
|
3
|
+
def initialize(ggplot2_dataset_name)
|
4
|
+
super()
|
5
|
+
@ggplot2_dataset_name = ggplot2_dataset_name
|
6
|
+
@metadata.url =
|
7
|
+
"https://ggplot2.tidyverse.org/reference/#{@ggplot2_dataset_name}.html"
|
8
|
+
@metadata.description = lambda do
|
9
|
+
fetch_description
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def each
|
14
|
+
return to_enum(__method__) unless block_given?
|
15
|
+
|
16
|
+
data_base_name = "#{@ggplot2_dataset_name}.csv"
|
17
|
+
data_path = cache_dir_path + data_base_name
|
18
|
+
data_url = "#{download_base_url}/data-raw/#{data_base_name}"
|
19
|
+
download(data_path, data_url)
|
20
|
+
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
21
|
+
record_class = self.class::Record
|
22
|
+
csv.each do |row|
|
23
|
+
record = record_class.new(*row.fields)
|
24
|
+
yield record
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def download_base_url
|
31
|
+
"https://raw.githubusercontent.com/tidyverse/ggplot2/main"
|
32
|
+
end
|
33
|
+
|
34
|
+
def fetch_description
|
35
|
+
data_r_base_name = "data.R"
|
36
|
+
data_r_path = cache_dir_path + data_r_base_name
|
37
|
+
data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
|
38
|
+
download(data_r_path, data_r_url)
|
39
|
+
descriptions = {}
|
40
|
+
comment = ""
|
41
|
+
File.open(data_r_path) do |data_r|
|
42
|
+
data_r.each_line do |line|
|
43
|
+
case line.chomp
|
44
|
+
when /\A#'/
|
45
|
+
comment_content = Regexp.last_match.post_match
|
46
|
+
unless comment_content.empty?
|
47
|
+
comment_content = comment_content[1..-1]
|
48
|
+
end
|
49
|
+
comment << comment_content
|
50
|
+
comment << "\n"
|
51
|
+
when /\A"(.+)"\z/
|
52
|
+
name = Regexp.last_match[1]
|
53
|
+
descriptions[name] = parse_roxygen(comment.rstrip)
|
54
|
+
comment = ""
|
55
|
+
end
|
56
|
+
end
|
57
|
+
descriptions[@ggplot2_dataset_name]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def parse_roxygen(roxygen)
|
62
|
+
column_name_mapping = self.class::COLUMN_NAME_MAPPING
|
63
|
+
roxygen
|
64
|
+
.gsub(/\\url\{(.*?)\}/, "\\1")
|
65
|
+
.gsub(/^@format /, "")
|
66
|
+
.gsub(/\\describe\{(.*)\}/m) do
|
67
|
+
content = $1
|
68
|
+
content.gsub(/\\item\{(.*?)\}\{(.*?)\}/m) do
|
69
|
+
column_name = $1
|
70
|
+
description = $2
|
71
|
+
column_name = column_name_mapping[column_name] || column_name
|
72
|
+
description = description
|
73
|
+
.gsub(/\\\$/, "$")
|
74
|
+
"* #{column_name}: #{description}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/datasets/hepatitis.rb
CHANGED
@@ -163,6 +163,7 @@ module Datasets
|
|
163
163
|
@metadata.id = "hepatitis"
|
164
164
|
@metadata.name = "Hepatitis"
|
165
165
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
|
166
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
166
167
|
@metadata.description = lambda do
|
167
168
|
read_names
|
168
169
|
end
|
@@ -186,10 +187,8 @@ module Datasets
|
|
186
187
|
|
187
188
|
def open_data
|
188
189
|
data_path = cache_dir_path + "hepatitis.csv"
|
189
|
-
|
190
|
-
|
191
|
-
download(data_path, data_url)
|
192
|
-
end
|
190
|
+
data_url = "#{base_url}/hepatitis.data"
|
191
|
+
download(data_path, data_url)
|
193
192
|
CSV.open(data_path) do |csv|
|
194
193
|
yield(csv)
|
195
194
|
end
|
@@ -197,10 +196,8 @@ module Datasets
|
|
197
196
|
|
198
197
|
def read_names
|
199
198
|
names_path = cache_dir_path + "hepatitis.names"
|
200
|
-
|
201
|
-
|
202
|
-
download(names_path, names_url)
|
203
|
-
end
|
199
|
+
names_url = "#{base_url}/hepatitis.names"
|
200
|
+
download(names_path, names_url)
|
204
201
|
names_path.read
|
205
202
|
end
|
206
203
|
end
|
data/lib/datasets/iris.rb
CHANGED
@@ -15,6 +15,7 @@ module Datasets
|
|
15
15
|
@metadata.id = "iris"
|
16
16
|
@metadata.name = "Iris"
|
17
17
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris"
|
18
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
18
19
|
@metadata.description = lambda do
|
19
20
|
read_names
|
20
21
|
end
|
@@ -35,10 +36,8 @@ module Datasets
|
|
35
36
|
private
|
36
37
|
def open_data
|
37
38
|
data_path = cache_dir_path + "iris.csv"
|
38
|
-
|
39
|
-
|
40
|
-
download(data_path, data_url)
|
41
|
-
end
|
39
|
+
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
|
40
|
+
download(data_path, data_url)
|
42
41
|
CSV.open(data_path, converters: [:numeric]) do |csv|
|
43
42
|
yield(csv)
|
44
43
|
end
|
@@ -46,10 +45,8 @@ module Datasets
|
|
46
45
|
|
47
46
|
def read_names
|
48
47
|
names_path = cache_dir_path + "iris.names"
|
49
|
-
|
50
|
-
|
51
|
-
download(names_path, names_url)
|
52
|
-
end
|
48
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
|
49
|
+
download(names_path, names_url)
|
53
50
|
names_path.read
|
54
51
|
end
|
55
52
|
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class ITACorpus < Dataset
|
5
|
+
Record = Struct.new(:id,
|
6
|
+
:sentence)
|
7
|
+
|
8
|
+
def initialize(type: :emotion)
|
9
|
+
unless [:emotion, :recitation].include?(type)
|
10
|
+
raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}"
|
11
|
+
end
|
12
|
+
|
13
|
+
super()
|
14
|
+
@type = type
|
15
|
+
@metadata.id = 'ita-corpus'
|
16
|
+
@metadata.name = 'ITA-corpus'
|
17
|
+
@metadata.url = 'https://github.com/mmorise/ita-corpus'
|
18
|
+
@metadata.licenses = ['Unlicense']
|
19
|
+
@metadata.description = lambda do
|
20
|
+
fetch_readme
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def each(&block)
|
25
|
+
return to_enum(__method__) unless block_given?
|
26
|
+
|
27
|
+
data_path = cache_dir_path + "#{@type}_transcript_utf8.txt"
|
28
|
+
data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt"
|
29
|
+
download(data_path, data_url)
|
30
|
+
|
31
|
+
parse_data(data_path, &block)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def fetch_readme
|
36
|
+
readme_base_name = "README.md"
|
37
|
+
readme_path = cache_dir_path + readme_base_name
|
38
|
+
readme_url = "#{download_base_url}/#{readme_base_name}"
|
39
|
+
download(readme_path, readme_url)
|
40
|
+
readme_path.read.split(/^## ファイル構成/, 2)[0].strip
|
41
|
+
end
|
42
|
+
|
43
|
+
def download_base_url
|
44
|
+
"https://raw.githubusercontent.com/mmorise/ita-corpus/main"
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse_data(data_path)
|
48
|
+
File.open(data_path) do |f|
|
49
|
+
f.each_line(chomp: true) do |line|
|
50
|
+
id, sentence = line.split(':', 2)
|
51
|
+
record = Record.new(id , sentence)
|
52
|
+
yield(record)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require_relative 'mnist'
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class KuzushijiMNIST < MNIST
|
5
|
+
BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
|
6
|
+
|
7
|
+
private
|
8
|
+
def dataset_name
|
9
|
+
"Kuzushiji-MNIST"
|
10
|
+
end
|
11
|
+
|
12
|
+
def licenses
|
13
|
+
["CC-BY-SA-4.0"]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|