red-datasets 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +23 -3
- data/Rakefile +56 -1
- data/doc/text/news.md +102 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +58 -23
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +110 -30
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/lazy.rb +90 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
- data/lib/datasets/penguins.rb +6 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +16 -8
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +48 -0
- data/lib/datasets.rb +2 -22
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +65 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-nagoya-university-conversation-corpus.rb +132 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- data/test/test-wikipedia.rb +25 -71
- metadata +62 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
data/lib/datasets/dataset.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "pathname"
|
2
2
|
|
3
|
+
require_relative "cache-path"
|
3
4
|
require_relative "downloader"
|
4
5
|
require_relative "error"
|
5
6
|
require_relative "metadata"
|
@@ -19,38 +20,72 @@ module Datasets
|
|
19
20
|
end
|
20
21
|
|
21
22
|
def clear_cache!
|
22
|
-
|
23
|
-
FileUtils.rmtree(cache_dir_path.to_s, secure: true)
|
24
|
-
end
|
23
|
+
cache_path.remove
|
25
24
|
end
|
26
25
|
|
27
26
|
private
|
27
|
+
|
28
28
|
def cache_dir_path
|
29
|
-
|
30
|
-
when /mswin/, /mingw/
|
31
|
-
base_dir = ENV["LOCALAPPDATA"] || "~/AppData/Local"
|
32
|
-
when /darwin/
|
33
|
-
base_dir = "~/Library/Caches"
|
34
|
-
else
|
35
|
-
base_dir = ENV["XDG_CACHE_HOME"] || "~/.cache"
|
36
|
-
end
|
37
|
-
Pathname(base_dir).expand_path + "red-datasets" + metadata.id
|
29
|
+
cache_path.base_dir
|
38
30
|
end
|
39
31
|
|
40
|
-
def
|
32
|
+
def cache_path
|
33
|
+
@cache_path ||= CachePath.new(@metadata.id)
|
34
|
+
end
|
35
|
+
|
36
|
+
def download(output_path, url, &block)
|
41
37
|
downloader = Downloader.new(url)
|
42
|
-
downloader.download(output_path)
|
38
|
+
downloader.download(output_path, &block)
|
43
39
|
end
|
44
40
|
|
45
|
-
def extract_bz2(
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
41
|
+
def extract_bz2(bz2)
|
42
|
+
case bz2
|
43
|
+
when Pathname, String
|
44
|
+
IO.pipe do |input, output|
|
45
|
+
pid = spawn("bzcat", bz2.to_s, {out: output})
|
46
|
+
begin
|
47
|
+
output.close
|
48
|
+
yield(input)
|
49
|
+
ensure
|
50
|
+
input.close
|
51
|
+
Process.waitpid(pid)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
else
|
55
|
+
IO.pipe do |bz2_input, bz2_output|
|
56
|
+
IO.pipe do |plain_input, plain_output|
|
57
|
+
bz2_stop = false
|
58
|
+
bz2_thread = Thread.new do
|
59
|
+
begin
|
60
|
+
bz2.each do |chunk|
|
61
|
+
bz2_output.write(chunk)
|
62
|
+
bz2_output.flush
|
63
|
+
break if bz2_stop
|
64
|
+
end
|
65
|
+
rescue => error
|
66
|
+
message = "Failed to read bzcat input: " +
|
67
|
+
"#{error.class}: #{error.message}"
|
68
|
+
$stderr.puts(message)
|
69
|
+
ensure
|
70
|
+
bz2_output.close
|
71
|
+
end
|
72
|
+
end
|
73
|
+
begin
|
74
|
+
pid = spawn("bzcat", {in: bz2_input, out: plain_output})
|
75
|
+
begin
|
76
|
+
bz2_input.close
|
77
|
+
plain_output.close
|
78
|
+
yield(plain_input)
|
79
|
+
ensure
|
80
|
+
plain_input.close
|
81
|
+
Process.waitpid(pid)
|
82
|
+
end
|
83
|
+
ensure
|
84
|
+
bz2_stop = true
|
85
|
+
bz2_thread.join
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
54
89
|
end
|
55
90
|
end
|
56
91
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative "ggplot2-dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class Diamonds < Ggplot2Dataset
|
5
|
+
Record = Struct.new(:carat,
|
6
|
+
:cut,
|
7
|
+
:color,
|
8
|
+
:clarity,
|
9
|
+
:depth,
|
10
|
+
:table,
|
11
|
+
:price,
|
12
|
+
:x,
|
13
|
+
:y,
|
14
|
+
:z)
|
15
|
+
|
16
|
+
def initialize()
|
17
|
+
super("diamonds")
|
18
|
+
@metadata.id = "diamonds"
|
19
|
+
@metadata.name = "Diamonds"
|
20
|
+
@metadata.licenses = ["CC0-1.0"]
|
21
|
+
end
|
22
|
+
|
23
|
+
COLUMN_NAME_MAPPING = {
|
24
|
+
}
|
25
|
+
end
|
26
|
+
end
|
data/lib/datasets/downloader.rb
CHANGED
@@ -22,45 +22,115 @@ module Datasets
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
-
def download(output_path)
|
26
|
-
output_path.
|
25
|
+
def download(output_path, &block)
|
26
|
+
if output_path.exist?
|
27
|
+
yield_chunks(output_path, &block) if block_given?
|
28
|
+
return
|
29
|
+
end
|
27
30
|
|
28
|
-
headers = {"User-Agent" => "Red Datasets/#{VERSION}"}
|
29
|
-
start = nil
|
30
31
|
partial_output_path = Pathname.new("#{output_path}.partial")
|
31
|
-
|
32
|
-
|
33
|
-
headers["Range"] = "bytes=#{start}-"
|
34
|
-
end
|
32
|
+
synchronize(output_path, partial_output_path) do
|
33
|
+
output_path.parent.mkpath
|
35
34
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
n_retries = 0
|
36
|
+
n_max_retries = 5
|
37
|
+
begin
|
38
|
+
headers = {
|
39
|
+
"Accept-Encoding" => "identity",
|
40
|
+
"User-Agent" => "Red Datasets/#{VERSION}",
|
41
|
+
}
|
40
42
|
start = nil
|
41
|
-
|
42
|
-
|
43
|
+
if partial_output_path.exist?
|
44
|
+
start = partial_output_path.size
|
45
|
+
headers["Range"] = "bytes=#{start}-"
|
46
|
+
end
|
47
|
+
|
48
|
+
start_http(@url, headers) do |response|
|
49
|
+
if response.is_a?(Net::HTTPPartialContent)
|
50
|
+
mode = "ab"
|
51
|
+
else
|
52
|
+
start = nil
|
53
|
+
mode = "wb"
|
54
|
+
end
|
43
55
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
56
|
+
base_name = @url.path.split("/").last
|
57
|
+
size_current = 0
|
58
|
+
size_max = response.content_length
|
59
|
+
if start
|
60
|
+
size_current += start
|
61
|
+
size_max += start
|
62
|
+
if block_given? and n_retries.zero?
|
63
|
+
yield_chunks(partial_output_path, &block)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
progress_reporter = ProgressReporter.new(base_name, size_max)
|
67
|
+
partial_output_path.open(mode) do |output|
|
68
|
+
response.read_body do |chunk|
|
69
|
+
size_current += chunk.bytesize
|
70
|
+
progress_reporter.report(size_current)
|
71
|
+
output.write(chunk)
|
72
|
+
yield(chunk) if block_given?
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
FileUtils.mv(partial_output_path, output_path)
|
77
|
+
rescue Net::ReadTimeout => error
|
78
|
+
n_retries += 1
|
79
|
+
retry if n_retries < n_max_retries
|
80
|
+
raise
|
81
|
+
rescue TooManyRedirects => error
|
82
|
+
last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
|
83
|
+
raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
|
50
84
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
private def synchronize(output_path, partial_output_path)
|
89
|
+
begin
|
90
|
+
Process.getpgid(Process.pid)
|
91
|
+
rescue NotImplementedError
|
92
|
+
return yield
|
93
|
+
end
|
94
|
+
|
95
|
+
lock_path = Pathname("#{output_path}.lock")
|
96
|
+
loop do
|
97
|
+
lock_path.parent.mkpath
|
98
|
+
begin
|
99
|
+
lock = lock_path.open(File::RDWR | File::CREAT | File::EXCL)
|
100
|
+
rescue SystemCallError
|
101
|
+
valid_lock_path = true
|
102
|
+
begin
|
103
|
+
pid = Integer(lock_path.read.chomp, 10)
|
104
|
+
rescue ArgumentError
|
105
|
+
# The process that acquired the lock will be exited before
|
106
|
+
# it stores its process ID.
|
107
|
+
valid_lock_path = (lock_path.mtime > 10)
|
108
|
+
else
|
109
|
+
begin
|
110
|
+
Process.getpgid(pid)
|
111
|
+
rescue SystemCallError
|
112
|
+
# Process that acquired the lock doesn't exist
|
113
|
+
valid_lock_path = false
|
114
|
+
end
|
115
|
+
end
|
116
|
+
if valid_lock_path
|
117
|
+
sleep(1 + rand(10))
|
118
|
+
else
|
119
|
+
lock_path.delete
|
57
120
|
end
|
121
|
+
retry
|
122
|
+
else
|
123
|
+
begin
|
124
|
+
lock.puts(Process.pid.to_s)
|
125
|
+
lock.flush
|
126
|
+
yield
|
127
|
+
ensure
|
128
|
+
lock.close
|
129
|
+
lock_path.delete
|
130
|
+
end
|
131
|
+
break
|
58
132
|
end
|
59
133
|
end
|
60
|
-
FileUtils.mv(partial_output_path, output_path)
|
61
|
-
rescue TooManyRedirects => error
|
62
|
-
last_url = error.message[/\Atoo many redirections: (.+)\z/, 1]
|
63
|
-
raise TooManyRedirects, "too many redirections: #{@url} .. #{last_url}"
|
64
134
|
end
|
65
135
|
|
66
136
|
private def start_http(url, headers, limit = 10, &block)
|
@@ -94,6 +164,16 @@ module Datasets
|
|
94
164
|
end
|
95
165
|
end
|
96
166
|
|
167
|
+
private def yield_chunks(path)
|
168
|
+
path.open("rb") do |output|
|
169
|
+
chunk_size = 1024 * 1024
|
170
|
+
chunk = ""
|
171
|
+
while output.read(chunk_size, chunk)
|
172
|
+
yield(chunk)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
97
177
|
class ProgressReporter
|
98
178
|
def initialize(base_name, size_max)
|
99
179
|
@base_name = base_name
|
@@ -74,6 +74,7 @@ module Datasets
|
|
74
74
|
@metadata.id = "e-stat-japan-#{@api_version}"
|
75
75
|
@metadata.name = "e-Stat API #{@api_version}"
|
76
76
|
@metadata.url = @base_url
|
77
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
77
78
|
@metadata.description = "e-Stat API #{@api_version}"
|
78
79
|
|
79
80
|
@id = id
|
@@ -214,7 +215,7 @@ module Datasets
|
|
214
215
|
# even if error happens dispite of its error mapping.
|
215
216
|
# So we can't avoid caching retrieved response from the api.
|
216
217
|
# ref: https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0
|
217
|
-
download(@data_path, @url.to_s)
|
218
|
+
download(@data_path, @url.to_s)
|
218
219
|
end
|
219
220
|
|
220
221
|
def index_data
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative "ggplot2-dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class FuelEconomy < Ggplot2Dataset
|
5
|
+
Record = Struct.new(:manufacturer,
|
6
|
+
:model,
|
7
|
+
:displacement,
|
8
|
+
:year,
|
9
|
+
:n_cylinders,
|
10
|
+
:transmission,
|
11
|
+
:drive_train,
|
12
|
+
:city_mpg,
|
13
|
+
:highway_mpg,
|
14
|
+
:fuel,
|
15
|
+
:type)
|
16
|
+
|
17
|
+
def initialize
|
18
|
+
super("mpg")
|
19
|
+
@metadata.id = "fuel-economy"
|
20
|
+
@metadata.name = "Fuel economy"
|
21
|
+
@metadata.licenses = ["CC0-1.0"]
|
22
|
+
end
|
23
|
+
|
24
|
+
COLUMN_NAME_MAPPING = {
|
25
|
+
"displ" => "displacement",
|
26
|
+
"cyl" => "n_cylinders",
|
27
|
+
"trans" => "transmissions",
|
28
|
+
"drv" => "drive_train",
|
29
|
+
"cty" => "city_mpg",
|
30
|
+
"hwy" => "highway_mpg",
|
31
|
+
"fl" => "fuel",
|
32
|
+
"class" => "type",
|
33
|
+
}
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
require_relative 'dataset'
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class Geolonia < Dataset
|
7
|
+
Record = Struct.new(:prefecture_code,
|
8
|
+
:prefecture_name,
|
9
|
+
:prefecture_kana,
|
10
|
+
:prefecture_romaji,
|
11
|
+
:municipality_code,
|
12
|
+
:municipality_name,
|
13
|
+
:municipality_kana,
|
14
|
+
:municipality_romaji,
|
15
|
+
:street_name,
|
16
|
+
:street_kana,
|
17
|
+
:street_romaji,
|
18
|
+
:alias,
|
19
|
+
:latitude,
|
20
|
+
:longitude)
|
21
|
+
|
22
|
+
def initialize
|
23
|
+
super
|
24
|
+
@metadata.id = 'geolonia'
|
25
|
+
@metadata.name = 'Geolonia'
|
26
|
+
@metadata.url = 'https://github.com/geolonia/japanese-addresses'
|
27
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
28
|
+
@metadata.description = lambda do
|
29
|
+
fetch_readme
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def each
|
34
|
+
return to_enum(__method__) unless block_given?
|
35
|
+
|
36
|
+
open_data do |csv|
|
37
|
+
csv.readline
|
38
|
+
csv.each do |row|
|
39
|
+
record = Record.new(*row)
|
40
|
+
yield(record)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def download_base_url
|
47
|
+
"https://raw.githubusercontent.com/geolonia/japanese-addresses/master"
|
48
|
+
end
|
49
|
+
|
50
|
+
def open_data
|
51
|
+
data_path = cache_dir_path + 'latest.csv'
|
52
|
+
data_url = "#{download_base_url}/data/latest.csv"
|
53
|
+
download(data_path, data_url)
|
54
|
+
CSV.open(data_path) do |csv|
|
55
|
+
yield(csv)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def fetch_readme
|
60
|
+
readme_base_name = "README.md"
|
61
|
+
readme_path = cache_dir_path + readme_base_name
|
62
|
+
readme_url = "#{download_base_url}/#{readme_base_name}"
|
63
|
+
download(readme_path, readme_url)
|
64
|
+
readme_path.read.split(/^## API/, 2)[0].strip
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Datasets
|
2
|
+
class Ggplot2Dataset < Dataset
|
3
|
+
def initialize(ggplot2_dataset_name)
|
4
|
+
super()
|
5
|
+
@ggplot2_dataset_name = ggplot2_dataset_name
|
6
|
+
@metadata.url =
|
7
|
+
"https://ggplot2.tidyverse.org/reference/#{@ggplot2_dataset_name}.html"
|
8
|
+
@metadata.description = lambda do
|
9
|
+
fetch_description
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def each
|
14
|
+
return to_enum(__method__) unless block_given?
|
15
|
+
|
16
|
+
data_base_name = "#{@ggplot2_dataset_name}.csv"
|
17
|
+
data_path = cache_dir_path + data_base_name
|
18
|
+
data_url = "#{download_base_url}/data-raw/#{data_base_name}"
|
19
|
+
download(data_path, data_url)
|
20
|
+
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
21
|
+
record_class = self.class::Record
|
22
|
+
csv.each do |row|
|
23
|
+
record = record_class.new(*row.fields)
|
24
|
+
yield record
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def download_base_url
|
31
|
+
"https://raw.githubusercontent.com/tidyverse/ggplot2/main"
|
32
|
+
end
|
33
|
+
|
34
|
+
def fetch_description
|
35
|
+
data_r_base_name = "data.R"
|
36
|
+
data_r_path = cache_dir_path + data_r_base_name
|
37
|
+
data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
|
38
|
+
download(data_r_path, data_r_url)
|
39
|
+
descriptions = {}
|
40
|
+
comment = ""
|
41
|
+
File.open(data_r_path) do |data_r|
|
42
|
+
data_r.each_line do |line|
|
43
|
+
case line.chomp
|
44
|
+
when /\A#'/
|
45
|
+
comment_content = Regexp.last_match.post_match
|
46
|
+
unless comment_content.empty?
|
47
|
+
comment_content = comment_content[1..-1]
|
48
|
+
end
|
49
|
+
comment << comment_content
|
50
|
+
comment << "\n"
|
51
|
+
when /\A"(.+)"\z/
|
52
|
+
name = Regexp.last_match[1]
|
53
|
+
descriptions[name] = parse_roxygen(comment.rstrip)
|
54
|
+
comment = ""
|
55
|
+
end
|
56
|
+
end
|
57
|
+
descriptions[@ggplot2_dataset_name]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def parse_roxygen(roxygen)
|
62
|
+
column_name_mapping = self.class::COLUMN_NAME_MAPPING
|
63
|
+
roxygen
|
64
|
+
.gsub(/\\url\{(.*?)\}/, "\\1")
|
65
|
+
.gsub(/^@format /, "")
|
66
|
+
.gsub(/\\describe\{(.*)\}/m) do
|
67
|
+
content = $1
|
68
|
+
content.gsub(/\\item\{(.*?)\}\{(.*?)\}/m) do
|
69
|
+
column_name = $1
|
70
|
+
description = $2
|
71
|
+
column_name = column_name_mapping[column_name] || column_name
|
72
|
+
description = description
|
73
|
+
.gsub(/\\\$/, "$")
|
74
|
+
"* #{column_name}: #{description}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/datasets/hepatitis.rb
CHANGED
@@ -163,6 +163,7 @@ module Datasets
|
|
163
163
|
@metadata.id = "hepatitis"
|
164
164
|
@metadata.name = "Hepatitis"
|
165
165
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
|
166
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
166
167
|
@metadata.description = lambda do
|
167
168
|
read_names
|
168
169
|
end
|
@@ -186,10 +187,8 @@ module Datasets
|
|
186
187
|
|
187
188
|
def open_data
|
188
189
|
data_path = cache_dir_path + "hepatitis.csv"
|
189
|
-
|
190
|
-
|
191
|
-
download(data_path, data_url)
|
192
|
-
end
|
190
|
+
data_url = "#{base_url}/hepatitis.data"
|
191
|
+
download(data_path, data_url)
|
193
192
|
CSV.open(data_path) do |csv|
|
194
193
|
yield(csv)
|
195
194
|
end
|
@@ -197,10 +196,8 @@ module Datasets
|
|
197
196
|
|
198
197
|
def read_names
|
199
198
|
names_path = cache_dir_path + "hepatitis.names"
|
200
|
-
|
201
|
-
|
202
|
-
download(names_path, names_url)
|
203
|
-
end
|
199
|
+
names_url = "#{base_url}/hepatitis.names"
|
200
|
+
download(names_path, names_url)
|
204
201
|
names_path.read
|
205
202
|
end
|
206
203
|
end
|
data/lib/datasets/iris.rb
CHANGED
@@ -15,6 +15,7 @@ module Datasets
|
|
15
15
|
@metadata.id = "iris"
|
16
16
|
@metadata.name = "Iris"
|
17
17
|
@metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris"
|
18
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
18
19
|
@metadata.description = lambda do
|
19
20
|
read_names
|
20
21
|
end
|
@@ -35,10 +36,8 @@ module Datasets
|
|
35
36
|
private
|
36
37
|
def open_data
|
37
38
|
data_path = cache_dir_path + "iris.csv"
|
38
|
-
|
39
|
-
|
40
|
-
download(data_path, data_url)
|
41
|
-
end
|
39
|
+
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
|
40
|
+
download(data_path, data_url)
|
42
41
|
CSV.open(data_path, converters: [:numeric]) do |csv|
|
43
42
|
yield(csv)
|
44
43
|
end
|
@@ -46,10 +45,8 @@ module Datasets
|
|
46
45
|
|
47
46
|
def read_names
|
48
47
|
names_path = cache_dir_path + "iris.names"
|
49
|
-
|
50
|
-
|
51
|
-
download(names_path, names_url)
|
52
|
-
end
|
48
|
+
names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
|
49
|
+
download(names_path, names_url)
|
53
50
|
names_path.read
|
54
51
|
end
|
55
52
|
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require_relative 'dataset'
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class ITACorpus < Dataset
|
5
|
+
Record = Struct.new(:id,
|
6
|
+
:sentence)
|
7
|
+
|
8
|
+
def initialize(type: :emotion)
|
9
|
+
unless [:emotion, :recitation].include?(type)
|
10
|
+
raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}"
|
11
|
+
end
|
12
|
+
|
13
|
+
super()
|
14
|
+
@type = type
|
15
|
+
@metadata.id = 'ita-corpus'
|
16
|
+
@metadata.name = 'ITA-corpus'
|
17
|
+
@metadata.url = 'https://github.com/mmorise/ita-corpus'
|
18
|
+
@metadata.licenses = ['Unlicense']
|
19
|
+
@metadata.description = lambda do
|
20
|
+
fetch_readme
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def each(&block)
|
25
|
+
return to_enum(__method__) unless block_given?
|
26
|
+
|
27
|
+
data_path = cache_dir_path + "#{@type}_transcript_utf8.txt"
|
28
|
+
data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt"
|
29
|
+
download(data_path, data_url)
|
30
|
+
|
31
|
+
parse_data(data_path, &block)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def fetch_readme
|
36
|
+
readme_base_name = "README.md"
|
37
|
+
readme_path = cache_dir_path + readme_base_name
|
38
|
+
readme_url = "#{download_base_url}/#{readme_base_name}"
|
39
|
+
download(readme_path, readme_url)
|
40
|
+
readme_path.read.split(/^## ファイル構成/, 2)[0].strip
|
41
|
+
end
|
42
|
+
|
43
|
+
def download_base_url
|
44
|
+
"https://raw.githubusercontent.com/mmorise/ita-corpus/main"
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse_data(data_path)
|
48
|
+
File.open(data_path) do |f|
|
49
|
+
f.each_line(chomp: true) do |line|
|
50
|
+
id, sentence = line.split(':', 2)
|
51
|
+
record = Record.new(id , sentence)
|
52
|
+
yield(record)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require_relative 'mnist'
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class KuzushijiMNIST < MNIST
|
5
|
+
BASE_URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
|
6
|
+
|
7
|
+
private
|
8
|
+
def dataset_name
|
9
|
+
"Kuzushiji-MNIST"
|
10
|
+
end
|
11
|
+
|
12
|
+
def licenses
|
13
|
+
["CC-BY-SA-4.0"]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|