red-datasets 0.1.0 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -0
- data/doc/text/news.md +48 -0
- data/lib/datasets.rb +7 -0
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +8 -1
- data/lib/datasets/downloader.rb +53 -27
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/mnist.rb +6 -4
- data/lib/datasets/penguins.rb +146 -0
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/seaborn-data.rb +49 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +171 -0
- data/lib/datasets/tar-gz-readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/red-datasets.gemspec +1 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-penguins.rb +251 -0
- data/test/test-rdatasets.rb +136 -0
- data/test/test-seaborn-data.rb +97 -0
- data/test/test-sudachi-synonym-dictionary.rb +48 -0
- metadata +55 -15
data/lib/datasets/mnist.rb
CHANGED
@@ -2,8 +2,6 @@ require 'zlib'
|
|
2
2
|
|
3
3
|
require_relative "dataset"
|
4
4
|
|
5
|
-
class SetTypeError < StandardError; end
|
6
|
-
|
7
5
|
module Datasets
|
8
6
|
class MNIST < Dataset
|
9
7
|
BASE_URL = "http://yann.lecun.com/exdb/mnist/"
|
@@ -67,7 +65,9 @@ module Datasets
|
|
67
65
|
n_bytes = n_uint32s * 4
|
68
66
|
mnist_magic_number = 2051
|
69
67
|
magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
|
70
|
-
|
68
|
+
if magic != mnist_magic_number
|
69
|
+
raise Error, "This is not #{dataset_name} image file"
|
70
|
+
end
|
71
71
|
n_images.times do |i|
|
72
72
|
data = f.read(n_rows * n_cols)
|
73
73
|
label = labels[i]
|
@@ -101,7 +101,9 @@ module Datasets
|
|
101
101
|
n_bytes = n_uint32s * 2
|
102
102
|
mnist_magic_number = 2049
|
103
103
|
magic, n_labels = f.read(n_bytes).unpack('N2')
|
104
|
-
|
104
|
+
if magic != mnist_magic_number
|
105
|
+
raise Error, "This is not #{dataset_name} label file"
|
106
|
+
end
|
105
107
|
f.read(n_labels).unpack('C*')
|
106
108
|
end
|
107
109
|
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
module PenguinsRawData
|
5
|
+
Record = Struct.new(:study_name,
|
6
|
+
:sample_number,
|
7
|
+
:species,
|
8
|
+
:region,
|
9
|
+
:island,
|
10
|
+
:stage,
|
11
|
+
:individual_id,
|
12
|
+
:clutch_completion,
|
13
|
+
:date_egg,
|
14
|
+
:culmen_length_mm,
|
15
|
+
:culmen_depth_mm,
|
16
|
+
:flipper_length_mm,
|
17
|
+
:body_mass_g,
|
18
|
+
:sex,
|
19
|
+
:delta_15_n_permil,
|
20
|
+
:delta_13_c_permil,
|
21
|
+
:comments)
|
22
|
+
class SpeciesBase < Dataset
|
23
|
+
def initialize
|
24
|
+
super
|
25
|
+
species = self.class.name.split("::").last.downcase
|
26
|
+
@metadata.id = "palmerpenguins-raw-#{species}"
|
27
|
+
@metadata.url = self.class::URL
|
28
|
+
@metadata.licenses = ["CC0"]
|
29
|
+
@data_path = cache_dir_path + "penguins" + (species + ".csv")
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :data_path
|
33
|
+
|
34
|
+
def each
|
35
|
+
return to_enum(__method__) unless block_given?
|
36
|
+
|
37
|
+
open_data do |csv|
|
38
|
+
csv.each do |row|
|
39
|
+
next if row[0].nil?
|
40
|
+
record = Record.new(*row.fields)
|
41
|
+
yield record
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
private def open_data
|
47
|
+
download unless data_path.exist?
|
48
|
+
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
49
|
+
yield csv
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private def download
|
54
|
+
super(data_path, metadata.url)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
|
59
|
+
class Adelie < SpeciesBase
|
60
|
+
DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
|
61
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
|
62
|
+
end
|
63
|
+
|
64
|
+
# Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
|
65
|
+
class Chinstrap < SpeciesBase
|
66
|
+
DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
|
67
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
|
68
|
+
end
|
69
|
+
|
70
|
+
# Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
|
71
|
+
class Gentoo < SpeciesBase
|
72
|
+
DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
|
73
|
+
URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
|
78
|
+
class Penguins < Dataset
|
79
|
+
Record = Struct.new(:species,
|
80
|
+
:island,
|
81
|
+
:bill_length_mm,
|
82
|
+
:bill_depth_mm,
|
83
|
+
:flipper_length_mm,
|
84
|
+
:body_mass_g,
|
85
|
+
:sex,
|
86
|
+
:year)
|
87
|
+
|
88
|
+
def initialize
|
89
|
+
super
|
90
|
+
@metadata.id = "palmerpenguins"
|
91
|
+
@metadata.name = "palmerpenguins"
|
92
|
+
@metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
|
93
|
+
@metadata.licenses = ["CC0"]
|
94
|
+
@metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
|
95
|
+
end
|
96
|
+
|
97
|
+
def each(&block)
|
98
|
+
return to_enum(__method__) unless block_given?
|
99
|
+
|
100
|
+
species_classes = [
|
101
|
+
PenguinsRawData::Adelie,
|
102
|
+
PenguinsRawData::Chinstrap,
|
103
|
+
PenguinsRawData::Gentoo,
|
104
|
+
]
|
105
|
+
|
106
|
+
species_classes.each do |species_class|
|
107
|
+
species_class.new.each do |raw_record|
|
108
|
+
yield convert_record(raw_record)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
private def convert_record(raw_record)
|
114
|
+
Record.new(*cleanse_fields(raw_record))
|
115
|
+
end
|
116
|
+
|
117
|
+
private def cleanse_fields(raw_record)
|
118
|
+
species = raw_record.species.split(' ')[0]
|
119
|
+
flipper_length_mm = raw_record.flipper_length_mm&.to_i
|
120
|
+
body_mass_g = raw_record.body_mass_g&.to_i
|
121
|
+
sex = normalize_sex(raw_record.sex)
|
122
|
+
year = raw_record.date_egg&.year
|
123
|
+
|
124
|
+
[
|
125
|
+
species,
|
126
|
+
raw_record.island,
|
127
|
+
raw_record.culmen_length_mm,
|
128
|
+
raw_record.culmen_depth_mm,
|
129
|
+
flipper_length_mm,
|
130
|
+
body_mass_g,
|
131
|
+
sex,
|
132
|
+
year
|
133
|
+
]
|
134
|
+
end
|
135
|
+
|
136
|
+
private def normalize_sex(val)
|
137
|
+
val = val&.downcase
|
138
|
+
case val
|
139
|
+
when "female", "male", nil
|
140
|
+
val
|
141
|
+
else
|
142
|
+
nil
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "tar-gz-readable"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class RdatasetsList < Dataset
|
6
|
+
Record = Struct.new(:package,
|
7
|
+
:dataset,
|
8
|
+
:title,
|
9
|
+
:rows,
|
10
|
+
:cols,
|
11
|
+
:n_binary,
|
12
|
+
:n_character,
|
13
|
+
:n_factor,
|
14
|
+
:n_logical,
|
15
|
+
:n_numeric,
|
16
|
+
:csv,
|
17
|
+
:doc)
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super
|
21
|
+
@metadata.id = "rdatasets"
|
22
|
+
@metadata.name = "Rdatasets"
|
23
|
+
@metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
|
24
|
+
@metadata.licenses = ["GPL-3"]
|
25
|
+
@data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
|
26
|
+
@data_path = cache_dir_path + "datasets.csv"
|
27
|
+
end
|
28
|
+
|
29
|
+
def filter(package: nil, dataset: nil)
|
30
|
+
return to_enum(__method__, package: package, dataset: dataset) unless block_given?
|
31
|
+
|
32
|
+
conds = {}
|
33
|
+
conds["Package"] = package if package
|
34
|
+
conds["Item"] = dataset if dataset
|
35
|
+
if conds.empty?
|
36
|
+
each_row {|row| yield Record.new(*row.fields) }
|
37
|
+
else
|
38
|
+
each_row do |row|
|
39
|
+
if conds.all? {|k, v| row[k] == v }
|
40
|
+
yield Record.new(*row.fields)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def each(&block)
|
47
|
+
filter(&block)
|
48
|
+
end
|
49
|
+
|
50
|
+
private def each_row(&block)
|
51
|
+
download(@data_path, @data_url) unless @data_path.exist?
|
52
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
53
|
+
csv.each(&block)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class Rdatasets < Dataset
|
59
|
+
def initialize(package_name, dataset_name)
|
60
|
+
list = RdatasetsList.new
|
61
|
+
|
62
|
+
info = list.filter(package: package_name, dataset: dataset_name).first
|
63
|
+
unless info
|
64
|
+
raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
|
65
|
+
end
|
66
|
+
|
67
|
+
super()
|
68
|
+
@metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
|
69
|
+
@metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
|
70
|
+
@metadata.url = info.csv
|
71
|
+
@metadata.licenses = ["GPL-3"]
|
72
|
+
@metadata.description = info.title
|
73
|
+
|
74
|
+
# Follow the original directory structure in the cache directory
|
75
|
+
@data_path = cache_dir_path + (dataset_name + ".csv")
|
76
|
+
|
77
|
+
@package_name = package_name
|
78
|
+
@dataset_name = dataset_name
|
79
|
+
end
|
80
|
+
|
81
|
+
def each(&block)
|
82
|
+
return to_enum(__method__) unless block_given?
|
83
|
+
|
84
|
+
download(@data_path, @metadata.url) unless @data_path.exist?
|
85
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
86
|
+
csv.each do |row|
|
87
|
+
record = row.to_h
|
88
|
+
record.delete("")
|
89
|
+
record.transform_keys!(&:to_sym)
|
90
|
+
yield record
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Datasets
|
2
|
+
class SeabornData < Dataset
|
3
|
+
URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
|
4
|
+
|
5
|
+
def initialize(name)
|
6
|
+
super()
|
7
|
+
@metadata.id = "seaborn-data-#{name}"
|
8
|
+
@metadata.name = "SeabornData: #{name}"
|
9
|
+
@metadata.url = URL_FORMAT % {name: name}
|
10
|
+
|
11
|
+
@data_path = cache_dir_path + (name + ".csv")
|
12
|
+
@name = name
|
13
|
+
end
|
14
|
+
|
15
|
+
def each(&block)
|
16
|
+
return to_enum(__method__) unless block_given?
|
17
|
+
|
18
|
+
download(@data_path, @metadata.url) unless @data_path.exist?
|
19
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
20
|
+
csv.each do |row|
|
21
|
+
record = prepare_record(row)
|
22
|
+
yield record
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
def prepare_record(csv_row)
|
29
|
+
record = csv_row.to_h
|
30
|
+
record.transform_keys!(&:to_sym)
|
31
|
+
|
32
|
+
# Perform the same preprocessing as seaborn's load_dataset function
|
33
|
+
preprocessor = :"preprocess_#{@name}_record"
|
34
|
+
__send__(preprocessor, record) if respond_to?(preprocessor, true)
|
35
|
+
|
36
|
+
record
|
37
|
+
end
|
38
|
+
|
39
|
+
# The same preprocessing as seaborn.load_dataset
|
40
|
+
def preprocess_flights_record(record)
|
41
|
+
record[:month] &&= record[:month][0,3]
|
42
|
+
end
|
43
|
+
|
44
|
+
# The same preprocessing as seaborn.load_dataset
|
45
|
+
def preprocess_penguins_record(record)
|
46
|
+
record[:sex] &&= record[:sex].capitalize
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class SudachiSynonymDictionary < Dataset
|
7
|
+
class Synonym < Struct.new(:group_id,
|
8
|
+
:is_noun,
|
9
|
+
:expansion_type,
|
10
|
+
:lexeme_id,
|
11
|
+
:form_type,
|
12
|
+
:acronym_type,
|
13
|
+
:variant_type,
|
14
|
+
:categories,
|
15
|
+
:notation)
|
16
|
+
alias_method :noun?, :is_noun
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super()
|
21
|
+
@metadata.id = "sudachi-synonym-dictionary"
|
22
|
+
@metadata.name = "Sudachi synonym dictionary"
|
23
|
+
@metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
|
24
|
+
@metadata.licenses = [
|
25
|
+
"Apache-2.0",
|
26
|
+
]
|
27
|
+
@metadata.description = lambda do
|
28
|
+
download_description
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def each
|
33
|
+
return to_enum(__method__) unless block_given?
|
34
|
+
|
35
|
+
lexeme_id_context = {}
|
36
|
+
open_data do |csv|
|
37
|
+
csv.each do |row|
|
38
|
+
group_id = row[0]
|
39
|
+
if group_id != lexeme_id_context[:group_id]
|
40
|
+
lexeme_id_context[:group_id] = group_id
|
41
|
+
lexeme_id_context[:counter] = 0
|
42
|
+
end
|
43
|
+
is_noun = (row[1] == "1")
|
44
|
+
expansion_type = normalize_expansion_type(row[2])
|
45
|
+
lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
|
46
|
+
form_type = normalize_form_type(row[4])
|
47
|
+
acronym_type = normalize_acronym_type(row[5])
|
48
|
+
variant_type = normalize_variant_type(row[6])
|
49
|
+
categories = normalize_categories(row[7])
|
50
|
+
notation = row[8]
|
51
|
+
synonym = Synonym.new(group_id,
|
52
|
+
is_noun,
|
53
|
+
expansion_type,
|
54
|
+
lexeme_id,
|
55
|
+
form_type,
|
56
|
+
acronym_type,
|
57
|
+
variant_type,
|
58
|
+
categories,
|
59
|
+
notation)
|
60
|
+
yield(synonym)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def open_data
|
67
|
+
data_path = cache_dir_path + "synonyms.txt"
|
68
|
+
unless data_path.exist?
|
69
|
+
data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
|
70
|
+
download(data_path, data_url)
|
71
|
+
end
|
72
|
+
CSV.open(data_path,
|
73
|
+
encoding: "UTF-8",
|
74
|
+
skip_blanks: true) do |csv|
|
75
|
+
yield(csv)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def download_description
|
80
|
+
description_path = cache_dir_path + "synonyms.md"
|
81
|
+
unless description_path.exist?
|
82
|
+
description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
|
83
|
+
download(description_path, description_url)
|
84
|
+
end
|
85
|
+
description_path.read
|
86
|
+
end
|
87
|
+
|
88
|
+
def normalize_expansion_type(type)
|
89
|
+
case type
|
90
|
+
when "0", ""
|
91
|
+
:always
|
92
|
+
when "1"
|
93
|
+
:expanded
|
94
|
+
when "2"
|
95
|
+
:never
|
96
|
+
else
|
97
|
+
raise Error, "unknown expansion type: #{type.inspect}"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def normalize_lexeme_id(id, context)
|
102
|
+
case id
|
103
|
+
when ""
|
104
|
+
lexeme_id_context[:counter] += 1
|
105
|
+
lexeme_id_context[:counter]
|
106
|
+
else
|
107
|
+
# Use only the first lexeme ID.
|
108
|
+
# Example:
|
109
|
+
# 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
|
110
|
+
# 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
|
111
|
+
Integer(id.split("/").first, 10)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def normalize_form_type(type)
|
116
|
+
case type
|
117
|
+
when "0", ""
|
118
|
+
:typical
|
119
|
+
when "1"
|
120
|
+
:translation
|
121
|
+
when "2"
|
122
|
+
:alias
|
123
|
+
when "3"
|
124
|
+
:old_name
|
125
|
+
when "4"
|
126
|
+
:misnomer
|
127
|
+
else
|
128
|
+
raise Error, "unknown form type: #{type.inspect}"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def normalize_acronym_type(type)
|
133
|
+
case type
|
134
|
+
when "0", ""
|
135
|
+
:typical
|
136
|
+
when "1"
|
137
|
+
:alphabet
|
138
|
+
when "2"
|
139
|
+
:others
|
140
|
+
else
|
141
|
+
raise Error, "unknown acronym type: #{type.inspect}"
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def normalize_variant_type(type)
|
146
|
+
case type
|
147
|
+
when "0", ""
|
148
|
+
:typical
|
149
|
+
when "1"
|
150
|
+
:alphabet
|
151
|
+
when "2"
|
152
|
+
:general
|
153
|
+
when "3"
|
154
|
+
:misspelled
|
155
|
+
else
|
156
|
+
raise Error, "unknown variant type: #{type.inspect}"
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def normalize_categories(categories)
|
161
|
+
case categories
|
162
|
+
when ""
|
163
|
+
nil
|
164
|
+
when /\A\((.*)\)\z/
|
165
|
+
$1.split("/")
|
166
|
+
else
|
167
|
+
raise Error, "invalid categories: #{categories.inspect}"
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|