red-datasets 0.0.8 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -0
- data/doc/text/news.md +93 -0
- data/lib/datasets.rb +9 -0
- data/lib/datasets/adult.rb +4 -3
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +20 -1
- data/lib/datasets/downloader.rb +54 -26
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/libsvm-dataset-list.rb +194 -54
- data/lib/datasets/libsvm.rb +1 -9
- data/lib/datasets/mnist.rb +6 -4
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +146 -0
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/seaborn-data.rb +49 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +169 -0
- data/lib/datasets/table.rb +83 -3
- data/lib/datasets/tar-gz-readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/red-datasets.gemspec +1 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +251 -0
- data/test/test-rdatasets.rb +136 -0
- data/test/test-seaborn-data.rb +97 -0
- data/test/test-sudachi-synonym-dictionary.rb +48 -0
- data/test/test-table.rb +123 -18
- metadata +61 -15
@@ -0,0 +1,95 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "tar-gz-readable"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class RdatasetsList < Dataset
|
6
|
+
Record = Struct.new(:package,
|
7
|
+
:dataset,
|
8
|
+
:title,
|
9
|
+
:rows,
|
10
|
+
:cols,
|
11
|
+
:n_binary,
|
12
|
+
:n_character,
|
13
|
+
:n_factor,
|
14
|
+
:n_logical,
|
15
|
+
:n_numeric,
|
16
|
+
:csv,
|
17
|
+
:doc)
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super
|
21
|
+
@metadata.id = "rdatasets"
|
22
|
+
@metadata.name = "Rdatasets"
|
23
|
+
@metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
|
24
|
+
@metadata.licenses = ["GPL-3"]
|
25
|
+
@data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
|
26
|
+
@data_path = cache_dir_path + "datasets.csv"
|
27
|
+
end
|
28
|
+
|
29
|
+
def filter(package: nil, dataset: nil)
|
30
|
+
return to_enum(__method__, package: package, dataset: dataset) unless block_given?
|
31
|
+
|
32
|
+
conds = {}
|
33
|
+
conds["Package"] = package if package
|
34
|
+
conds["Item"] = dataset if dataset
|
35
|
+
if conds.empty?
|
36
|
+
each_row {|row| yield Record.new(*row.fields) }
|
37
|
+
else
|
38
|
+
each_row do |row|
|
39
|
+
if conds.all? {|k, v| row[k] == v }
|
40
|
+
yield Record.new(*row.fields)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def each(&block)
|
47
|
+
filter(&block)
|
48
|
+
end
|
49
|
+
|
50
|
+
private def each_row(&block)
|
51
|
+
download(@data_path, @data_url) unless @data_path.exist?
|
52
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
53
|
+
csv.each(&block)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class Rdatasets < Dataset
|
59
|
+
def initialize(package_name, dataset_name)
|
60
|
+
list = RdatasetsList.new
|
61
|
+
|
62
|
+
info = list.filter(package: package_name, dataset: dataset_name).first
|
63
|
+
unless info
|
64
|
+
raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
|
65
|
+
end
|
66
|
+
|
67
|
+
super()
|
68
|
+
@metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
|
69
|
+
@metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
|
70
|
+
@metadata.url = info.csv
|
71
|
+
@metadata.licenses = ["GPL-3"]
|
72
|
+
@metadata.description = info.title
|
73
|
+
|
74
|
+
# Follow the original directory structure in the cache directory
|
75
|
+
@data_path = cache_dir_path + (dataset_name + ".csv")
|
76
|
+
|
77
|
+
@package_name = package_name
|
78
|
+
@dataset_name = dataset_name
|
79
|
+
end
|
80
|
+
|
81
|
+
def each(&block)
|
82
|
+
return to_enum(__method__) unless block_given?
|
83
|
+
|
84
|
+
download(@data_path, @metadata.url) unless @data_path.exist?
|
85
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
86
|
+
csv.each do |row|
|
87
|
+
record = row.to_h
|
88
|
+
record.delete("")
|
89
|
+
record.transform_keys!(&:to_sym)
|
90
|
+
yield record
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Datasets
|
2
|
+
class SeabornData < Dataset
|
3
|
+
URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
|
4
|
+
|
5
|
+
def initialize(name)
|
6
|
+
super()
|
7
|
+
@metadata.id = "seaborn-data-#{name}"
|
8
|
+
@metadata.name = "SeabornData: #{name}"
|
9
|
+
@metadata.url = URL_FORMAT % {name: name}
|
10
|
+
|
11
|
+
@data_path = cache_dir_path + (name + ".csv")
|
12
|
+
@name = name
|
13
|
+
end
|
14
|
+
|
15
|
+
def each(&block)
|
16
|
+
return to_enum(__method__) unless block_given?
|
17
|
+
|
18
|
+
download(@data_path, @metadata.url) unless @data_path.exist?
|
19
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
20
|
+
csv.each do |row|
|
21
|
+
record = prepare_record(row)
|
22
|
+
yield record
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
def prepare_record(csv_row)
|
29
|
+
record = csv_row.to_h
|
30
|
+
record.transform_keys!(&:to_sym)
|
31
|
+
|
32
|
+
# Perform the same preprocessing as seaborn's load_dataset function
|
33
|
+
preprocessor = :"preprocess_#{@name}_record"
|
34
|
+
__send__(preprocessor, record) if respond_to?(preprocessor, true)
|
35
|
+
|
36
|
+
record
|
37
|
+
end
|
38
|
+
|
39
|
+
# The same preprocessing as seaborn.load_dataset
|
40
|
+
def preprocess_flights_record(record)
|
41
|
+
record[:month] &&= record[:month][0,3]
|
42
|
+
end
|
43
|
+
|
44
|
+
# The same preprocessing as seaborn.load_dataset
|
45
|
+
def preprocess_penguins_record(record)
|
46
|
+
record[:sex] &&= record[:sex].capitalize
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class SudachiSynonymDictionary < Dataset
|
7
|
+
class Synonym < Struct.new(:group_id,
|
8
|
+
:is_noun,
|
9
|
+
:expansion_type,
|
10
|
+
:lexeme_id,
|
11
|
+
:form_type,
|
12
|
+
:acronym_type,
|
13
|
+
:variant_type,
|
14
|
+
:categories,
|
15
|
+
:notation)
|
16
|
+
alias_method :noun?, :is_noun
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super()
|
21
|
+
@metadata.id = "sudachi-synonym-dictionary"
|
22
|
+
@metadata.name = "Sudachi synonym dictionary"
|
23
|
+
@metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
|
24
|
+
@metadata.licenses = [
|
25
|
+
"Apache-2.0",
|
26
|
+
]
|
27
|
+
@metadata.description = lambda do
|
28
|
+
download_description
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def each
|
33
|
+
return to_enum(__method__) unless block_given?
|
34
|
+
|
35
|
+
lexeme_id_context = {}
|
36
|
+
open_data do |csv|
|
37
|
+
csv.each do |row|
|
38
|
+
group_id = row[0]
|
39
|
+
if group_id != lexeme_id_context[:group_id]
|
40
|
+
lexeme_id_context[:group_id] = group_id
|
41
|
+
lexeme_id_context[:counter] = 0
|
42
|
+
end
|
43
|
+
is_noun = (row[1] == "1")
|
44
|
+
expansion_type = normalize_expansion_type(row[2])
|
45
|
+
lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
|
46
|
+
form_type = normalize_form_type(row[4])
|
47
|
+
acronym_type = normalize_acronym_type(row[5])
|
48
|
+
variant_type = normalize_variant_type(row[6])
|
49
|
+
categories = normalize_categories(row[7])
|
50
|
+
notation = row[8]
|
51
|
+
synonym = Synonym.new(group_id,
|
52
|
+
is_noun,
|
53
|
+
expansion_type,
|
54
|
+
lexeme_id,
|
55
|
+
form_type,
|
56
|
+
acronym_type,
|
57
|
+
variant_type,
|
58
|
+
categories,
|
59
|
+
notation)
|
60
|
+
yield(synonym)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def open_data
|
67
|
+
data_path = cache_dir_path + "synonyms.txt"
|
68
|
+
unless data_path.exist?
|
69
|
+
data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
|
70
|
+
download(data_path, data_url)
|
71
|
+
end
|
72
|
+
CSV.open(data_path, skip_blanks: true) do |csv|
|
73
|
+
yield(csv)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def download_description
|
78
|
+
description_path = cache_dir_path + "synonyms.md"
|
79
|
+
unless description_path.exist?
|
80
|
+
description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
|
81
|
+
download(description_path, description_url)
|
82
|
+
end
|
83
|
+
description_path.read
|
84
|
+
end
|
85
|
+
|
86
|
+
def normalize_expansion_type(type)
|
87
|
+
case type
|
88
|
+
when "0", ""
|
89
|
+
:always
|
90
|
+
when "1"
|
91
|
+
:expanded
|
92
|
+
when "2"
|
93
|
+
:never
|
94
|
+
else
|
95
|
+
raise Error, "unknown expansion type: #{type.inspect}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def normalize_lexeme_id(id, context)
|
100
|
+
case id
|
101
|
+
when ""
|
102
|
+
lexeme_id_context[:counter] += 1
|
103
|
+
lexeme_id_context[:counter]
|
104
|
+
else
|
105
|
+
# Use only the first lexeme ID.
|
106
|
+
# Example:
|
107
|
+
# 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
|
108
|
+
# 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
|
109
|
+
Integer(id.split("/").first, 10)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def normalize_form_type(type)
|
114
|
+
case type
|
115
|
+
when "0", ""
|
116
|
+
:typical
|
117
|
+
when "1"
|
118
|
+
:translation
|
119
|
+
when "2"
|
120
|
+
:alias
|
121
|
+
when "3"
|
122
|
+
:old_name
|
123
|
+
when "4"
|
124
|
+
:misnomer
|
125
|
+
else
|
126
|
+
raise Error, "unknown form type: #{type.inspect}"
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def normalize_acronym_type(type)
|
131
|
+
case type
|
132
|
+
when "0", ""
|
133
|
+
:typical
|
134
|
+
when "1"
|
135
|
+
:alphabet
|
136
|
+
when "2"
|
137
|
+
:others
|
138
|
+
else
|
139
|
+
raise Error, "unknown acronym type: #{type.inspect}"
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def normalize_variant_type(type)
|
144
|
+
case type
|
145
|
+
when "0", ""
|
146
|
+
:typical
|
147
|
+
when "1"
|
148
|
+
:alphabet
|
149
|
+
when "2"
|
150
|
+
:general
|
151
|
+
when "3"
|
152
|
+
:misspelled
|
153
|
+
else
|
154
|
+
raise Error, "unknown variant type: #{type.inspect}"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def normalize_categories(categories)
|
159
|
+
case categories
|
160
|
+
when ""
|
161
|
+
nil
|
162
|
+
when /\A\((.*)\)\z/
|
163
|
+
$1.split("/")
|
164
|
+
else
|
165
|
+
raise Error, "invalid categories: #{categories.inspect}"
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
data/lib/datasets/table.rb
CHANGED
@@ -2,19 +2,99 @@ require "datasets/dictionary"
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class Table
|
5
|
+
class Record
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
def initialize(table, index)
|
9
|
+
@table = table
|
10
|
+
@index = index
|
11
|
+
end
|
12
|
+
|
13
|
+
def [](column_name_or_column_index)
|
14
|
+
@table[column_name_or_column_index][@index]
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
return to_enum(__method__) unless block_given?
|
19
|
+
@table.each_column.each do |column_name, column_values|
|
20
|
+
yield(column_name, column_values[@index])
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def values
|
25
|
+
@table.each_column.collect do |_column_name, column_values|
|
26
|
+
column_values[@index]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_h
|
31
|
+
hash = {}
|
32
|
+
each do |column_name, column_value|
|
33
|
+
hash[column_name] = column_value
|
34
|
+
end
|
35
|
+
hash
|
36
|
+
end
|
37
|
+
|
38
|
+
def inspect
|
39
|
+
"#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
5
43
|
include Enumerable
|
6
44
|
|
45
|
+
attr_reader :dataset
|
7
46
|
def initialize(dataset)
|
8
47
|
@dataset = dataset
|
9
48
|
@dictionaries = {}
|
10
49
|
end
|
11
50
|
|
12
|
-
def
|
51
|
+
def n_columns
|
52
|
+
columner_data.size
|
53
|
+
end
|
54
|
+
alias_method :size, :n_columns
|
55
|
+
alias_method :length, :n_columns
|
56
|
+
|
57
|
+
def n_rows
|
58
|
+
first_column = columner_data.first
|
59
|
+
return 0 if first_column.nil?
|
60
|
+
first_column[1].size
|
61
|
+
end
|
62
|
+
|
63
|
+
def column_names
|
64
|
+
columner_data.keys
|
65
|
+
end
|
66
|
+
|
67
|
+
def each_column(&block)
|
13
68
|
columner_data.each(&block)
|
14
69
|
end
|
70
|
+
alias_method :each, :each_column
|
15
71
|
|
16
|
-
def
|
17
|
-
|
72
|
+
def each_record
|
73
|
+
return to_enum(__method__) unless block_given?
|
74
|
+
n_rows.times do |i|
|
75
|
+
yield(Record.new(self, i))
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_record(row)
|
80
|
+
row += n_rows if row < 0
|
81
|
+
return nil if row < 0
|
82
|
+
return nil if row >= n_rows
|
83
|
+
Record.new(self, row)
|
84
|
+
end
|
85
|
+
|
86
|
+
def [](name_or_index)
|
87
|
+
case name_or_index
|
88
|
+
when Integer
|
89
|
+
index = name_or_index
|
90
|
+
columner_data.each_with_index do |(_name, values), i|
|
91
|
+
return values if i == index
|
92
|
+
end
|
93
|
+
nil
|
94
|
+
else
|
95
|
+
name = name_or_index
|
96
|
+
columner_data[normalize_name(name)]
|
97
|
+
end
|
18
98
|
end
|
19
99
|
|
20
100
|
def dictionary_encode(name)
|
data/lib/datasets/version.rb
CHANGED
data/lib/datasets/wikipedia.rb
CHANGED
@@ -52,7 +52,7 @@ module Datasets
|
|
52
52
|
end
|
53
53
|
|
54
54
|
private
|
55
|
-
def open_data
|
55
|
+
def open_data(&block)
|
56
56
|
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
57
57
|
data_path = cache_dir_path + base_name
|
58
58
|
unless data_path.exist?
|
@@ -60,15 +60,7 @@ module Datasets
|
|
60
60
|
download(data_path, data_url)
|
61
61
|
end
|
62
62
|
|
63
|
-
|
64
|
-
pid = spawn("bzcat", data_path.to_s, {:out => output})
|
65
|
-
begin
|
66
|
-
output.close
|
67
|
-
yield(input)
|
68
|
-
ensure
|
69
|
-
input.close
|
70
|
-
Process.waitpid(pid)
|
71
|
-
end
|
63
|
+
extract_bz2(data_path, &block)
|
72
64
|
end
|
73
65
|
|
74
66
|
def type_in_path
|