red-datasets 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/doc/text/news.md +8 -0
- data/lib/datasets.rb +2 -0
- data/lib/datasets/cifar.rb +1 -1
- data/lib/datasets/cldr-plurals.rb +4 -4
- data/lib/datasets/mnist.rb +6 -2
- data/lib/datasets/rdatasets.rb +1 -1
- data/lib/datasets/seaborn-data.rb +49 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +169 -0
- data/lib/datasets/{tar_gz_readable.rb → tar-gz-readable.rb} +0 -0
- data/lib/datasets/version.rb +1 -1
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-rdatasets.rb +1 -1
- data/test/test-seaborn-data.rb +97 -0
- data/test/test-sudachi-synonym-dictionary.rb +48 -0
- metadata +9 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07c55b47d31b30ceaf4cdd3ea22da5c737d81884a494c6a11abc6fda6fbea22b
|
4
|
+
data.tar.gz: a28d34b5d28cb57349a81112ffc2db8fe9f94939beb21477af4d9d0c9d5b59ab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4871ac4ec167cb78b3fce8f9c5de9f6cff6a4089b8e4fd87fe7bb3265865cfcbd86935e8f2fa0bc5e40fde8a471e1655390fcdf8dcc0a5197342143e0cb855e5
|
7
|
+
data.tar.gz: 66d31943cb857632518a90166972bfa9ebe4b8ec355eca8291da40183c260e3c175d5866220efc0e4174f780c8449b45004db425a8fc7453268236f9b7dcbc1d
|
data/doc/text/news.md
CHANGED
data/lib/datasets.rb
CHANGED
@@ -16,5 +16,7 @@ require_relative "datasets/penguins"
|
|
16
16
|
require_relative "datasets/penn-treebank"
|
17
17
|
require_relative "datasets/postal-code-japan"
|
18
18
|
require_relative "datasets/rdatasets"
|
19
|
+
require_relative "datasets/seaborn-data"
|
20
|
+
require_relative "datasets/sudachi-synonym-dictionary"
|
19
21
|
require_relative "datasets/wikipedia"
|
20
22
|
require_relative "datasets/wine"
|
data/lib/datasets/cifar.rb
CHANGED
@@ -183,7 +183,7 @@ module Datasets
|
|
183
183
|
end
|
184
184
|
value = parse_value
|
185
185
|
if value.nil?
|
186
|
-
raise Error
|
186
|
+
raise Error, "no value for #{operator}: #{@scanner.inspect}"
|
187
187
|
end
|
188
188
|
[operator, expr, value]
|
189
189
|
end
|
@@ -267,7 +267,7 @@ module Datasets
|
|
267
267
|
if operator
|
268
268
|
value = parse_value
|
269
269
|
if value.nil?
|
270
|
-
raise Error
|
270
|
+
raise Error, "no value for #{operator}: #{@scanner.inspect}"
|
271
271
|
end
|
272
272
|
[operator, operand, value]
|
273
273
|
else
|
@@ -336,7 +336,7 @@ module Datasets
|
|
336
336
|
skip_whitespaces
|
337
337
|
# U+2026 HORIZONTAL ELLIPSIS
|
338
338
|
unless @scanner.scan(/\u2026|\.\.\./)
|
339
|
-
raise "no ellipsis: #{@scanner.inspect}"
|
339
|
+
raise Error, "no ellipsis: #{@scanner.inspect}"
|
340
340
|
end
|
341
341
|
samples << :elipsis
|
342
342
|
end
|
@@ -362,7 +362,7 @@ module Datasets
|
|
362
362
|
skip_whitespaces
|
363
363
|
decimal = @scanner.scan(/[0-9]+/)
|
364
364
|
if decimal.nil?
|
365
|
-
raise "no decimal: #{@scanner.inspect}"
|
365
|
+
raise Error, "no decimal: #{@scanner.inspect}"
|
366
366
|
end
|
367
367
|
value += Float("0.#{decimal}")
|
368
368
|
skip_whitespaces
|
data/lib/datasets/mnist.rb
CHANGED
@@ -65,7 +65,9 @@ module Datasets
|
|
65
65
|
n_bytes = n_uint32s * 4
|
66
66
|
mnist_magic_number = 2051
|
67
67
|
magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
|
68
|
-
|
68
|
+
if magic != mnist_magic_number
|
69
|
+
raise Error, "This is not #{dataset_name} image file"
|
70
|
+
end
|
69
71
|
n_images.times do |i|
|
70
72
|
data = f.read(n_rows * n_cols)
|
71
73
|
label = labels[i]
|
@@ -99,7 +101,9 @@ module Datasets
|
|
99
101
|
n_bytes = n_uint32s * 2
|
100
102
|
mnist_magic_number = 2049
|
101
103
|
magic, n_labels = f.read(n_bytes).unpack('N2')
|
102
|
-
|
104
|
+
if magic != mnist_magic_number
|
105
|
+
raise Error, "This is not #{dataset_name} label file"
|
106
|
+
end
|
103
107
|
f.read(n_labels).unpack('C*')
|
104
108
|
end
|
105
109
|
end
|
data/lib/datasets/rdatasets.rb
CHANGED
@@ -0,0 +1,49 @@
|
|
1
|
+
module Datasets
|
2
|
+
class SeabornData < Dataset
|
3
|
+
URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
|
4
|
+
|
5
|
+
def initialize(name)
|
6
|
+
super()
|
7
|
+
@metadata.id = "seaborn-data-#{name}"
|
8
|
+
@metadata.name = "SeabornData: #{name}"
|
9
|
+
@metadata.url = URL_FORMAT % {name: name}
|
10
|
+
|
11
|
+
@data_path = cache_dir_path + (name + ".csv")
|
12
|
+
@name = name
|
13
|
+
end
|
14
|
+
|
15
|
+
def each(&block)
|
16
|
+
return to_enum(__method__) unless block_given?
|
17
|
+
|
18
|
+
download(@data_path, @metadata.url) unless @data_path.exist?
|
19
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
20
|
+
csv.each do |row|
|
21
|
+
record = prepare_record(row)
|
22
|
+
yield record
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
def prepare_record(csv_row)
|
29
|
+
record = csv_row.to_h
|
30
|
+
record.transform_keys!(&:to_sym)
|
31
|
+
|
32
|
+
# Perform the same preprocessing as seaborn's load_dataset function
|
33
|
+
preprocessor = :"preprocess_#{@name}_record"
|
34
|
+
__send__(preprocessor, record) if respond_to?(preprocessor, true)
|
35
|
+
|
36
|
+
record
|
37
|
+
end
|
38
|
+
|
39
|
+
# The same preprocessing as seaborn.load_dataset
|
40
|
+
def preprocess_flights_record(record)
|
41
|
+
record[:month] &&= record[:month][0,3]
|
42
|
+
end
|
43
|
+
|
44
|
+
# The same preprocessing as seaborn.load_dataset
|
45
|
+
def preprocess_penguins_record(record)
|
46
|
+
record[:sex] &&= record[:sex].capitalize
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class SudachiSynonymDictionary < Dataset
|
7
|
+
class Synonym < Struct.new(:group_id,
|
8
|
+
:is_noun,
|
9
|
+
:expansion_type,
|
10
|
+
:lexeme_id,
|
11
|
+
:form_type,
|
12
|
+
:acronym_type,
|
13
|
+
:variant_type,
|
14
|
+
:categories,
|
15
|
+
:notation)
|
16
|
+
alias_method :noun?, :is_noun
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super()
|
21
|
+
@metadata.id = "sudachi-synonym-dictionary"
|
22
|
+
@metadata.name = "Sudachi synonym dictionary"
|
23
|
+
@metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
|
24
|
+
@metadata.licenses = [
|
25
|
+
"Apache-2.0",
|
26
|
+
]
|
27
|
+
@metadata.description = lambda do
|
28
|
+
download_description
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def each
|
33
|
+
return to_enum(__method__) unless block_given?
|
34
|
+
|
35
|
+
lexeme_id_context = {}
|
36
|
+
open_data do |csv|
|
37
|
+
csv.each do |row|
|
38
|
+
group_id = row[0]
|
39
|
+
if group_id != lexeme_id_context[:group_id]
|
40
|
+
lexeme_id_context[:group_id] = group_id
|
41
|
+
lexeme_id_context[:counter] = 0
|
42
|
+
end
|
43
|
+
is_noun = (row[1] == "1")
|
44
|
+
expansion_type = normalize_expansion_type(row[2])
|
45
|
+
lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
|
46
|
+
form_type = normalize_form_type(row[4])
|
47
|
+
acronym_type = normalize_acronym_type(row[5])
|
48
|
+
variant_type = normalize_variant_type(row[6])
|
49
|
+
categories = normalize_categories(row[7])
|
50
|
+
notation = row[8]
|
51
|
+
synonym = Synonym.new(group_id,
|
52
|
+
is_noun,
|
53
|
+
expansion_type,
|
54
|
+
lexeme_id,
|
55
|
+
form_type,
|
56
|
+
acronym_type,
|
57
|
+
variant_type,
|
58
|
+
categories,
|
59
|
+
notation)
|
60
|
+
yield(synonym)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def open_data
|
67
|
+
data_path = cache_dir_path + "synonyms.txt"
|
68
|
+
unless data_path.exist?
|
69
|
+
data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
|
70
|
+
download(data_path, data_url)
|
71
|
+
end
|
72
|
+
CSV.open(data_path, skip_blanks: true) do |csv|
|
73
|
+
yield(csv)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def download_description
|
78
|
+
description_path = cache_dir_path + "synonyms.md"
|
79
|
+
unless description_path.exist?
|
80
|
+
description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
|
81
|
+
download(description_path, description_url)
|
82
|
+
end
|
83
|
+
description_path.read
|
84
|
+
end
|
85
|
+
|
86
|
+
def normalize_expansion_type(type)
|
87
|
+
case type
|
88
|
+
when "0", ""
|
89
|
+
:always
|
90
|
+
when "1"
|
91
|
+
:expanded
|
92
|
+
when "2"
|
93
|
+
:never
|
94
|
+
else
|
95
|
+
raise Error, "unknown expansion type: #{type.inspect}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def normalize_lexeme_id(id, context)
|
100
|
+
case id
|
101
|
+
when ""
|
102
|
+
lexeme_id_context[:counter] += 1
|
103
|
+
lexeme_id_context[:counter]
|
104
|
+
else
|
105
|
+
# Use only the first lexeme ID.
|
106
|
+
# Example:
|
107
|
+
# 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
|
108
|
+
# 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
|
109
|
+
Integer(id.split("/").first, 10)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def normalize_form_type(type)
|
114
|
+
case type
|
115
|
+
when "0", ""
|
116
|
+
:typical
|
117
|
+
when "1"
|
118
|
+
:translation
|
119
|
+
when "2"
|
120
|
+
:alias
|
121
|
+
when "3"
|
122
|
+
:old_name
|
123
|
+
when "4"
|
124
|
+
:misnomer
|
125
|
+
else
|
126
|
+
raise Error, "unknown form type: #{type.inspect}"
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def normalize_acronym_type(type)
|
131
|
+
case type
|
132
|
+
when "0", ""
|
133
|
+
:typical
|
134
|
+
when "1"
|
135
|
+
:alphabet
|
136
|
+
when "2"
|
137
|
+
:others
|
138
|
+
else
|
139
|
+
raise Error, "unknown acronym type: #{type.inspect}"
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def normalize_variant_type(type)
|
144
|
+
case type
|
145
|
+
when "0", ""
|
146
|
+
:typical
|
147
|
+
when "1"
|
148
|
+
:alphabet
|
149
|
+
when "2"
|
150
|
+
:general
|
151
|
+
when "3"
|
152
|
+
:misspelled
|
153
|
+
else
|
154
|
+
raise Error, "unknown variant type: #{type.inspect}"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def normalize_categories(categories)
|
159
|
+
case categories
|
160
|
+
when ""
|
161
|
+
nil
|
162
|
+
when /\A\((.*)\)\z/
|
163
|
+
$1.split("/")
|
164
|
+
else
|
165
|
+
raise Error, "invalid categories: #{categories.inspect}"
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
File without changes
|
data/lib/datasets/version.rb
CHANGED
data/test/test-cldr-plurals.rb
CHANGED
data/test/test-rdatasets.rb
CHANGED
@@ -0,0 +1,97 @@
|
|
1
|
+
class SeabornDataTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("fmri") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::SeabornData.new("fmri")
|
5
|
+
end
|
6
|
+
|
7
|
+
def test_each
|
8
|
+
records = @dataset.each.to_a
|
9
|
+
assert_equal([
|
10
|
+
1064,
|
11
|
+
{
|
12
|
+
subject: "s5",
|
13
|
+
timepoint: 14,
|
14
|
+
event: "stim",
|
15
|
+
region: "parietal",
|
16
|
+
signal: -0.0808829319505
|
17
|
+
},
|
18
|
+
{
|
19
|
+
subject: "s0",
|
20
|
+
timepoint: 0,
|
21
|
+
event: "cue",
|
22
|
+
region: "parietal",
|
23
|
+
signal: -0.00689923478092
|
24
|
+
}
|
25
|
+
],
|
26
|
+
[
|
27
|
+
records.size,
|
28
|
+
records[1].to_h,
|
29
|
+
records[-1].to_h
|
30
|
+
])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
sub_test_case("flights") do
|
35
|
+
def setup
|
36
|
+
@dataset = Datasets::SeabornData.new("flights")
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_each
|
40
|
+
records = @dataset.each.to_a
|
41
|
+
assert_equal([
|
42
|
+
144,
|
43
|
+
{
|
44
|
+
year: 1949,
|
45
|
+
month: "Feb",
|
46
|
+
passengers: 118
|
47
|
+
},
|
48
|
+
{
|
49
|
+
year: 1960,
|
50
|
+
month: "Dec",
|
51
|
+
passengers: 432
|
52
|
+
}
|
53
|
+
],
|
54
|
+
[
|
55
|
+
records.size,
|
56
|
+
records[1].to_h,
|
57
|
+
records[-1].to_h
|
58
|
+
])
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
sub_test_case("penguins") do
|
63
|
+
def setup
|
64
|
+
@dataset = Datasets::SeabornData.new("penguins")
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_each
|
68
|
+
records = @dataset.each.to_a
|
69
|
+
assert_equal([
|
70
|
+
344,
|
71
|
+
{
|
72
|
+
species: "Adelie",
|
73
|
+
island: "Torgersen",
|
74
|
+
bill_length_mm: 39.5,
|
75
|
+
bill_depth_mm: 17.4,
|
76
|
+
flipper_length_mm: 186,
|
77
|
+
body_mass_g: 3800,
|
78
|
+
sex: "Female"
|
79
|
+
},
|
80
|
+
{
|
81
|
+
species: "Gentoo",
|
82
|
+
island: "Biscoe",
|
83
|
+
bill_length_mm: 49.9,
|
84
|
+
bill_depth_mm: 16.1,
|
85
|
+
flipper_length_mm: 213,
|
86
|
+
body_mass_g: 5400,
|
87
|
+
sex: "Male"
|
88
|
+
}
|
89
|
+
],
|
90
|
+
[
|
91
|
+
records.size,
|
92
|
+
records[1].to_h,
|
93
|
+
records[-1].to_h
|
94
|
+
])
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
class SudachiSynonymDictionaryTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::SudachiSynonymDictionary.new
|
4
|
+
end
|
5
|
+
|
6
|
+
test('#each') do
|
7
|
+
records = @dataset.each.to_a
|
8
|
+
assert_equal([
|
9
|
+
61335,
|
10
|
+
{
|
11
|
+
group_id: "000001",
|
12
|
+
is_noun: true,
|
13
|
+
expansion_type: :always,
|
14
|
+
lexeme_id: 1,
|
15
|
+
form_type: :typical,
|
16
|
+
acronym_type: :typical,
|
17
|
+
variant_type: :typical,
|
18
|
+
categories: [],
|
19
|
+
notation: "曖昧",
|
20
|
+
},
|
21
|
+
{
|
22
|
+
group_id: "023705",
|
23
|
+
is_noun: true,
|
24
|
+
expansion_type: :always,
|
25
|
+
lexeme_id: 1,
|
26
|
+
form_type: :typical,
|
27
|
+
acronym_type: :alphabet,
|
28
|
+
variant_type: :typical,
|
29
|
+
categories: ["単位"],
|
30
|
+
notation: "GB",
|
31
|
+
},
|
32
|
+
],
|
33
|
+
[
|
34
|
+
records.size,
|
35
|
+
records[0].to_h,
|
36
|
+
records[-1].to_h,
|
37
|
+
])
|
38
|
+
end
|
39
|
+
|
40
|
+
sub_test_case('#metadata') do
|
41
|
+
test('#description') do
|
42
|
+
description = @dataset.metadata.description
|
43
|
+
assert do
|
44
|
+
description.start_with?('# Sudachi 同義語辞書')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-07-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: csv
|
@@ -162,8 +162,10 @@ files:
|
|
162
162
|
- lib/datasets/penn-treebank.rb
|
163
163
|
- lib/datasets/postal-code-japan.rb
|
164
164
|
- lib/datasets/rdatasets.rb
|
165
|
+
- lib/datasets/seaborn-data.rb
|
166
|
+
- lib/datasets/sudachi-synonym-dictionary.rb
|
165
167
|
- lib/datasets/table.rb
|
166
|
-
- lib/datasets/
|
168
|
+
- lib/datasets/tar-gz-readable.rb
|
167
169
|
- lib/datasets/version.rb
|
168
170
|
- lib/datasets/wikipedia.rb
|
169
171
|
- lib/datasets/wine.rb
|
@@ -189,6 +191,8 @@ files:
|
|
189
191
|
- test/test-penn-treebank.rb
|
190
192
|
- test/test-postal-code-japan.rb
|
191
193
|
- test/test-rdatasets.rb
|
194
|
+
- test/test-seaborn-data.rb
|
195
|
+
- test/test-sudachi-synonym-dictionary.rb
|
192
196
|
- test/test-table.rb
|
193
197
|
- test/test-wikipedia.rb
|
194
198
|
- test/test-wine.rb
|
@@ -237,6 +241,8 @@ test_files:
|
|
237
241
|
- test/test-penn-treebank.rb
|
238
242
|
- test/test-postal-code-japan.rb
|
239
243
|
- test/test-rdatasets.rb
|
244
|
+
- test/test-seaborn-data.rb
|
245
|
+
- test/test-sudachi-synonym-dictionary.rb
|
240
246
|
- test/test-table.rb
|
241
247
|
- test/test-wikipedia.rb
|
242
248
|
- test/test-wine.rb
|