red-datasets 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/doc/text/news.md +8 -0
- data/lib/datasets.rb +2 -0
- data/lib/datasets/cifar.rb +1 -1
- data/lib/datasets/cldr-plurals.rb +4 -4
- data/lib/datasets/mnist.rb +6 -2
- data/lib/datasets/rdatasets.rb +1 -1
- data/lib/datasets/seaborn-data.rb +49 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +169 -0
- data/lib/datasets/{tar_gz_readable.rb → tar-gz-readable.rb} +0 -0
- data/lib/datasets/version.rb +1 -1
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-rdatasets.rb +1 -1
- data/test/test-seaborn-data.rb +97 -0
- data/test/test-sudachi-synonym-dictionary.rb +48 -0
- metadata +9 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07c55b47d31b30ceaf4cdd3ea22da5c737d81884a494c6a11abc6fda6fbea22b
|
4
|
+
data.tar.gz: a28d34b5d28cb57349a81112ffc2db8fe9f94939beb21477af4d9d0c9d5b59ab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4871ac4ec167cb78b3fce8f9c5de9f6cff6a4089b8e4fd87fe7bb3265865cfcbd86935e8f2fa0bc5e40fde8a471e1655390fcdf8dcc0a5197342143e0cb855e5
|
7
|
+
data.tar.gz: 66d31943cb857632518a90166972bfa9ebe4b8ec355eca8291da40183c260e3c175d5866220efc0e4174f780c8449b45004db425a8fc7453268236f9b7dcbc1d
|
data/doc/text/news.md
CHANGED
data/lib/datasets.rb
CHANGED
@@ -16,5 +16,7 @@ require_relative "datasets/penguins"
|
|
16
16
|
require_relative "datasets/penn-treebank"
|
17
17
|
require_relative "datasets/postal-code-japan"
|
18
18
|
require_relative "datasets/rdatasets"
|
19
|
+
require_relative "datasets/seaborn-data"
|
20
|
+
require_relative "datasets/sudachi-synonym-dictionary"
|
19
21
|
require_relative "datasets/wikipedia"
|
20
22
|
require_relative "datasets/wine"
|
data/lib/datasets/cifar.rb
CHANGED
@@ -183,7 +183,7 @@ module Datasets
|
|
183
183
|
end
|
184
184
|
value = parse_value
|
185
185
|
if value.nil?
|
186
|
-
raise Error
|
186
|
+
raise Error, "no value for #{operator}: #{@scanner.inspect}"
|
187
187
|
end
|
188
188
|
[operator, expr, value]
|
189
189
|
end
|
@@ -267,7 +267,7 @@ module Datasets
|
|
267
267
|
if operator
|
268
268
|
value = parse_value
|
269
269
|
if value.nil?
|
270
|
-
raise Error
|
270
|
+
raise Error, "no value for #{operator}: #{@scanner.inspect}"
|
271
271
|
end
|
272
272
|
[operator, operand, value]
|
273
273
|
else
|
@@ -336,7 +336,7 @@ module Datasets
|
|
336
336
|
skip_whitespaces
|
337
337
|
# U+2026 HORIZONTAL ELLIPSIS
|
338
338
|
unless @scanner.scan(/\u2026|\.\.\./)
|
339
|
-
raise "no ellipsis: #{@scanner.inspect}"
|
339
|
+
raise Error, "no ellipsis: #{@scanner.inspect}"
|
340
340
|
end
|
341
341
|
samples << :elipsis
|
342
342
|
end
|
@@ -362,7 +362,7 @@ module Datasets
|
|
362
362
|
skip_whitespaces
|
363
363
|
decimal = @scanner.scan(/[0-9]+/)
|
364
364
|
if decimal.nil?
|
365
|
-
raise "no decimal: #{@scanner.inspect}"
|
365
|
+
raise Error, "no decimal: #{@scanner.inspect}"
|
366
366
|
end
|
367
367
|
value += Float("0.#{decimal}")
|
368
368
|
skip_whitespaces
|
data/lib/datasets/mnist.rb
CHANGED
@@ -65,7 +65,9 @@ module Datasets
|
|
65
65
|
n_bytes = n_uint32s * 4
|
66
66
|
mnist_magic_number = 2051
|
67
67
|
magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
|
68
|
-
|
68
|
+
if magic != mnist_magic_number
|
69
|
+
raise Error, "This is not #{dataset_name} image file"
|
70
|
+
end
|
69
71
|
n_images.times do |i|
|
70
72
|
data = f.read(n_rows * n_cols)
|
71
73
|
label = labels[i]
|
@@ -99,7 +101,9 @@ module Datasets
|
|
99
101
|
n_bytes = n_uint32s * 2
|
100
102
|
mnist_magic_number = 2049
|
101
103
|
magic, n_labels = f.read(n_bytes).unpack('N2')
|
102
|
-
|
104
|
+
if magic != mnist_magic_number
|
105
|
+
raise Error, "This is not #{dataset_name} label file"
|
106
|
+
end
|
103
107
|
f.read(n_labels).unpack('C*')
|
104
108
|
end
|
105
109
|
end
|
data/lib/datasets/rdatasets.rb
CHANGED
@@ -0,0 +1,49 @@
|
|
1
|
+
module Datasets
|
2
|
+
class SeabornData < Dataset
|
3
|
+
URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
|
4
|
+
|
5
|
+
def initialize(name)
|
6
|
+
super()
|
7
|
+
@metadata.id = "seaborn-data-#{name}"
|
8
|
+
@metadata.name = "SeabornData: #{name}"
|
9
|
+
@metadata.url = URL_FORMAT % {name: name}
|
10
|
+
|
11
|
+
@data_path = cache_dir_path + (name + ".csv")
|
12
|
+
@name = name
|
13
|
+
end
|
14
|
+
|
15
|
+
def each(&block)
|
16
|
+
return to_enum(__method__) unless block_given?
|
17
|
+
|
18
|
+
download(@data_path, @metadata.url) unless @data_path.exist?
|
19
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
20
|
+
csv.each do |row|
|
21
|
+
record = prepare_record(row)
|
22
|
+
yield record
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
def prepare_record(csv_row)
|
29
|
+
record = csv_row.to_h
|
30
|
+
record.transform_keys!(&:to_sym)
|
31
|
+
|
32
|
+
# Perform the same preprocessing as seaborn's load_dataset function
|
33
|
+
preprocessor = :"preprocess_#{@name}_record"
|
34
|
+
__send__(preprocessor, record) if respond_to?(preprocessor, true)
|
35
|
+
|
36
|
+
record
|
37
|
+
end
|
38
|
+
|
39
|
+
# The same preprocessing as seaborn.load_dataset
|
40
|
+
def preprocess_flights_record(record)
|
41
|
+
record[:month] &&= record[:month][0,3]
|
42
|
+
end
|
43
|
+
|
44
|
+
# The same preprocessing as seaborn.load_dataset
|
45
|
+
def preprocess_penguins_record(record)
|
46
|
+
record[:sex] &&= record[:sex].capitalize
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require "csv"
|
2
|
+
|
3
|
+
require_relative "dataset"
|
4
|
+
|
5
|
+
module Datasets
|
6
|
+
class SudachiSynonymDictionary < Dataset
|
7
|
+
class Synonym < Struct.new(:group_id,
|
8
|
+
:is_noun,
|
9
|
+
:expansion_type,
|
10
|
+
:lexeme_id,
|
11
|
+
:form_type,
|
12
|
+
:acronym_type,
|
13
|
+
:variant_type,
|
14
|
+
:categories,
|
15
|
+
:notation)
|
16
|
+
alias_method :noun?, :is_noun
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super()
|
21
|
+
@metadata.id = "sudachi-synonym-dictionary"
|
22
|
+
@metadata.name = "Sudachi synonym dictionary"
|
23
|
+
@metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
|
24
|
+
@metadata.licenses = [
|
25
|
+
"Apache-2.0",
|
26
|
+
]
|
27
|
+
@metadata.description = lambda do
|
28
|
+
download_description
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def each
|
33
|
+
return to_enum(__method__) unless block_given?
|
34
|
+
|
35
|
+
lexeme_id_context = {}
|
36
|
+
open_data do |csv|
|
37
|
+
csv.each do |row|
|
38
|
+
group_id = row[0]
|
39
|
+
if group_id != lexeme_id_context[:group_id]
|
40
|
+
lexeme_id_context[:group_id] = group_id
|
41
|
+
lexeme_id_context[:counter] = 0
|
42
|
+
end
|
43
|
+
is_noun = (row[1] == "1")
|
44
|
+
expansion_type = normalize_expansion_type(row[2])
|
45
|
+
lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
|
46
|
+
form_type = normalize_form_type(row[4])
|
47
|
+
acronym_type = normalize_acronym_type(row[5])
|
48
|
+
variant_type = normalize_variant_type(row[6])
|
49
|
+
categories = normalize_categories(row[7])
|
50
|
+
notation = row[8]
|
51
|
+
synonym = Synonym.new(group_id,
|
52
|
+
is_noun,
|
53
|
+
expansion_type,
|
54
|
+
lexeme_id,
|
55
|
+
form_type,
|
56
|
+
acronym_type,
|
57
|
+
variant_type,
|
58
|
+
categories,
|
59
|
+
notation)
|
60
|
+
yield(synonym)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
private
|
66
|
+
def open_data
|
67
|
+
data_path = cache_dir_path + "synonyms.txt"
|
68
|
+
unless data_path.exist?
|
69
|
+
data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
|
70
|
+
download(data_path, data_url)
|
71
|
+
end
|
72
|
+
CSV.open(data_path, skip_blanks: true) do |csv|
|
73
|
+
yield(csv)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def download_description
|
78
|
+
description_path = cache_dir_path + "synonyms.md"
|
79
|
+
unless description_path.exist?
|
80
|
+
description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
|
81
|
+
download(description_path, description_url)
|
82
|
+
end
|
83
|
+
description_path.read
|
84
|
+
end
|
85
|
+
|
86
|
+
def normalize_expansion_type(type)
|
87
|
+
case type
|
88
|
+
when "0", ""
|
89
|
+
:always
|
90
|
+
when "1"
|
91
|
+
:expanded
|
92
|
+
when "2"
|
93
|
+
:never
|
94
|
+
else
|
95
|
+
raise Error, "unknown expansion type: #{type.inspect}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def normalize_lexeme_id(id, context)
|
100
|
+
case id
|
101
|
+
when ""
|
102
|
+
lexeme_id_context[:counter] += 1
|
103
|
+
lexeme_id_context[:counter]
|
104
|
+
else
|
105
|
+
# Use only the first lexeme ID.
|
106
|
+
# Example:
|
107
|
+
# 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
|
108
|
+
# 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
|
109
|
+
Integer(id.split("/").first, 10)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def normalize_form_type(type)
|
114
|
+
case type
|
115
|
+
when "0", ""
|
116
|
+
:typical
|
117
|
+
when "1"
|
118
|
+
:translation
|
119
|
+
when "2"
|
120
|
+
:alias
|
121
|
+
when "3"
|
122
|
+
:old_name
|
123
|
+
when "4"
|
124
|
+
:misnomer
|
125
|
+
else
|
126
|
+
raise Error, "unknown form type: #{type.inspect}"
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def normalize_acronym_type(type)
|
131
|
+
case type
|
132
|
+
when "0", ""
|
133
|
+
:typical
|
134
|
+
when "1"
|
135
|
+
:alphabet
|
136
|
+
when "2"
|
137
|
+
:others
|
138
|
+
else
|
139
|
+
raise Error, "unknown acronym type: #{type.inspect}"
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
def normalize_variant_type(type)
|
144
|
+
case type
|
145
|
+
when "0", ""
|
146
|
+
:typical
|
147
|
+
when "1"
|
148
|
+
:alphabet
|
149
|
+
when "2"
|
150
|
+
:general
|
151
|
+
when "3"
|
152
|
+
:misspelled
|
153
|
+
else
|
154
|
+
raise Error, "unknown variant type: #{type.inspect}"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def normalize_categories(categories)
|
159
|
+
case categories
|
160
|
+
when ""
|
161
|
+
nil
|
162
|
+
when /\A\((.*)\)\z/
|
163
|
+
$1.split("/")
|
164
|
+
else
|
165
|
+
raise Error, "invalid categories: #{categories.inspect}"
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
File without changes
|
data/lib/datasets/version.rb
CHANGED
data/test/test-cldr-plurals.rb
CHANGED
data/test/test-rdatasets.rb
CHANGED
@@ -0,0 +1,97 @@
|
|
1
|
+
class SeabornDataTest < Test::Unit::TestCase
|
2
|
+
sub_test_case("fmri") do
|
3
|
+
def setup
|
4
|
+
@dataset = Datasets::SeabornData.new("fmri")
|
5
|
+
end
|
6
|
+
|
7
|
+
def test_each
|
8
|
+
records = @dataset.each.to_a
|
9
|
+
assert_equal([
|
10
|
+
1064,
|
11
|
+
{
|
12
|
+
subject: "s5",
|
13
|
+
timepoint: 14,
|
14
|
+
event: "stim",
|
15
|
+
region: "parietal",
|
16
|
+
signal: -0.0808829319505
|
17
|
+
},
|
18
|
+
{
|
19
|
+
subject: "s0",
|
20
|
+
timepoint: 0,
|
21
|
+
event: "cue",
|
22
|
+
region: "parietal",
|
23
|
+
signal: -0.00689923478092
|
24
|
+
}
|
25
|
+
],
|
26
|
+
[
|
27
|
+
records.size,
|
28
|
+
records[1].to_h,
|
29
|
+
records[-1].to_h
|
30
|
+
])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
sub_test_case("flights") do
|
35
|
+
def setup
|
36
|
+
@dataset = Datasets::SeabornData.new("flights")
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_each
|
40
|
+
records = @dataset.each.to_a
|
41
|
+
assert_equal([
|
42
|
+
144,
|
43
|
+
{
|
44
|
+
year: 1949,
|
45
|
+
month: "Feb",
|
46
|
+
passengers: 118
|
47
|
+
},
|
48
|
+
{
|
49
|
+
year: 1960,
|
50
|
+
month: "Dec",
|
51
|
+
passengers: 432
|
52
|
+
}
|
53
|
+
],
|
54
|
+
[
|
55
|
+
records.size,
|
56
|
+
records[1].to_h,
|
57
|
+
records[-1].to_h
|
58
|
+
])
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
sub_test_case("penguins") do
|
63
|
+
def setup
|
64
|
+
@dataset = Datasets::SeabornData.new("penguins")
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_each
|
68
|
+
records = @dataset.each.to_a
|
69
|
+
assert_equal([
|
70
|
+
344,
|
71
|
+
{
|
72
|
+
species: "Adelie",
|
73
|
+
island: "Torgersen",
|
74
|
+
bill_length_mm: 39.5,
|
75
|
+
bill_depth_mm: 17.4,
|
76
|
+
flipper_length_mm: 186,
|
77
|
+
body_mass_g: 3800,
|
78
|
+
sex: "Female"
|
79
|
+
},
|
80
|
+
{
|
81
|
+
species: "Gentoo",
|
82
|
+
island: "Biscoe",
|
83
|
+
bill_length_mm: 49.9,
|
84
|
+
bill_depth_mm: 16.1,
|
85
|
+
flipper_length_mm: 213,
|
86
|
+
body_mass_g: 5400,
|
87
|
+
sex: "Male"
|
88
|
+
}
|
89
|
+
],
|
90
|
+
[
|
91
|
+
records.size,
|
92
|
+
records[1].to_h,
|
93
|
+
records[-1].to_h
|
94
|
+
])
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
class SudachiSynonymDictionaryTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::SudachiSynonymDictionary.new
|
4
|
+
end
|
5
|
+
|
6
|
+
test('#each') do
|
7
|
+
records = @dataset.each.to_a
|
8
|
+
assert_equal([
|
9
|
+
61335,
|
10
|
+
{
|
11
|
+
group_id: "000001",
|
12
|
+
is_noun: true,
|
13
|
+
expansion_type: :always,
|
14
|
+
lexeme_id: 1,
|
15
|
+
form_type: :typical,
|
16
|
+
acronym_type: :typical,
|
17
|
+
variant_type: :typical,
|
18
|
+
categories: [],
|
19
|
+
notation: "曖昧",
|
20
|
+
},
|
21
|
+
{
|
22
|
+
group_id: "023705",
|
23
|
+
is_noun: true,
|
24
|
+
expansion_type: :always,
|
25
|
+
lexeme_id: 1,
|
26
|
+
form_type: :typical,
|
27
|
+
acronym_type: :alphabet,
|
28
|
+
variant_type: :typical,
|
29
|
+
categories: ["単位"],
|
30
|
+
notation: "GB",
|
31
|
+
},
|
32
|
+
],
|
33
|
+
[
|
34
|
+
records.size,
|
35
|
+
records[0].to_h,
|
36
|
+
records[-1].to_h,
|
37
|
+
])
|
38
|
+
end
|
39
|
+
|
40
|
+
sub_test_case('#metadata') do
|
41
|
+
test('#description') do
|
42
|
+
description = @dataset.metadata.description
|
43
|
+
assert do
|
44
|
+
description.start_with?('# Sudachi 同義語辞書')
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-07-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: csv
|
@@ -162,8 +162,10 @@ files:
|
|
162
162
|
- lib/datasets/penn-treebank.rb
|
163
163
|
- lib/datasets/postal-code-japan.rb
|
164
164
|
- lib/datasets/rdatasets.rb
|
165
|
+
- lib/datasets/seaborn-data.rb
|
166
|
+
- lib/datasets/sudachi-synonym-dictionary.rb
|
165
167
|
- lib/datasets/table.rb
|
166
|
-
- lib/datasets/
|
168
|
+
- lib/datasets/tar-gz-readable.rb
|
167
169
|
- lib/datasets/version.rb
|
168
170
|
- lib/datasets/wikipedia.rb
|
169
171
|
- lib/datasets/wine.rb
|
@@ -189,6 +191,8 @@ files:
|
|
189
191
|
- test/test-penn-treebank.rb
|
190
192
|
- test/test-postal-code-japan.rb
|
191
193
|
- test/test-rdatasets.rb
|
194
|
+
- test/test-seaborn-data.rb
|
195
|
+
- test/test-sudachi-synonym-dictionary.rb
|
192
196
|
- test/test-table.rb
|
193
197
|
- test/test-wikipedia.rb
|
194
198
|
- test/test-wine.rb
|
@@ -237,6 +241,8 @@ test_files:
|
|
237
241
|
- test/test-penn-treebank.rb
|
238
242
|
- test/test-postal-code-japan.rb
|
239
243
|
- test/test-rdatasets.rb
|
244
|
+
- test/test-seaborn-data.rb
|
245
|
+
- test/test-sudachi-synonym-dictionary.rb
|
240
246
|
- test/test-table.rb
|
241
247
|
- test/test-wikipedia.rb
|
242
248
|
- test/test-wine.rb
|