red-datasets 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6fbd4d11063f89ba2e09250b751886086c953ec8bc92c75a6a351c31a36da0c4
4
- data.tar.gz: acc6ff31f0f4ae3a6c6565fe569233c01615718c01300b0838ff744571edc34d
3
+ metadata.gz: 07c55b47d31b30ceaf4cdd3ea22da5c737d81884a494c6a11abc6fda6fbea22b
4
+ data.tar.gz: a28d34b5d28cb57349a81112ffc2db8fe9f94939beb21477af4d9d0c9d5b59ab
5
5
  SHA512:
6
- metadata.gz: 26361511155b447ffed56a79b2336a9a1db96494bf856b23e7b39cc6a8b6a2039e7ed27564140761bdb2daaae7ee563b3695c464a7a7b21ff93b0636f6b8338d
7
- data.tar.gz: 40446f90e410e0d86abeec186a1d7adcc5375e29c19dc934f823befb26a87d904458ef5ea18c9d64055493d29ed305dba53d6e4d86bd7d84488baf3745ebd792
6
+ metadata.gz: 4871ac4ec167cb78b3fce8f9c5de9f6cff6a4089b8e4fd87fe7bb3265865cfcbd86935e8f2fa0bc5e40fde8a471e1655390fcdf8dcc0a5197342143e0cb855e5
7
+ data.tar.gz: 66d31943cb857632518a90166972bfa9ebe4b8ec355eca8291da40183c260e3c175d5866220efc0e4174f780c8449b45004db425a8fc7453268236f9b7dcbc1d
data/doc/text/news.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # News
2
2
 
3
+ ## 0.1.3 - 2021-07-09
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::SeabornData`: Added.
8
+
9
+ * `Datasets::SudachiSynonymDictionary`: Added.
10
+
3
11
  ## 0.1.2 - 2021-06-03
4
12
 
5
13
  ### Improvements
data/lib/datasets.rb CHANGED
@@ -16,5 +16,7 @@ require_relative "datasets/penguins"
16
16
  require_relative "datasets/penn-treebank"
17
17
  require_relative "datasets/postal-code-japan"
18
18
  require_relative "datasets/rdatasets"
19
+ require_relative "datasets/seaborn-data"
20
+ require_relative "datasets/sudachi-synonym-dictionary"
19
21
  require_relative "datasets/wikipedia"
20
22
  require_relative "datasets/wine"
@@ -1,4 +1,4 @@
1
- require_relative "tar_gz_readable"
1
+ require_relative "tar-gz-readable"
2
2
  require_relative "dataset"
3
3
 
4
4
  module Datasets
@@ -183,7 +183,7 @@ module Datasets
183
183
  end
184
184
  value = parse_value
185
185
  if value.nil?
186
- raise Error.new("no value for #{operator}: #{@scanner.inspect}")
186
+ raise Error, "no value for #{operator}: #{@scanner.inspect}"
187
187
  end
188
188
  [operator, expr, value]
189
189
  end
@@ -267,7 +267,7 @@ module Datasets
267
267
  if operator
268
268
  value = parse_value
269
269
  if value.nil?
270
- raise Error.new("no value for #{operator}: #{@scanner.inspect}")
270
+ raise Error, "no value for #{operator}: #{@scanner.inspect}"
271
271
  end
272
272
  [operator, operand, value]
273
273
  else
@@ -336,7 +336,7 @@ module Datasets
336
336
  skip_whitespaces
337
337
  # U+2026 HORIZONTAL ELLIPSIS
338
338
  unless @scanner.scan(/\u2026|\.\.\./)
339
- raise "no ellipsis: #{@scanner.inspect}"
339
+ raise Error, "no ellipsis: #{@scanner.inspect}"
340
340
  end
341
341
  samples << :elipsis
342
342
  end
@@ -362,7 +362,7 @@ module Datasets
362
362
  skip_whitespaces
363
363
  decimal = @scanner.scan(/[0-9]+/)
364
364
  if decimal.nil?
365
- raise "no decimal: #{@scanner.inspect}"
365
+ raise Error, "no decimal: #{@scanner.inspect}"
366
366
  end
367
367
  value += Float("0.#{decimal}")
368
368
  skip_whitespaces
@@ -65,7 +65,9 @@ module Datasets
65
65
  n_bytes = n_uint32s * 4
66
66
  mnist_magic_number = 2051
67
67
  magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
68
- raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
68
+ if magic != mnist_magic_number
69
+ raise Error, "This is not #{dataset_name} image file"
70
+ end
69
71
  n_images.times do |i|
70
72
  data = f.read(n_rows * n_cols)
71
73
  label = labels[i]
@@ -99,7 +101,9 @@ module Datasets
99
101
  n_bytes = n_uint32s * 2
100
102
  mnist_magic_number = 2049
101
103
  magic, n_labels = f.read(n_bytes).unpack('N2')
102
- raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
104
+ if magic != mnist_magic_number
105
+ raise Error, "This is not #{dataset_name} label file"
106
+ end
103
107
  f.read(n_labels).unpack('C*')
104
108
  end
105
109
  end
@@ -1,5 +1,5 @@
1
1
  require_relative "dataset"
2
- require_relative "tar_gz_readable"
2
+ require_relative "tar-gz-readable"
3
3
 
4
4
  module Datasets
5
5
  class RdatasetsList < Dataset
@@ -0,0 +1,49 @@
1
+ module Datasets
2
+ class SeabornData < Dataset
3
+ URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
4
+
5
+ def initialize(name)
6
+ super()
7
+ @metadata.id = "seaborn-data-#{name}"
8
+ @metadata.name = "SeabornData: #{name}"
9
+ @metadata.url = URL_FORMAT % {name: name}
10
+
11
+ @data_path = cache_dir_path + (name + ".csv")
12
+ @name = name
13
+ end
14
+
15
+ def each(&block)
16
+ return to_enum(__method__) unless block_given?
17
+
18
+ download(@data_path, @metadata.url) unless @data_path.exist?
19
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
20
+ csv.each do |row|
21
+ record = prepare_record(row)
22
+ yield record
23
+ end
24
+ end
25
+ end
26
+
27
+ private
28
+ def prepare_record(csv_row)
29
+ record = csv_row.to_h
30
+ record.transform_keys!(&:to_sym)
31
+
32
+ # Perform the same preprocessing as seaborn's load_dataset function
33
+ preprocessor = :"preprocess_#{@name}_record"
34
+ __send__(preprocessor, record) if respond_to?(preprocessor, true)
35
+
36
+ record
37
+ end
38
+
39
+ # The same preprocessing as seaborn.load_dataset
40
+ def preprocess_flights_record(record)
41
+ record[:month] &&= record[:month][0,3]
42
+ end
43
+
44
+ # The same preprocessing as seaborn.load_dataset
45
+ def preprocess_penguins_record(record)
46
+ record[:sex] &&= record[:sex].capitalize
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,169 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class SudachiSynonymDictionary < Dataset
7
+ class Synonym < Struct.new(:group_id,
8
+ :is_noun,
9
+ :expansion_type,
10
+ :lexeme_id,
11
+ :form_type,
12
+ :acronym_type,
13
+ :variant_type,
14
+ :categories,
15
+ :notation)
16
+ alias_method :noun?, :is_noun
17
+ end
18
+
19
+ def initialize
20
+ super()
21
+ @metadata.id = "sudachi-synonym-dictionary"
22
+ @metadata.name = "Sudachi synonym dictionary"
23
+ @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
24
+ @metadata.licenses = [
25
+ "Apache-2.0",
26
+ ]
27
+ @metadata.description = lambda do
28
+ download_description
29
+ end
30
+ end
31
+
32
+ def each
33
+ return to_enum(__method__) unless block_given?
34
+
35
+ lexeme_id_context = {}
36
+ open_data do |csv|
37
+ csv.each do |row|
38
+ group_id = row[0]
39
+ if group_id != lexeme_id_context[:group_id]
40
+ lexeme_id_context[:group_id] = group_id
41
+ lexeme_id_context[:counter] = 0
42
+ end
43
+ is_noun = (row[1] == "1")
44
+ expansion_type = normalize_expansion_type(row[2])
45
+ lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
46
+ form_type = normalize_form_type(row[4])
47
+ acronym_type = normalize_acronym_type(row[5])
48
+ variant_type = normalize_variant_type(row[6])
49
+ categories = normalize_categories(row[7])
50
+ notation = row[8]
51
+ synonym = Synonym.new(group_id,
52
+ is_noun,
53
+ expansion_type,
54
+ lexeme_id,
55
+ form_type,
56
+ acronym_type,
57
+ variant_type,
58
+ categories,
59
+ notation)
60
+ yield(synonym)
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def open_data
67
+ data_path = cache_dir_path + "synonyms.txt"
68
+ unless data_path.exist?
69
+ data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
70
+ download(data_path, data_url)
71
+ end
72
+ CSV.open(data_path, skip_blanks: true) do |csv|
73
+ yield(csv)
74
+ end
75
+ end
76
+
77
+ def download_description
78
+ description_path = cache_dir_path + "synonyms.md"
79
+ unless description_path.exist?
80
+ description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
81
+ download(description_path, description_url)
82
+ end
83
+ description_path.read
84
+ end
85
+
86
+ def normalize_expansion_type(type)
87
+ case type
88
+ when "0", ""
89
+ :always
90
+ when "1"
91
+ :expanded
92
+ when "2"
93
+ :never
94
+ else
95
+ raise Error, "unknown expansion type: #{type.inspect}"
96
+ end
97
+ end
98
+
99
+ def normalize_lexeme_id(id, context)
100
+ case id
101
+ when ""
102
+ lexeme_id_context[:counter] += 1
103
+ lexeme_id_context[:counter]
104
+ else
105
+ # Use only the first lexeme ID.
106
+ # Example:
107
+ # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
108
+ # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
109
+ Integer(id.split("/").first, 10)
110
+ end
111
+ end
112
+
113
+ def normalize_form_type(type)
114
+ case type
115
+ when "0", ""
116
+ :typical
117
+ when "1"
118
+ :translation
119
+ when "2"
120
+ :alias
121
+ when "3"
122
+ :old_name
123
+ when "4"
124
+ :misnomer
125
+ else
126
+ raise Error, "unknown form type: #{type.inspect}"
127
+ end
128
+ end
129
+
130
+ def normalize_acronym_type(type)
131
+ case type
132
+ when "0", ""
133
+ :typical
134
+ when "1"
135
+ :alphabet
136
+ when "2"
137
+ :others
138
+ else
139
+ raise Error, "unknown acronym type: #{type.inspect}"
140
+ end
141
+ end
142
+
143
+ def normalize_variant_type(type)
144
+ case type
145
+ when "0", ""
146
+ :typical
147
+ when "1"
148
+ :alphabet
149
+ when "2"
150
+ :general
151
+ when "3"
152
+ :misspelled
153
+ else
154
+ raise Error, "unknown variant type: #{type.inspect}"
155
+ end
156
+ end
157
+
158
+ def normalize_categories(categories)
159
+ case categories
160
+ when ""
161
+ nil
162
+ when /\A\((.*)\)\z/
163
+ $1.split("/")
164
+ else
165
+ raise Error, "invalid categories: #{categories.inspect}"
166
+ end
167
+ end
168
+ end
169
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -14,7 +14,7 @@ class CLDRPluralsTest < Test::Unit::TestCase
14
14
  test("#each") do
15
15
  locales = @dataset.each.to_a
16
16
  assert_equal([
17
- 215,
17
+ 218,
18
18
  locale("bm",
19
19
  [
20
20
  rule("other",
@@ -48,7 +48,7 @@ class RdatasetsTest < Test::Unit::TestCase
48
48
  test("without package_name") do
49
49
  records = @dataset.each.to_a
50
50
  assert_equal([
51
- 1478,
51
+ 1714,
52
52
  {
53
53
  package: "AER",
54
54
  dataset: "Affairs",
@@ -0,0 +1,97 @@
1
+ class SeabornDataTest < Test::Unit::TestCase
2
+ sub_test_case("fmri") do
3
+ def setup
4
+ @dataset = Datasets::SeabornData.new("fmri")
5
+ end
6
+
7
+ def test_each
8
+ records = @dataset.each.to_a
9
+ assert_equal([
10
+ 1064,
11
+ {
12
+ subject: "s5",
13
+ timepoint: 14,
14
+ event: "stim",
15
+ region: "parietal",
16
+ signal: -0.0808829319505
17
+ },
18
+ {
19
+ subject: "s0",
20
+ timepoint: 0,
21
+ event: "cue",
22
+ region: "parietal",
23
+ signal: -0.00689923478092
24
+ }
25
+ ],
26
+ [
27
+ records.size,
28
+ records[1].to_h,
29
+ records[-1].to_h
30
+ ])
31
+ end
32
+ end
33
+
34
+ sub_test_case("flights") do
35
+ def setup
36
+ @dataset = Datasets::SeabornData.new("flights")
37
+ end
38
+
39
+ def test_each
40
+ records = @dataset.each.to_a
41
+ assert_equal([
42
+ 144,
43
+ {
44
+ year: 1949,
45
+ month: "Feb",
46
+ passengers: 118
47
+ },
48
+ {
49
+ year: 1960,
50
+ month: "Dec",
51
+ passengers: 432
52
+ }
53
+ ],
54
+ [
55
+ records.size,
56
+ records[1].to_h,
57
+ records[-1].to_h
58
+ ])
59
+ end
60
+ end
61
+
62
+ sub_test_case("penguins") do
63
+ def setup
64
+ @dataset = Datasets::SeabornData.new("penguins")
65
+ end
66
+
67
+ def test_each
68
+ records = @dataset.each.to_a
69
+ assert_equal([
70
+ 344,
71
+ {
72
+ species: "Adelie",
73
+ island: "Torgersen",
74
+ bill_length_mm: 39.5,
75
+ bill_depth_mm: 17.4,
76
+ flipper_length_mm: 186,
77
+ body_mass_g: 3800,
78
+ sex: "Female"
79
+ },
80
+ {
81
+ species: "Gentoo",
82
+ island: "Biscoe",
83
+ bill_length_mm: 49.9,
84
+ bill_depth_mm: 16.1,
85
+ flipper_length_mm: 213,
86
+ body_mass_g: 5400,
87
+ sex: "Male"
88
+ }
89
+ ],
90
+ [
91
+ records.size,
92
+ records[1].to_h,
93
+ records[-1].to_h
94
+ ])
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,48 @@
1
+ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::SudachiSynonymDictionary.new
4
+ end
5
+
6
+ test('#each') do
7
+ records = @dataset.each.to_a
8
+ assert_equal([
9
+ 61335,
10
+ {
11
+ group_id: "000001",
12
+ is_noun: true,
13
+ expansion_type: :always,
14
+ lexeme_id: 1,
15
+ form_type: :typical,
16
+ acronym_type: :typical,
17
+ variant_type: :typical,
18
+ categories: [],
19
+ notation: "曖昧",
20
+ },
21
+ {
22
+ group_id: "023705",
23
+ is_noun: true,
24
+ expansion_type: :always,
25
+ lexeme_id: 1,
26
+ form_type: :typical,
27
+ acronym_type: :alphabet,
28
+ variant_type: :typical,
29
+ categories: ["単位"],
30
+ notation: "GB",
31
+ },
32
+ ],
33
+ [
34
+ records.size,
35
+ records[0].to_h,
36
+ records[-1].to_h,
37
+ ])
38
+ end
39
+
40
+ sub_test_case('#metadata') do
41
+ test('#description') do
42
+ description = @dataset.metadata.description
43
+ assert do
44
+ description.start_with?('# Sudachi 同義語辞書')
45
+ end
46
+ end
47
+ end
48
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-06-03 00:00:00.000000000 Z
12
+ date: 2021-07-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: csv
@@ -162,8 +162,10 @@ files:
162
162
  - lib/datasets/penn-treebank.rb
163
163
  - lib/datasets/postal-code-japan.rb
164
164
  - lib/datasets/rdatasets.rb
165
+ - lib/datasets/seaborn-data.rb
166
+ - lib/datasets/sudachi-synonym-dictionary.rb
165
167
  - lib/datasets/table.rb
166
- - lib/datasets/tar_gz_readable.rb
168
+ - lib/datasets/tar-gz-readable.rb
167
169
  - lib/datasets/version.rb
168
170
  - lib/datasets/wikipedia.rb
169
171
  - lib/datasets/wine.rb
@@ -189,6 +191,8 @@ files:
189
191
  - test/test-penn-treebank.rb
190
192
  - test/test-postal-code-japan.rb
191
193
  - test/test-rdatasets.rb
194
+ - test/test-seaborn-data.rb
195
+ - test/test-sudachi-synonym-dictionary.rb
192
196
  - test/test-table.rb
193
197
  - test/test-wikipedia.rb
194
198
  - test/test-wine.rb
@@ -237,6 +241,8 @@ test_files:
237
241
  - test/test-penn-treebank.rb
238
242
  - test/test-postal-code-japan.rb
239
243
  - test/test-rdatasets.rb
244
+ - test/test-seaborn-data.rb
245
+ - test/test-sudachi-synonym-dictionary.rb
240
246
  - test/test-table.rb
241
247
  - test/test-wikipedia.rb
242
248
  - test/test-wine.rb