red-datasets 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6fbd4d11063f89ba2e09250b751886086c953ec8bc92c75a6a351c31a36da0c4
4
- data.tar.gz: acc6ff31f0f4ae3a6c6565fe569233c01615718c01300b0838ff744571edc34d
3
+ metadata.gz: 07c55b47d31b30ceaf4cdd3ea22da5c737d81884a494c6a11abc6fda6fbea22b
4
+ data.tar.gz: a28d34b5d28cb57349a81112ffc2db8fe9f94939beb21477af4d9d0c9d5b59ab
5
5
  SHA512:
6
- metadata.gz: 26361511155b447ffed56a79b2336a9a1db96494bf856b23e7b39cc6a8b6a2039e7ed27564140761bdb2daaae7ee563b3695c464a7a7b21ff93b0636f6b8338d
7
- data.tar.gz: 40446f90e410e0d86abeec186a1d7adcc5375e29c19dc934f823befb26a87d904458ef5ea18c9d64055493d29ed305dba53d6e4d86bd7d84488baf3745ebd792
6
+ metadata.gz: 4871ac4ec167cb78b3fce8f9c5de9f6cff6a4089b8e4fd87fe7bb3265865cfcbd86935e8f2fa0bc5e40fde8a471e1655390fcdf8dcc0a5197342143e0cb855e5
7
+ data.tar.gz: 66d31943cb857632518a90166972bfa9ebe4b8ec355eca8291da40183c260e3c175d5866220efc0e4174f780c8449b45004db425a8fc7453268236f9b7dcbc1d
data/doc/text/news.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # News
2
2
 
3
+ ## 0.1.3 - 2021-07-09
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::SeabornData`: Added.
8
+
9
+ * `Datasets::SudachiSynonymDictionary`: Added.
10
+
3
11
  ## 0.1.2 - 2021-06-03
4
12
 
5
13
  ### Improvements
data/lib/datasets.rb CHANGED
@@ -16,5 +16,7 @@ require_relative "datasets/penguins"
16
16
  require_relative "datasets/penn-treebank"
17
17
  require_relative "datasets/postal-code-japan"
18
18
  require_relative "datasets/rdatasets"
19
+ require_relative "datasets/seaborn-data"
20
+ require_relative "datasets/sudachi-synonym-dictionary"
19
21
  require_relative "datasets/wikipedia"
20
22
  require_relative "datasets/wine"
@@ -1,4 +1,4 @@
1
- require_relative "tar_gz_readable"
1
+ require_relative "tar-gz-readable"
2
2
  require_relative "dataset"
3
3
 
4
4
  module Datasets
@@ -183,7 +183,7 @@ module Datasets
183
183
  end
184
184
  value = parse_value
185
185
  if value.nil?
186
- raise Error.new("no value for #{operator}: #{@scanner.inspect}")
186
+ raise Error, "no value for #{operator}: #{@scanner.inspect}"
187
187
  end
188
188
  [operator, expr, value]
189
189
  end
@@ -267,7 +267,7 @@ module Datasets
267
267
  if operator
268
268
  value = parse_value
269
269
  if value.nil?
270
- raise Error.new("no value for #{operator}: #{@scanner.inspect}")
270
+ raise Error, "no value for #{operator}: #{@scanner.inspect}"
271
271
  end
272
272
  [operator, operand, value]
273
273
  else
@@ -336,7 +336,7 @@ module Datasets
336
336
  skip_whitespaces
337
337
  # U+2026 HORIZONTAL ELLIPSIS
338
338
  unless @scanner.scan(/\u2026|\.\.\./)
339
- raise "no ellipsis: #{@scanner.inspect}"
339
+ raise Error, "no ellipsis: #{@scanner.inspect}"
340
340
  end
341
341
  samples << :elipsis
342
342
  end
@@ -362,7 +362,7 @@ module Datasets
362
362
  skip_whitespaces
363
363
  decimal = @scanner.scan(/[0-9]+/)
364
364
  if decimal.nil?
365
- raise "no decimal: #{@scanner.inspect}"
365
+ raise Error, "no decimal: #{@scanner.inspect}"
366
366
  end
367
367
  value += Float("0.#{decimal}")
368
368
  skip_whitespaces
@@ -65,7 +65,9 @@ module Datasets
65
65
  n_bytes = n_uint32s * 4
66
66
  mnist_magic_number = 2051
67
67
  magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
68
- raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
68
+ if magic != mnist_magic_number
69
+ raise Error, "This is not #{dataset_name} image file"
70
+ end
69
71
  n_images.times do |i|
70
72
  data = f.read(n_rows * n_cols)
71
73
  label = labels[i]
@@ -99,7 +101,9 @@ module Datasets
99
101
  n_bytes = n_uint32s * 2
100
102
  mnist_magic_number = 2049
101
103
  magic, n_labels = f.read(n_bytes).unpack('N2')
102
- raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
104
+ if magic != mnist_magic_number
105
+ raise Error, "This is not #{dataset_name} label file"
106
+ end
103
107
  f.read(n_labels).unpack('C*')
104
108
  end
105
109
  end
@@ -1,5 +1,5 @@
1
1
  require_relative "dataset"
2
- require_relative "tar_gz_readable"
2
+ require_relative "tar-gz-readable"
3
3
 
4
4
  module Datasets
5
5
  class RdatasetsList < Dataset
@@ -0,0 +1,49 @@
1
+ module Datasets
2
+ class SeabornData < Dataset
3
+ URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
4
+
5
+ def initialize(name)
6
+ super()
7
+ @metadata.id = "seaborn-data-#{name}"
8
+ @metadata.name = "SeabornData: #{name}"
9
+ @metadata.url = URL_FORMAT % {name: name}
10
+
11
+ @data_path = cache_dir_path + (name + ".csv")
12
+ @name = name
13
+ end
14
+
15
+ def each(&block)
16
+ return to_enum(__method__) unless block_given?
17
+
18
+ download(@data_path, @metadata.url) unless @data_path.exist?
19
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
20
+ csv.each do |row|
21
+ record = prepare_record(row)
22
+ yield record
23
+ end
24
+ end
25
+ end
26
+
27
+ private
28
+ def prepare_record(csv_row)
29
+ record = csv_row.to_h
30
+ record.transform_keys!(&:to_sym)
31
+
32
+ # Perform the same preprocessing as seaborn's load_dataset function
33
+ preprocessor = :"preprocess_#{@name}_record"
34
+ __send__(preprocessor, record) if respond_to?(preprocessor, true)
35
+
36
+ record
37
+ end
38
+
39
+ # The same preprocessing as seaborn.load_dataset
40
+ def preprocess_flights_record(record)
41
+ record[:month] &&= record[:month][0,3]
42
+ end
43
+
44
+ # The same preprocessing as seaborn.load_dataset
45
+ def preprocess_penguins_record(record)
46
+ record[:sex] &&= record[:sex].capitalize
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,169 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class SudachiSynonymDictionary < Dataset
7
+ class Synonym < Struct.new(:group_id,
8
+ :is_noun,
9
+ :expansion_type,
10
+ :lexeme_id,
11
+ :form_type,
12
+ :acronym_type,
13
+ :variant_type,
14
+ :categories,
15
+ :notation)
16
+ alias_method :noun?, :is_noun
17
+ end
18
+
19
+ def initialize
20
+ super()
21
+ @metadata.id = "sudachi-synonym-dictionary"
22
+ @metadata.name = "Sudachi synonym dictionary"
23
+ @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
24
+ @metadata.licenses = [
25
+ "Apache-2.0",
26
+ ]
27
+ @metadata.description = lambda do
28
+ download_description
29
+ end
30
+ end
31
+
32
+ def each
33
+ return to_enum(__method__) unless block_given?
34
+
35
+ lexeme_id_context = {}
36
+ open_data do |csv|
37
+ csv.each do |row|
38
+ group_id = row[0]
39
+ if group_id != lexeme_id_context[:group_id]
40
+ lexeme_id_context[:group_id] = group_id
41
+ lexeme_id_context[:counter] = 0
42
+ end
43
+ is_noun = (row[1] == "1")
44
+ expansion_type = normalize_expansion_type(row[2])
45
+ lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
46
+ form_type = normalize_form_type(row[4])
47
+ acronym_type = normalize_acronym_type(row[5])
48
+ variant_type = normalize_variant_type(row[6])
49
+ categories = normalize_categories(row[7])
50
+ notation = row[8]
51
+ synonym = Synonym.new(group_id,
52
+ is_noun,
53
+ expansion_type,
54
+ lexeme_id,
55
+ form_type,
56
+ acronym_type,
57
+ variant_type,
58
+ categories,
59
+ notation)
60
+ yield(synonym)
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def open_data
67
+ data_path = cache_dir_path + "synonyms.txt"
68
+ unless data_path.exist?
69
+ data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
70
+ download(data_path, data_url)
71
+ end
72
+ CSV.open(data_path, skip_blanks: true) do |csv|
73
+ yield(csv)
74
+ end
75
+ end
76
+
77
+ def download_description
78
+ description_path = cache_dir_path + "synonyms.md"
79
+ unless description_path.exist?
80
+ description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
81
+ download(description_path, description_url)
82
+ end
83
+ description_path.read
84
+ end
85
+
86
+ def normalize_expansion_type(type)
87
+ case type
88
+ when "0", ""
89
+ :always
90
+ when "1"
91
+ :expanded
92
+ when "2"
93
+ :never
94
+ else
95
+ raise Error, "unknown expansion type: #{type.inspect}"
96
+ end
97
+ end
98
+
99
+ def normalize_lexeme_id(id, context)
100
+ case id
101
+ when ""
102
+ lexeme_id_context[:counter] += 1
103
+ lexeme_id_context[:counter]
104
+ else
105
+ # Use only the first lexeme ID.
106
+ # Example:
107
+ # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
108
+ # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
109
+ Integer(id.split("/").first, 10)
110
+ end
111
+ end
112
+
113
+ def normalize_form_type(type)
114
+ case type
115
+ when "0", ""
116
+ :typical
117
+ when "1"
118
+ :translation
119
+ when "2"
120
+ :alias
121
+ when "3"
122
+ :old_name
123
+ when "4"
124
+ :misnomer
125
+ else
126
+ raise Error, "unknown form type: #{type.inspect}"
127
+ end
128
+ end
129
+
130
+ def normalize_acronym_type(type)
131
+ case type
132
+ when "0", ""
133
+ :typical
134
+ when "1"
135
+ :alphabet
136
+ when "2"
137
+ :others
138
+ else
139
+ raise Error, "unknown acronym type: #{type.inspect}"
140
+ end
141
+ end
142
+
143
+ def normalize_variant_type(type)
144
+ case type
145
+ when "0", ""
146
+ :typical
147
+ when "1"
148
+ :alphabet
149
+ when "2"
150
+ :general
151
+ when "3"
152
+ :misspelled
153
+ else
154
+ raise Error, "unknown variant type: #{type.inspect}"
155
+ end
156
+ end
157
+
158
+ def normalize_categories(categories)
159
+ case categories
160
+ when ""
161
+ nil
162
+ when /\A\((.*)\)\z/
163
+ $1.split("/")
164
+ else
165
+ raise Error, "invalid categories: #{categories.inspect}"
166
+ end
167
+ end
168
+ end
169
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
@@ -14,7 +14,7 @@ class CLDRPluralsTest < Test::Unit::TestCase
14
14
  test("#each") do
15
15
  locales = @dataset.each.to_a
16
16
  assert_equal([
17
- 215,
17
+ 218,
18
18
  locale("bm",
19
19
  [
20
20
  rule("other",
@@ -48,7 +48,7 @@ class RdatasetsTest < Test::Unit::TestCase
48
48
  test("without package_name") do
49
49
  records = @dataset.each.to_a
50
50
  assert_equal([
51
- 1478,
51
+ 1714,
52
52
  {
53
53
  package: "AER",
54
54
  dataset: "Affairs",
@@ -0,0 +1,97 @@
1
+ class SeabornDataTest < Test::Unit::TestCase
2
+ sub_test_case("fmri") do
3
+ def setup
4
+ @dataset = Datasets::SeabornData.new("fmri")
5
+ end
6
+
7
+ def test_each
8
+ records = @dataset.each.to_a
9
+ assert_equal([
10
+ 1064,
11
+ {
12
+ subject: "s5",
13
+ timepoint: 14,
14
+ event: "stim",
15
+ region: "parietal",
16
+ signal: -0.0808829319505
17
+ },
18
+ {
19
+ subject: "s0",
20
+ timepoint: 0,
21
+ event: "cue",
22
+ region: "parietal",
23
+ signal: -0.00689923478092
24
+ }
25
+ ],
26
+ [
27
+ records.size,
28
+ records[1].to_h,
29
+ records[-1].to_h
30
+ ])
31
+ end
32
+ end
33
+
34
+ sub_test_case("flights") do
35
+ def setup
36
+ @dataset = Datasets::SeabornData.new("flights")
37
+ end
38
+
39
+ def test_each
40
+ records = @dataset.each.to_a
41
+ assert_equal([
42
+ 144,
43
+ {
44
+ year: 1949,
45
+ month: "Feb",
46
+ passengers: 118
47
+ },
48
+ {
49
+ year: 1960,
50
+ month: "Dec",
51
+ passengers: 432
52
+ }
53
+ ],
54
+ [
55
+ records.size,
56
+ records[1].to_h,
57
+ records[-1].to_h
58
+ ])
59
+ end
60
+ end
61
+
62
+ sub_test_case("penguins") do
63
+ def setup
64
+ @dataset = Datasets::SeabornData.new("penguins")
65
+ end
66
+
67
+ def test_each
68
+ records = @dataset.each.to_a
69
+ assert_equal([
70
+ 344,
71
+ {
72
+ species: "Adelie",
73
+ island: "Torgersen",
74
+ bill_length_mm: 39.5,
75
+ bill_depth_mm: 17.4,
76
+ flipper_length_mm: 186,
77
+ body_mass_g: 3800,
78
+ sex: "Female"
79
+ },
80
+ {
81
+ species: "Gentoo",
82
+ island: "Biscoe",
83
+ bill_length_mm: 49.9,
84
+ bill_depth_mm: 16.1,
85
+ flipper_length_mm: 213,
86
+ body_mass_g: 5400,
87
+ sex: "Male"
88
+ }
89
+ ],
90
+ [
91
+ records.size,
92
+ records[1].to_h,
93
+ records[-1].to_h
94
+ ])
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,48 @@
1
+ class SudachiSynonymDictionaryTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::SudachiSynonymDictionary.new
4
+ end
5
+
6
+ test('#each') do
7
+ records = @dataset.each.to_a
8
+ assert_equal([
9
+ 61335,
10
+ {
11
+ group_id: "000001",
12
+ is_noun: true,
13
+ expansion_type: :always,
14
+ lexeme_id: 1,
15
+ form_type: :typical,
16
+ acronym_type: :typical,
17
+ variant_type: :typical,
18
+ categories: [],
19
+ notation: "曖昧",
20
+ },
21
+ {
22
+ group_id: "023705",
23
+ is_noun: true,
24
+ expansion_type: :always,
25
+ lexeme_id: 1,
26
+ form_type: :typical,
27
+ acronym_type: :alphabet,
28
+ variant_type: :typical,
29
+ categories: ["単位"],
30
+ notation: "GB",
31
+ },
32
+ ],
33
+ [
34
+ records.size,
35
+ records[0].to_h,
36
+ records[-1].to_h,
37
+ ])
38
+ end
39
+
40
+ sub_test_case('#metadata') do
41
+ test('#description') do
42
+ description = @dataset.metadata.description
43
+ assert do
44
+ description.start_with?('# Sudachi 同義語辞書')
45
+ end
46
+ end
47
+ end
48
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-06-03 00:00:00.000000000 Z
12
+ date: 2021-07-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: csv
@@ -162,8 +162,10 @@ files:
162
162
  - lib/datasets/penn-treebank.rb
163
163
  - lib/datasets/postal-code-japan.rb
164
164
  - lib/datasets/rdatasets.rb
165
+ - lib/datasets/seaborn-data.rb
166
+ - lib/datasets/sudachi-synonym-dictionary.rb
165
167
  - lib/datasets/table.rb
166
- - lib/datasets/tar_gz_readable.rb
168
+ - lib/datasets/tar-gz-readable.rb
167
169
  - lib/datasets/version.rb
168
170
  - lib/datasets/wikipedia.rb
169
171
  - lib/datasets/wine.rb
@@ -189,6 +191,8 @@ files:
189
191
  - test/test-penn-treebank.rb
190
192
  - test/test-postal-code-japan.rb
191
193
  - test/test-rdatasets.rb
194
+ - test/test-seaborn-data.rb
195
+ - test/test-sudachi-synonym-dictionary.rb
192
196
  - test/test-table.rb
193
197
  - test/test-wikipedia.rb
194
198
  - test/test-wine.rb
@@ -237,6 +241,8 @@ test_files:
237
241
  - test/test-penn-treebank.rb
238
242
  - test/test-postal-code-japan.rb
239
243
  - test/test-rdatasets.rb
244
+ - test/test-seaborn-data.rb
245
+ - test/test-sudachi-synonym-dictionary.rb
240
246
  - test/test-table.rb
241
247
  - test/test-wikipedia.rb
242
248
  - test/test-wine.rb