red-datasets 0.0.8 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c7a9199546e7a001c97e45c6fa28db15c0d96b748e527d9705dfee4e4b1db6fd
4
- data.tar.gz: c659f6ae1e658ad91210e4427be063463124d89ef90388d34ebfb73ceb49068a
3
+ metadata.gz: 07c55b47d31b30ceaf4cdd3ea22da5c737d81884a494c6a11abc6fda6fbea22b
4
+ data.tar.gz: a28d34b5d28cb57349a81112ffc2db8fe9f94939beb21477af4d9d0c9d5b59ab
5
5
  SHA512:
6
- metadata.gz: d8a23c4a165a596df22ce5bbe1f8f0cd5c0f002deecafbb26cd5e5f75abb3c0224c1013898162a67787159258d1b801395fc4d949c17939d95940664cffd5600
7
- data.tar.gz: f2fd4eb733e6205f138c4005627e815e3787040a8a4b6cce7eca9fd5d4adaa12263e17e8f5bd9394a851e5210f28736ee3c682c81e110da304ae17fb3f0bedba
6
+ metadata.gz: 4871ac4ec167cb78b3fce8f9c5de9f6cff6a4089b8e4fd87fe7bb3265865cfcbd86935e8f2fa0bc5e40fde8a471e1655390fcdf8dcc0a5197342143e0cb855e5
7
+ data.tar.gz: 66d31943cb857632518a90166972bfa9ebe4b8ec355eca8291da40183c260e3c175d5866220efc0e4174f780c8449b45004db425a8fc7453268236f9b7dcbc1d
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Red Datasets
2
2
 
3
+ [![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
4
+ [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
5
+
3
6
  ## Description
4
7
 
5
8
  Red Datasets provides classes that provide common datasets such as iris dataset.
@@ -128,6 +131,9 @@ mnist.each do |record|
128
131
  end
129
132
  ```
130
133
 
134
+ ## NArray compatibility
135
+
136
+ * [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
131
137
 
132
138
  ## License
133
139
 
data/doc/text/news.md CHANGED
@@ -1,5 +1,98 @@
1
1
  # News
2
2
 
3
+ ## 0.1.3 - 2021-07-09
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::SeabornData`: Added.
8
+
9
+ * `Datasets::SudachiSynonymDictionary`: Added.
10
+
11
+ ## 0.1.2 - 2021-06-03
12
+
13
+ ### Improvements
14
+
15
+ * `Datasets::Rdatasets` and `Datasets::RdatasetsList`: Added.
16
+
17
+ * `Datasets::Penguins`: Changed for compatibility with seaborn's
18
+ penguins dataset.
19
+
20
+ ## 0.1.1 - 2021-04-11
21
+
22
+ ### Improvements
23
+
24
+ * Added support for Ruby 3.0.
25
+
26
+ * `Datasets::Communities`: Added.
27
+ [GitHub#64][Patch by Yasuo Honda]
28
+
29
+ * `Datasets::EStatJapan`: Added.
30
+ [GitHub#90][Patch by Kunihiko Miyoshi]
31
+
32
+ * `Datasets::Penguins`: Added.
33
+ [GitHub#100][Patch by Kenta Murata]
34
+
35
+ * `Datasets::CLDRPlurals`: Added.
36
+
37
+ ### Thanks
38
+
39
+ * Yasuo Honda
40
+
41
+ * Kunihiko Miyoshi
42
+
43
+ * Kenta Murata
44
+
45
+ ## 0.1.0 - 2020-02-04
46
+
47
+ ### Improvements
48
+
49
+ * Added support for Ruby 2.7.
50
+ [GitHub#82][GitHub#83][Patch by Yasuo Honda]
51
+
52
+ * `Datasets::Hepatitis`: Added.
53
+ [GitHub#70][Patch by KazuhiroYoshimoto]
54
+
55
+ * `Datasets::Downloader`: Added support for query.
56
+
57
+ ### Thanks
58
+
59
+ * Yasuo Honda
60
+
61
+ * KazuhiroYoshimoto
62
+
63
+ ## 0.0.9 - 2019-09-09
64
+
65
+ ### Improvements
66
+
67
+ * `Datasets::LIBSVMDatasetList`: Improved performance.
68
+
69
+ * `Datasets::Mushroom`: Added.
70
+ [GitHub#33][Patch by Yasuo Honda]
71
+
72
+ * `Datasets::Table#n_columns`: Added.
73
+
74
+ * `Datasets::Table#n_rows`: Added.
75
+
76
+ * `Datasets::Table#[]`: Added support for index access.
77
+
78
+ * `Datasets::Table#coolumn_names`: Added.
79
+
80
+ * `Datasets::Table#size`: Added.
81
+
82
+ * `Datasets::Table#length`: Added.
83
+
84
+ * `Datasets::Table#each_column`: Added.
85
+
86
+ * `Datasets::Table#each_record`: Added.
87
+
88
+ * `Datasets::Table#find_record`: Added.
89
+
90
+ ### Thanks
91
+
92
+ * Yasuo Honda
93
+
94
+ ### Improvements
95
+
3
96
  ## 0.0.8 - 2019-03-24
4
97
 
5
98
  ### Improvements
data/lib/datasets.rb CHANGED
@@ -2,12 +2,21 @@ require_relative "datasets/version"
2
2
 
3
3
  require_relative "datasets/adult"
4
4
  require_relative "datasets/cifar"
5
+ require_relative "datasets/cldr-plurals"
6
+ require_relative "datasets/communities"
7
+ require_relative "datasets/e-stat-japan"
5
8
  require_relative "datasets/fashion-mnist"
9
+ require_relative "datasets/hepatitis"
6
10
  require_relative "datasets/iris"
7
11
  require_relative "datasets/libsvm"
8
12
  require_relative "datasets/libsvm-dataset-list"
9
13
  require_relative "datasets/mnist"
14
+ require_relative "datasets/mushroom"
15
+ require_relative "datasets/penguins"
10
16
  require_relative "datasets/penn-treebank"
11
17
  require_relative "datasets/postal-code-japan"
18
+ require_relative "datasets/rdatasets"
19
+ require_relative "datasets/seaborn-data"
20
+ require_relative "datasets/sudachi-synonym-dictionary"
12
21
  require_relative "datasets/wikipedia"
13
22
  require_relative "datasets/wine"
@@ -62,11 +62,12 @@ module Datasets
62
62
  data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
63
63
  download(data_path, data_url)
64
64
  end
65
- CSV.open(data_path,
66
- {
65
+
66
+ options = {
67
67
  converters: [:numeric, lambda {|f| f.strip}],
68
68
  skip_lines: /\A\|/,
69
- }) do |csv|
69
+ }
70
+ CSV.open(data_path, **options) do |csv|
70
71
  yield(csv)
71
72
  end
72
73
  end
@@ -1,10 +1,10 @@
1
- require "rubygems/package"
2
- require "zlib"
3
-
1
+ require_relative "tar-gz-readable"
4
2
  require_relative "dataset"
5
3
 
6
4
  module Datasets
7
5
  class CIFAR < Dataset
6
+ include TarGzReadable
7
+
8
8
  module Pixelable
9
9
  def pixels
10
10
  data.unpack("C*")
@@ -61,7 +61,7 @@ module Datasets
61
61
  private
62
62
 
63
63
  def parse_data(data_path, &block)
64
- open_tar(data_path) do |tar|
64
+ open_tar_gz(data_path) do |tar|
65
65
  target_file_names.each do |target_file_name|
66
66
  tar.seek(target_file_name) do |entry|
67
67
  parse_entry(entry, &block)
@@ -124,14 +124,6 @@ module Datasets
124
124
  end
125
125
  end
126
126
  end
127
-
128
- def open_tar(data_path)
129
- Zlib::GzipReader.open(data_path) do |f|
130
- Gem::Package::TarReader.new(f) do |tar|
131
- yield(tar)
132
- end
133
- end
134
- end
135
127
  end
136
128
  end
137
129
 
@@ -0,0 +1,385 @@
1
+ require "rexml/streamlistener"
2
+ require "rexml/parsers/baseparser"
3
+ require "rexml/parsers/streamparser"
4
+ require "strscan"
5
+
6
+ require_relative "dataset"
7
+
8
+ module Datasets
9
+ class CLDRPlurals < Dataset
10
+ Locale = Struct.new(:name,
11
+ :rules)
12
+
13
+ Rule = Struct.new(:count,
14
+ :condition,
15
+ :integer_samples,
16
+ :decimal_samples)
17
+
18
+ def initialize
19
+ super()
20
+ @metadata.id = "cldr-plurals"
21
+ @metadata.name = "CLDR language plural rules"
22
+ @metadata.url = "https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/plurals.xml"
23
+ @metadata.licenses = ["Unicode-DFS-2016"]
24
+ @metadata.description = <<~DESCRIPTION
25
+ Language plural rules in Unicode Common Locale Data Repository.
26
+ See also: https://unicode-org.github.io/cldr-staging/charts/latest/supplemental/language_plural_rules.html
27
+ DESCRIPTION
28
+ end
29
+
30
+ def each(&block)
31
+ return to_enum(__method__) unless block_given?
32
+
33
+ open_data do |input|
34
+ catch do |abort_tag|
35
+ listener = Listener.new(abort_tag, &block)
36
+ parser = REXML::Parsers::StreamParser.new(input, listener)
37
+ parser.parse
38
+ end
39
+ end
40
+ end
41
+
42
+ private
43
+ def open_data
44
+ data_path = cache_dir_path + "plurals.xml"
45
+ unless data_path.exist?
46
+ download(data_path, @metadata.url)
47
+ end
48
+ ::File.open(data_path) do |input|
49
+ yield(input)
50
+ end
51
+ end
52
+
53
+ # Spec: https://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules
54
+ class Listener
55
+ include REXML::StreamListener
56
+
57
+ def initialize(abort_tag, &block)
58
+ @abort_tag = abort_tag
59
+ @block = block
60
+ @tag_name_stack = []
61
+ end
62
+
63
+ def tag_start(name, attributes)
64
+ @tag_name_stack.push(name)
65
+ case name
66
+ when "pluralRules"
67
+ @locales = attributes["locales"].split
68
+ @rules = []
69
+ when "pluralRule"
70
+ @rule = Rule.new(attributes["count"])
71
+ end
72
+ end
73
+
74
+ def tag_end(name)
75
+ case name
76
+ when "pluralRules"
77
+ @locales.each do |locale_name|
78
+ @block.call(Locale.new(locale_name, @rules))
79
+ end
80
+ when "pluralRule"
81
+ @rules << @rule
82
+ end
83
+ @tag_name_stack.pop
84
+ end
85
+
86
+ def text(data)
87
+ case @tag_name_stack.last
88
+ when "pluralRule"
89
+ parse_plural_rule(data)
90
+ end
91
+ end
92
+
93
+ private
94
+ def parse_plural_rule(data)
95
+ parser = RuleParser.new(@rule, data)
96
+ parser.parse
97
+ end
98
+ end
99
+ private_constant :Listener
100
+
101
+ # Syntax: http://unicode.org/reports/tr35/tr35-numbers.html#Plural_rules_syntax
102
+ class RuleParser
103
+ def initialize(rule, data)
104
+ @rule = rule
105
+ @data = data
106
+ @scanner = StringScanner.new(@data)
107
+ end
108
+
109
+ def parse
110
+ @rule.condition = parse_condition
111
+ skip_whitespaces
112
+ if @scanner.scan(/@integer/)
113
+ @rule.integer_samples = parse_sample_list
114
+ end
115
+ skip_whitespaces
116
+ if @scanner.scan(/@decimal/)
117
+ @rule.decimal_samples = parse_sample_list
118
+ end
119
+ end
120
+
121
+ private
122
+ def skip_whitespaces
123
+ @scanner.skip(/\p{Pattern_White_Space}+/)
124
+ end
125
+
126
+ def parse_condition
127
+ and_condition = parse_and_condition
128
+ return nil if and_condition.nil?
129
+ and_conditions = [and_condition]
130
+ while parse_or
131
+ and_conditions << parse_and_condition
132
+ end
133
+ if and_conditions.size == 1
134
+ and_condition
135
+ else
136
+ [:or, *and_conditions]
137
+ end
138
+ end
139
+
140
+ def parse_or
141
+ skip_whitespaces
142
+ @scanner.scan(/or/)
143
+ end
144
+
145
+ def parse_and_condition
146
+ skip_whitespaces
147
+ relation = parse_relation
148
+ return nil if relation.nil?
149
+ relations = [relation]
150
+ while parse_and
151
+ relations << parse_relation
152
+ end
153
+ if relations.size == 1
154
+ relation
155
+ else
156
+ [:and, *relations]
157
+ end
158
+ end
159
+
160
+ def parse_and
161
+ skip_whitespaces
162
+ @scanner.scan(/and/)
163
+ end
164
+
165
+ def parse_relation
166
+ parse_is_relation or
167
+ parse_in_relation or
168
+ parse_within_relation
169
+ end
170
+
171
+ def parse_is_relation
172
+ position = @scanner.pos
173
+ skip_whitespaces
174
+ expr = parse_expr
175
+ unless parse_is
176
+ @scanner.pos = position
177
+ return nil
178
+ end
179
+ if parse_not
180
+ operator = :is_not
181
+ else
182
+ operator = :is
183
+ end
184
+ value = parse_value
185
+ if value.nil?
186
+ raise Error, "no value for #{operator}: #{@scanner.inspect}"
187
+ end
188
+ [operator, expr, value]
189
+ end
190
+
191
+ def parse_is
192
+ skip_whitespaces
193
+ @scanner.scan(/is/)
194
+ end
195
+
196
+ def parse_not
197
+ skip_whitespaces
198
+ @scanner.scan(/not/)
199
+ end
200
+
201
+ def parse_in_relation
202
+ position = @scanner.pos
203
+ skip_whitespaces
204
+ expr = parse_expr
205
+ if parse_not
206
+ if parse_in
207
+ operator = :not_in
208
+ else
209
+ @scanner.ops = position
210
+ return nil
211
+ end
212
+ elsif parse_in
213
+ operator = :in
214
+ elsif parse_equal
215
+ operator = :equal
216
+ elsif parse_not_equal
217
+ operator = :not_equal
218
+ else
219
+ @scanner.pos = position
220
+ return nil
221
+ end
222
+ range_list = parse_range_list
223
+ [operator, expr, range_list]
224
+ end
225
+
226
+ def parse_in
227
+ skip_whitespaces
228
+ @scanner.scan(/in/)
229
+ end
230
+
231
+ def parse_equal
232
+ skip_whitespaces
233
+ @scanner.scan(/=/)
234
+ end
235
+
236
+ def parse_not_equal
237
+ skip_whitespaces
238
+ @scanner.scan(/!=/)
239
+ end
240
+
241
+ def parse_within_relation
242
+ position = @scanner.pos
243
+ skip_whitespaces
244
+ expr = parse_expr
245
+ have_not = parse_not
246
+ unless parse_within
247
+ @scanner.pos = position
248
+ return nil
249
+ end
250
+ if have_not
251
+ operator = :not_within
252
+ else
253
+ operator = :within
254
+ end
255
+ range_list = parse_range_list
256
+ [operator, expr, range_list]
257
+ end
258
+
259
+ def parse_within
260
+ skip_whitespaces
261
+ @scanner.scan(/within/)
262
+ end
263
+
264
+ def parse_expr
265
+ operand = parse_operand
266
+ operator = parse_expr_operator
267
+ if operator
268
+ value = parse_value
269
+ if value.nil?
270
+ raise Error, "no value for #{operator}: #{@scanner.inspect}"
271
+ end
272
+ [operator, operand, value]
273
+ else
274
+ operand
275
+ end
276
+ end
277
+
278
+ def parse_operand
279
+ skip_whitespaces
280
+ @scanner.scan(/[niftvwce]/)
281
+ end
282
+
283
+ def parse_expr_operator
284
+ skip_whitespaces
285
+ if @scanner.scan(/(?:mod|%)/)
286
+ :mod
287
+ else
288
+ nil
289
+ end
290
+ end
291
+
292
+ def parse_range_list
293
+ ranges = [parse_range || parse_value]
294
+ loop do
295
+ skip_whitespaces
296
+ break unless @scanner.scan(/,/)
297
+ ranges << (parse_range || parse_value)
298
+ end
299
+ ranges
300
+ end
301
+
302
+ def parse_range
303
+ position = @scanner.pos
304
+ range_start = parse_value
305
+ skip_whitespaces
306
+ unless @scanner.scan(/\.\./)
307
+ @scanner.pos = position
308
+ return nil
309
+ end
310
+ range_end = parse_value
311
+ range_start..range_end
312
+ end
313
+
314
+ def parse_value
315
+ skip_whitespaces
316
+ value = @scanner.scan(/\d+/)
317
+ return nil if value.nil?
318
+ Integer(value, 10)
319
+ end
320
+
321
+ def parse_sample_list
322
+ samples = [parse_sample_range]
323
+ loop do
324
+ position = @scanner.pos
325
+ skip_whitespaces
326
+ break unless @scanner.scan(/,/)
327
+ sample_range = parse_sample_range
328
+ unless sample_range
329
+ @scanner.pos = position
330
+ break
331
+ end
332
+ samples << sample_range
333
+ end
334
+ skip_whitespaces
335
+ if @scanner.scan(/,/)
336
+ skip_whitespaces
337
+ # U+2026 HORIZONTAL ELLIPSIS
338
+ unless @scanner.scan(/\u2026|\.\.\./)
339
+ raise Error, "no ellipsis: #{@scanner.inspect}"
340
+ end
341
+ samples << :elipsis
342
+ end
343
+ samples
344
+ end
345
+
346
+ def parse_sample_range
347
+ value = parse_sample_value
348
+ return nil if value.nil?
349
+ skip_whitespaces
350
+ if @scanner.scan(/~/)
351
+ range_end = parse_sample_value
352
+ value..range_end
353
+ else
354
+ value
355
+ end
356
+ end
357
+
358
+ def parse_sample_value
359
+ value = parse_value
360
+ return nil if value.nil?
361
+ if @scanner.scan(/\./)
362
+ skip_whitespaces
363
+ decimal = @scanner.scan(/[0-9]+/)
364
+ if decimal.nil?
365
+ raise Error, "no decimal: #{@scanner.inspect}"
366
+ end
367
+ value += Float("0.#{decimal}")
368
+ skip_whitespaces
369
+ end
370
+ if @scanner.scan(/[ce]/)
371
+ # Workardoun for a spec bug. "e1" should be accepted.
372
+ #
373
+ # Spec:
374
+ # sampleValue = value ('.' digit+)? ([ce] digitPos digit+)?
375
+ # digit = [0-9]
376
+ # digitPos = [1-9]
377
+ e = @scanner.scan(/[1-9][0-9]*/)
378
+ value *= 10 * Integer(e, 10)
379
+ end
380
+ value
381
+ end
382
+ end
383
+ private_constant :RuleParser
384
+ end
385
+ end