red-datasets 0.1.0 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 120492172aae9cec1c4fc4f3b73575cb5349caf2f0b67d70676c8896324e1491
4
- data.tar.gz: e46eb3f2875cb407e86cc0976eff7d612beb62ca6b421a51435b5d5e1bfa6e03
3
+ metadata.gz: 8d18fa976f1b368a6a3f9cc85dc7a58a1785fd02901157672484f2a7d8b1fa88
4
+ data.tar.gz: c91d651a0d8de6722ee759ce29545f5f382d1e9f060c7e4ee5a0fcd557be4d21
5
5
  SHA512:
6
- metadata.gz: 360bbf78c131f20a67359ddc2055cd58502da1f4e95adf30475cd405d5eb50be6ba4fd9aa0a0857226dc803e14282cc4231de113843e96657a65e287c7500137
7
- data.tar.gz: f88ed1ae8c8f0dad9f4d8904a265c833ceee723ba92860c0e3bed4c193d56a901c31184abd4290058de47fbc089b12b4d3b1da064f138214e2954d45eee928da
6
+ metadata.gz: 0ff7694dd27e4293206de81fc2a7b5ccccb886579ed73eb7f97d390472692ce310993e2ece741cf85f5fbe265f1deb2a7ea326590383b4bdf0d3f77f10b1bbc1
7
+ data.tar.gz: 38ac6aa12d3e33ab0c26c0750273b60386d90fd4d916776a0d561c3f25a79fa2d7d216ac465842207cd65f62e2fcbd348389e65f905583187fe23c30908d92dc
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Red Datasets
2
2
 
3
+ [![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
4
+ [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
5
+
3
6
  ## Description
4
7
 
5
8
  Red Datasets provides classes that provide common datasets such as iris dataset.
@@ -128,6 +131,9 @@ mnist.each do |record|
128
131
  end
129
132
  ```
130
133
 
134
+ ## NArray compatibility
135
+
136
+ * [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
131
137
 
132
138
  ## License
133
139
 
data/doc/text/news.md CHANGED
@@ -1,5 +1,53 @@
1
1
  # News
2
2
 
3
+ ## 0.1.4 - 2021-07-13
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::SudachiSynonymDictionary`: Stopped depending on `LANG`.
8
+
9
+ ## 0.1.3 - 2021-07-09
10
+
11
+ ### Improvements
12
+
13
+ * `Datasets::SeabornData`: Added.
14
+
15
+ * `Datasets::SudachiSynonymDictionary`: Added.
16
+
17
+ ## 0.1.2 - 2021-06-03
18
+
19
+ ### Improvements
20
+
21
+ * `Datasets::Rdatasets` and `Datasets::RdatasetsList`: Added.
22
+
23
+ * `Datasets::Penguins`: Changed for compatibility with seaborn's
24
+ penguins dataset.
25
+
26
+ ## 0.1.1 - 2021-04-11
27
+
28
+ ### Improvements
29
+
30
+ * Added support for Ruby 3.0.
31
+
32
+ * `Datasets::Communities`: Added.
33
+ [GitHub#64][Patch by Yasuo Honda]
34
+
35
+ * `Datasets::EStatJapan`: Added.
36
+ [GitHub#90][Patch by Kunihiko Miyoshi]
37
+
38
+ * `Datasets::Penguins`: Added.
39
+ [GitHub#100][Patch by Kenta Murata]
40
+
41
+ * `Datasets::CLDRPlurals`: Added.
42
+
43
+ ### Thanks
44
+
45
+ * Yasuo Honda
46
+
47
+ * Kunihiko Miyoshi
48
+
49
+ * Kenta Murata
50
+
3
51
  ## 0.1.0 - 2020-02-04
4
52
 
5
53
  ### Improvements
data/lib/datasets.rb CHANGED
@@ -2,6 +2,9 @@ require_relative "datasets/version"
2
2
 
3
3
  require_relative "datasets/adult"
4
4
  require_relative "datasets/cifar"
5
+ require_relative "datasets/cldr-plurals"
6
+ require_relative "datasets/communities"
7
+ require_relative "datasets/e-stat-japan"
5
8
  require_relative "datasets/fashion-mnist"
6
9
  require_relative "datasets/hepatitis"
7
10
  require_relative "datasets/iris"
@@ -9,7 +12,11 @@ require_relative "datasets/libsvm"
9
12
  require_relative "datasets/libsvm-dataset-list"
10
13
  require_relative "datasets/mnist"
11
14
  require_relative "datasets/mushroom"
15
+ require_relative "datasets/penguins"
12
16
  require_relative "datasets/penn-treebank"
13
17
  require_relative "datasets/postal-code-japan"
18
+ require_relative "datasets/rdatasets"
19
+ require_relative "datasets/seaborn-data"
20
+ require_relative "datasets/sudachi-synonym-dictionary"
14
21
  require_relative "datasets/wikipedia"
15
22
  require_relative "datasets/wine"
@@ -1,10 +1,10 @@
1
- require "rubygems/package"
2
- require "zlib"
3
-
1
+ require_relative "tar-gz-readable"
4
2
  require_relative "dataset"
5
3
 
6
4
  module Datasets
7
5
  class CIFAR < Dataset
6
+ include TarGzReadable
7
+
8
8
  module Pixelable
9
9
  def pixels
10
10
  data.unpack("C*")
@@ -61,7 +61,7 @@ module Datasets
61
61
  private
62
62
 
63
63
  def parse_data(data_path, &block)
64
- open_tar(data_path) do |tar|
64
+ open_tar_gz(data_path) do |tar|
65
65
  target_file_names.each do |target_file_name|
66
66
  tar.seek(target_file_name) do |entry|
67
67
  parse_entry(entry, &block)
@@ -124,14 +124,6 @@ module Datasets
124
124
  end
125
125
  end
126
126
  end
127
-
128
- def open_tar(data_path)
129
- Zlib::GzipReader.open(data_path) do |f|
130
- Gem::Package::TarReader.new(f) do |tar|
131
- yield(tar)
132
- end
133
- end
134
- end
135
127
  end
136
128
  end
137
129
 
@@ -0,0 +1,385 @@
1
+ require "rexml/streamlistener"
2
+ require "rexml/parsers/baseparser"
3
+ require "rexml/parsers/streamparser"
4
+ require "strscan"
5
+
6
+ require_relative "dataset"
7
+
8
+ module Datasets
9
+ class CLDRPlurals < Dataset
10
+ Locale = Struct.new(:name,
11
+ :rules)
12
+
13
+ Rule = Struct.new(:count,
14
+ :condition,
15
+ :integer_samples,
16
+ :decimal_samples)
17
+
18
+ def initialize
19
+ super()
20
+ @metadata.id = "cldr-plurals"
21
+ @metadata.name = "CLDR language plural rules"
22
+ @metadata.url = "https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/plurals.xml"
23
+ @metadata.licenses = ["Unicode-DFS-2016"]
24
+ @metadata.description = <<~DESCRIPTION
25
+ Language plural rules in Unicode Common Locale Data Repository.
26
+ See also: https://unicode-org.github.io/cldr-staging/charts/latest/supplemental/language_plural_rules.html
27
+ DESCRIPTION
28
+ end
29
+
30
+ def each(&block)
31
+ return to_enum(__method__) unless block_given?
32
+
33
+ open_data do |input|
34
+ catch do |abort_tag|
35
+ listener = Listener.new(abort_tag, &block)
36
+ parser = REXML::Parsers::StreamParser.new(input, listener)
37
+ parser.parse
38
+ end
39
+ end
40
+ end
41
+
42
+ private
43
+ def open_data
44
+ data_path = cache_dir_path + "plurals.xml"
45
+ unless data_path.exist?
46
+ download(data_path, @metadata.url)
47
+ end
48
+ ::File.open(data_path) do |input|
49
+ yield(input)
50
+ end
51
+ end
52
+
53
+ # Spec: https://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules
54
+ class Listener
55
+ include REXML::StreamListener
56
+
57
+ def initialize(abort_tag, &block)
58
+ @abort_tag = abort_tag
59
+ @block = block
60
+ @tag_name_stack = []
61
+ end
62
+
63
+ def tag_start(name, attributes)
64
+ @tag_name_stack.push(name)
65
+ case name
66
+ when "pluralRules"
67
+ @locales = attributes["locales"].split
68
+ @rules = []
69
+ when "pluralRule"
70
+ @rule = Rule.new(attributes["count"])
71
+ end
72
+ end
73
+
74
+ def tag_end(name)
75
+ case name
76
+ when "pluralRules"
77
+ @locales.each do |locale_name|
78
+ @block.call(Locale.new(locale_name, @rules))
79
+ end
80
+ when "pluralRule"
81
+ @rules << @rule
82
+ end
83
+ @tag_name_stack.pop
84
+ end
85
+
86
+ def text(data)
87
+ case @tag_name_stack.last
88
+ when "pluralRule"
89
+ parse_plural_rule(data)
90
+ end
91
+ end
92
+
93
+ private
94
+ def parse_plural_rule(data)
95
+ parser = RuleParser.new(@rule, data)
96
+ parser.parse
97
+ end
98
+ end
99
+ private_constant :Listener
100
+
101
+ # Syntax: http://unicode.org/reports/tr35/tr35-numbers.html#Plural_rules_syntax
102
+ class RuleParser
103
+ def initialize(rule, data)
104
+ @rule = rule
105
+ @data = data
106
+ @scanner = StringScanner.new(@data)
107
+ end
108
+
109
+ def parse
110
+ @rule.condition = parse_condition
111
+ skip_whitespaces
112
+ if @scanner.scan(/@integer/)
113
+ @rule.integer_samples = parse_sample_list
114
+ end
115
+ skip_whitespaces
116
+ if @scanner.scan(/@decimal/)
117
+ @rule.decimal_samples = parse_sample_list
118
+ end
119
+ end
120
+
121
+ private
122
+ def skip_whitespaces
123
+ @scanner.skip(/\p{Pattern_White_Space}+/)
124
+ end
125
+
126
+ def parse_condition
127
+ and_condition = parse_and_condition
128
+ return nil if and_condition.nil?
129
+ and_conditions = [and_condition]
130
+ while parse_or
131
+ and_conditions << parse_and_condition
132
+ end
133
+ if and_conditions.size == 1
134
+ and_condition
135
+ else
136
+ [:or, *and_conditions]
137
+ end
138
+ end
139
+
140
+ def parse_or
141
+ skip_whitespaces
142
+ @scanner.scan(/or/)
143
+ end
144
+
145
+ def parse_and_condition
146
+ skip_whitespaces
147
+ relation = parse_relation
148
+ return nil if relation.nil?
149
+ relations = [relation]
150
+ while parse_and
151
+ relations << parse_relation
152
+ end
153
+ if relations.size == 1
154
+ relation
155
+ else
156
+ [:and, *relations]
157
+ end
158
+ end
159
+
160
+ def parse_and
161
+ skip_whitespaces
162
+ @scanner.scan(/and/)
163
+ end
164
+
165
+ def parse_relation
166
+ parse_is_relation or
167
+ parse_in_relation or
168
+ parse_within_relation
169
+ end
170
+
171
+ def parse_is_relation
172
+ position = @scanner.pos
173
+ skip_whitespaces
174
+ expr = parse_expr
175
+ unless parse_is
176
+ @scanner.pos = position
177
+ return nil
178
+ end
179
+ if parse_not
180
+ operator = :is_not
181
+ else
182
+ operator = :is
183
+ end
184
+ value = parse_value
185
+ if value.nil?
186
+ raise Error, "no value for #{operator}: #{@scanner.inspect}"
187
+ end
188
+ [operator, expr, value]
189
+ end
190
+
191
+ def parse_is
192
+ skip_whitespaces
193
+ @scanner.scan(/is/)
194
+ end
195
+
196
+ def parse_not
197
+ skip_whitespaces
198
+ @scanner.scan(/not/)
199
+ end
200
+
201
+ def parse_in_relation
202
+ position = @scanner.pos
203
+ skip_whitespaces
204
+ expr = parse_expr
205
+ if parse_not
206
+ if parse_in
207
+ operator = :not_in
208
+ else
209
+ @scanner.ops = position
210
+ return nil
211
+ end
212
+ elsif parse_in
213
+ operator = :in
214
+ elsif parse_equal
215
+ operator = :equal
216
+ elsif parse_not_equal
217
+ operator = :not_equal
218
+ else
219
+ @scanner.pos = position
220
+ return nil
221
+ end
222
+ range_list = parse_range_list
223
+ [operator, expr, range_list]
224
+ end
225
+
226
+ def parse_in
227
+ skip_whitespaces
228
+ @scanner.scan(/in/)
229
+ end
230
+
231
+ def parse_equal
232
+ skip_whitespaces
233
+ @scanner.scan(/=/)
234
+ end
235
+
236
+ def parse_not_equal
237
+ skip_whitespaces
238
+ @scanner.scan(/!=/)
239
+ end
240
+
241
+ def parse_within_relation
242
+ position = @scanner.pos
243
+ skip_whitespaces
244
+ expr = parse_expr
245
+ have_not = parse_not
246
+ unless parse_within
247
+ @scanner.pos = position
248
+ return nil
249
+ end
250
+ if have_not
251
+ operator = :not_within
252
+ else
253
+ operator = :within
254
+ end
255
+ range_list = parse_range_list
256
+ [operator, expr, range_list]
257
+ end
258
+
259
+ def parse_within
260
+ skip_whitespaces
261
+ @scanner.scan(/within/)
262
+ end
263
+
264
+ def parse_expr
265
+ operand = parse_operand
266
+ operator = parse_expr_operator
267
+ if operator
268
+ value = parse_value
269
+ if value.nil?
270
+ raise Error, "no value for #{operator}: #{@scanner.inspect}"
271
+ end
272
+ [operator, operand, value]
273
+ else
274
+ operand
275
+ end
276
+ end
277
+
278
+ def parse_operand
279
+ skip_whitespaces
280
+ @scanner.scan(/[niftvwce]/)
281
+ end
282
+
283
+ def parse_expr_operator
284
+ skip_whitespaces
285
+ if @scanner.scan(/(?:mod|%)/)
286
+ :mod
287
+ else
288
+ nil
289
+ end
290
+ end
291
+
292
+ def parse_range_list
293
+ ranges = [parse_range || parse_value]
294
+ loop do
295
+ skip_whitespaces
296
+ break unless @scanner.scan(/,/)
297
+ ranges << (parse_range || parse_value)
298
+ end
299
+ ranges
300
+ end
301
+
302
+ def parse_range
303
+ position = @scanner.pos
304
+ range_start = parse_value
305
+ skip_whitespaces
306
+ unless @scanner.scan(/\.\./)
307
+ @scanner.pos = position
308
+ return nil
309
+ end
310
+ range_end = parse_value
311
+ range_start..range_end
312
+ end
313
+
314
+ def parse_value
315
+ skip_whitespaces
316
+ value = @scanner.scan(/\d+/)
317
+ return nil if value.nil?
318
+ Integer(value, 10)
319
+ end
320
+
321
+ def parse_sample_list
322
+ samples = [parse_sample_range]
323
+ loop do
324
+ position = @scanner.pos
325
+ skip_whitespaces
326
+ break unless @scanner.scan(/,/)
327
+ sample_range = parse_sample_range
328
+ unless sample_range
329
+ @scanner.pos = position
330
+ break
331
+ end
332
+ samples << sample_range
333
+ end
334
+ skip_whitespaces
335
+ if @scanner.scan(/,/)
336
+ skip_whitespaces
337
+ # U+2026 HORIZONTAL ELLIPSIS
338
+ unless @scanner.scan(/\u2026|\.\.\./)
339
+ raise Error, "no ellipsis: #{@scanner.inspect}"
340
+ end
341
+ samples << :elipsis
342
+ end
343
+ samples
344
+ end
345
+
346
+ def parse_sample_range
347
+ value = parse_sample_value
348
+ return nil if value.nil?
349
+ skip_whitespaces
350
+ if @scanner.scan(/~/)
351
+ range_end = parse_sample_value
352
+ value..range_end
353
+ else
354
+ value
355
+ end
356
+ end
357
+
358
+ def parse_sample_value
359
+ value = parse_value
360
+ return nil if value.nil?
361
+ if @scanner.scan(/\./)
362
+ skip_whitespaces
363
+ decimal = @scanner.scan(/[0-9]+/)
364
+ if decimal.nil?
365
+ raise Error, "no decimal: #{@scanner.inspect}"
366
+ end
367
+ value += Float("0.#{decimal}")
368
+ skip_whitespaces
369
+ end
370
+ if @scanner.scan(/[ce]/)
371
+ # Workardoun for a spec bug. "e1" should be accepted.
372
+ #
373
+ # Spec:
374
+ # sampleValue = value ('.' digit+)? ([ce] digitPos digit+)?
375
+ # digit = [0-9]
376
+ # digitPos = [1-9]
377
+ e = @scanner.scan(/[1-9][0-9]*/)
378
+ value *= 10 * Integer(e, 10)
379
+ end
380
+ value
381
+ end
382
+ end
383
+ private_constant :RuleParser
384
+ end
385
+ end