red-datasets 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -0
- data/doc/text/news.md +25 -0
- data/lib/datasets.rb +4 -0
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +1 -0
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/mnist.rb +0 -2
- data/lib/datasets/penguins.rb +125 -0
- data/lib/datasets/version.rb +1 -1
- data/red-datasets.gemspec +1 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-penguins.rb +239 -0
- metadata +41 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '0239c4ab86dd9f589b1f67b9d6c381570e25a29289c261470943ed48f7dfc3d0'
|
4
|
+
data.tar.gz: 2f3f3af1f17a1bd1e7aa307e2b182108790549754d907262105e18479997cde6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04b3dbc23dc8679855a6104a9f3da39871594979f149295ef13b3be864a3dbbdb6bec3fb59153db9b5be4fade6819686e13b60a38f1d1721bf7e1163d4bb49b8
|
7
|
+
data.tar.gz: 476a9081fe0db32aad8a4e00c7e08f77002e58a2f2c68eb37aecf2a70054d43877707ddda29137b361a5a37ff979f3c28c5f2a03c2d1e96cbc7f7289f659ba9f
|
data/README.md
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
# Red Datasets
|
2
2
|
|
3
|
+
[![Build Status](https://travis-ci.org/red-data-tools/red-datasets.svg?branch=master)](https://travis-ci.org/red-data-tools/red-datasets)
|
4
|
+
[![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
|
5
|
+
|
3
6
|
## Description
|
4
7
|
|
5
8
|
Red Datasets provides classes that provide common datasets such as iris dataset.
|
@@ -128,6 +131,9 @@ mnist.each do |record|
|
|
128
131
|
end
|
129
132
|
```
|
130
133
|
|
134
|
+
## NArray compatibility
|
135
|
+
|
136
|
+
* [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
|
131
137
|
|
132
138
|
## License
|
133
139
|
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,30 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.1.1 - 2021-04-11
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* Added support for Ruby 3.0.
|
8
|
+
|
9
|
+
* `Datasets::Communities`: Added.
|
10
|
+
[GitHub#64][Patch by Yasuo Honda]
|
11
|
+
|
12
|
+
* `Datasets::EStatJapan`: Added.
|
13
|
+
[GitHub#90][Patch by Kunihiko Miyoshi]
|
14
|
+
|
15
|
+
* `Datasets::Penguins`: Added.
|
16
|
+
[GitHub#100][Patch by Kenta Murata]
|
17
|
+
|
18
|
+
* `Datasets::CLDRPlurals`: Added.
|
19
|
+
|
20
|
+
### Thanks
|
21
|
+
|
22
|
+
* Yasuo Honda
|
23
|
+
|
24
|
+
* Kunihiko Miyoshi
|
25
|
+
|
26
|
+
* Kenta Murata
|
27
|
+
|
3
28
|
## 0.1.0 - 2020-02-04
|
4
29
|
|
5
30
|
### Improvements
|
data/lib/datasets.rb
CHANGED
@@ -2,6 +2,9 @@ require_relative "datasets/version"
|
|
2
2
|
|
3
3
|
require_relative "datasets/adult"
|
4
4
|
require_relative "datasets/cifar"
|
5
|
+
require_relative "datasets/cldr-plurals"
|
6
|
+
require_relative "datasets/communities"
|
7
|
+
require_relative "datasets/e-stat-japan"
|
5
8
|
require_relative "datasets/fashion-mnist"
|
6
9
|
require_relative "datasets/hepatitis"
|
7
10
|
require_relative "datasets/iris"
|
@@ -9,6 +12,7 @@ require_relative "datasets/libsvm"
|
|
9
12
|
require_relative "datasets/libsvm-dataset-list"
|
10
13
|
require_relative "datasets/mnist"
|
11
14
|
require_relative "datasets/mushroom"
|
15
|
+
require_relative "datasets/penguins"
|
12
16
|
require_relative "datasets/penn-treebank"
|
13
17
|
require_relative "datasets/postal-code-japan"
|
14
18
|
require_relative "datasets/wikipedia"
|
@@ -0,0 +1,385 @@
|
|
1
|
+
require "rexml/streamlistener"
|
2
|
+
require "rexml/parsers/baseparser"
|
3
|
+
require "rexml/parsers/streamparser"
|
4
|
+
require "strscan"
|
5
|
+
|
6
|
+
require_relative "dataset"
|
7
|
+
|
8
|
+
module Datasets
|
9
|
+
class CLDRPlurals < Dataset
|
10
|
+
Locale = Struct.new(:name,
|
11
|
+
:rules)
|
12
|
+
|
13
|
+
Rule = Struct.new(:count,
|
14
|
+
:condition,
|
15
|
+
:integer_samples,
|
16
|
+
:decimal_samples)
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
super()
|
20
|
+
@metadata.id = "cldr-plurals"
|
21
|
+
@metadata.name = "CLDR language plural rules"
|
22
|
+
@metadata.url = "https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/plurals.xml"
|
23
|
+
@metadata.licenses = ["Unicode-DFS-2016"]
|
24
|
+
@metadata.description = <<~DESCRIPTION
|
25
|
+
Language plural rules in Unicode Common Locale Data Repository.
|
26
|
+
See also: https://unicode-org.github.io/cldr-staging/charts/latest/supplemental/language_plural_rules.html
|
27
|
+
DESCRIPTION
|
28
|
+
end
|
29
|
+
|
30
|
+
def each(&block)
|
31
|
+
return to_enum(__method__) unless block_given?
|
32
|
+
|
33
|
+
open_data do |input|
|
34
|
+
catch do |abort_tag|
|
35
|
+
listener = Listener.new(abort_tag, &block)
|
36
|
+
parser = REXML::Parsers::StreamParser.new(input, listener)
|
37
|
+
parser.parse
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
def open_data
|
44
|
+
data_path = cache_dir_path + "plurals.xml"
|
45
|
+
unless data_path.exist?
|
46
|
+
download(data_path, @metadata.url)
|
47
|
+
end
|
48
|
+
::File.open(data_path) do |input|
|
49
|
+
yield(input)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Spec: https://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules
|
54
|
+
class Listener
|
55
|
+
include REXML::StreamListener
|
56
|
+
|
57
|
+
def initialize(abort_tag, &block)
|
58
|
+
@abort_tag = abort_tag
|
59
|
+
@block = block
|
60
|
+
@tag_name_stack = []
|
61
|
+
end
|
62
|
+
|
63
|
+
def tag_start(name, attributes)
|
64
|
+
@tag_name_stack.push(name)
|
65
|
+
case name
|
66
|
+
when "pluralRules"
|
67
|
+
@locales = attributes["locales"].split
|
68
|
+
@rules = []
|
69
|
+
when "pluralRule"
|
70
|
+
@rule = Rule.new(attributes["count"])
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def tag_end(name)
|
75
|
+
case name
|
76
|
+
when "pluralRules"
|
77
|
+
@locales.each do |locale_name|
|
78
|
+
@block.call(Locale.new(locale_name, @rules))
|
79
|
+
end
|
80
|
+
when "pluralRule"
|
81
|
+
@rules << @rule
|
82
|
+
end
|
83
|
+
@tag_name_stack.pop
|
84
|
+
end
|
85
|
+
|
86
|
+
def text(data)
|
87
|
+
case @tag_name_stack.last
|
88
|
+
when "pluralRule"
|
89
|
+
parse_plural_rule(data)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
def parse_plural_rule(data)
|
95
|
+
parser = RuleParser.new(@rule, data)
|
96
|
+
parser.parse
|
97
|
+
end
|
98
|
+
end
|
99
|
+
private_constant :Listener
|
100
|
+
|
101
|
+
# Syntax: http://unicode.org/reports/tr35/tr35-numbers.html#Plural_rules_syntax
|
102
|
+
class RuleParser
|
103
|
+
def initialize(rule, data)
|
104
|
+
@rule = rule
|
105
|
+
@data = data
|
106
|
+
@scanner = StringScanner.new(@data)
|
107
|
+
end
|
108
|
+
|
109
|
+
def parse
|
110
|
+
@rule.condition = parse_condition
|
111
|
+
skip_whitespaces
|
112
|
+
if @scanner.scan(/@integer/)
|
113
|
+
@rule.integer_samples = parse_sample_list
|
114
|
+
end
|
115
|
+
skip_whitespaces
|
116
|
+
if @scanner.scan(/@decimal/)
|
117
|
+
@rule.decimal_samples = parse_sample_list
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
private
|
122
|
+
def skip_whitespaces
|
123
|
+
@scanner.skip(/\p{Pattern_White_Space}+/)
|
124
|
+
end
|
125
|
+
|
126
|
+
def parse_condition
|
127
|
+
and_condition = parse_and_condition
|
128
|
+
return nil if and_condition.nil?
|
129
|
+
and_conditions = [and_condition]
|
130
|
+
while parse_or
|
131
|
+
and_conditions << parse_and_condition
|
132
|
+
end
|
133
|
+
if and_conditions.size == 1
|
134
|
+
and_condition
|
135
|
+
else
|
136
|
+
[:or, *and_conditions]
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def parse_or
|
141
|
+
skip_whitespaces
|
142
|
+
@scanner.scan(/or/)
|
143
|
+
end
|
144
|
+
|
145
|
+
def parse_and_condition
|
146
|
+
skip_whitespaces
|
147
|
+
relation = parse_relation
|
148
|
+
return nil if relation.nil?
|
149
|
+
relations = [relation]
|
150
|
+
while parse_and
|
151
|
+
relations << parse_relation
|
152
|
+
end
|
153
|
+
if relations.size == 1
|
154
|
+
relation
|
155
|
+
else
|
156
|
+
[:and, *relations]
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def parse_and
|
161
|
+
skip_whitespaces
|
162
|
+
@scanner.scan(/and/)
|
163
|
+
end
|
164
|
+
|
165
|
+
def parse_relation
|
166
|
+
parse_is_relation or
|
167
|
+
parse_in_relation or
|
168
|
+
parse_within_relation
|
169
|
+
end
|
170
|
+
|
171
|
+
def parse_is_relation
|
172
|
+
position = @scanner.pos
|
173
|
+
skip_whitespaces
|
174
|
+
expr = parse_expr
|
175
|
+
unless parse_is
|
176
|
+
@scanner.pos = position
|
177
|
+
return nil
|
178
|
+
end
|
179
|
+
if parse_not
|
180
|
+
operator = :is_not
|
181
|
+
else
|
182
|
+
operator = :is
|
183
|
+
end
|
184
|
+
value = parse_value
|
185
|
+
if value.nil?
|
186
|
+
raise Error.new("no value for #{operator}: #{@scanner.inspect}")
|
187
|
+
end
|
188
|
+
[operator, expr, value]
|
189
|
+
end
|
190
|
+
|
191
|
+
def parse_is
|
192
|
+
skip_whitespaces
|
193
|
+
@scanner.scan(/is/)
|
194
|
+
end
|
195
|
+
|
196
|
+
def parse_not
|
197
|
+
skip_whitespaces
|
198
|
+
@scanner.scan(/not/)
|
199
|
+
end
|
200
|
+
|
201
|
+
def parse_in_relation
|
202
|
+
position = @scanner.pos
|
203
|
+
skip_whitespaces
|
204
|
+
expr = parse_expr
|
205
|
+
if parse_not
|
206
|
+
if parse_in
|
207
|
+
operator = :not_in
|
208
|
+
else
|
209
|
+
@scanner.ops = position
|
210
|
+
return nil
|
211
|
+
end
|
212
|
+
elsif parse_in
|
213
|
+
operator = :in
|
214
|
+
elsif parse_equal
|
215
|
+
operator = :equal
|
216
|
+
elsif parse_not_equal
|
217
|
+
operator = :not_equal
|
218
|
+
else
|
219
|
+
@scanner.pos = position
|
220
|
+
return nil
|
221
|
+
end
|
222
|
+
range_list = parse_range_list
|
223
|
+
[operator, expr, range_list]
|
224
|
+
end
|
225
|
+
|
226
|
+
def parse_in
|
227
|
+
skip_whitespaces
|
228
|
+
@scanner.scan(/in/)
|
229
|
+
end
|
230
|
+
|
231
|
+
def parse_equal
|
232
|
+
skip_whitespaces
|
233
|
+
@scanner.scan(/=/)
|
234
|
+
end
|
235
|
+
|
236
|
+
def parse_not_equal
|
237
|
+
skip_whitespaces
|
238
|
+
@scanner.scan(/!=/)
|
239
|
+
end
|
240
|
+
|
241
|
+
def parse_within_relation
|
242
|
+
position = @scanner.pos
|
243
|
+
skip_whitespaces
|
244
|
+
expr = parse_expr
|
245
|
+
have_not = parse_not
|
246
|
+
unless parse_within
|
247
|
+
@scanner.pos = position
|
248
|
+
return nil
|
249
|
+
end
|
250
|
+
if have_not
|
251
|
+
operator = :not_within
|
252
|
+
else
|
253
|
+
operator = :within
|
254
|
+
end
|
255
|
+
range_list = parse_range_list
|
256
|
+
[operator, expr, range_list]
|
257
|
+
end
|
258
|
+
|
259
|
+
def parse_within
|
260
|
+
skip_whitespaces
|
261
|
+
@scanner.scan(/within/)
|
262
|
+
end
|
263
|
+
|
264
|
+
def parse_expr
|
265
|
+
operand = parse_operand
|
266
|
+
operator = parse_expr_operator
|
267
|
+
if operator
|
268
|
+
value = parse_value
|
269
|
+
if value.nil?
|
270
|
+
raise Error.new("no value for #{operator}: #{@scanner.inspect}")
|
271
|
+
end
|
272
|
+
[operator, operand, value]
|
273
|
+
else
|
274
|
+
operand
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
def parse_operand
|
279
|
+
skip_whitespaces
|
280
|
+
@scanner.scan(/[niftvwce]/)
|
281
|
+
end
|
282
|
+
|
283
|
+
def parse_expr_operator
|
284
|
+
skip_whitespaces
|
285
|
+
if @scanner.scan(/(?:mod|%)/)
|
286
|
+
:mod
|
287
|
+
else
|
288
|
+
nil
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
def parse_range_list
|
293
|
+
ranges = [parse_range || parse_value]
|
294
|
+
loop do
|
295
|
+
skip_whitespaces
|
296
|
+
break unless @scanner.scan(/,/)
|
297
|
+
ranges << (parse_range || parse_value)
|
298
|
+
end
|
299
|
+
ranges
|
300
|
+
end
|
301
|
+
|
302
|
+
def parse_range
|
303
|
+
position = @scanner.pos
|
304
|
+
range_start = parse_value
|
305
|
+
skip_whitespaces
|
306
|
+
unless @scanner.scan(/\.\./)
|
307
|
+
@scanner.pos = position
|
308
|
+
return nil
|
309
|
+
end
|
310
|
+
range_end = parse_value
|
311
|
+
range_start..range_end
|
312
|
+
end
|
313
|
+
|
314
|
+
def parse_value
|
315
|
+
skip_whitespaces
|
316
|
+
value = @scanner.scan(/\d+/)
|
317
|
+
return nil if value.nil?
|
318
|
+
Integer(value, 10)
|
319
|
+
end
|
320
|
+
|
321
|
+
def parse_sample_list
|
322
|
+
samples = [parse_sample_range]
|
323
|
+
loop do
|
324
|
+
position = @scanner.pos
|
325
|
+
skip_whitespaces
|
326
|
+
break unless @scanner.scan(/,/)
|
327
|
+
sample_range = parse_sample_range
|
328
|
+
unless sample_range
|
329
|
+
@scanner.pos = position
|
330
|
+
break
|
331
|
+
end
|
332
|
+
samples << sample_range
|
333
|
+
end
|
334
|
+
skip_whitespaces
|
335
|
+
if @scanner.scan(/,/)
|
336
|
+
skip_whitespaces
|
337
|
+
# U+2026 HORIZONTAL ELLIPSIS
|
338
|
+
unless @scanner.scan(/\u2026|\.\.\./)
|
339
|
+
raise "no ellipsis: #{@scanner.inspect}"
|
340
|
+
end
|
341
|
+
samples << :elipsis
|
342
|
+
end
|
343
|
+
samples
|
344
|
+
end
|
345
|
+
|
346
|
+
def parse_sample_range
|
347
|
+
value = parse_sample_value
|
348
|
+
return nil if value.nil?
|
349
|
+
skip_whitespaces
|
350
|
+
if @scanner.scan(/~/)
|
351
|
+
range_end = parse_sample_value
|
352
|
+
value..range_end
|
353
|
+
else
|
354
|
+
value
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
def parse_sample_value
|
359
|
+
value = parse_value
|
360
|
+
return nil if value.nil?
|
361
|
+
if @scanner.scan(/\./)
|
362
|
+
skip_whitespaces
|
363
|
+
decimal = @scanner.scan(/[0-9]+/)
|
364
|
+
if decimal.nil?
|
365
|
+
raise "no decimal: #{@scanner.inspect}"
|
366
|
+
end
|
367
|
+
value += Float("0.#{decimal}")
|
368
|
+
skip_whitespaces
|
369
|
+
end
|
370
|
+
if @scanner.scan(/[ce]/)
|
371
|
+
# Workardoun for a spec bug. "e1" should be accepted.
|
372
|
+
#
|
373
|
+
# Spec:
|
374
|
+
# sampleValue = value ('.' digit+)? ([ce] digitPos digit+)?
|
375
|
+
# digit = [0-9]
|
376
|
+
# digitPos = [1-9]
|
377
|
+
e = @scanner.scan(/[1-9][0-9]*/)
|
378
|
+
value *= 10 * Integer(e, 10)
|
379
|
+
end
|
380
|
+
value
|
381
|
+
end
|
382
|
+
end
|
383
|
+
private_constant :RuleParser
|
384
|
+
end
|
385
|
+
end
|