red-datasets 0.0.7 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/doc/text/news.md +102 -0
- data/lib/datasets.rb +19 -9
- data/lib/datasets/adult.rb +4 -3
- data/lib/datasets/cifar.rb +4 -12
- data/lib/datasets/cldr-plurals.rb +385 -0
- data/lib/datasets/communities.rb +198 -0
- data/lib/datasets/dataset.rb +20 -1
- data/lib/datasets/downloader.rb +54 -26
- data/lib/datasets/e-stat-japan.rb +320 -0
- data/lib/datasets/error.rb +4 -0
- data/lib/datasets/hepatitis.rb +207 -0
- data/lib/datasets/libsvm-dataset-list.rb +277 -0
- data/lib/datasets/libsvm.rb +135 -0
- data/lib/datasets/mnist.rb +0 -2
- data/lib/datasets/mushroom.rb +256 -0
- data/lib/datasets/penguins.rb +146 -0
- data/lib/datasets/postal-code-japan.rb +154 -0
- data/lib/datasets/rdatasets.rb +95 -0
- data/lib/datasets/table.rb +83 -3
- data/lib/datasets/tar_gz_readable.rb +14 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +2 -10
- data/red-datasets.gemspec +4 -0
- data/test/run-test.rb +2 -0
- data/test/test-cldr-plurals.rb +180 -0
- data/test/test-communities.rb +290 -0
- data/test/test-dataset.rb +27 -0
- data/test/test-downloader.rb +29 -0
- data/test/test-e-stat-japan.rb +383 -0
- data/test/test-hepatitis.rb +74 -0
- data/test/test-libsvm-dataset-list.rb +47 -0
- data/test/test-libsvm.rb +205 -0
- data/test/test-mushroom.rb +80 -0
- data/test/test-penguins.rb +251 -0
- data/test/test-postal-code-japan.rb +69 -0
- data/test/test-rdatasets.rb +136 -0
- data/test/test-table.rb +123 -18
- metadata +88 -11
@@ -0,0 +1,154 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "zip"
|
3
|
+
|
4
|
+
require_relative "dataset"
|
5
|
+
|
6
|
+
module Datasets
|
7
|
+
class PostalCodeJapan < Dataset
|
8
|
+
class Record < Struct.new(:organization_code,
|
9
|
+
:old_postal_code,
|
10
|
+
:postal_code,
|
11
|
+
:prefecture_reading,
|
12
|
+
:city_reading,
|
13
|
+
:address_reading,
|
14
|
+
:prefecture,
|
15
|
+
:city,
|
16
|
+
:address,
|
17
|
+
:have_multiple_postal_codes,
|
18
|
+
:have_address_number_per_koaza,
|
19
|
+
:have_chome,
|
20
|
+
:postal_code_is_shared,
|
21
|
+
:changed,
|
22
|
+
:change_reason)
|
23
|
+
alias_method :have_multiple_postal_codes?,
|
24
|
+
:have_multiple_postal_codes
|
25
|
+
alias_method :have_address_number_per_koaza?,
|
26
|
+
:have_address_number_per_koaza
|
27
|
+
alias_method :have_chome?,
|
28
|
+
:have_chome
|
29
|
+
alias_method :postal_code_is_shared?,
|
30
|
+
:postal_code_is_shared
|
31
|
+
alias_method :changed?,
|
32
|
+
:changed
|
33
|
+
end
|
34
|
+
|
35
|
+
VALID_READINGS = [
|
36
|
+
:lowercase,
|
37
|
+
:uppercase,
|
38
|
+
:romaji,
|
39
|
+
]
|
40
|
+
def initialize(reading: :lowercase)
|
41
|
+
super()
|
42
|
+
@reading = reading
|
43
|
+
unless VALID_READINGS.include?(@reading)
|
44
|
+
message = ":reading must be one of ["
|
45
|
+
message << VALID_READINGS.collect(&:inspect).join(", ")
|
46
|
+
message << "]: #{@reading.inspect}"
|
47
|
+
raise ArgumentError, message
|
48
|
+
end
|
49
|
+
@metadata.id = "postal-code-japan-#{@reading}"
|
50
|
+
@metadata.name = "Postal code in Japan (#{@reading})"
|
51
|
+
@metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
|
52
|
+
@metadata.licenses = [
|
53
|
+
"CC0-1.0",
|
54
|
+
]
|
55
|
+
@metadata.description = "Postal code in Japan (reading: #{@reading})"
|
56
|
+
end
|
57
|
+
|
58
|
+
def each(&block)
|
59
|
+
return to_enum(__method__) unless block_given?
|
60
|
+
|
61
|
+
open_data do |input|
|
62
|
+
utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
|
63
|
+
options = {
|
64
|
+
quote_char: nil,
|
65
|
+
strip: %Q["],
|
66
|
+
}
|
67
|
+
if @reading == :romaji
|
68
|
+
CSV.parse(utf8_data, **options) do |row|
|
69
|
+
yield(Record.new(nil,
|
70
|
+
nil,
|
71
|
+
row[0],
|
72
|
+
row[4],
|
73
|
+
row[5],
|
74
|
+
row[6],
|
75
|
+
row[1],
|
76
|
+
row[2],
|
77
|
+
row[3],
|
78
|
+
false,
|
79
|
+
false,
|
80
|
+
false,
|
81
|
+
false,
|
82
|
+
false,
|
83
|
+
nil))
|
84
|
+
end
|
85
|
+
else
|
86
|
+
CSV.parse(utf8_data, **options) do |row|
|
87
|
+
yield(Record.new(row[0],
|
88
|
+
row[1].rstrip,
|
89
|
+
row[2],
|
90
|
+
row[3],
|
91
|
+
row[4],
|
92
|
+
row[5],
|
93
|
+
row[6],
|
94
|
+
row[7],
|
95
|
+
row[8],
|
96
|
+
(row[9] == "1"),
|
97
|
+
(row[10] == "1"),
|
98
|
+
(row[11] == "1"),
|
99
|
+
(row[12] == "1"),
|
100
|
+
(row[13] != "0"),
|
101
|
+
convert_change_reason(row[14])))
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
private
|
108
|
+
def open_data
|
109
|
+
data_url = "https://www.post.japanpost.jp/zipcode/dl"
|
110
|
+
case @reading
|
111
|
+
when :lowercase
|
112
|
+
data_url << "/kogaki/zip/ken_all.zip"
|
113
|
+
when :uppercase
|
114
|
+
data_url << "/oogaki/zip/ken_all.zip"
|
115
|
+
when :romaji
|
116
|
+
data_url << "/roman/ken_all_rome.zip"
|
117
|
+
end
|
118
|
+
data_path = cache_dir_path + "#{@reading}-ken-all.zip"
|
119
|
+
unless data_path.exist?
|
120
|
+
download(data_path, data_url)
|
121
|
+
end
|
122
|
+
|
123
|
+
Zip::File.open(data_path.to_s) do |zip_file|
|
124
|
+
zip_file.each do |entry|
|
125
|
+
next unless entry.file?
|
126
|
+
entry.get_input_stream do |input|
|
127
|
+
yield(input)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def convert_change_reason(reason)
|
134
|
+
case reason
|
135
|
+
when "0"
|
136
|
+
nil
|
137
|
+
when "1"
|
138
|
+
:new
|
139
|
+
when "2"
|
140
|
+
:japanese_addressing_system
|
141
|
+
when "3"
|
142
|
+
:land_readjustment
|
143
|
+
when "4"
|
144
|
+
:postal_district_adjustment
|
145
|
+
when "5"
|
146
|
+
:correction
|
147
|
+
when "6"
|
148
|
+
:deletion
|
149
|
+
else
|
150
|
+
:unknown
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require_relative "dataset"
|
2
|
+
require_relative "tar_gz_readable"
|
3
|
+
|
4
|
+
module Datasets
|
5
|
+
class RdatasetsList < Dataset
|
6
|
+
Record = Struct.new(:package,
|
7
|
+
:dataset,
|
8
|
+
:title,
|
9
|
+
:rows,
|
10
|
+
:cols,
|
11
|
+
:n_binary,
|
12
|
+
:n_character,
|
13
|
+
:n_factor,
|
14
|
+
:n_logical,
|
15
|
+
:n_numeric,
|
16
|
+
:csv,
|
17
|
+
:doc)
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
super
|
21
|
+
@metadata.id = "rdatasets"
|
22
|
+
@metadata.name = "Rdatasets"
|
23
|
+
@metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
|
24
|
+
@metadata.licenses = ["GPL-3"]
|
25
|
+
@data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
|
26
|
+
@data_path = cache_dir_path + "datasets.csv"
|
27
|
+
end
|
28
|
+
|
29
|
+
def filter(package: nil, dataset: nil)
|
30
|
+
return to_enum(__method__, package: package, dataset: dataset) unless block_given?
|
31
|
+
|
32
|
+
conds = {}
|
33
|
+
conds["Package"] = package if package
|
34
|
+
conds["Item"] = dataset if dataset
|
35
|
+
if conds.empty?
|
36
|
+
each_row {|row| yield Record.new(*row.fields) }
|
37
|
+
else
|
38
|
+
each_row do |row|
|
39
|
+
if conds.all? {|k, v| row[k] == v }
|
40
|
+
yield Record.new(*row.fields)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def each(&block)
|
47
|
+
filter(&block)
|
48
|
+
end
|
49
|
+
|
50
|
+
private def each_row(&block)
|
51
|
+
download(@data_path, @data_url) unless @data_path.exist?
|
52
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
53
|
+
csv.each(&block)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class Rdatasets < Dataset
|
59
|
+
def initialize(package_name, dataset_name)
|
60
|
+
list = RdatasetsList.new
|
61
|
+
|
62
|
+
info = list.filter(package: package_name, dataset: dataset_name).first
|
63
|
+
unless info
|
64
|
+
raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
|
65
|
+
end
|
66
|
+
|
67
|
+
super()
|
68
|
+
@metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
|
69
|
+
@metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
|
70
|
+
@metadata.url = info.csv
|
71
|
+
@metadata.licenses = ["GPL-3"]
|
72
|
+
@metadata.description = info.title
|
73
|
+
|
74
|
+
# Follow the original directory structure in the cache directory
|
75
|
+
@data_path = cache_dir_path + (dataset_name + ".csv")
|
76
|
+
|
77
|
+
@package_name = package_name
|
78
|
+
@dataset_name = dataset_name
|
79
|
+
end
|
80
|
+
|
81
|
+
def each(&block)
|
82
|
+
return to_enum(__method__) unless block_given?
|
83
|
+
|
84
|
+
download(@data_path, @metadata.url) unless @data_path.exist?
|
85
|
+
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
86
|
+
csv.each do |row|
|
87
|
+
record = row.to_h
|
88
|
+
record.delete("")
|
89
|
+
record.transform_keys!(&:to_sym)
|
90
|
+
yield record
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/datasets/table.rb
CHANGED
@@ -2,19 +2,99 @@ require "datasets/dictionary"
|
|
2
2
|
|
3
3
|
module Datasets
|
4
4
|
class Table
|
5
|
+
class Record
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
def initialize(table, index)
|
9
|
+
@table = table
|
10
|
+
@index = index
|
11
|
+
end
|
12
|
+
|
13
|
+
def [](column_name_or_column_index)
|
14
|
+
@table[column_name_or_column_index][@index]
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
return to_enum(__method__) unless block_given?
|
19
|
+
@table.each_column.each do |column_name, column_values|
|
20
|
+
yield(column_name, column_values[@index])
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def values
|
25
|
+
@table.each_column.collect do |_column_name, column_values|
|
26
|
+
column_values[@index]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_h
|
31
|
+
hash = {}
|
32
|
+
each do |column_name, column_value|
|
33
|
+
hash[column_name] = column_value
|
34
|
+
end
|
35
|
+
hash
|
36
|
+
end
|
37
|
+
|
38
|
+
def inspect
|
39
|
+
"#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
5
43
|
include Enumerable
|
6
44
|
|
45
|
+
attr_reader :dataset
|
7
46
|
def initialize(dataset)
|
8
47
|
@dataset = dataset
|
9
48
|
@dictionaries = {}
|
10
49
|
end
|
11
50
|
|
12
|
-
def
|
51
|
+
def n_columns
|
52
|
+
columner_data.size
|
53
|
+
end
|
54
|
+
alias_method :size, :n_columns
|
55
|
+
alias_method :length, :n_columns
|
56
|
+
|
57
|
+
def n_rows
|
58
|
+
first_column = columner_data.first
|
59
|
+
return 0 if first_column.nil?
|
60
|
+
first_column[1].size
|
61
|
+
end
|
62
|
+
|
63
|
+
def column_names
|
64
|
+
columner_data.keys
|
65
|
+
end
|
66
|
+
|
67
|
+
def each_column(&block)
|
13
68
|
columner_data.each(&block)
|
14
69
|
end
|
70
|
+
alias_method :each, :each_column
|
15
71
|
|
16
|
-
def
|
17
|
-
|
72
|
+
def each_record
|
73
|
+
return to_enum(__method__) unless block_given?
|
74
|
+
n_rows.times do |i|
|
75
|
+
yield(Record.new(self, i))
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def find_record(row)
|
80
|
+
row += n_rows if row < 0
|
81
|
+
return nil if row < 0
|
82
|
+
return nil if row >= n_rows
|
83
|
+
Record.new(self, row)
|
84
|
+
end
|
85
|
+
|
86
|
+
def [](name_or_index)
|
87
|
+
case name_or_index
|
88
|
+
when Integer
|
89
|
+
index = name_or_index
|
90
|
+
columner_data.each_with_index do |(_name, values), i|
|
91
|
+
return values if i == index
|
92
|
+
end
|
93
|
+
nil
|
94
|
+
else
|
95
|
+
name = name_or_index
|
96
|
+
columner_data[normalize_name(name)]
|
97
|
+
end
|
18
98
|
end
|
19
99
|
|
20
100
|
def dictionary_encode(name)
|
data/lib/datasets/version.rb
CHANGED
data/lib/datasets/wikipedia.rb
CHANGED
@@ -52,7 +52,7 @@ module Datasets
|
|
52
52
|
end
|
53
53
|
|
54
54
|
private
|
55
|
-
def open_data
|
55
|
+
def open_data(&block)
|
56
56
|
base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
57
57
|
data_path = cache_dir_path + base_name
|
58
58
|
unless data_path.exist?
|
@@ -60,15 +60,7 @@ module Datasets
|
|
60
60
|
download(data_path, data_url)
|
61
61
|
end
|
62
62
|
|
63
|
-
|
64
|
-
pid = spawn("bzcat", data_path.to_s, {:out => output})
|
65
|
-
begin
|
66
|
-
output.close
|
67
|
-
yield(input)
|
68
|
-
ensure
|
69
|
-
input.close
|
70
|
-
Process.waitpid(pid)
|
71
|
-
end
|
63
|
+
extract_bz2(data_path, &block)
|
72
64
|
end
|
73
65
|
|
74
66
|
def type_in_path
|
data/red-datasets.gemspec
CHANGED
@@ -34,6 +34,10 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.files += Dir.glob("doc/text/*")
|
35
35
|
spec.test_files += Dir.glob("test/**/*")
|
36
36
|
|
37
|
+
spec.add_runtime_dependency("csv", ">= 3.0.5")
|
38
|
+
spec.add_runtime_dependency("rexml")
|
39
|
+
spec.add_runtime_dependency("rubyzip")
|
40
|
+
|
37
41
|
spec.add_development_dependency("bundler")
|
38
42
|
spec.add_development_dependency("rake")
|
39
43
|
spec.add_development_dependency("test-unit")
|
data/test/run-test.rb
CHANGED
@@ -0,0 +1,180 @@
|
|
1
|
+
class CLDRPluralsTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@dataset = Datasets::CLDRPlurals.new
|
4
|
+
end
|
5
|
+
|
6
|
+
def locale(*args)
|
7
|
+
Datasets::CLDRPlurals::Locale.new(*args)
|
8
|
+
end
|
9
|
+
|
10
|
+
def rule(*args)
|
11
|
+
Datasets::CLDRPlurals::Rule.new(*args)
|
12
|
+
end
|
13
|
+
|
14
|
+
test("#each") do
|
15
|
+
locales = @dataset.each.to_a
|
16
|
+
assert_equal([
|
17
|
+
215,
|
18
|
+
locale("bm",
|
19
|
+
[
|
20
|
+
rule("other",
|
21
|
+
nil,
|
22
|
+
[
|
23
|
+
0..15,
|
24
|
+
100,
|
25
|
+
1000,
|
26
|
+
10000,
|
27
|
+
100000,
|
28
|
+
1000000,
|
29
|
+
:elipsis,
|
30
|
+
],
|
31
|
+
[
|
32
|
+
0.0..1.5,
|
33
|
+
10.0,
|
34
|
+
100.0,
|
35
|
+
1000.0,
|
36
|
+
10000.0,
|
37
|
+
100000.0,
|
38
|
+
1000000.0,
|
39
|
+
:elipsis,
|
40
|
+
])
|
41
|
+
]),
|
42
|
+
locale("kw",
|
43
|
+
[
|
44
|
+
rule("zero",
|
45
|
+
[:equal, "n", [0]],
|
46
|
+
[0],
|
47
|
+
[0.0, 0.00, 0.000, 0.0000]),
|
48
|
+
rule("one",
|
49
|
+
[:equal, "n", [1]],
|
50
|
+
[1],
|
51
|
+
[1.0, 1.00, 1.000, 1.0000]),
|
52
|
+
rule("two",
|
53
|
+
[:or,
|
54
|
+
[:equal,
|
55
|
+
[:mod, "n", 100],
|
56
|
+
[2, 22, 42, 62, 82]],
|
57
|
+
[:and,
|
58
|
+
[:equal, [:mod, "n", 1000], [0]],
|
59
|
+
[:equal,
|
60
|
+
[:mod, "n", 100000],
|
61
|
+
[1000..20000, 40000, 60000, 80000]]],
|
62
|
+
[:and,
|
63
|
+
[:not_equal, "n", [0]],
|
64
|
+
[:equal, [:mod, "n", 1000000], [100000]]]],
|
65
|
+
[
|
66
|
+
2,
|
67
|
+
22,
|
68
|
+
42,
|
69
|
+
62,
|
70
|
+
82,
|
71
|
+
102,
|
72
|
+
122,
|
73
|
+
142,
|
74
|
+
1000,
|
75
|
+
10000,
|
76
|
+
100000,
|
77
|
+
:elipsis,
|
78
|
+
],
|
79
|
+
[
|
80
|
+
2.0,
|
81
|
+
22.0,
|
82
|
+
42.0,
|
83
|
+
62.0,
|
84
|
+
82.0,
|
85
|
+
102.0,
|
86
|
+
122.0,
|
87
|
+
142.0,
|
88
|
+
1000.0,
|
89
|
+
10000.0,
|
90
|
+
100000.0,
|
91
|
+
:elipsis,
|
92
|
+
]),
|
93
|
+
rule("few",
|
94
|
+
[:equal,
|
95
|
+
[:mod, "n", 100],
|
96
|
+
[3, 23, 43, 63, 83]],
|
97
|
+
[
|
98
|
+
3,
|
99
|
+
23,
|
100
|
+
43,
|
101
|
+
63,
|
102
|
+
83,
|
103
|
+
103,
|
104
|
+
123,
|
105
|
+
143,
|
106
|
+
1003,
|
107
|
+
:elipsis,
|
108
|
+
],
|
109
|
+
[
|
110
|
+
3.0,
|
111
|
+
23.0,
|
112
|
+
43.0,
|
113
|
+
63.0,
|
114
|
+
83.0,
|
115
|
+
103.0,
|
116
|
+
123.0,
|
117
|
+
143.0,
|
118
|
+
1003.0,
|
119
|
+
:elipsis,
|
120
|
+
]),
|
121
|
+
rule("many",
|
122
|
+
[:and,
|
123
|
+
[:not_equal, "n", [1]],
|
124
|
+
[:equal,
|
125
|
+
[:mod, "n", 100],
|
126
|
+
[1, 21, 41, 61, 81]]],
|
127
|
+
[
|
128
|
+
21,
|
129
|
+
41,
|
130
|
+
61,
|
131
|
+
81,
|
132
|
+
101,
|
133
|
+
121,
|
134
|
+
141,
|
135
|
+
161,
|
136
|
+
1001,
|
137
|
+
:elipsis,
|
138
|
+
],
|
139
|
+
[
|
140
|
+
21.0,
|
141
|
+
41.0,
|
142
|
+
61.0,
|
143
|
+
81.0,
|
144
|
+
101.0,
|
145
|
+
121.0,
|
146
|
+
141.0,
|
147
|
+
161.0,
|
148
|
+
1001.0,
|
149
|
+
:elipsis,
|
150
|
+
]),
|
151
|
+
rule("other",
|
152
|
+
nil,
|
153
|
+
[4..19, 100, 1004, 1000000, :elipsis],
|
154
|
+
[
|
155
|
+
0.1..0.9,
|
156
|
+
1.1..1.7,
|
157
|
+
10.0,
|
158
|
+
100.0,
|
159
|
+
1000.1,
|
160
|
+
1000000.0,
|
161
|
+
:elipsis,
|
162
|
+
]),
|
163
|
+
]),
|
164
|
+
],
|
165
|
+
[
|
166
|
+
locales.size,
|
167
|
+
locales[0],
|
168
|
+
locales[-4],
|
169
|
+
])
|
170
|
+
end
|
171
|
+
|
172
|
+
sub_test_case("#metadata") do
|
173
|
+
test("#description") do
|
174
|
+
description = @dataset.metadata.description
|
175
|
+
assert do
|
176
|
+
description.start_with?("Language plural rules in Unicode Common Locale Data Repository.")
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|