red-datasets 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 222271b814e3a5ce23b5e0dd1d2578bffb84afdab10110b0869985c6056bfd3b
4
- data.tar.gz: ac30931b3317ab04afd394b28a45a9206c784d78b3bcaf98fc3a2a48227c7930
3
+ metadata.gz: c7a9199546e7a001c97e45c6fa28db15c0d96b748e527d9705dfee4e4b1db6fd
4
+ data.tar.gz: c659f6ae1e658ad91210e4427be063463124d89ef90388d34ebfb73ceb49068a
5
5
  SHA512:
6
- metadata.gz: 8a94a3d66baaed4948904e97dc53100d73ae96c528c09b02252caabd05b8545587abf6fbcba3a578725812327a9a2c8827bbb7e283ccd3d7e66753bf30035e2e
7
- data.tar.gz: 2ab44b5aa3ee5da0ac8e8307546c71942938de4497bfec05fc929715a4e5ef6df1cb091bce0d5f12978582d2c9fa7eaffff9edd54be0d845627dccfce42a63dd
6
+ metadata.gz: d8a23c4a165a596df22ce5bbe1f8f0cd5c0f002deecafbb26cd5e5f75abb3c0224c1013898162a67787159258d1b801395fc4d949c17939d95940664cffd5600
7
+ data.tar.gz: f2fd4eb733e6205f138c4005627e815e3787040a8a4b6cce7eca9fd5d4adaa12263e17e8f5bd9394a851e5210f28736ee3c682c81e110da304ae17fb3f0bedba
data/README.md CHANGED
@@ -1,8 +1,4 @@
1
- # README
2
-
3
- ## Name
4
-
5
- Red Datasets
1
+ # Red Datasets
6
2
 
7
3
  ## Description
8
4
 
@@ -16,6 +12,20 @@ You can use datasets easily because you can access each dataset with multiple wa
16
12
  % gem install red-datasets
17
13
  ```
18
14
 
15
+ ## Available datasets
16
+
17
+ TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
18
+
19
+ * Adult Dataset
20
+ * CIFAR-10 Dataset
21
+ * CIFAR-100 Dataset
22
+ * Fashion-MNIST
23
+ * Iris Dataset
24
+ * MNIST database
25
+ * The Penn Treebank Project
26
+ * Wikipedia
27
+ * Wine Dataset
28
+
19
29
  ## Usage
20
30
 
21
31
  Here is an example to access [Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris) by `#each` or `Table#to_h` or `Table#fetch_values`.
data/doc/text/news.md CHANGED
@@ -1,5 +1,22 @@
1
1
  # News
2
2
 
3
+ ## 0.0.8 - 2019-03-24
4
+
5
+ ### Improvements
6
+
7
+ * Improved README.
8
+ [GitHub#40][Patch by kojix2]
9
+
10
+ * `Datasets::PostalCodeJapan`: Added.
11
+
12
+ * `Datasets::LIBSVMDatasetList`: Added.
13
+
14
+ * `Datasets::LIBSVM`: Added.
15
+
16
+ ### Thanks
17
+
18
+ * kojix2
19
+
3
20
  ## 0.0.7 - 2018-11-21
4
21
 
5
22
  ### Improvements
@@ -0,0 +1,137 @@
1
+ require "English"
2
+ require "rexml/document"
3
+
4
+ require_relative "dataset"
5
+
6
+ module Datasets
7
+ class LIBSVMDatasetList < Dataset
8
+ File = Struct.new(:name,
9
+ :url,
10
+ :note)
11
+ class Record < Struct.new(:name,
12
+ :source,
13
+ :preprocessing,
14
+ :n_classes,
15
+ :n_data,
16
+ :n_features,
17
+ :files)
18
+ def to_h
19
+ hash = super
20
+ hash[:files] = hash[:files].collect(&:to_h)
21
+ hash
22
+ end
23
+ end
24
+
25
+ def initialize
26
+ super()
27
+ @metadata.id = "libsvm-dataset-list"
28
+ @metadata.name = "LIBSVM dataset list"
29
+ @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
30
+ @metadata.description = lambda do
31
+ extract_description
32
+ end
33
+ end
34
+
35
+ def each
36
+ return to_enum(__method__) unless block_given?
37
+
38
+ open_data do |input|
39
+ # TODO: Improve performance
40
+ document = REXML::Document.new(input)
41
+ is_header = true
42
+ document.each_element("//tr") do |tr|
43
+ if is_header
44
+ is_header = false
45
+ next
46
+ end
47
+ name = tr.elements.first
48
+ a = name.elements.first
49
+ href = a.attributes["href"]
50
+ record = Record.new
51
+ record.name = a.text
52
+ record.files = []
53
+ parse_detail(href, record)
54
+ yield(record)
55
+ end
56
+ end
57
+ end
58
+
59
+ private
60
+ def open_data
61
+ data_path = cache_dir_path + "index.html"
62
+ unless data_path.exist?
63
+ download(data_path, @metadata.url)
64
+ end
65
+ ::File.open(data_path) do |input|
66
+ yield(input)
67
+ end
68
+ end
69
+
70
+ def extract_description
71
+ open_data do |input|
72
+ document = REXML::Document.new(input)
73
+ description = []
74
+ in_content = false
75
+ document.each_element("//body/*") do |element|
76
+ unless in_content
77
+ in_content = (element.name == "h1")
78
+ next
79
+ end
80
+ break if element.name == "hr"
81
+ content = extract_text(element)
82
+ description << content unless content.empty?
83
+ end
84
+ description.join("\n\n")
85
+ end
86
+ end
87
+
88
+ def extract_text(element)
89
+ texts = REXML::XPath.match(element, ".//text()")
90
+ texts.join("").gsub(/[ \t\n]+/, " ").strip
91
+ end
92
+
93
+ def open_detail(detail)
94
+ data_path = cache_dir_path + detail
95
+ unless data_path.exist?
96
+ download(data_path, @metadata.url + detail)
97
+ end
98
+ ::File.open(data_path) do |input|
99
+ yield(input)
100
+ end
101
+ end
102
+
103
+ def parse_detail(href, record)
104
+ path, id = href.split("#")
105
+ open_detail(path) do |detail|
106
+ detail_document = REXML::Document.new(detail)
107
+ anchor = REXML::XPath.match(detail_document, "//*[@name='#{id}']")[0]
108
+ ul = anchor.next_sibling
109
+ ul.each_element do |li|
110
+ text = extract_text(li)
111
+ case text
112
+ when /\ASource: /
113
+ record.source = $POSTMATCH
114
+ when /\APreprocessing: /
115
+ record.preprocessing = $POSTMATCH
116
+ when /\A\# of classes: (\d+)/
117
+ record.n_classes = Integer($1, 10)
118
+ when /\A\# of data: ([\d,]+)/
119
+ record.n_data = Integer($1.gsub(/,/, ""), 10)
120
+ when /\A\# of features: ([\d,]+)/
121
+ record.n_features = Integer($1.gsub(/,/, ""), 10)
122
+ when /\AFiles:/
123
+ li.elements.first.each_element do |file_li|
124
+ file_a = file_li.elements.first
125
+ file = File.new
126
+ file.name = file_a.text
127
+ file.url = @metadata.url + file_a.attributes["href"]
128
+ file_note = file_li.text
129
+ file.note = file_note.strip.gsub(/[()]/, "") if file_note
130
+ record.files << file
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,143 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class LIBSVM < Dataset
7
+ class Record
8
+ attr_reader :label
9
+ attr_reader :features
10
+ def initialize(label, features)
11
+ @label = label
12
+ @features = features
13
+ end
14
+
15
+ def [](index)
16
+ @features[index]
17
+ end
18
+
19
+ def to_h
20
+ hash = {
21
+ label: @label,
22
+ }
23
+ @features.each_with_index do |feature, i|
24
+ hash[i] = feature
25
+ end
26
+ hash
27
+ end
28
+
29
+ def values
30
+ [@label] + @features
31
+ end
32
+ end
33
+
34
+ def initialize(name,
35
+ note: nil,
36
+ default_feature_value: 0)
37
+ super()
38
+ @libsvm_dataset_metadata = fetch_dataset_info(name)
39
+ @file = choose_file(note)
40
+ @default_feature_value = default_feature_value
41
+ @metadata.id = "libsvm-#{normalize_name(name)}"
42
+ @metadata.name = "LIBSVM dataset: #{name}"
43
+ @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
44
+ end
45
+
46
+ def each
47
+ return to_enum(__method__) unless block_given?
48
+
49
+ open_data do |input|
50
+ n_features = @libsvm_dataset_metadata.n_features
51
+ csv = CSV.new(input, col_sep: " ")
52
+ csv.each do |row|
53
+ label = parse_label(row.shift)
54
+ features = [@default_feature_value] * n_features
55
+ row.each do |column|
56
+ next if column.nil?
57
+ index, value = column.split(":", 2)
58
+ features[Integer(index, 10) - 1] = parse_value(value)
59
+ end
60
+ yield(Record.new(label, features))
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def fetch_dataset_info(name)
67
+ list = LIBSVMDatasetList.new
68
+ available_datasets = []
69
+ list.each do |record|
70
+ available_datasets << record.name
71
+ if record.name == name
72
+ return record
73
+ end
74
+ end
75
+ message = "unavailable LIBSVM dataset: #{name.inspect}: "
76
+ message << "available datasets: ["
77
+ message << available_datasets.collect(&:inspect).join(", ")
78
+ message << "]"
79
+ raise ArgumentError, message
80
+ end
81
+
82
+ def choose_file(note)
83
+ files = @libsvm_dataset_metadata.files
84
+ return files.first if note.nil?
85
+
86
+ available_notes = []
87
+ @libsvm_dataset_metadata.files.find do |file|
88
+ return file if file.note == note
89
+ available_notes << file.note if file.note
90
+ end
91
+
92
+ name = @libsvm_dataset_metadata.name
93
+ message = "unavailable note: #{name}: #{note.inspect}: "
94
+ message << "available notes: ["
95
+ message << available_notes.collect(&:inspect).join(", ")
96
+ message << "]"
97
+ raise ArgumentError, message
98
+ end
99
+
100
+ def open_data(&block)
101
+ data_path = cache_dir_path + @file.name
102
+ unless data_path.exist?
103
+ download(data_path, @file.url)
104
+ end
105
+ if data_path.extname == ".bz2"
106
+ input, output = IO.pipe
107
+ pid = spawn("bzcat", data_path.to_s, {:out => output})
108
+ begin
109
+ output.close
110
+ yield(input)
111
+ ensure
112
+ input.close
113
+ Process.waitpid(pid)
114
+ end
115
+ else
116
+ File.open(data_path, &block)
117
+ end
118
+ end
119
+
120
+ def normalize_name(name)
121
+ name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
122
+ end
123
+
124
+ def parse_label(label)
125
+ labels = label.split(",").collect do |value|
126
+ parse_value(value)
127
+ end
128
+ if labels.size == 1
129
+ labels[0]
130
+ else
131
+ labels
132
+ end
133
+ end
134
+
135
+ def parse_value(value)
136
+ if value.include?(".")
137
+ Float(value)
138
+ else
139
+ Integer(value, 10)
140
+ end
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,154 @@
1
+ require "csv"
2
+ require "zip"
3
+
4
+ require_relative "dataset"
5
+
6
+ module Datasets
7
+ class PostalCodeJapan < Dataset
8
+ class Record < Struct.new(:organization_code,
9
+ :old_postal_code,
10
+ :postal_code,
11
+ :prefecture_reading,
12
+ :city_reading,
13
+ :address_reading,
14
+ :prefecture,
15
+ :city,
16
+ :address,
17
+ :have_multiple_postal_codes,
18
+ :have_address_number_per_koaza,
19
+ :have_chome,
20
+ :postal_code_is_shared,
21
+ :changed,
22
+ :change_reason)
23
+ alias_method :have_multiple_postal_codes?,
24
+ :have_multiple_postal_codes
25
+ alias_method :have_address_number_per_koaza?,
26
+ :have_address_number_per_koaza
27
+ alias_method :have_chome?,
28
+ :have_chome
29
+ alias_method :postal_code_is_shared?,
30
+ :postal_code_is_shared
31
+ alias_method :changed?,
32
+ :changed
33
+ end
34
+
35
+ VALID_READINGS = [
36
+ :lowercase,
37
+ :uppercase,
38
+ :romaji,
39
+ ]
40
+ def initialize(reading: :lowercase)
41
+ super()
42
+ @reading = reading
43
+ unless VALID_READINGS.include?(@reading)
44
+ message = ":reading must be one of ["
45
+ message << VALID_READINGS.collect(&:inspect).join(", ")
46
+ message << "]: #{@reading.inspect}"
47
+ raise ArgumentError, message
48
+ end
49
+ @metadata.id = "postal-code-japan-#{@reading}"
50
+ @metadata.name = "Postal code in Japan (#{@reading})"
51
+ @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
52
+ @metadata.licenses = [
53
+ "CC0-1.0",
54
+ ]
55
+ @metadata.description = "Postal code in Japan (reading: #{@reading})"
56
+ end
57
+
58
+ def each(&block)
59
+ return to_enum(__method__) unless block_given?
60
+
61
+ open_data do |input|
62
+ utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
63
+ options = {
64
+ quote_char: nil,
65
+ strip: %Q["],
66
+ }
67
+ if @reading == :romaji
68
+ CSV.parse(utf8_data, **options) do |row|
69
+ yield(Record.new(nil,
70
+ nil,
71
+ row[0],
72
+ row[4],
73
+ row[5],
74
+ row[6],
75
+ row[1],
76
+ row[2],
77
+ row[3],
78
+ false,
79
+ false,
80
+ false,
81
+ false,
82
+ false,
83
+ nil))
84
+ end
85
+ else
86
+ CSV.parse(utf8_data, **options) do |row|
87
+ yield(Record.new(row[0],
88
+ row[1].rstrip,
89
+ row[2],
90
+ row[3],
91
+ row[4],
92
+ row[5],
93
+ row[6],
94
+ row[7],
95
+ row[8],
96
+ (row[9] == "1"),
97
+ (row[10] == "1"),
98
+ (row[11] == "1"),
99
+ (row[12] == "1"),
100
+ (row[13] != "0"),
101
+ convert_change_reason(row[14])))
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ private
108
+ def open_data
109
+ data_url = "https://www.post.japanpost.jp/zipcode/dl"
110
+ case @reading
111
+ when :lowercase
112
+ data_url << "/kogaki/zip/ken_all.zip"
113
+ when :uppercase
114
+ data_url << "/oogaki/zip/ken_all.zip"
115
+ when :romaji
116
+ data_url << "/roman/ken_all_rome.zip"
117
+ end
118
+ data_path = cache_dir_path + "#{@reading}-ken-all.zip"
119
+ unless data_path.exist?
120
+ download(data_path, data_url)
121
+ end
122
+
123
+ Zip::File.open(data_path.to_s) do |zip_file|
124
+ zip_file.each do |entry|
125
+ next unless entry.file?
126
+ entry.get_input_stream do |input|
127
+ yield(input)
128
+ end
129
+ end
130
+ end
131
+ end
132
+
133
+ def convert_change_reason(reason)
134
+ case reason
135
+ when "0"
136
+ nil
137
+ when "1"
138
+ :new
139
+ when "2"
140
+ :japanese_addressing_system
141
+ when "3"
142
+ :land_readjustment
143
+ when "4"
144
+ :postal_district_adjustment
145
+ when "5"
146
+ :correction
147
+ when "6"
148
+ :deletion
149
+ else
150
+ :unknown
151
+ end
152
+ end
153
+ end
154
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.7"
2
+ VERSION = "0.0.8"
3
3
  end
data/lib/datasets.rb CHANGED
@@ -1,10 +1,13 @@
1
- require "datasets/version"
1
+ require_relative "datasets/version"
2
2
 
3
- require "datasets/adult"
4
- require "datasets/cifar"
5
- require "datasets/fashion-mnist"
6
- require "datasets/iris"
7
- require "datasets/mnist"
8
- require "datasets/penn-treebank"
9
- require "datasets/wikipedia"
10
- require "datasets/wine"
3
+ require_relative "datasets/adult"
4
+ require_relative "datasets/cifar"
5
+ require_relative "datasets/fashion-mnist"
6
+ require_relative "datasets/iris"
7
+ require_relative "datasets/libsvm"
8
+ require_relative "datasets/libsvm-dataset-list"
9
+ require_relative "datasets/mnist"
10
+ require_relative "datasets/penn-treebank"
11
+ require_relative "datasets/postal-code-japan"
12
+ require_relative "datasets/wikipedia"
13
+ require_relative "datasets/wine"
data/red-datasets.gemspec CHANGED
@@ -34,6 +34,9 @@ Gem::Specification.new do |spec|
34
34
  spec.files += Dir.glob("doc/text/*")
35
35
  spec.test_files += Dir.glob("test/**/*")
36
36
 
37
+ spec.add_runtime_dependency("csv", ">= 3.0.5")
38
+ spec.add_runtime_dependency("rubyzip")
39
+
37
40
  spec.add_development_dependency("bundler")
38
41
  spec.add_development_dependency("rake")
39
42
  spec.add_development_dependency("test-unit")
@@ -0,0 +1,47 @@
1
+ class LIBSVMDatasetListTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::LIBSVMDatasetList.new
4
+ end
5
+
6
+ test("#each") do
7
+ assert_equal({
8
+ name: "a1a",
9
+ source: "UCI / Adult",
10
+ preprocessing:
11
+ "The original Adult data set has 14 features, " +
12
+ "among which six are continuous and eight are " +
13
+ "categorical. In this data set, continuous features " +
14
+ "are discretized into quantiles, and each quantile is " +
15
+ "represented by a binary feature. Also, a categorical " +
16
+ "feature with m categories is converted to m binary " +
17
+ "features. Details on how each feature is converted " +
18
+ "can be found in the beginning of each file from this " +
19
+ "page. [JP98a]",
20
+ n_classes: 2,
21
+ n_data: 1605,
22
+ n_features: 123,
23
+ files: [
24
+ {
25
+ name: "a1a",
26
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a",
27
+ note: nil,
28
+ },
29
+ {
30
+ name: "a1a.t",
31
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t",
32
+ note: "testing",
33
+ }
34
+ ],
35
+ },
36
+ @dataset.first.to_h)
37
+ end
38
+
39
+ sub_test_case("#metadata") do
40
+ test("#description") do
41
+ description = @dataset.metadata.description
42
+ assert do
43
+ description.start_with?("This page contains many classification, ")
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,205 @@
1
+ class LIBSVMDatasetTest < Test::Unit::TestCase
2
+ test(":note") do
3
+ dataset = Datasets::LIBSVM.new("a1a", note: "testing")
4
+ hash = {label: -1}
5
+ n_features = 123
6
+ n_features.times do |i|
7
+ hash[i] = 0
8
+ end
9
+ [5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i|
10
+ hash[i - 1] = 1
11
+ end
12
+ assert_equal(hash,
13
+ dataset.first.to_h)
14
+ end
15
+
16
+ test(":default_feature_value") do
17
+ dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil)
18
+ hash = {label: -1}
19
+ n_features = 123
20
+ n_features.times do |i|
21
+ hash[i] = nil
22
+ end
23
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
24
+ hash[i - 1] = 1
25
+ end
26
+ assert_equal(hash,
27
+ dataset.first.to_h)
28
+ end
29
+
30
+ test("classification") do
31
+ dataset = Datasets::LIBSVM.new("a1a")
32
+ hash = {label: -1}
33
+ n_features = 123
34
+ n_features.times do |i|
35
+ hash[i] = 0
36
+ end
37
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
38
+ hash[i - 1] = 1
39
+ end
40
+ assert_equal(hash,
41
+ dataset.first.to_h)
42
+ end
43
+
44
+ test("regression") do
45
+ dataset = Datasets::LIBSVM.new("abalone")
46
+ hash = {label: 15}
47
+ n_features = 8
48
+ n_features.times do |i|
49
+ hash[i] = 0
50
+ end
51
+ [
52
+ [1, 1],
53
+ [2, 0.455],
54
+ [3, 0.365],
55
+ [4, 0.095],
56
+ [5, 0.514],
57
+ [6, 0.2245],
58
+ [7, 0.101],
59
+ [8, 0.15],
60
+ ].each do |i, value|
61
+ hash[i - 1] = value
62
+ end
63
+ assert_equal(hash,
64
+ dataset.first.to_h)
65
+ end
66
+
67
+ test("multi-label") do
68
+ dataset = Datasets::LIBSVM.new("mediamill (exp1)")
69
+ hash = {label: [65, 67, 11, 31]}
70
+ n_features = 120
71
+ n_features.times do |i|
72
+ hash[i] = 0
73
+ end
74
+ [
75
+ [1, 0.380877],
76
+ [2, 0.494079],
77
+ [3, 0.540009],
78
+ [4, 0.422926],
79
+ [5, 0.158318],
80
+ [6, 0.326975],
81
+ [7, 0.390861],
82
+ [8, 0.527121],
83
+ [9, 0.254052],
84
+ [10, 0.223731],
85
+ [11, 0.040285],
86
+ [12, 0.141133],
87
+ [13, 0.112249],
88
+ [14, 0.263171],
89
+ [15, 0.147020],
90
+ [16, 0.472414],
91
+ [17, 0.592614],
92
+ [18, 0.653138],
93
+ [19, 0.499867],
94
+ [20, 0.196520],
95
+ [21, 0.403892],
96
+ [22, 0.482395],
97
+ [23, 0.619219],
98
+ [24, 0.320346],
99
+ [25, 0.281251],
100
+ [26, 0.054750],
101
+ [27, 0.180459],
102
+ [28, 0.139964],
103
+ [29, 0.319925],
104
+ [30, 0.181216],
105
+ [31, 0.364294],
106
+ [32, 0.407211],
107
+ [33, 0.368926],
108
+ [34, 0.427661],
109
+ [35, 0.211391],
110
+ [36, 0.364345],
111
+ [37, 0.370710],
112
+ [38, 0.409107],
113
+ [39, 0.289299],
114
+ [40, 0.243053],
115
+ [41, 0.063121],
116
+ [42, 0.193587],
117
+ [43, 0.158755],
118
+ [44, 0.316054],
119
+ [45, 0.197410],
120
+ [46, 0.656168],
121
+ [47, 0.678760],
122
+ [48, 0.650831],
123
+ [49, 0.674636],
124
+ [50, 0.492428],
125
+ [51, 0.623887],
126
+ [52, 0.610622],
127
+ [53, 0.678219],
128
+ [54, 0.574774],
129
+ [55, 0.523073],
130
+ [56, 0.206804],
131
+ [57, 0.496294],
132
+ [58, 0.429221],
133
+ [59, 0.586611],
134
+ [60, 0.471550],
135
+ [61, 0.284480],
136
+ [62, 0.432466],
137
+ [63, 0.498075],
138
+ [64, 0.408141],
139
+ [65, 0.102713],
140
+ [66, 0.303028],
141
+ [67, 0.309501],
142
+ [68, 0.444855],
143
+ [69, 0.191727],
144
+ [70, 0.174895],
145
+ [71, 0.034143],
146
+ [72, 0.153099],
147
+ [73, 0.068318],
148
+ [74, 0.217020],
149
+ [75, 0.099688],
150
+ [76, 0.409862],
151
+ [77, 0.561918],
152
+ [78, 0.612031],
153
+ [79, 0.514471],
154
+ [80, 0.146015],
155
+ [81, 0.398807],
156
+ [82, 0.383295],
157
+ [83, 0.548485],
158
+ [84, 0.282937],
159
+ [85, 0.252712],
160
+ [86, 0.051008],
161
+ [87, 0.223110],
162
+ [88, 0.098112],
163
+ [89, 0.299672],
164
+ [90, 0.144873],
165
+ [91, 0.308488],
166
+ [92, 0.358478],
167
+ [93, 0.352077],
168
+ [94, 0.394686],
169
+ [95, 0.157513],
170
+ [96, 0.339370],
171
+ [97, 0.321558],
172
+ [98, 0.341373],
173
+ [99, 0.247969],
174
+ [100, 0.206070],
175
+ [101, 0.061001],
176
+ [102, 0.216793],
177
+ [103, 0.112389],
178
+ [104, 0.273648],
179
+ [105, 0.152745],
180
+ [106, 0.598081],
181
+ [107, 0.621687],
182
+ [108, 0.607213],
183
+ [109, 0.644025],
184
+ [110, 0.394948],
185
+ [111, 0.593651],
186
+ [112, 0.551529],
187
+ [113, 0.574392],
188
+ [114, 0.511032],
189
+ [115, 0.463997],
190
+ [116, 0.202034],
191
+ [117, 0.492341],
192
+ [118, 0.317983],
193
+ [119, 0.547807],
194
+ [120, 0.393778],
195
+ ].each do |i, value|
196
+ hash[i - 1] = value
197
+ end
198
+ assert_equal(hash,
199
+ dataset.first.to_h)
200
+ end
201
+
202
+ test("string") do
203
+ # TODO
204
+ end
205
+ end
@@ -0,0 +1,69 @@
1
+ class PostalCodeJapanTest < Test::Unit::TestCase
2
+ sub_test_case(":reading") do
3
+ test(":lowercase") do
4
+ dataset = Datasets::PostalCodeJapan.new(reading: :lowercase)
5
+ assert_equal({
6
+ organization_code: "01101",
7
+ old_postal_code: "060",
8
+ postal_code: "0600000",
9
+ prefecture_reading: "ホッカイドウ",
10
+ city_reading: "サッポロシチュウオウク",
11
+ address_reading: "イカニケイサイガナイバアイ",
12
+ prefecture: "北海道",
13
+ city: "札幌市中央区",
14
+ address: "以下に掲載がない場合",
15
+ have_multiple_postal_codes: false,
16
+ have_address_number_per_koaza: false,
17
+ have_chome: false,
18
+ postal_code_is_shared: false,
19
+ changed: false,
20
+ change_reason: nil,
21
+ },
22
+ dataset.first.to_h)
23
+ end
24
+
25
+ test(":uppercase") do
26
+ dataset = Datasets::PostalCodeJapan.new(reading: :uppercase)
27
+ assert_equal({
28
+ organization_code: "01101",
29
+ old_postal_code: "060",
30
+ postal_code: "0600000",
31
+ prefecture_reading: "ホツカイドウ",
32
+ city_reading: "サツポロシチユウオウク",
33
+ address_reading: "イカニケイサイガナイバアイ",
34
+ prefecture: "北海道",
35
+ city: "札幌市中央区",
36
+ address: "以下に掲載がない場合",
37
+ have_multiple_postal_codes: false,
38
+ have_address_number_per_koaza: false,
39
+ have_chome: false,
40
+ postal_code_is_shared: false,
41
+ changed: false,
42
+ change_reason: nil,
43
+ },
44
+ dataset.first.to_h)
45
+ end
46
+
47
+ test(":romaji") do
48
+ dataset = Datasets::PostalCodeJapan.new(reading: :romaji)
49
+ assert_equal({
50
+ organization_code: nil,
51
+ old_postal_code: nil,
52
+ postal_code: "0600000",
53
+ prefecture_reading: "HOKKAIDO",
54
+ city_reading: "SAPPORO SHI CHUO KU",
55
+ address_reading: "IKANIKEISAIGANAIBAAI",
56
+ prefecture: "北海道",
57
+ city: "札幌市 中央区",
58
+ address: "以下に掲載がない場合",
59
+ have_multiple_postal_codes: false,
60
+ have_address_number_per_koaza: false,
61
+ have_chome: false,
62
+ postal_code_is_shared: false,
63
+ changed: false,
64
+ change_reason: nil,
65
+ },
66
+ dataset.first.to_h)
67
+ end
68
+ end
69
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,8 +9,36 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-11-20 00:00:00.000000000 Z
12
+ date: 2019-03-24 00:00:00.000000000 Z
13
13
  dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: csv
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: 3.0.5
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: 3.0.5
28
+ - !ruby/object:Gem::Dependency
29
+ name: rubyzip
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
14
42
  - !ruby/object:Gem::Dependency
15
43
  name: bundler
16
44
  requirement: !ruby/object:Gem::Requirement
@@ -106,9 +134,12 @@ files:
106
134
  - lib/datasets/downloader.rb
107
135
  - lib/datasets/fashion-mnist.rb
108
136
  - lib/datasets/iris.rb
137
+ - lib/datasets/libsvm-dataset-list.rb
138
+ - lib/datasets/libsvm.rb
109
139
  - lib/datasets/metadata.rb
110
140
  - lib/datasets/mnist.rb
111
141
  - lib/datasets/penn-treebank.rb
142
+ - lib/datasets/postal-code-japan.rb
112
143
  - lib/datasets/table.rb
113
144
  - lib/datasets/version.rb
114
145
  - lib/datasets/wikipedia.rb
@@ -121,8 +152,11 @@ files:
121
152
  - test/test-dictionary.rb
122
153
  - test/test-fashion-mnist.rb
123
154
  - test/test-iris.rb
155
+ - test/test-libsvm-dataset-list.rb
156
+ - test/test-libsvm.rb
124
157
  - test/test-mnist.rb
125
158
  - test/test-penn-treebank.rb
159
+ - test/test-postal-code-japan.rb
126
160
  - test/test-table.rb
127
161
  - test/test-wikipedia.rb
128
162
  - test/test-wine.rb
@@ -146,20 +180,23 @@ required_rubygems_version: !ruby/object:Gem::Requirement
146
180
  version: '0'
147
181
  requirements: []
148
182
  rubyforge_project:
149
- rubygems_version: 3.0.0.beta2
183
+ rubygems_version: 2.7.6
150
184
  signing_key:
151
185
  specification_version: 4
152
186
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
153
187
  test_files:
154
- - test/test-wine.rb
155
- - test/run-test.rb
156
- - test/test-cifar.rb
157
- - test/test-fashion-mnist.rb
158
- - test/test-wikipedia.rb
159
188
  - test/test-iris.rb
160
- - test/helper.rb
189
+ - test/test-wikipedia.rb
190
+ - test/test-fashion-mnist.rb
191
+ - test/test-wine.rb
192
+ - test/test-postal-code-japan.rb
161
193
  - test/test-mnist.rb
162
- - test/test-table.rb
194
+ - test/helper.rb
163
195
  - test/test-adult.rb
196
+ - test/test-libsvm.rb
197
+ - test/run-test.rb
198
+ - test/test-table.rb
199
+ - test/test-cifar.rb
200
+ - test/test-libsvm-dataset-list.rb
164
201
  - test/test-penn-treebank.rb
165
202
  - test/test-dictionary.rb