red-datasets 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 222271b814e3a5ce23b5e0dd1d2578bffb84afdab10110b0869985c6056bfd3b
4
- data.tar.gz: ac30931b3317ab04afd394b28a45a9206c784d78b3bcaf98fc3a2a48227c7930
3
+ metadata.gz: c7a9199546e7a001c97e45c6fa28db15c0d96b748e527d9705dfee4e4b1db6fd
4
+ data.tar.gz: c659f6ae1e658ad91210e4427be063463124d89ef90388d34ebfb73ceb49068a
5
5
  SHA512:
6
- metadata.gz: 8a94a3d66baaed4948904e97dc53100d73ae96c528c09b02252caabd05b8545587abf6fbcba3a578725812327a9a2c8827bbb7e283ccd3d7e66753bf30035e2e
7
- data.tar.gz: 2ab44b5aa3ee5da0ac8e8307546c71942938de4497bfec05fc929715a4e5ef6df1cb091bce0d5f12978582d2c9fa7eaffff9edd54be0d845627dccfce42a63dd
6
+ metadata.gz: d8a23c4a165a596df22ce5bbe1f8f0cd5c0f002deecafbb26cd5e5f75abb3c0224c1013898162a67787159258d1b801395fc4d949c17939d95940664cffd5600
7
+ data.tar.gz: f2fd4eb733e6205f138c4005627e815e3787040a8a4b6cce7eca9fd5d4adaa12263e17e8f5bd9394a851e5210f28736ee3c682c81e110da304ae17fb3f0bedba
data/README.md CHANGED
@@ -1,8 +1,4 @@
1
- # README
2
-
3
- ## Name
4
-
5
- Red Datasets
1
+ # Red Datasets
6
2
 
7
3
  ## Description
8
4
 
@@ -16,6 +12,20 @@ You can use datasets easily because you can access each dataset with multiple wa
16
12
  % gem install red-datasets
17
13
  ```
18
14
 
15
+ ## Available datasets
16
+
17
+ TODO: Document them in source code to list in document: https://www.rubydoc.info/gems/red-datasets
18
+
19
+ * Adult Dataset
20
+ * CIFAR-10 Dataset
21
+ * CIFAR-100 Dataset
22
+ * Fashion-MNIST
23
+ * Iris Dataset
24
+ * MNIST database
25
+ * The Penn Treebank Project
26
+ * Wikipedia
27
+ * Wine Dataset
28
+
19
29
  ## Usage
20
30
 
21
31
  Here is an example to access [Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris) by `#each` or `Table#to_h` or `Table#fetch_values`.
data/doc/text/news.md CHANGED
@@ -1,5 +1,22 @@
1
1
  # News
2
2
 
3
+ ## 0.0.8 - 2019-03-24
4
+
5
+ ### Improvements
6
+
7
+ * Improved README.
8
+ [GitHub#40][Patch by kojix2]
9
+
10
+ * `Datasets::PostalCodeJapan`: Added.
11
+
12
+ * `Datasets::LIBSVMDatasetList`: Added.
13
+
14
+ * `Datasets::LIBSVM`: Added.
15
+
16
+ ### Thanks
17
+
18
+ * kojix2
19
+
3
20
  ## 0.0.7 - 2018-11-21
4
21
 
5
22
  ### Improvements
@@ -0,0 +1,137 @@
1
+ require "English"
2
+ require "rexml/document"
3
+
4
+ require_relative "dataset"
5
+
6
+ module Datasets
7
+ class LIBSVMDatasetList < Dataset
8
+ File = Struct.new(:name,
9
+ :url,
10
+ :note)
11
+ class Record < Struct.new(:name,
12
+ :source,
13
+ :preprocessing,
14
+ :n_classes,
15
+ :n_data,
16
+ :n_features,
17
+ :files)
18
+ def to_h
19
+ hash = super
20
+ hash[:files] = hash[:files].collect(&:to_h)
21
+ hash
22
+ end
23
+ end
24
+
25
+ def initialize
26
+ super()
27
+ @metadata.id = "libsvm-dataset-list"
28
+ @metadata.name = "LIBSVM dataset list"
29
+ @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
30
+ @metadata.description = lambda do
31
+ extract_description
32
+ end
33
+ end
34
+
35
+ def each
36
+ return to_enum(__method__) unless block_given?
37
+
38
+ open_data do |input|
39
+ # TODO: Improve performance
40
+ document = REXML::Document.new(input)
41
+ is_header = true
42
+ document.each_element("//tr") do |tr|
43
+ if is_header
44
+ is_header = false
45
+ next
46
+ end
47
+ name = tr.elements.first
48
+ a = name.elements.first
49
+ href = a.attributes["href"]
50
+ record = Record.new
51
+ record.name = a.text
52
+ record.files = []
53
+ parse_detail(href, record)
54
+ yield(record)
55
+ end
56
+ end
57
+ end
58
+
59
+ private
60
+ def open_data
61
+ data_path = cache_dir_path + "index.html"
62
+ unless data_path.exist?
63
+ download(data_path, @metadata.url)
64
+ end
65
+ ::File.open(data_path) do |input|
66
+ yield(input)
67
+ end
68
+ end
69
+
70
+ def extract_description
71
+ open_data do |input|
72
+ document = REXML::Document.new(input)
73
+ description = []
74
+ in_content = false
75
+ document.each_element("//body/*") do |element|
76
+ unless in_content
77
+ in_content = (element.name == "h1")
78
+ next
79
+ end
80
+ break if element.name == "hr"
81
+ content = extract_text(element)
82
+ description << content unless content.empty?
83
+ end
84
+ description.join("\n\n")
85
+ end
86
+ end
87
+
88
+ def extract_text(element)
89
+ texts = REXML::XPath.match(element, ".//text()")
90
+ texts.join("").gsub(/[ \t\n]+/, " ").strip
91
+ end
92
+
93
+ def open_detail(detail)
94
+ data_path = cache_dir_path + detail
95
+ unless data_path.exist?
96
+ download(data_path, @metadata.url + detail)
97
+ end
98
+ ::File.open(data_path) do |input|
99
+ yield(input)
100
+ end
101
+ end
102
+
103
+ def parse_detail(href, record)
104
+ path, id = href.split("#")
105
+ open_detail(path) do |detail|
106
+ detail_document = REXML::Document.new(detail)
107
+ anchor = REXML::XPath.match(detail_document, "//*[@name='#{id}']")[0]
108
+ ul = anchor.next_sibling
109
+ ul.each_element do |li|
110
+ text = extract_text(li)
111
+ case text
112
+ when /\ASource: /
113
+ record.source = $POSTMATCH
114
+ when /\APreprocessing: /
115
+ record.preprocessing = $POSTMATCH
116
+ when /\A\# of classes: (\d+)/
117
+ record.n_classes = Integer($1, 10)
118
+ when /\A\# of data: ([\d,]+)/
119
+ record.n_data = Integer($1.gsub(/,/, ""), 10)
120
+ when /\A\# of features: ([\d,]+)/
121
+ record.n_features = Integer($1.gsub(/,/, ""), 10)
122
+ when /\AFiles:/
123
+ li.elements.first.each_element do |file_li|
124
+ file_a = file_li.elements.first
125
+ file = File.new
126
+ file.name = file_a.text
127
+ file.url = @metadata.url + file_a.attributes["href"]
128
+ file_note = file_li.text
129
+ file.note = file_note.strip.gsub(/[()]/, "") if file_note
130
+ record.files << file
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,143 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class LIBSVM < Dataset
7
+ class Record
8
+ attr_reader :label
9
+ attr_reader :features
10
+ def initialize(label, features)
11
+ @label = label
12
+ @features = features
13
+ end
14
+
15
+ def [](index)
16
+ @features[index]
17
+ end
18
+
19
+ def to_h
20
+ hash = {
21
+ label: @label,
22
+ }
23
+ @features.each_with_index do |feature, i|
24
+ hash[i] = feature
25
+ end
26
+ hash
27
+ end
28
+
29
+ def values
30
+ [@label] + @features
31
+ end
32
+ end
33
+
34
+ def initialize(name,
35
+ note: nil,
36
+ default_feature_value: 0)
37
+ super()
38
+ @libsvm_dataset_metadata = fetch_dataset_info(name)
39
+ @file = choose_file(note)
40
+ @default_feature_value = default_feature_value
41
+ @metadata.id = "libsvm-#{normalize_name(name)}"
42
+ @metadata.name = "LIBSVM dataset: #{name}"
43
+ @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
44
+ end
45
+
46
+ def each
47
+ return to_enum(__method__) unless block_given?
48
+
49
+ open_data do |input|
50
+ n_features = @libsvm_dataset_metadata.n_features
51
+ csv = CSV.new(input, col_sep: " ")
52
+ csv.each do |row|
53
+ label = parse_label(row.shift)
54
+ features = [@default_feature_value] * n_features
55
+ row.each do |column|
56
+ next if column.nil?
57
+ index, value = column.split(":", 2)
58
+ features[Integer(index, 10) - 1] = parse_value(value)
59
+ end
60
+ yield(Record.new(label, features))
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def fetch_dataset_info(name)
67
+ list = LIBSVMDatasetList.new
68
+ available_datasets = []
69
+ list.each do |record|
70
+ available_datasets << record.name
71
+ if record.name == name
72
+ return record
73
+ end
74
+ end
75
+ message = "unavailable LIBSVM dataset: #{name.inspect}: "
76
+ message << "available datasets: ["
77
+ message << available_datasets.collect(&:inspect).join(", ")
78
+ message << "]"
79
+ raise ArgumentError, message
80
+ end
81
+
82
+ def choose_file(note)
83
+ files = @libsvm_dataset_metadata.files
84
+ return files.first if note.nil?
85
+
86
+ available_notes = []
87
+ @libsvm_dataset_metadata.files.find do |file|
88
+ return file if file.note == note
89
+ available_notes << file.note if file.note
90
+ end
91
+
92
+ name = @libsvm_dataset_metadata.name
93
+ message = "unavailable note: #{name}: #{note.inspect}: "
94
+ message << "available notes: ["
95
+ message << available_notes.collect(&:inspect).join(", ")
96
+ message << "]"
97
+ raise ArgumentError, message
98
+ end
99
+
100
+ def open_data(&block)
101
+ data_path = cache_dir_path + @file.name
102
+ unless data_path.exist?
103
+ download(data_path, @file.url)
104
+ end
105
+ if data_path.extname == ".bz2"
106
+ input, output = IO.pipe
107
+ pid = spawn("bzcat", data_path.to_s, {:out => output})
108
+ begin
109
+ output.close
110
+ yield(input)
111
+ ensure
112
+ input.close
113
+ Process.waitpid(pid)
114
+ end
115
+ else
116
+ File.open(data_path, &block)
117
+ end
118
+ end
119
+
120
+ def normalize_name(name)
121
+ name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
122
+ end
123
+
124
+ def parse_label(label)
125
+ labels = label.split(",").collect do |value|
126
+ parse_value(value)
127
+ end
128
+ if labels.size == 1
129
+ labels[0]
130
+ else
131
+ labels
132
+ end
133
+ end
134
+
135
+ def parse_value(value)
136
+ if value.include?(".")
137
+ Float(value)
138
+ else
139
+ Integer(value, 10)
140
+ end
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,154 @@
1
+ require "csv"
2
+ require "zip"
3
+
4
+ require_relative "dataset"
5
+
6
+ module Datasets
7
+ class PostalCodeJapan < Dataset
8
+ class Record < Struct.new(:organization_code,
9
+ :old_postal_code,
10
+ :postal_code,
11
+ :prefecture_reading,
12
+ :city_reading,
13
+ :address_reading,
14
+ :prefecture,
15
+ :city,
16
+ :address,
17
+ :have_multiple_postal_codes,
18
+ :have_address_number_per_koaza,
19
+ :have_chome,
20
+ :postal_code_is_shared,
21
+ :changed,
22
+ :change_reason)
23
+ alias_method :have_multiple_postal_codes?,
24
+ :have_multiple_postal_codes
25
+ alias_method :have_address_number_per_koaza?,
26
+ :have_address_number_per_koaza
27
+ alias_method :have_chome?,
28
+ :have_chome
29
+ alias_method :postal_code_is_shared?,
30
+ :postal_code_is_shared
31
+ alias_method :changed?,
32
+ :changed
33
+ end
34
+
35
+ VALID_READINGS = [
36
+ :lowercase,
37
+ :uppercase,
38
+ :romaji,
39
+ ]
40
+ def initialize(reading: :lowercase)
41
+ super()
42
+ @reading = reading
43
+ unless VALID_READINGS.include?(@reading)
44
+ message = ":reading must be one of ["
45
+ message << VALID_READINGS.collect(&:inspect).join(", ")
46
+ message << "]: #{@reading.inspect}"
47
+ raise ArgumentError, message
48
+ end
49
+ @metadata.id = "postal-code-japan-#{@reading}"
50
+ @metadata.name = "Postal code in Japan (#{@reading})"
51
+ @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
52
+ @metadata.licenses = [
53
+ "CC0-1.0",
54
+ ]
55
+ @metadata.description = "Postal code in Japan (reading: #{@reading})"
56
+ end
57
+
58
+ def each(&block)
59
+ return to_enum(__method__) unless block_given?
60
+
61
+ open_data do |input|
62
+ utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
63
+ options = {
64
+ quote_char: nil,
65
+ strip: %Q["],
66
+ }
67
+ if @reading == :romaji
68
+ CSV.parse(utf8_data, **options) do |row|
69
+ yield(Record.new(nil,
70
+ nil,
71
+ row[0],
72
+ row[4],
73
+ row[5],
74
+ row[6],
75
+ row[1],
76
+ row[2],
77
+ row[3],
78
+ false,
79
+ false,
80
+ false,
81
+ false,
82
+ false,
83
+ nil))
84
+ end
85
+ else
86
+ CSV.parse(utf8_data, **options) do |row|
87
+ yield(Record.new(row[0],
88
+ row[1].rstrip,
89
+ row[2],
90
+ row[3],
91
+ row[4],
92
+ row[5],
93
+ row[6],
94
+ row[7],
95
+ row[8],
96
+ (row[9] == "1"),
97
+ (row[10] == "1"),
98
+ (row[11] == "1"),
99
+ (row[12] == "1"),
100
+ (row[13] != "0"),
101
+ convert_change_reason(row[14])))
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ private
108
+ def open_data
109
+ data_url = "https://www.post.japanpost.jp/zipcode/dl"
110
+ case @reading
111
+ when :lowercase
112
+ data_url << "/kogaki/zip/ken_all.zip"
113
+ when :uppercase
114
+ data_url << "/oogaki/zip/ken_all.zip"
115
+ when :romaji
116
+ data_url << "/roman/ken_all_rome.zip"
117
+ end
118
+ data_path = cache_dir_path + "#{@reading}-ken-all.zip"
119
+ unless data_path.exist?
120
+ download(data_path, data_url)
121
+ end
122
+
123
+ Zip::File.open(data_path.to_s) do |zip_file|
124
+ zip_file.each do |entry|
125
+ next unless entry.file?
126
+ entry.get_input_stream do |input|
127
+ yield(input)
128
+ end
129
+ end
130
+ end
131
+ end
132
+
133
+ def convert_change_reason(reason)
134
+ case reason
135
+ when "0"
136
+ nil
137
+ when "1"
138
+ :new
139
+ when "2"
140
+ :japanese_addressing_system
141
+ when "3"
142
+ :land_readjustment
143
+ when "4"
144
+ :postal_district_adjustment
145
+ when "5"
146
+ :correction
147
+ when "6"
148
+ :deletion
149
+ else
150
+ :unknown
151
+ end
152
+ end
153
+ end
154
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.7"
2
+ VERSION = "0.0.8"
3
3
  end
data/lib/datasets.rb CHANGED
@@ -1,10 +1,13 @@
1
- require "datasets/version"
1
+ require_relative "datasets/version"
2
2
 
3
- require "datasets/adult"
4
- require "datasets/cifar"
5
- require "datasets/fashion-mnist"
6
- require "datasets/iris"
7
- require "datasets/mnist"
8
- require "datasets/penn-treebank"
9
- require "datasets/wikipedia"
10
- require "datasets/wine"
3
+ require_relative "datasets/adult"
4
+ require_relative "datasets/cifar"
5
+ require_relative "datasets/fashion-mnist"
6
+ require_relative "datasets/iris"
7
+ require_relative "datasets/libsvm"
8
+ require_relative "datasets/libsvm-dataset-list"
9
+ require_relative "datasets/mnist"
10
+ require_relative "datasets/penn-treebank"
11
+ require_relative "datasets/postal-code-japan"
12
+ require_relative "datasets/wikipedia"
13
+ require_relative "datasets/wine"
data/red-datasets.gemspec CHANGED
@@ -34,6 +34,9 @@ Gem::Specification.new do |spec|
34
34
  spec.files += Dir.glob("doc/text/*")
35
35
  spec.test_files += Dir.glob("test/**/*")
36
36
 
37
+ spec.add_runtime_dependency("csv", ">= 3.0.5")
38
+ spec.add_runtime_dependency("rubyzip")
39
+
37
40
  spec.add_development_dependency("bundler")
38
41
  spec.add_development_dependency("rake")
39
42
  spec.add_development_dependency("test-unit")
@@ -0,0 +1,47 @@
1
+ class LIBSVMDatasetListTest < Test::Unit::TestCase
2
+ def setup
3
+ @dataset = Datasets::LIBSVMDatasetList.new
4
+ end
5
+
6
+ test("#each") do
7
+ assert_equal({
8
+ name: "a1a",
9
+ source: "UCI / Adult",
10
+ preprocessing:
11
+ "The original Adult data set has 14 features, " +
12
+ "among which six are continuous and eight are " +
13
+ "categorical. In this data set, continuous features " +
14
+ "are discretized into quantiles, and each quantile is " +
15
+ "represented by a binary feature. Also, a categorical " +
16
+ "feature with m categories is converted to m binary " +
17
+ "features. Details on how each feature is converted " +
18
+ "can be found in the beginning of each file from this " +
19
+ "page. [JP98a]",
20
+ n_classes: 2,
21
+ n_data: 1605,
22
+ n_features: 123,
23
+ files: [
24
+ {
25
+ name: "a1a",
26
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a",
27
+ note: nil,
28
+ },
29
+ {
30
+ name: "a1a.t",
31
+ url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t",
32
+ note: "testing",
33
+ }
34
+ ],
35
+ },
36
+ @dataset.first.to_h)
37
+ end
38
+
39
+ sub_test_case("#metadata") do
40
+ test("#description") do
41
+ description = @dataset.metadata.description
42
+ assert do
43
+ description.start_with?("This page contains many classification, ")
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,205 @@
1
+ class LIBSVMDatasetTest < Test::Unit::TestCase
2
+ test(":note") do
3
+ dataset = Datasets::LIBSVM.new("a1a", note: "testing")
4
+ hash = {label: -1}
5
+ n_features = 123
6
+ n_features.times do |i|
7
+ hash[i] = 0
8
+ end
9
+ [5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i|
10
+ hash[i - 1] = 1
11
+ end
12
+ assert_equal(hash,
13
+ dataset.first.to_h)
14
+ end
15
+
16
+ test(":default_feature_value") do
17
+ dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil)
18
+ hash = {label: -1}
19
+ n_features = 123
20
+ n_features.times do |i|
21
+ hash[i] = nil
22
+ end
23
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
24
+ hash[i - 1] = 1
25
+ end
26
+ assert_equal(hash,
27
+ dataset.first.to_h)
28
+ end
29
+
30
+ test("classification") do
31
+ dataset = Datasets::LIBSVM.new("a1a")
32
+ hash = {label: -1}
33
+ n_features = 123
34
+ n_features.times do |i|
35
+ hash[i] = 0
36
+ end
37
+ [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
38
+ hash[i - 1] = 1
39
+ end
40
+ assert_equal(hash,
41
+ dataset.first.to_h)
42
+ end
43
+
44
+ test("regression") do
45
+ dataset = Datasets::LIBSVM.new("abalone")
46
+ hash = {label: 15}
47
+ n_features = 8
48
+ n_features.times do |i|
49
+ hash[i] = 0
50
+ end
51
+ [
52
+ [1, 1],
53
+ [2, 0.455],
54
+ [3, 0.365],
55
+ [4, 0.095],
56
+ [5, 0.514],
57
+ [6, 0.2245],
58
+ [7, 0.101],
59
+ [8, 0.15],
60
+ ].each do |i, value|
61
+ hash[i - 1] = value
62
+ end
63
+ assert_equal(hash,
64
+ dataset.first.to_h)
65
+ end
66
+
67
+ test("multi-label") do
68
+ dataset = Datasets::LIBSVM.new("mediamill (exp1)")
69
+ hash = {label: [65, 67, 11, 31]}
70
+ n_features = 120
71
+ n_features.times do |i|
72
+ hash[i] = 0
73
+ end
74
+ [
75
+ [1, 0.380877],
76
+ [2, 0.494079],
77
+ [3, 0.540009],
78
+ [4, 0.422926],
79
+ [5, 0.158318],
80
+ [6, 0.326975],
81
+ [7, 0.390861],
82
+ [8, 0.527121],
83
+ [9, 0.254052],
84
+ [10, 0.223731],
85
+ [11, 0.040285],
86
+ [12, 0.141133],
87
+ [13, 0.112249],
88
+ [14, 0.263171],
89
+ [15, 0.147020],
90
+ [16, 0.472414],
91
+ [17, 0.592614],
92
+ [18, 0.653138],
93
+ [19, 0.499867],
94
+ [20, 0.196520],
95
+ [21, 0.403892],
96
+ [22, 0.482395],
97
+ [23, 0.619219],
98
+ [24, 0.320346],
99
+ [25, 0.281251],
100
+ [26, 0.054750],
101
+ [27, 0.180459],
102
+ [28, 0.139964],
103
+ [29, 0.319925],
104
+ [30, 0.181216],
105
+ [31, 0.364294],
106
+ [32, 0.407211],
107
+ [33, 0.368926],
108
+ [34, 0.427661],
109
+ [35, 0.211391],
110
+ [36, 0.364345],
111
+ [37, 0.370710],
112
+ [38, 0.409107],
113
+ [39, 0.289299],
114
+ [40, 0.243053],
115
+ [41, 0.063121],
116
+ [42, 0.193587],
117
+ [43, 0.158755],
118
+ [44, 0.316054],
119
+ [45, 0.197410],
120
+ [46, 0.656168],
121
+ [47, 0.678760],
122
+ [48, 0.650831],
123
+ [49, 0.674636],
124
+ [50, 0.492428],
125
+ [51, 0.623887],
126
+ [52, 0.610622],
127
+ [53, 0.678219],
128
+ [54, 0.574774],
129
+ [55, 0.523073],
130
+ [56, 0.206804],
131
+ [57, 0.496294],
132
+ [58, 0.429221],
133
+ [59, 0.586611],
134
+ [60, 0.471550],
135
+ [61, 0.284480],
136
+ [62, 0.432466],
137
+ [63, 0.498075],
138
+ [64, 0.408141],
139
+ [65, 0.102713],
140
+ [66, 0.303028],
141
+ [67, 0.309501],
142
+ [68, 0.444855],
143
+ [69, 0.191727],
144
+ [70, 0.174895],
145
+ [71, 0.034143],
146
+ [72, 0.153099],
147
+ [73, 0.068318],
148
+ [74, 0.217020],
149
+ [75, 0.099688],
150
+ [76, 0.409862],
151
+ [77, 0.561918],
152
+ [78, 0.612031],
153
+ [79, 0.514471],
154
+ [80, 0.146015],
155
+ [81, 0.398807],
156
+ [82, 0.383295],
157
+ [83, 0.548485],
158
+ [84, 0.282937],
159
+ [85, 0.252712],
160
+ [86, 0.051008],
161
+ [87, 0.223110],
162
+ [88, 0.098112],
163
+ [89, 0.299672],
164
+ [90, 0.144873],
165
+ [91, 0.308488],
166
+ [92, 0.358478],
167
+ [93, 0.352077],
168
+ [94, 0.394686],
169
+ [95, 0.157513],
170
+ [96, 0.339370],
171
+ [97, 0.321558],
172
+ [98, 0.341373],
173
+ [99, 0.247969],
174
+ [100, 0.206070],
175
+ [101, 0.061001],
176
+ [102, 0.216793],
177
+ [103, 0.112389],
178
+ [104, 0.273648],
179
+ [105, 0.152745],
180
+ [106, 0.598081],
181
+ [107, 0.621687],
182
+ [108, 0.607213],
183
+ [109, 0.644025],
184
+ [110, 0.394948],
185
+ [111, 0.593651],
186
+ [112, 0.551529],
187
+ [113, 0.574392],
188
+ [114, 0.511032],
189
+ [115, 0.463997],
190
+ [116, 0.202034],
191
+ [117, 0.492341],
192
+ [118, 0.317983],
193
+ [119, 0.547807],
194
+ [120, 0.393778],
195
+ ].each do |i, value|
196
+ hash[i - 1] = value
197
+ end
198
+ assert_equal(hash,
199
+ dataset.first.to_h)
200
+ end
201
+
202
+ test("string") do
203
+ # TODO
204
+ end
205
+ end
@@ -0,0 +1,69 @@
1
+ class PostalCodeJapanTest < Test::Unit::TestCase
2
+ sub_test_case(":reading") do
3
+ test(":lowercase") do
4
+ dataset = Datasets::PostalCodeJapan.new(reading: :lowercase)
5
+ assert_equal({
6
+ organization_code: "01101",
7
+ old_postal_code: "060",
8
+ postal_code: "0600000",
9
+ prefecture_reading: "ホッカイドウ",
10
+ city_reading: "サッポロシチュウオウク",
11
+ address_reading: "イカニケイサイガナイバアイ",
12
+ prefecture: "北海道",
13
+ city: "札幌市中央区",
14
+ address: "以下に掲載がない場合",
15
+ have_multiple_postal_codes: false,
16
+ have_address_number_per_koaza: false,
17
+ have_chome: false,
18
+ postal_code_is_shared: false,
19
+ changed: false,
20
+ change_reason: nil,
21
+ },
22
+ dataset.first.to_h)
23
+ end
24
+
25
+ test(":uppercase") do
26
+ dataset = Datasets::PostalCodeJapan.new(reading: :uppercase)
27
+ assert_equal({
28
+ organization_code: "01101",
29
+ old_postal_code: "060",
30
+ postal_code: "0600000",
31
+ prefecture_reading: "ホツカイドウ",
32
+ city_reading: "サツポロシチユウオウク",
33
+ address_reading: "イカニケイサイガナイバアイ",
34
+ prefecture: "北海道",
35
+ city: "札幌市中央区",
36
+ address: "以下に掲載がない場合",
37
+ have_multiple_postal_codes: false,
38
+ have_address_number_per_koaza: false,
39
+ have_chome: false,
40
+ postal_code_is_shared: false,
41
+ changed: false,
42
+ change_reason: nil,
43
+ },
44
+ dataset.first.to_h)
45
+ end
46
+
47
+ test(":romaji") do
48
+ dataset = Datasets::PostalCodeJapan.new(reading: :romaji)
49
+ assert_equal({
50
+ organization_code: nil,
51
+ old_postal_code: nil,
52
+ postal_code: "0600000",
53
+ prefecture_reading: "HOKKAIDO",
54
+ city_reading: "SAPPORO SHI CHUO KU",
55
+ address_reading: "IKANIKEISAIGANAIBAAI",
56
+ prefecture: "北海道",
57
+ city: "札幌市 中央区",
58
+ address: "以下に掲載がない場合",
59
+ have_multiple_postal_codes: false,
60
+ have_address_number_per_koaza: false,
61
+ have_chome: false,
62
+ postal_code_is_shared: false,
63
+ changed: false,
64
+ change_reason: nil,
65
+ },
66
+ dataset.first.to_h)
67
+ end
68
+ end
69
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,8 +9,36 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-11-20 00:00:00.000000000 Z
12
+ date: 2019-03-24 00:00:00.000000000 Z
13
13
  dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: csv
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ">="
19
+ - !ruby/object:Gem::Version
20
+ version: 3.0.5
21
+ type: :runtime
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ version: 3.0.5
28
+ - !ruby/object:Gem::Dependency
29
+ name: rubyzip
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ type: :runtime
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
14
42
  - !ruby/object:Gem::Dependency
15
43
  name: bundler
16
44
  requirement: !ruby/object:Gem::Requirement
@@ -106,9 +134,12 @@ files:
106
134
  - lib/datasets/downloader.rb
107
135
  - lib/datasets/fashion-mnist.rb
108
136
  - lib/datasets/iris.rb
137
+ - lib/datasets/libsvm-dataset-list.rb
138
+ - lib/datasets/libsvm.rb
109
139
  - lib/datasets/metadata.rb
110
140
  - lib/datasets/mnist.rb
111
141
  - lib/datasets/penn-treebank.rb
142
+ - lib/datasets/postal-code-japan.rb
112
143
  - lib/datasets/table.rb
113
144
  - lib/datasets/version.rb
114
145
  - lib/datasets/wikipedia.rb
@@ -121,8 +152,11 @@ files:
121
152
  - test/test-dictionary.rb
122
153
  - test/test-fashion-mnist.rb
123
154
  - test/test-iris.rb
155
+ - test/test-libsvm-dataset-list.rb
156
+ - test/test-libsvm.rb
124
157
  - test/test-mnist.rb
125
158
  - test/test-penn-treebank.rb
159
+ - test/test-postal-code-japan.rb
126
160
  - test/test-table.rb
127
161
  - test/test-wikipedia.rb
128
162
  - test/test-wine.rb
@@ -146,20 +180,23 @@ required_rubygems_version: !ruby/object:Gem::Requirement
146
180
  version: '0'
147
181
  requirements: []
148
182
  rubyforge_project:
149
- rubygems_version: 3.0.0.beta2
183
+ rubygems_version: 2.7.6
150
184
  signing_key:
151
185
  specification_version: 4
152
186
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
153
187
  test_files:
154
- - test/test-wine.rb
155
- - test/run-test.rb
156
- - test/test-cifar.rb
157
- - test/test-fashion-mnist.rb
158
- - test/test-wikipedia.rb
159
188
  - test/test-iris.rb
160
- - test/helper.rb
189
+ - test/test-wikipedia.rb
190
+ - test/test-fashion-mnist.rb
191
+ - test/test-wine.rb
192
+ - test/test-postal-code-japan.rb
161
193
  - test/test-mnist.rb
162
- - test/test-table.rb
194
+ - test/helper.rb
163
195
  - test/test-adult.rb
196
+ - test/test-libsvm.rb
197
+ - test/run-test.rb
198
+ - test/test-table.rb
199
+ - test/test-cifar.rb
200
+ - test/test-libsvm-dataset-list.rb
164
201
  - test/test-penn-treebank.rb
165
202
  - test/test-dictionary.rb