red-datasets 0.0.6 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -7
  3. data/doc/text/news.md +124 -0
  4. data/lib/datasets.rb +18 -6
  5. data/lib/datasets/adult.rb +84 -0
  6. data/lib/datasets/cldr-plurals.rb +385 -0
  7. data/lib/datasets/communities.rb +198 -0
  8. data/lib/datasets/dataset.rb +13 -0
  9. data/lib/datasets/dictionary.rb +59 -0
  10. data/lib/datasets/downloader.rb +37 -62
  11. data/lib/datasets/e-stat-japan.rb +320 -0
  12. data/lib/datasets/error.rb +4 -0
  13. data/lib/datasets/fashion-mnist.rb +12 -0
  14. data/lib/datasets/hepatitis.rb +207 -0
  15. data/lib/datasets/iris.rb +1 -1
  16. data/lib/datasets/libsvm-dataset-list.rb +277 -0
  17. data/lib/datasets/libsvm.rb +135 -0
  18. data/lib/datasets/mnist.rb +11 -8
  19. data/lib/datasets/mushroom.rb +256 -0
  20. data/lib/datasets/penguins.rb +125 -0
  21. data/lib/datasets/penn-treebank.rb +2 -9
  22. data/lib/datasets/postal-code-japan.rb +154 -0
  23. data/lib/datasets/table.rb +99 -3
  24. data/lib/datasets/version.rb +1 -1
  25. data/lib/datasets/wikipedia.rb +2 -10
  26. data/lib/datasets/wine.rb +64 -0
  27. data/red-datasets.gemspec +4 -0
  28. data/test/helper.rb +1 -0
  29. data/test/run-test.rb +2 -0
  30. data/test/test-adult.rb +126 -0
  31. data/test/test-cldr-plurals.rb +180 -0
  32. data/test/test-communities.rb +290 -0
  33. data/test/test-dictionary.rb +43 -0
  34. data/test/test-e-stat-japan.rb +383 -0
  35. data/test/test-fashion-mnist.rb +137 -0
  36. data/test/test-hepatitis.rb +74 -0
  37. data/test/test-libsvm-dataset-list.rb +47 -0
  38. data/test/test-libsvm.rb +205 -0
  39. data/test/test-mnist.rb +95 -70
  40. data/test/test-mushroom.rb +80 -0
  41. data/test/test-penguins.rb +239 -0
  42. data/test/test-penn-treebank.rb +6 -6
  43. data/test/test-postal-code-japan.rb +69 -0
  44. data/test/test-table.rb +144 -19
  45. data/test/test-wine.rb +58 -0
  46. metadata +89 -8
@@ -2,7 +2,7 @@ require_relative "dataset"
2
2
 
3
3
  module Datasets
4
4
  class PennTreebank < Dataset
5
- Record = Struct.new(:word, :id)
5
+ Record = Struct.new(:word)
6
6
 
7
7
  DESCRIPTION = <<~DESC
8
8
  `Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a
@@ -46,17 +46,10 @@ module Datasets
46
46
 
47
47
  private
48
48
  def parse_data(data_path)
49
- index = 0
50
- vocabulary = {}
51
49
  File.open(data_path) do |f|
52
50
  f.each_line do |line|
53
51
  line.split.each do |word|
54
- word = word.strip
55
- unless vocabulary.key?(word)
56
- vocabulary[word] = index
57
- index += 1
58
- end
59
- yield(Record.new(word, vocabulary[word]))
52
+ yield(Record.new(word.strip))
60
53
  end
61
54
  end
62
55
  end
@@ -0,0 +1,154 @@
1
+ require "csv"
2
+ require "zip"
3
+
4
+ require_relative "dataset"
5
+
6
+ module Datasets
7
+ class PostalCodeJapan < Dataset
8
+ class Record < Struct.new(:organization_code,
9
+ :old_postal_code,
10
+ :postal_code,
11
+ :prefecture_reading,
12
+ :city_reading,
13
+ :address_reading,
14
+ :prefecture,
15
+ :city,
16
+ :address,
17
+ :have_multiple_postal_codes,
18
+ :have_address_number_per_koaza,
19
+ :have_chome,
20
+ :postal_code_is_shared,
21
+ :changed,
22
+ :change_reason)
23
+ alias_method :have_multiple_postal_codes?,
24
+ :have_multiple_postal_codes
25
+ alias_method :have_address_number_per_koaza?,
26
+ :have_address_number_per_koaza
27
+ alias_method :have_chome?,
28
+ :have_chome
29
+ alias_method :postal_code_is_shared?,
30
+ :postal_code_is_shared
31
+ alias_method :changed?,
32
+ :changed
33
+ end
34
+
35
+ VALID_READINGS = [
36
+ :lowercase,
37
+ :uppercase,
38
+ :romaji,
39
+ ]
40
+ def initialize(reading: :lowercase)
41
+ super()
42
+ @reading = reading
43
+ unless VALID_READINGS.include?(@reading)
44
+ message = ":reading must be one of ["
45
+ message << VALID_READINGS.collect(&:inspect).join(", ")
46
+ message << "]: #{@reading.inspect}"
47
+ raise ArgumentError, message
48
+ end
49
+ @metadata.id = "postal-code-japan-#{@reading}"
50
+ @metadata.name = "Postal code in Japan (#{@reading})"
51
+ @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
52
+ @metadata.licenses = [
53
+ "CC0-1.0",
54
+ ]
55
+ @metadata.description = "Postal code in Japan (reading: #{@reading})"
56
+ end
57
+
58
+ def each(&block)
59
+ return to_enum(__method__) unless block_given?
60
+
61
+ open_data do |input|
62
+ utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
63
+ options = {
64
+ quote_char: nil,
65
+ strip: %Q["],
66
+ }
67
+ if @reading == :romaji
68
+ CSV.parse(utf8_data, **options) do |row|
69
+ yield(Record.new(nil,
70
+ nil,
71
+ row[0],
72
+ row[4],
73
+ row[5],
74
+ row[6],
75
+ row[1],
76
+ row[2],
77
+ row[3],
78
+ false,
79
+ false,
80
+ false,
81
+ false,
82
+ false,
83
+ nil))
84
+ end
85
+ else
86
+ CSV.parse(utf8_data, **options) do |row|
87
+ yield(Record.new(row[0],
88
+ row[1].rstrip,
89
+ row[2],
90
+ row[3],
91
+ row[4],
92
+ row[5],
93
+ row[6],
94
+ row[7],
95
+ row[8],
96
+ (row[9] == "1"),
97
+ (row[10] == "1"),
98
+ (row[11] == "1"),
99
+ (row[12] == "1"),
100
+ (row[13] != "0"),
101
+ convert_change_reason(row[14])))
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ private
108
+ def open_data
109
+ data_url = "https://www.post.japanpost.jp/zipcode/dl"
110
+ case @reading
111
+ when :lowercase
112
+ data_url << "/kogaki/zip/ken_all.zip"
113
+ when :uppercase
114
+ data_url << "/oogaki/zip/ken_all.zip"
115
+ when :romaji
116
+ data_url << "/roman/ken_all_rome.zip"
117
+ end
118
+ data_path = cache_dir_path + "#{@reading}-ken-all.zip"
119
+ unless data_path.exist?
120
+ download(data_path, data_url)
121
+ end
122
+
123
+ Zip::File.open(data_path.to_s) do |zip_file|
124
+ zip_file.each do |entry|
125
+ next unless entry.file?
126
+ entry.get_input_stream do |input|
127
+ yield(input)
128
+ end
129
+ end
130
+ end
131
+ end
132
+
133
+ def convert_change_reason(reason)
134
+ case reason
135
+ when "0"
136
+ nil
137
+ when "1"
138
+ :new
139
+ when "2"
140
+ :japanese_addressing_system
141
+ when "3"
142
+ :land_readjustment
143
+ when "4"
144
+ :postal_district_adjustment
145
+ when "5"
146
+ :correction
147
+ when "6"
148
+ :deletion
149
+ else
150
+ :unknown
151
+ end
152
+ end
153
+ end
154
+ end
@@ -1,17 +1,109 @@
1
+ require "datasets/dictionary"
2
+
1
3
  module Datasets
2
4
  class Table
5
+ class Record
6
+ include Enumerable
7
+
8
+ def initialize(table, index)
9
+ @table = table
10
+ @index = index
11
+ end
12
+
13
+ def [](column_name_or_column_index)
14
+ @table[column_name_or_column_index][@index]
15
+ end
16
+
17
+ def each
18
+ return to_enum(__method__) unless block_given?
19
+ @table.each_column.each do |column_name, column_values|
20
+ yield(column_name, column_values[@index])
21
+ end
22
+ end
23
+
24
+ def values
25
+ @table.each_column.collect do |_column_name, column_values|
26
+ column_values[@index]
27
+ end
28
+ end
29
+
30
+ def to_h
31
+ hash = {}
32
+ each do |column_name, column_value|
33
+ hash[column_name] = column_value
34
+ end
35
+ hash
36
+ end
37
+
38
+ def inspect
39
+ "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
40
+ end
41
+ end
42
+
3
43
  include Enumerable
4
44
 
45
+ attr_reader :dataset
5
46
  def initialize(dataset)
6
47
  @dataset = dataset
48
+ @dictionaries = {}
7
49
  end
8
50
 
9
- def each(&block)
51
+ def n_columns
52
+ columner_data.size
53
+ end
54
+ alias_method :size, :n_columns
55
+ alias_method :length, :n_columns
56
+
57
+ def n_rows
58
+ first_column = columner_data.first
59
+ return 0 if first_column.nil?
60
+ first_column[1].size
61
+ end
62
+
63
+ def column_names
64
+ columner_data.keys
65
+ end
66
+
67
+ def each_column(&block)
10
68
  columner_data.each(&block)
11
69
  end
70
+ alias_method :each, :each_column
71
+
72
+ def each_record
73
+ return to_enum(__method__) unless block_given?
74
+ n_rows.times do |i|
75
+ yield(Record.new(self, i))
76
+ end
77
+ end
78
+
79
+ def find_record(row)
80
+ row += n_rows if row < 0
81
+ return nil if row < 0
82
+ return nil if row >= n_rows
83
+ Record.new(self, row)
84
+ end
85
+
86
+ def [](name_or_index)
87
+ case name_or_index
88
+ when Integer
89
+ index = name_or_index
90
+ columner_data.each_with_index do |(_name, values), i|
91
+ return values if i == index
92
+ end
93
+ nil
94
+ else
95
+ name = name_or_index
96
+ columner_data[normalize_name(name)]
97
+ end
98
+ end
12
99
 
13
- def [](name)
14
- columner_data[name.to_sym]
100
+ def dictionary_encode(name)
101
+ @dictionaries[normalize_name(name)] ||= Dictionary.new(self[name])
102
+ end
103
+
104
+ def label_encode(name)
105
+ dictionary = dictionary_encode(name)
106
+ dictionary.encode(self[name])
15
107
  end
16
108
 
17
109
  def fetch_values(*keys)
@@ -55,5 +147,9 @@ module Datasets
55
147
  def columner_data
56
148
  @columns ||= to_h
57
149
  end
150
+
151
+ def normalize_name(name)
152
+ name.to_sym
153
+ end
58
154
  end
59
155
  end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.6"
2
+ VERSION = "0.1.1"
3
3
  end
@@ -52,7 +52,7 @@ module Datasets
52
52
  end
53
53
 
54
54
  private
55
- def open_data
55
+ def open_data(&block)
56
56
  base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
57
  data_path = cache_dir_path + base_name
58
58
  unless data_path.exist?
@@ -60,15 +60,7 @@ module Datasets
60
60
  download(data_path, data_url)
61
61
  end
62
62
 
63
- input, output = IO.pipe
64
- pid = spawn("bzcat", data_path.to_s, {:out => output})
65
- begin
66
- output.close
67
- yield(input)
68
- ensure
69
- input.close
70
- Process.waitpid(pid)
71
- end
63
+ extract_bz2(data_path, &block)
72
64
  end
73
65
 
74
66
  def type_in_path
@@ -0,0 +1,64 @@
1
+ require 'csv'
2
+
3
+ require_relative 'dataset'
4
+
5
+ module Datasets
6
+ class Wine < Dataset
7
+ Record = Struct.new(:label,
8
+ :alcohol,
9
+ :malic_acid,
10
+ :ash,
11
+ :alcalinity_of_ash,
12
+ :n_magnesiums,
13
+ :total_phenols,
14
+ :total_flavonoids,
15
+ :total_nonflavanoid_phenols,
16
+ :total_proanthocyanins,
17
+ :color_intensity,
18
+ :hue,
19
+ :optical_nucleic_acid_concentration,
20
+ :n_prolines)
21
+
22
+ def initialize
23
+ super
24
+ @metadata.id = 'wine'
25
+ @metadata.name = 'Wine'
26
+ @metadata.url = 'http://archive.ics.uci.edu/ml/datasets/wine'
27
+ @metadata.description = -> { read_names }
28
+ end
29
+
30
+ def each
31
+ return to_enum(__method__) unless block_given?
32
+
33
+ open_data do |csv|
34
+ csv.each do |row|
35
+ next if row[0].nil?
36
+ record = Record.new(*row)
37
+ yield(record)
38
+ end
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def read_names
45
+ names_path = cache_dir_path + 'wine.names'
46
+ unless names_path.exist?
47
+ names_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48
+ download(names_path, names_url)
49
+ end
50
+ names_path.read
51
+ end
52
+
53
+ def open_data
54
+ data_path = cache_dir_path + 'wine.data'
55
+ unless data_path.exist?
56
+ data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
57
+ download(data_path, data_url)
58
+ end
59
+ CSV.open(data_path, converters: %i[numeric]) do |csv|
60
+ yield(csv)
61
+ end
62
+ end
63
+ end
64
+ end
data/red-datasets.gemspec CHANGED
@@ -34,6 +34,10 @@ Gem::Specification.new do |spec|
34
34
  spec.files += Dir.glob("doc/text/*")
35
35
  spec.test_files += Dir.glob("test/**/*")
36
36
 
37
+ spec.add_runtime_dependency("csv", ">= 3.0.5")
38
+ spec.add_runtime_dependency("rexml")
39
+ spec.add_runtime_dependency("rubyzip")
40
+
37
41
  spec.add_development_dependency("bundler")
38
42
  spec.add_development_dependency("rake")
39
43
  spec.add_development_dependency("test-unit")
data/test/helper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "fileutils"
2
2
  require "pathname"
3
+ require "time"
3
4
 
4
5
  require "datasets"
5
6
 
data/test/run-test.rb CHANGED
@@ -13,4 +13,6 @@ $LOAD_PATH.unshift(lib_dir.to_s)
13
13
 
14
14
  require_relative "helper"
15
15
 
16
+ ARGV.unshift("--max-diff-target-string-size=#{10 * 1024}")
17
+
16
18
  exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
@@ -0,0 +1,126 @@
1
+ class AdultTest < Test::Unit::TestCase
2
+ sub_test_case("train") do
3
+ def setup
4
+ @dataset = Datasets::Adult.new(type: :train)
5
+ end
6
+
7
+ def record(*args)
8
+ Datasets::Adult::Record.new(*args)
9
+ end
10
+
11
+ test("#each") do
12
+ records = @dataset.each.to_a
13
+ assert_equal([
14
+ 32561,
15
+ {
16
+ :age => 39,
17
+ :work_class => "State-gov",
18
+ :final_weight => 77516,
19
+ :education => "Bachelors",
20
+ :n_education_years => 13,
21
+ :marital_status => "Never-married",
22
+ :occupation => "Adm-clerical",
23
+ :relationship => "Not-in-family",
24
+ :race => "White",
25
+ :sex => "Male",
26
+ :capital_gain => 2174,
27
+ :capital_loss => 0,
28
+ :hours_per_week => 40,
29
+ :native_country => "United-States",
30
+ :label => "<=50K"
31
+ },
32
+ {
33
+ :age => 52,
34
+ :work_class => "Self-emp-inc",
35
+ :final_weight => 287927,
36
+ :education => "HS-grad",
37
+ :n_education_years => 9,
38
+ :marital_status => "Married-civ-spouse",
39
+ :occupation => "Exec-managerial",
40
+ :relationship => "Wife",
41
+ :race => "White",
42
+ :sex => "Female",
43
+ :capital_gain => 15024,
44
+ :capital_loss => 0,
45
+ :hours_per_week => 40,
46
+ :native_country => "United-States",
47
+ :label => ">50K"
48
+ }
49
+ ],
50
+ [
51
+ records.size,
52
+ records[0].to_h,
53
+ records[-1].to_h
54
+ ])
55
+ end
56
+ end
57
+
58
+ sub_test_case("test") do
59
+ def setup
60
+ @dataset = Datasets::Adult.new(type: :test)
61
+ end
62
+
63
+ def record(*args)
64
+ Datasets::Adult::Record.new(*args)
65
+ end
66
+
67
+ test("#each") do
68
+ records = @dataset.each.to_a
69
+ assert_equal([
70
+ 16281,
71
+ {
72
+ :age => 25,
73
+ :work_class => "Private",
74
+ :final_weight => 226802,
75
+ :education => "11th",
76
+ :n_education_years => 7,
77
+ :marital_status => "Never-married",
78
+ :occupation => "Machine-op-inspct",
79
+ :relationship => "Own-child",
80
+ :race => "Black",
81
+ :sex => "Male",
82
+ :capital_gain => 0,
83
+ :capital_loss => 0,
84
+ :hours_per_week => 40,
85
+ :native_country => "United-States",
86
+ :label => "<=50K."
87
+ },
88
+ {
89
+ :age => 35,
90
+ :work_class => "Self-emp-inc",
91
+ :final_weight => 182148,
92
+ :education => "Bachelors",
93
+ :n_education_years => 13,
94
+ :marital_status => "Married-civ-spouse",
95
+ :occupation => "Exec-managerial",
96
+ :relationship => "Husband",
97
+ :race => "White",
98
+ :sex => "Male",
99
+ :capital_gain => 0,
100
+ :capital_loss => 0,
101
+ :hours_per_week => 60,
102
+ :native_country => "United-States",
103
+ :label => ">50K."
104
+ }
105
+ ],
106
+ [
107
+ records.size,
108
+ records[0].to_h,
109
+ records[-1].to_h
110
+ ])
111
+ end
112
+ end
113
+
114
+ sub_test_case("#metadata") do
115
+ def setup
116
+ @dataset = Datasets::Adult.new(type: :train)
117
+ end
118
+
119
+ test("#description") do
120
+ description = @dataset.metadata.description
121
+ assert do
122
+ description.start_with?("| This data was extracted from the census bureau database found at")
123
+ end
124
+ end
125
+ end
126
+ end