red-datasets 0.0.6 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -7
  3. data/doc/text/news.md +124 -0
  4. data/lib/datasets.rb +18 -6
  5. data/lib/datasets/adult.rb +84 -0
  6. data/lib/datasets/cldr-plurals.rb +385 -0
  7. data/lib/datasets/communities.rb +198 -0
  8. data/lib/datasets/dataset.rb +13 -0
  9. data/lib/datasets/dictionary.rb +59 -0
  10. data/lib/datasets/downloader.rb +37 -62
  11. data/lib/datasets/e-stat-japan.rb +320 -0
  12. data/lib/datasets/error.rb +4 -0
  13. data/lib/datasets/fashion-mnist.rb +12 -0
  14. data/lib/datasets/hepatitis.rb +207 -0
  15. data/lib/datasets/iris.rb +1 -1
  16. data/lib/datasets/libsvm-dataset-list.rb +277 -0
  17. data/lib/datasets/libsvm.rb +135 -0
  18. data/lib/datasets/mnist.rb +11 -8
  19. data/lib/datasets/mushroom.rb +256 -0
  20. data/lib/datasets/penguins.rb +125 -0
  21. data/lib/datasets/penn-treebank.rb +2 -9
  22. data/lib/datasets/postal-code-japan.rb +154 -0
  23. data/lib/datasets/table.rb +99 -3
  24. data/lib/datasets/version.rb +1 -1
  25. data/lib/datasets/wikipedia.rb +2 -10
  26. data/lib/datasets/wine.rb +64 -0
  27. data/red-datasets.gemspec +4 -0
  28. data/test/helper.rb +1 -0
  29. data/test/run-test.rb +2 -0
  30. data/test/test-adult.rb +126 -0
  31. data/test/test-cldr-plurals.rb +180 -0
  32. data/test/test-communities.rb +290 -0
  33. data/test/test-dictionary.rb +43 -0
  34. data/test/test-e-stat-japan.rb +383 -0
  35. data/test/test-fashion-mnist.rb +137 -0
  36. data/test/test-hepatitis.rb +74 -0
  37. data/test/test-libsvm-dataset-list.rb +47 -0
  38. data/test/test-libsvm.rb +205 -0
  39. data/test/test-mnist.rb +95 -70
  40. data/test/test-mushroom.rb +80 -0
  41. data/test/test-penguins.rb +239 -0
  42. data/test/test-penn-treebank.rb +6 -6
  43. data/test/test-postal-code-japan.rb +69 -0
  44. data/test/test-table.rb +144 -19
  45. data/test/test-wine.rb +58 -0
  46. metadata +89 -8
@@ -2,7 +2,7 @@ require_relative "dataset"
2
2
 
3
3
  module Datasets
4
4
  class PennTreebank < Dataset
5
- Record = Struct.new(:word, :id)
5
+ Record = Struct.new(:word)
6
6
 
7
7
  DESCRIPTION = <<~DESC
8
8
  `Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a
@@ -46,17 +46,10 @@ module Datasets
46
46
 
47
47
  private
48
48
  def parse_data(data_path)
49
- index = 0
50
- vocabulary = {}
51
49
  File.open(data_path) do |f|
52
50
  f.each_line do |line|
53
51
  line.split.each do |word|
54
- word = word.strip
55
- unless vocabulary.key?(word)
56
- vocabulary[word] = index
57
- index += 1
58
- end
59
- yield(Record.new(word, vocabulary[word]))
52
+ yield(Record.new(word.strip))
60
53
  end
61
54
  end
62
55
  end
@@ -0,0 +1,154 @@
1
+ require "csv"
2
+ require "zip"
3
+
4
+ require_relative "dataset"
5
+
6
+ module Datasets
7
+ class PostalCodeJapan < Dataset
8
+ class Record < Struct.new(:organization_code,
9
+ :old_postal_code,
10
+ :postal_code,
11
+ :prefecture_reading,
12
+ :city_reading,
13
+ :address_reading,
14
+ :prefecture,
15
+ :city,
16
+ :address,
17
+ :have_multiple_postal_codes,
18
+ :have_address_number_per_koaza,
19
+ :have_chome,
20
+ :postal_code_is_shared,
21
+ :changed,
22
+ :change_reason)
23
+ alias_method :have_multiple_postal_codes?,
24
+ :have_multiple_postal_codes
25
+ alias_method :have_address_number_per_koaza?,
26
+ :have_address_number_per_koaza
27
+ alias_method :have_chome?,
28
+ :have_chome
29
+ alias_method :postal_code_is_shared?,
30
+ :postal_code_is_shared
31
+ alias_method :changed?,
32
+ :changed
33
+ end
34
+
35
+ VALID_READINGS = [
36
+ :lowercase,
37
+ :uppercase,
38
+ :romaji,
39
+ ]
40
+ def initialize(reading: :lowercase)
41
+ super()
42
+ @reading = reading
43
+ unless VALID_READINGS.include?(@reading)
44
+ message = ":reading must be one of ["
45
+ message << VALID_READINGS.collect(&:inspect).join(", ")
46
+ message << "]: #{@reading.inspect}"
47
+ raise ArgumentError, message
48
+ end
49
+ @metadata.id = "postal-code-japan-#{@reading}"
50
+ @metadata.name = "Postal code in Japan (#{@reading})"
51
+ @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
52
+ @metadata.licenses = [
53
+ "CC0-1.0",
54
+ ]
55
+ @metadata.description = "Postal code in Japan (reading: #{@reading})"
56
+ end
57
+
58
+ def each(&block)
59
+ return to_enum(__method__) unless block_given?
60
+
61
+ open_data do |input|
62
+ utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
63
+ options = {
64
+ quote_char: nil,
65
+ strip: %Q["],
66
+ }
67
+ if @reading == :romaji
68
+ CSV.parse(utf8_data, **options) do |row|
69
+ yield(Record.new(nil,
70
+ nil,
71
+ row[0],
72
+ row[4],
73
+ row[5],
74
+ row[6],
75
+ row[1],
76
+ row[2],
77
+ row[3],
78
+ false,
79
+ false,
80
+ false,
81
+ false,
82
+ false,
83
+ nil))
84
+ end
85
+ else
86
+ CSV.parse(utf8_data, **options) do |row|
87
+ yield(Record.new(row[0],
88
+ row[1].rstrip,
89
+ row[2],
90
+ row[3],
91
+ row[4],
92
+ row[5],
93
+ row[6],
94
+ row[7],
95
+ row[8],
96
+ (row[9] == "1"),
97
+ (row[10] == "1"),
98
+ (row[11] == "1"),
99
+ (row[12] == "1"),
100
+ (row[13] != "0"),
101
+ convert_change_reason(row[14])))
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ private
108
+ def open_data
109
+ data_url = "https://www.post.japanpost.jp/zipcode/dl"
110
+ case @reading
111
+ when :lowercase
112
+ data_url << "/kogaki/zip/ken_all.zip"
113
+ when :uppercase
114
+ data_url << "/oogaki/zip/ken_all.zip"
115
+ when :romaji
116
+ data_url << "/roman/ken_all_rome.zip"
117
+ end
118
+ data_path = cache_dir_path + "#{@reading}-ken-all.zip"
119
+ unless data_path.exist?
120
+ download(data_path, data_url)
121
+ end
122
+
123
+ Zip::File.open(data_path.to_s) do |zip_file|
124
+ zip_file.each do |entry|
125
+ next unless entry.file?
126
+ entry.get_input_stream do |input|
127
+ yield(input)
128
+ end
129
+ end
130
+ end
131
+ end
132
+
133
+ def convert_change_reason(reason)
134
+ case reason
135
+ when "0"
136
+ nil
137
+ when "1"
138
+ :new
139
+ when "2"
140
+ :japanese_addressing_system
141
+ when "3"
142
+ :land_readjustment
143
+ when "4"
144
+ :postal_district_adjustment
145
+ when "5"
146
+ :correction
147
+ when "6"
148
+ :deletion
149
+ else
150
+ :unknown
151
+ end
152
+ end
153
+ end
154
+ end
@@ -1,17 +1,109 @@
1
+ require "datasets/dictionary"
2
+
1
3
  module Datasets
2
4
  class Table
5
+ class Record
6
+ include Enumerable
7
+
8
+ def initialize(table, index)
9
+ @table = table
10
+ @index = index
11
+ end
12
+
13
+ def [](column_name_or_column_index)
14
+ @table[column_name_or_column_index][@index]
15
+ end
16
+
17
+ def each
18
+ return to_enum(__method__) unless block_given?
19
+ @table.each_column.each do |column_name, column_values|
20
+ yield(column_name, column_values[@index])
21
+ end
22
+ end
23
+
24
+ def values
25
+ @table.each_column.collect do |_column_name, column_values|
26
+ column_values[@index]
27
+ end
28
+ end
29
+
30
+ def to_h
31
+ hash = {}
32
+ each do |column_name, column_value|
33
+ hash[column_name] = column_value
34
+ end
35
+ hash
36
+ end
37
+
38
+ def inspect
39
+ "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
40
+ end
41
+ end
42
+
3
43
  include Enumerable
4
44
 
45
+ attr_reader :dataset
5
46
  def initialize(dataset)
6
47
  @dataset = dataset
48
+ @dictionaries = {}
7
49
  end
8
50
 
9
- def each(&block)
51
+ def n_columns
52
+ columner_data.size
53
+ end
54
+ alias_method :size, :n_columns
55
+ alias_method :length, :n_columns
56
+
57
+ def n_rows
58
+ first_column = columner_data.first
59
+ return 0 if first_column.nil?
60
+ first_column[1].size
61
+ end
62
+
63
+ def column_names
64
+ columner_data.keys
65
+ end
66
+
67
+ def each_column(&block)
10
68
  columner_data.each(&block)
11
69
  end
70
+ alias_method :each, :each_column
71
+
72
+ def each_record
73
+ return to_enum(__method__) unless block_given?
74
+ n_rows.times do |i|
75
+ yield(Record.new(self, i))
76
+ end
77
+ end
78
+
79
+ def find_record(row)
80
+ row += n_rows if row < 0
81
+ return nil if row < 0
82
+ return nil if row >= n_rows
83
+ Record.new(self, row)
84
+ end
85
+
86
+ def [](name_or_index)
87
+ case name_or_index
88
+ when Integer
89
+ index = name_or_index
90
+ columner_data.each_with_index do |(_name, values), i|
91
+ return values if i == index
92
+ end
93
+ nil
94
+ else
95
+ name = name_or_index
96
+ columner_data[normalize_name(name)]
97
+ end
98
+ end
12
99
 
13
- def [](name)
14
- columner_data[name.to_sym]
100
+ def dictionary_encode(name)
101
+ @dictionaries[normalize_name(name)] ||= Dictionary.new(self[name])
102
+ end
103
+
104
+ def label_encode(name)
105
+ dictionary = dictionary_encode(name)
106
+ dictionary.encode(self[name])
15
107
  end
16
108
 
17
109
  def fetch_values(*keys)
@@ -55,5 +147,9 @@ module Datasets
55
147
  def columner_data
56
148
  @columns ||= to_h
57
149
  end
150
+
151
+ def normalize_name(name)
152
+ name.to_sym
153
+ end
58
154
  end
59
155
  end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.6"
2
+ VERSION = "0.1.1"
3
3
  end
@@ -52,7 +52,7 @@ module Datasets
52
52
  end
53
53
 
54
54
  private
55
- def open_data
55
+ def open_data(&block)
56
56
  base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
57
  data_path = cache_dir_path + base_name
58
58
  unless data_path.exist?
@@ -60,15 +60,7 @@ module Datasets
60
60
  download(data_path, data_url)
61
61
  end
62
62
 
63
- input, output = IO.pipe
64
- pid = spawn("bzcat", data_path.to_s, {:out => output})
65
- begin
66
- output.close
67
- yield(input)
68
- ensure
69
- input.close
70
- Process.waitpid(pid)
71
- end
63
+ extract_bz2(data_path, &block)
72
64
  end
73
65
 
74
66
  def type_in_path
@@ -0,0 +1,64 @@
1
+ require 'csv'
2
+
3
+ require_relative 'dataset'
4
+
5
+ module Datasets
6
+ class Wine < Dataset
7
+ Record = Struct.new(:label,
8
+ :alcohol,
9
+ :malic_acid,
10
+ :ash,
11
+ :alcalinity_of_ash,
12
+ :n_magnesiums,
13
+ :total_phenols,
14
+ :total_flavonoids,
15
+ :total_nonflavanoid_phenols,
16
+ :total_proanthocyanins,
17
+ :color_intensity,
18
+ :hue,
19
+ :optical_nucleic_acid_concentration,
20
+ :n_prolines)
21
+
22
+ def initialize
23
+ super
24
+ @metadata.id = 'wine'
25
+ @metadata.name = 'Wine'
26
+ @metadata.url = 'http://archive.ics.uci.edu/ml/datasets/wine'
27
+ @metadata.description = -> { read_names }
28
+ end
29
+
30
+ def each
31
+ return to_enum(__method__) unless block_given?
32
+
33
+ open_data do |csv|
34
+ csv.each do |row|
35
+ next if row[0].nil?
36
+ record = Record.new(*row)
37
+ yield(record)
38
+ end
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def read_names
45
+ names_path = cache_dir_path + 'wine.names'
46
+ unless names_path.exist?
47
+ names_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48
+ download(names_path, names_url)
49
+ end
50
+ names_path.read
51
+ end
52
+
53
+ def open_data
54
+ data_path = cache_dir_path + 'wine.data'
55
+ unless data_path.exist?
56
+ data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
57
+ download(data_path, data_url)
58
+ end
59
+ CSV.open(data_path, converters: %i[numeric]) do |csv|
60
+ yield(csv)
61
+ end
62
+ end
63
+ end
64
+ end
data/red-datasets.gemspec CHANGED
@@ -34,6 +34,10 @@ Gem::Specification.new do |spec|
34
34
  spec.files += Dir.glob("doc/text/*")
35
35
  spec.test_files += Dir.glob("test/**/*")
36
36
 
37
+ spec.add_runtime_dependency("csv", ">= 3.0.5")
38
+ spec.add_runtime_dependency("rexml")
39
+ spec.add_runtime_dependency("rubyzip")
40
+
37
41
  spec.add_development_dependency("bundler")
38
42
  spec.add_development_dependency("rake")
39
43
  spec.add_development_dependency("test-unit")
data/test/helper.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "fileutils"
2
2
  require "pathname"
3
+ require "time"
3
4
 
4
5
  require "datasets"
5
6
 
data/test/run-test.rb CHANGED
@@ -13,4 +13,6 @@ $LOAD_PATH.unshift(lib_dir.to_s)
13
13
 
14
14
  require_relative "helper"
15
15
 
16
+ ARGV.unshift("--max-diff-target-string-size=#{10 * 1024}")
17
+
16
18
  exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
@@ -0,0 +1,126 @@
1
+ class AdultTest < Test::Unit::TestCase
2
+ sub_test_case("train") do
3
+ def setup
4
+ @dataset = Datasets::Adult.new(type: :train)
5
+ end
6
+
7
+ def record(*args)
8
+ Datasets::Adult::Record.new(*args)
9
+ end
10
+
11
+ test("#each") do
12
+ records = @dataset.each.to_a
13
+ assert_equal([
14
+ 32561,
15
+ {
16
+ :age => 39,
17
+ :work_class => "State-gov",
18
+ :final_weight => 77516,
19
+ :education => "Bachelors",
20
+ :n_education_years => 13,
21
+ :marital_status => "Never-married",
22
+ :occupation => "Adm-clerical",
23
+ :relationship => "Not-in-family",
24
+ :race => "White",
25
+ :sex => "Male",
26
+ :capital_gain => 2174,
27
+ :capital_loss => 0,
28
+ :hours_per_week => 40,
29
+ :native_country => "United-States",
30
+ :label => "<=50K"
31
+ },
32
+ {
33
+ :age => 52,
34
+ :work_class => "Self-emp-inc",
35
+ :final_weight => 287927,
36
+ :education => "HS-grad",
37
+ :n_education_years => 9,
38
+ :marital_status => "Married-civ-spouse",
39
+ :occupation => "Exec-managerial",
40
+ :relationship => "Wife",
41
+ :race => "White",
42
+ :sex => "Female",
43
+ :capital_gain => 15024,
44
+ :capital_loss => 0,
45
+ :hours_per_week => 40,
46
+ :native_country => "United-States",
47
+ :label => ">50K"
48
+ }
49
+ ],
50
+ [
51
+ records.size,
52
+ records[0].to_h,
53
+ records[-1].to_h
54
+ ])
55
+ end
56
+ end
57
+
58
+ sub_test_case("test") do
59
+ def setup
60
+ @dataset = Datasets::Adult.new(type: :test)
61
+ end
62
+
63
+ def record(*args)
64
+ Datasets::Adult::Record.new(*args)
65
+ end
66
+
67
+ test("#each") do
68
+ records = @dataset.each.to_a
69
+ assert_equal([
70
+ 16281,
71
+ {
72
+ :age => 25,
73
+ :work_class => "Private",
74
+ :final_weight => 226802,
75
+ :education => "11th",
76
+ :n_education_years => 7,
77
+ :marital_status => "Never-married",
78
+ :occupation => "Machine-op-inspct",
79
+ :relationship => "Own-child",
80
+ :race => "Black",
81
+ :sex => "Male",
82
+ :capital_gain => 0,
83
+ :capital_loss => 0,
84
+ :hours_per_week => 40,
85
+ :native_country => "United-States",
86
+ :label => "<=50K."
87
+ },
88
+ {
89
+ :age => 35,
90
+ :work_class => "Self-emp-inc",
91
+ :final_weight => 182148,
92
+ :education => "Bachelors",
93
+ :n_education_years => 13,
94
+ :marital_status => "Married-civ-spouse",
95
+ :occupation => "Exec-managerial",
96
+ :relationship => "Husband",
97
+ :race => "White",
98
+ :sex => "Male",
99
+ :capital_gain => 0,
100
+ :capital_loss => 0,
101
+ :hours_per_week => 60,
102
+ :native_country => "United-States",
103
+ :label => ">50K."
104
+ }
105
+ ],
106
+ [
107
+ records.size,
108
+ records[0].to_h,
109
+ records[-1].to_h
110
+ ])
111
+ end
112
+ end
113
+
114
+ sub_test_case("#metadata") do
115
+ def setup
116
+ @dataset = Datasets::Adult.new(type: :train)
117
+ end
118
+
119
+ test("#description") do
120
+ description = @dataset.metadata.description
121
+ assert do
122
+ description.start_with?("| This data was extracted from the census bureau database found at")
123
+ end
124
+ end
125
+ end
126
+ end