red-datasets 0.1.0 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ module Datasets
2
+ class Error < StandardError
3
+ end
4
+ end
@@ -2,8 +2,6 @@ require 'zlib'
2
2
 
3
3
  require_relative "dataset"
4
4
 
5
- class SetTypeError < StandardError; end
6
-
7
5
  module Datasets
8
6
  class MNIST < Dataset
9
7
  BASE_URL = "http://yann.lecun.com/exdb/mnist/"
@@ -67,7 +65,9 @@ module Datasets
67
65
  n_bytes = n_uint32s * 4
68
66
  mnist_magic_number = 2051
69
67
  magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
70
- raise "This is not #{dataset_name} image file" if magic != mnist_magic_number
68
+ if magic != mnist_magic_number
69
+ raise Error, "This is not #{dataset_name} image file"
70
+ end
71
71
  n_images.times do |i|
72
72
  data = f.read(n_rows * n_cols)
73
73
  label = labels[i]
@@ -101,7 +101,9 @@ module Datasets
101
101
  n_bytes = n_uint32s * 2
102
102
  mnist_magic_number = 2049
103
103
  magic, n_labels = f.read(n_bytes).unpack('N2')
104
- raise "This is not #{dataset_name} label file" if magic != mnist_magic_number
104
+ if magic != mnist_magic_number
105
+ raise Error, "This is not #{dataset_name} label file"
106
+ end
105
107
  f.read(n_labels).unpack('C*')
106
108
  end
107
109
  end
@@ -0,0 +1,146 @@
1
+ require_relative "dataset"
2
+
3
+ module Datasets
4
+ module PenguinsRawData
5
+ Record = Struct.new(:study_name,
6
+ :sample_number,
7
+ :species,
8
+ :region,
9
+ :island,
10
+ :stage,
11
+ :individual_id,
12
+ :clutch_completion,
13
+ :date_egg,
14
+ :culmen_length_mm,
15
+ :culmen_depth_mm,
16
+ :flipper_length_mm,
17
+ :body_mass_g,
18
+ :sex,
19
+ :delta_15_n_permil,
20
+ :delta_13_c_permil,
21
+ :comments)
22
+ class SpeciesBase < Dataset
23
+ def initialize
24
+ super
25
+ species = self.class.name.split("::").last.downcase
26
+ @metadata.id = "palmerpenguins-raw-#{species}"
27
+ @metadata.url = self.class::URL
28
+ @metadata.licenses = ["CC0"]
29
+ @data_path = cache_dir_path + "penguins" + (species + ".csv")
30
+ end
31
+
32
+ attr_reader :data_path
33
+
34
+ def each
35
+ return to_enum(__method__) unless block_given?
36
+
37
+ open_data do |csv|
38
+ csv.each do |row|
39
+ next if row[0].nil?
40
+ record = Record.new(*row.fields)
41
+ yield record
42
+ end
43
+ end
44
+ end
45
+
46
+ private def open_data
47
+ download unless data_path.exist?
48
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
49
+ yield csv
50
+ end
51
+ end
52
+
53
+ private def download
54
+ super(data_path, metadata.url)
55
+ end
56
+ end
57
+
58
+ # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
59
+ class Adelie < SpeciesBase
60
+ DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
61
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff".freeze
62
+ end
63
+
64
+ # Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
65
+ class Chinstrap < SpeciesBase
66
+ DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
67
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462".freeze
68
+ end
69
+
70
+ # Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
71
+ class Gentoo < SpeciesBase
72
+ DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
73
+ URL = "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381".freeze
74
+ end
75
+ end
76
+
77
+ # This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
78
+ class Penguins < Dataset
79
+ Record = Struct.new(:species,
80
+ :island,
81
+ :bill_length_mm,
82
+ :bill_depth_mm,
83
+ :flipper_length_mm,
84
+ :body_mass_g,
85
+ :sex,
86
+ :year)
87
+
88
+ def initialize
89
+ super
90
+ @metadata.id = "palmerpenguins"
91
+ @metadata.name = "palmerpenguins"
92
+ @metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
93
+ @metadata.licenses = ["CC0"]
94
+ @metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
95
+ end
96
+
97
+ def each(&block)
98
+ return to_enum(__method__) unless block_given?
99
+
100
+ species_classes = [
101
+ PenguinsRawData::Adelie,
102
+ PenguinsRawData::Chinstrap,
103
+ PenguinsRawData::Gentoo,
104
+ ]
105
+
106
+ species_classes.each do |species_class|
107
+ species_class.new.each do |raw_record|
108
+ yield convert_record(raw_record)
109
+ end
110
+ end
111
+ end
112
+
113
+ private def convert_record(raw_record)
114
+ Record.new(*cleanse_fields(raw_record))
115
+ end
116
+
117
+ private def cleanse_fields(raw_record)
118
+ species = raw_record.species.split(' ')[0]
119
+ flipper_length_mm = raw_record.flipper_length_mm&.to_i
120
+ body_mass_g = raw_record.body_mass_g&.to_i
121
+ sex = normalize_sex(raw_record.sex)
122
+ year = raw_record.date_egg&.year
123
+
124
+ [
125
+ species,
126
+ raw_record.island,
127
+ raw_record.culmen_length_mm,
128
+ raw_record.culmen_depth_mm,
129
+ flipper_length_mm,
130
+ body_mass_g,
131
+ sex,
132
+ year
133
+ ]
134
+ end
135
+
136
+ private def normalize_sex(val)
137
+ val = val&.downcase
138
+ case val
139
+ when "female", "male", nil
140
+ val
141
+ else
142
+ nil
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,95 @@
1
+ require_relative "dataset"
2
+ require_relative "tar-gz-readable"
3
+
4
+ module Datasets
5
+ class RdatasetsList < Dataset
6
+ Record = Struct.new(:package,
7
+ :dataset,
8
+ :title,
9
+ :rows,
10
+ :cols,
11
+ :n_binary,
12
+ :n_character,
13
+ :n_factor,
14
+ :n_logical,
15
+ :n_numeric,
16
+ :csv,
17
+ :doc)
18
+
19
+ def initialize
20
+ super
21
+ @metadata.id = "rdatasets"
22
+ @metadata.name = "Rdatasets"
23
+ @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
24
+ @metadata.licenses = ["GPL-3"]
25
+ @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
26
+ @data_path = cache_dir_path + "datasets.csv"
27
+ end
28
+
29
+ def filter(package: nil, dataset: nil)
30
+ return to_enum(__method__, package: package, dataset: dataset) unless block_given?
31
+
32
+ conds = {}
33
+ conds["Package"] = package if package
34
+ conds["Item"] = dataset if dataset
35
+ if conds.empty?
36
+ each_row {|row| yield Record.new(*row.fields) }
37
+ else
38
+ each_row do |row|
39
+ if conds.all? {|k, v| row[k] == v }
40
+ yield Record.new(*row.fields)
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ def each(&block)
47
+ filter(&block)
48
+ end
49
+
50
+ private def each_row(&block)
51
+ download(@data_path, @data_url) unless @data_path.exist?
52
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
53
+ csv.each(&block)
54
+ end
55
+ end
56
+ end
57
+
58
+ class Rdatasets < Dataset
59
+ def initialize(package_name, dataset_name)
60
+ list = RdatasetsList.new
61
+
62
+ info = list.filter(package: package_name, dataset: dataset_name).first
63
+ unless info
64
+ raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
65
+ end
66
+
67
+ super()
68
+ @metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
69
+ @metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
70
+ @metadata.url = info.csv
71
+ @metadata.licenses = ["GPL-3"]
72
+ @metadata.description = info.title
73
+
74
+ # Follow the original directory structure in the cache directory
75
+ @data_path = cache_dir_path + (dataset_name + ".csv")
76
+
77
+ @package_name = package_name
78
+ @dataset_name = dataset_name
79
+ end
80
+
81
+ def each(&block)
82
+ return to_enum(__method__) unless block_given?
83
+
84
+ download(@data_path, @metadata.url) unless @data_path.exist?
85
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
86
+ csv.each do |row|
87
+ record = row.to_h
88
+ record.delete("")
89
+ record.transform_keys!(&:to_sym)
90
+ yield record
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,49 @@
1
+ module Datasets
2
+ class SeabornData < Dataset
3
+ URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
4
+
5
+ def initialize(name)
6
+ super()
7
+ @metadata.id = "seaborn-data-#{name}"
8
+ @metadata.name = "SeabornData: #{name}"
9
+ @metadata.url = URL_FORMAT % {name: name}
10
+
11
+ @data_path = cache_dir_path + (name + ".csv")
12
+ @name = name
13
+ end
14
+
15
+ def each(&block)
16
+ return to_enum(__method__) unless block_given?
17
+
18
+ download(@data_path, @metadata.url) unless @data_path.exist?
19
+ CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
20
+ csv.each do |row|
21
+ record = prepare_record(row)
22
+ yield record
23
+ end
24
+ end
25
+ end
26
+
27
+ private
28
+ def prepare_record(csv_row)
29
+ record = csv_row.to_h
30
+ record.transform_keys!(&:to_sym)
31
+
32
+ # Perform the same preprocessing as seaborn's load_dataset function
33
+ preprocessor = :"preprocess_#{@name}_record"
34
+ __send__(preprocessor, record) if respond_to?(preprocessor, true)
35
+
36
+ record
37
+ end
38
+
39
+ # The same preprocessing as seaborn.load_dataset
40
+ def preprocess_flights_record(record)
41
+ record[:month] &&= record[:month][0,3]
42
+ end
43
+
44
+ # The same preprocessing as seaborn.load_dataset
45
+ def preprocess_penguins_record(record)
46
+ record[:sex] &&= record[:sex].capitalize
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,171 @@
1
+ require "csv"
2
+
3
+ require_relative "dataset"
4
+
5
+ module Datasets
6
+ class SudachiSynonymDictionary < Dataset
7
+ class Synonym < Struct.new(:group_id,
8
+ :is_noun,
9
+ :expansion_type,
10
+ :lexeme_id,
11
+ :form_type,
12
+ :acronym_type,
13
+ :variant_type,
14
+ :categories,
15
+ :notation)
16
+ alias_method :noun?, :is_noun
17
+ end
18
+
19
+ def initialize
20
+ super()
21
+ @metadata.id = "sudachi-synonym-dictionary"
22
+ @metadata.name = "Sudachi synonym dictionary"
23
+ @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
24
+ @metadata.licenses = [
25
+ "Apache-2.0",
26
+ ]
27
+ @metadata.description = lambda do
28
+ download_description
29
+ end
30
+ end
31
+
32
+ def each
33
+ return to_enum(__method__) unless block_given?
34
+
35
+ lexeme_id_context = {}
36
+ open_data do |csv|
37
+ csv.each do |row|
38
+ group_id = row[0]
39
+ if group_id != lexeme_id_context[:group_id]
40
+ lexeme_id_context[:group_id] = group_id
41
+ lexeme_id_context[:counter] = 0
42
+ end
43
+ is_noun = (row[1] == "1")
44
+ expansion_type = normalize_expansion_type(row[2])
45
+ lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
46
+ form_type = normalize_form_type(row[4])
47
+ acronym_type = normalize_acronym_type(row[5])
48
+ variant_type = normalize_variant_type(row[6])
49
+ categories = normalize_categories(row[7])
50
+ notation = row[8]
51
+ synonym = Synonym.new(group_id,
52
+ is_noun,
53
+ expansion_type,
54
+ lexeme_id,
55
+ form_type,
56
+ acronym_type,
57
+ variant_type,
58
+ categories,
59
+ notation)
60
+ yield(synonym)
61
+ end
62
+ end
63
+ end
64
+
65
+ private
66
+ def open_data
67
+ data_path = cache_dir_path + "synonyms.txt"
68
+ unless data_path.exist?
69
+ data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
70
+ download(data_path, data_url)
71
+ end
72
+ CSV.open(data_path,
73
+ encoding: "UTF-8",
74
+ skip_blanks: true) do |csv|
75
+ yield(csv)
76
+ end
77
+ end
78
+
79
+ def download_description
80
+ description_path = cache_dir_path + "synonyms.md"
81
+ unless description_path.exist?
82
+ description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
83
+ download(description_path, description_url)
84
+ end
85
+ description_path.read
86
+ end
87
+
88
+ def normalize_expansion_type(type)
89
+ case type
90
+ when "0", ""
91
+ :always
92
+ when "1"
93
+ :expanded
94
+ when "2"
95
+ :never
96
+ else
97
+ raise Error, "unknown expansion type: #{type.inspect}"
98
+ end
99
+ end
100
+
101
+ def normalize_lexeme_id(id, context)
102
+ case id
103
+ when ""
104
+ lexeme_id_context[:counter] += 1
105
+ lexeme_id_context[:counter]
106
+ else
107
+ # Use only the first lexeme ID.
108
+ # Example:
109
+ # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
110
+ # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
111
+ Integer(id.split("/").first, 10)
112
+ end
113
+ end
114
+
115
+ def normalize_form_type(type)
116
+ case type
117
+ when "0", ""
118
+ :typical
119
+ when "1"
120
+ :translation
121
+ when "2"
122
+ :alias
123
+ when "3"
124
+ :old_name
125
+ when "4"
126
+ :misnomer
127
+ else
128
+ raise Error, "unknown form type: #{type.inspect}"
129
+ end
130
+ end
131
+
132
+ def normalize_acronym_type(type)
133
+ case type
134
+ when "0", ""
135
+ :typical
136
+ when "1"
137
+ :alphabet
138
+ when "2"
139
+ :others
140
+ else
141
+ raise Error, "unknown acronym type: #{type.inspect}"
142
+ end
143
+ end
144
+
145
+ def normalize_variant_type(type)
146
+ case type
147
+ when "0", ""
148
+ :typical
149
+ when "1"
150
+ :alphabet
151
+ when "2"
152
+ :general
153
+ when "3"
154
+ :misspelled
155
+ else
156
+ raise Error, "unknown variant type: #{type.inspect}"
157
+ end
158
+ end
159
+
160
+ def normalize_categories(categories)
161
+ case categories
162
+ when ""
163
+ nil
164
+ when /\A\((.*)\)\z/
165
+ $1.split("/")
166
+ else
167
+ raise Error, "invalid categories: #{categories.inspect}"
168
+ end
169
+ end
170
+ end
171
+ end