red-datasets 0.1.4 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
@@ -2,7 +2,7 @@ require_relative "dataset"
2
2
  require_relative "tar-gz-readable"
3
3
 
4
4
  module Datasets
5
- class RdatasetsList < Dataset
5
+ class RdatasetList < Dataset
6
6
  Record = Struct.new(:package,
7
7
  :dataset,
8
8
  :title,
@@ -18,8 +18,8 @@ module Datasets
18
18
 
19
19
  def initialize
20
20
  super
21
- @metadata.id = "rdatasets"
22
- @metadata.name = "Rdatasets"
21
+ @metadata.id = "rdataset-list"
22
+ @metadata.name = "Rdataset"
23
23
  @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
24
24
  @metadata.licenses = ["GPL-3"]
25
25
  @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
@@ -48,16 +48,19 @@ module Datasets
48
48
  end
49
49
 
50
50
  private def each_row(&block)
51
- download(@data_path, @data_url) unless @data_path.exist?
51
+ download(@data_path, @data_url)
52
52
  CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
53
53
  csv.each(&block)
54
54
  end
55
55
  end
56
56
  end
57
57
 
58
- class Rdatasets < Dataset
58
+ # For backward compatibility
59
+ RdatasetsList = RdatasetList
60
+
61
+ class Rdataset < Dataset
59
62
  def initialize(package_name, dataset_name)
60
- list = RdatasetsList.new
63
+ list = RdatasetList.new
61
64
 
62
65
  info = list.filter(package: package_name, dataset: dataset_name).first
63
66
  unless info
@@ -65,8 +68,8 @@ module Datasets
65
68
  end
66
69
 
67
70
  super()
68
- @metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
69
- @metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
71
+ @metadata.id = "rdataset-#{package_name}-#{dataset_name}"
72
+ @metadata.name = "Rdataset: #{package_name}: #{dataset_name}"
70
73
  @metadata.url = info.csv
71
74
  @metadata.licenses = ["GPL-3"]
72
75
  @metadata.description = info.title
@@ -81,15 +84,63 @@ module Datasets
81
84
  def each(&block)
82
85
  return to_enum(__method__) unless block_given?
83
86
 
84
- download(@data_path, @metadata.url) unless @data_path.exist?
85
- CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
86
- csv.each do |row|
87
- record = row.to_h
88
- record.delete("")
89
- record.transform_keys!(&:to_sym)
90
- yield record
87
+ download(@data_path, @metadata.url)
88
+
89
+ na_converter = lambda do |field|
90
+ begin
91
+ if field.encode(CSV::ConverterEncoding) == "NA"
92
+ nil
93
+ else
94
+ field
95
+ end
96
+ rescue
97
+ field
91
98
  end
92
99
  end
100
+
101
+ inf_converter = lambda do |field|
102
+ begin
103
+ if field.encode(CSV::ConverterEncoding) == "Inf"
104
+ Float::INFINITY
105
+ else
106
+ field
107
+ end
108
+ rescue
109
+ field
110
+ end
111
+ end
112
+
113
+ quote_preserving_converter = lambda do |field, info|
114
+ f = field.encode(CSV::ConverterEncoding)
115
+ return f if info.quoted?
116
+
117
+ begin
118
+ begin
119
+ begin
120
+ return DateTime.parse(f) if f.match?(DateTimeMatcher)
121
+ rescue
122
+ return Integer(f)
123
+ end
124
+ rescue
125
+ return Float(f)
126
+ end
127
+ rescue
128
+ field
129
+ end
130
+ end
131
+
132
+ table = CSV.table(@data_path,
133
+ header_converters: [:symbol_raw],
134
+ # quote_preserving_converter should be the last
135
+ converters: [na_converter, inf_converter, quote_preserving_converter])
136
+ table.delete(:"") # delete 1st column for indices.
137
+
138
+ table.each do |row|
139
+ yield row.to_h
140
+ end
93
141
  end
94
142
  end
143
+
144
+ # For backward compatibility
145
+ Rdatasets = Rdataset
95
146
  end
@@ -0,0 +1,90 @@
1
+ require "json"
2
+
3
+ module Datasets
4
+ class SeabornList < Dataset
5
+ def initialize
6
+ super
7
+ @metadata.id = "seaborn-data-list"
8
+ @metadata.name = "seaborn: data list"
9
+ @metadata.url = "https://github.com/mwaskom/seaborn-data"
10
+ # Treat as the same license as seaborn
11
+ @metadata.licenses = ["BSD-3-Clause"]
12
+ @metadata.description = "Datasets for seaborn examples."
13
+ end
14
+
15
+ def each(&block)
16
+ return to_enum(__method__) unless block_given?
17
+
18
+ data_path = cache_dir_path + "trees.json"
19
+ url = "https://api.github.com/repos/mwaskom/seaborn-data/git/trees/master"
20
+ download(data_path, url)
21
+
22
+ tree = JSON.parse(File.read(data_path))["tree"]
23
+ tree.each do |content|
24
+ path = content["path"]
25
+ next unless path.end_with?(".csv")
26
+ dataset = File.basename(path, ".csv")
27
+ record = {dataset: dataset}
28
+ yield record
29
+ end
30
+ end
31
+ end
32
+
33
+ class Seaborn < Dataset
34
+ URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
35
+
36
+ def initialize(name)
37
+ super()
38
+ @metadata.id = "seaborn-#{name}"
39
+ @metadata.name = "seaborn: #{name}"
40
+ @metadata.url = URL_FORMAT % {name: name}
41
+ # @metadata.licenses = TODO
42
+
43
+ @name = name
44
+ end
45
+
46
+ def each(&block)
47
+ return to_enum(__method__) unless block_given?
48
+
49
+ data_path = cache_dir_path + "#{@name}.csv"
50
+ download(data_path, @metadata.url)
51
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
52
+ csv.each do |row|
53
+ record = prepare_record(row)
54
+ yield record
55
+ end
56
+ end
57
+ end
58
+
59
+ private
60
+ def prepare_record(csv_row)
61
+ record = csv_row.to_h
62
+ record.transform_keys! do |key|
63
+ if key.nil?
64
+ :index
65
+ else
66
+ key.to_sym
67
+ end
68
+ end
69
+
70
+ # Perform the same preprocessing as seaborn's load_dataset function
71
+ preprocessor = :"preprocess_#{@name}_record"
72
+ __send__(preprocessor, record) if respond_to?(preprocessor, true)
73
+
74
+ record
75
+ end
76
+
77
+ # The same preprocessing as seaborn.load_dataset
78
+ def preprocess_flights_record(record)
79
+ record[:month] &&= record[:month][0,3]
80
+ end
81
+
82
+ # The same preprocessing as seaborn.load_dataset
83
+ def preprocess_penguins_record(record)
84
+ record[:sex] &&= record[:sex].capitalize
85
+ end
86
+ end
87
+
88
+ # For backward compatibility
89
+ SeabornData = Seaborn
90
+ end
@@ -21,9 +21,7 @@ module Datasets
21
21
  @metadata.id = "sudachi-synonym-dictionary"
22
22
  @metadata.name = "Sudachi synonym dictionary"
23
23
  @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
24
- @metadata.licenses = [
25
- "Apache-2.0",
26
- ]
24
+ @metadata.licenses = ["Apache-2.0"]
27
25
  @metadata.description = lambda do
28
26
  download_description
29
27
  end
@@ -65,10 +63,8 @@ module Datasets
65
63
  private
66
64
  def open_data
67
65
  data_path = cache_dir_path + "synonyms.txt"
68
- unless data_path.exist?
69
- data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
70
- download(data_path, data_url)
71
- end
66
+ data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
67
+ download(data_path, data_url)
72
68
  CSV.open(data_path,
73
69
  encoding: "UTF-8",
74
70
  skip_blanks: true) do |csv|
@@ -78,10 +74,8 @@ module Datasets
78
74
 
79
75
  def download_description
80
76
  description_path = cache_dir_path + "synonyms.md"
81
- unless description_path.exist?
82
- description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
83
- download(description_path, description_url)
84
- end
77
+ description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
78
+ download(description_path, description_url)
85
79
  description_path.read
86
80
  end
87
81
 
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.1.4"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -0,0 +1,219 @@
1
+ require "csv"
2
+ require "rexml/streamlistener"
3
+ require "rexml/parsers/baseparser"
4
+ require "rexml/parsers/streamparser"
5
+ require "time"
6
+
7
+ require_relative "dataset"
8
+ require_relative "tar-gz-readable"
9
+
10
+ module Datasets
11
+ class WikipediaKyotoJapaneseEnglish < Dataset
12
+ include TarGzReadable
13
+
14
+ Article = Struct.new(:source,
15
+ :copyright,
16
+ :contents,
17
+ :sections)
18
+
19
+ Section = Struct.new(:id,
20
+ :title,
21
+ :contents)
22
+
23
+ class Title < Struct.new(:section,
24
+ :japanese,
25
+ :english)
26
+ def title?
27
+ true
28
+ end
29
+
30
+ def sentence?
31
+ false
32
+ end
33
+ end
34
+
35
+ Paragraph = Struct.new(:id,
36
+ :sentences)
37
+
38
+ class Sentence < Struct.new(:id,
39
+ :section,
40
+ :paragraph,
41
+ :japanese,
42
+ :english)
43
+ def title?
44
+ false
45
+ end
46
+
47
+ def sentence?
48
+ true
49
+ end
50
+ end
51
+
52
+ Entry = Struct.new(:japanese,
53
+ :english)
54
+
55
+ def initialize(type: :article)
56
+ unless [:article, :lexicon].include?(type)
57
+ raise ArgumentError, "Please set type :article or :lexicon: #{type.inspect}"
58
+ end
59
+
60
+ super()
61
+ @type = type
62
+ @metadata.id = "wikipedia-kyoto-japanese-english"
63
+ @metadata.name =
64
+ "The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
65
+ @metadata.url = "https://alaginrc.nict.go.jp/WikiCorpus/index_E.html"
66
+ @metadata.licenses = ["CC-BY-SA-3.0"]
67
+ @metadata.description = <<-DESCRIPTION
68
+ "The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
69
+ aims mainly at supporting research and development relevant to
70
+ high-performance multilingual machine translation, information
71
+ extraction, and other language processing technologies. The National
72
+ Institute of Information and Communications Technology (NICT) has
73
+ created this corpus by manually translating Japanese Wikipedia
74
+ articles (related to Kyoto) into English.
75
+ DESCRIPTION
76
+ end
77
+
78
+ def each(&block)
79
+ return to_enum(__method__) unless block_given?
80
+
81
+ data_path = download_tar_gz
82
+
83
+ open_tar_gz(data_path) do |tar|
84
+ tar.each do |entry|
85
+ next unless entry.file?
86
+ base_name = File.basename(entry.full_name)
87
+ case @type
88
+ when :article
89
+ next unless base_name.end_with?(".xml")
90
+ listener = ArticleListener.new(block)
91
+ parser = REXML::Parsers::StreamParser.new(entry.read, listener)
92
+ parser.parse
93
+ when :lexicon
94
+ next unless base_name == "kyoto_lexicon.csv"
95
+ is_header = true
96
+ CSV.parse(entry.read.force_encoding("UTF-8")) do |row|
97
+ if is_header
98
+ is_header = false
99
+ next
100
+ end
101
+ yield(Entry.new(row[0], row[1]))
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
107
+
108
+ private
109
+ def download_tar_gz
110
+ base_name = "wiki_corpus_2.01.tar.gz"
111
+ data_path = cache_dir_path + base_name
112
+ data_url = "https://alaginrc.nict.go.jp/WikiCorpus/src/#{base_name}"
113
+ download(data_path, data_url)
114
+ data_path
115
+ end
116
+
117
+ class ArticleListener
118
+ include REXML::StreamListener
119
+
120
+ def initialize(block)
121
+ @block = block
122
+ @article = nil
123
+ @title = nil
124
+ @section = nil
125
+ @page = nil
126
+ @sentence = nil
127
+ @text_container_stack = []
128
+ @element_stack = []
129
+ @text_stack = [""]
130
+ end
131
+
132
+ def tag_start(name, attributes)
133
+ push_stacks(name, attributes)
134
+ case name
135
+ when "art"
136
+ @article = Article.new
137
+ @article.contents = []
138
+ @article.sections = []
139
+ when "tit"
140
+ @title = Title.new
141
+ @title.section = @section
142
+ @text_container_stack.push(@title)
143
+ when "sec"
144
+ @section = Section.new
145
+ @section.id = attributes["id"]
146
+ @section.contents = []
147
+ @text_container_stack.push(@section)
148
+ when "par"
149
+ @paragraph = Paragraph.new
150
+ @paragraph.id = attributes["id"]
151
+ @paragraph.sentences = []
152
+ @text_container_stack.push(@paragraph)
153
+ when "sen"
154
+ @sentence = Sentence.new
155
+ @sentence.id = attributes["id"]
156
+ @text_container_stack.push(@sentence)
157
+ end
158
+ end
159
+
160
+ def tag_end(name)
161
+ case name
162
+ when "art"
163
+ @block.call(@article)
164
+ @article = nil
165
+ when "inf"
166
+ @article.source = @text_stack.last
167
+ when "copyright"
168
+ @article.copyright = @text_stack.last
169
+ when "tit"
170
+ @article.contents << @title
171
+ if @section
172
+ @section.title = @title
173
+ @section.contents << @title
174
+ end
175
+ @title = nil
176
+ @text_container_stack.pop
177
+ when "sec"
178
+ @article.sections << @section
179
+ @section = nil
180
+ @text_container_stack.pop
181
+ when "par"
182
+ @paragraph = nil
183
+ @text_container_stack.pop
184
+ when "sen"
185
+ @article.contents << @sentence
186
+ @sentence.section = @section
187
+ @section.contents << @sentence if @section
188
+ @sentence.paragraph = @paragraph
189
+ @paragraph.sentences << @sentence if @paragraph
190
+ @sentence = nil
191
+ @text_container_stack.pop
192
+ when "j"
193
+ @text_container_stack.last.japanese = @text_stack.last
194
+ when "e"
195
+ attributes = @element_stack.last[:attributes]
196
+ if attributes["type"] == "check"
197
+ @text_container_stack.last.english = @text_stack.last
198
+ end
199
+ end
200
+ pop_stacks
201
+ end
202
+
203
+ def text(data)
204
+ @text_stack.last << data
205
+ end
206
+
207
+ private
208
+ def push_stacks(name, attributes)
209
+ @element_stack.push({name: name, attributes: attributes})
210
+ @text_stack.push("")
211
+ end
212
+
213
+ def pop_stacks
214
+ @text_stack.pop
215
+ @element_stack.pop
216
+ end
217
+ end
218
+ end
219
+ end
@@ -1,6 +1,7 @@
1
1
  require "rexml/streamlistener"
2
2
  require "rexml/parsers/baseparser"
3
3
  require "rexml/parsers/streamparser"
4
+ require "time"
4
5
 
5
6
  require_relative "dataset"
6
7
 
@@ -52,15 +53,22 @@ module Datasets
52
53
  end
53
54
 
54
55
  private
56
+ def base_name
57
+ "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
58
+ end
59
+
60
+ def data_path
61
+ cache_dir_path + base_name
62
+ end
63
+
55
64
  def open_data(&block)
56
- base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
- data_path = cache_dir_path + base_name
58
- unless data_path.exist?
59
- data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
60
- download(data_path, data_url)
65
+ data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
66
+ bz2 = Enumerator.new do |yielder|
67
+ download(data_path, data_url) do |bz2_chunk|
68
+ yielder << bz2_chunk
69
+ end
61
70
  end
62
-
63
- extract_bz2(data_path, &block)
71
+ extract_bz2(bz2, &block)
64
72
  end
65
73
 
66
74
  def type_in_path
@@ -153,7 +161,7 @@ module Datasets
153
161
  @text_stack.last << data
154
162
  end
155
163
 
156
- def cdata(contnet)
164
+ def cdata(content)
157
165
  @text_stack.last << content
158
166
  end
159
167
 
data/lib/datasets/wine.rb CHANGED
@@ -23,7 +23,8 @@ module Datasets
23
23
  super
24
24
  @metadata.id = 'wine'
25
25
  @metadata.name = 'Wine'
26
- @metadata.url = 'http://archive.ics.uci.edu/ml/datasets/wine'
26
+ @metadata.url = 'https://archive.ics.uci.edu/ml/datasets/wine'
27
+ @metadata.licenses = ["CC-BY-4.0"]
27
28
  @metadata.description = -> { read_names }
28
29
  end
29
30
 
@@ -43,19 +44,15 @@ module Datasets
43
44
 
44
45
  def read_names
45
46
  names_path = cache_dir_path + 'wine.names'
46
- unless names_path.exist?
47
- names_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48
- download(names_path, names_url)
49
- end
47
+ names_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48
+ download(names_path, names_url)
50
49
  names_path.read
51
50
  end
52
51
 
53
52
  def open_data
54
53
  data_path = cache_dir_path + 'wine.data'
55
- unless data_path.exist?
56
- data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
57
- download(data_path, data_url)
58
- end
54
+ data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
55
+ download(data_path, data_url)
59
56
  CSV.open(data_path, converters: %i[numeric]) do |csv|
60
57
  yield(csv)
61
58
  end
@@ -0,0 +1,48 @@
1
+ require 'zip'
2
+
3
+ module Datasets
4
+ class ZipExtractor
5
+ def initialize(path)
6
+ @path = path
7
+ end
8
+
9
+ def extract_first_file
10
+ Zip::File.open(@path) do |zip_file|
11
+ zip_file.each do |entry|
12
+ next unless entry.file?
13
+
14
+ entry.get_input_stream do |input|
15
+ return yield(input)
16
+ end
17
+ end
18
+ end
19
+ nil
20
+ end
21
+
22
+ def extract_file(file_path)
23
+ Zip::File.open(@path) do |zip_file|
24
+ zip_file.each do |entry|
25
+ next unless entry.file?
26
+ next unless entry.name == file_path
27
+
28
+ entry.get_input_stream do |input|
29
+ return yield(input)
30
+ end
31
+ end
32
+ end
33
+ nil
34
+ end
35
+
36
+ def extract_files
37
+ Zip::File.open(@path) do |zip_file|
38
+ zip_file.each do |entry|
39
+ next unless entry.file?
40
+
41
+ entry.get_input_stream do |input|
42
+ yield(input)
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
data/lib/datasets.rb CHANGED
@@ -1,22 +1,2 @@
1
- require_relative "datasets/version"
2
-
3
- require_relative "datasets/adult"
4
- require_relative "datasets/cifar"
5
- require_relative "datasets/cldr-plurals"
6
- require_relative "datasets/communities"
7
- require_relative "datasets/e-stat-japan"
8
- require_relative "datasets/fashion-mnist"
9
- require_relative "datasets/hepatitis"
10
- require_relative "datasets/iris"
11
- require_relative "datasets/libsvm"
12
- require_relative "datasets/libsvm-dataset-list"
13
- require_relative "datasets/mnist"
14
- require_relative "datasets/mushroom"
15
- require_relative "datasets/penguins"
16
- require_relative "datasets/penn-treebank"
17
- require_relative "datasets/postal-code-japan"
18
- require_relative "datasets/rdatasets"
19
- require_relative "datasets/seaborn-data"
20
- require_relative "datasets/sudachi-synonym-dictionary"
21
- require_relative "datasets/wikipedia"
22
- require_relative "datasets/wine"
1
+ require_relative "datasets/lazy"
2
+ Datasets::LAZY_LOADER.load_all
data/red-datasets.gemspec CHANGED
@@ -34,7 +34,7 @@ Gem::Specification.new do |spec|
34
34
  spec.files += Dir.glob("doc/text/*")
35
35
  spec.test_files += Dir.glob("test/**/*")
36
36
 
37
- spec.add_runtime_dependency("csv", ">= 3.0.5")
37
+ spec.add_runtime_dependency("csv", ">= 3.2.4")
38
38
  spec.add_runtime_dependency("rexml")
39
39
  spec.add_runtime_dependency("rubyzip")
40
40
 
data/test/helper.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require "fileutils"
2
2
  require "pathname"
3
3
  require "time"
4
+ require "tmpdir"
4
5
 
5
6
  require "datasets"
6
7
 
@@ -18,4 +19,24 @@ module Helper
18
19
  FileUtils.rm_rf(@tmp_dir)
19
20
  end
20
21
  end
22
+
23
+ module PathRestorable
24
+ def restore_path(path)
25
+ unless path.exist?
26
+ return yield
27
+ end
28
+
29
+ Dir.mktmpdir do |dir|
30
+ FileUtils.cp_r(path, dir, preserve: true)
31
+ begin
32
+ yield
33
+ ensure
34
+ FileUtils.rmtree(path, secure: true) if path.exist?
35
+ FileUtils.cp_r(Pathname(dir) + path.basename,
36
+ path,
37
+ preserve: true)
38
+ end
39
+ end
40
+ end
41
+ end
21
42
  end