red-datasets 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +23 -3
  3. data/Rakefile +56 -1
  4. data/doc/text/news.md +102 -0
  5. data/lib/datasets/adult.rb +6 -9
  6. data/lib/datasets/afinn.rb +48 -0
  7. data/lib/datasets/aozora-bunko.rb +196 -0
  8. data/lib/datasets/cache-path.rb +28 -0
  9. data/lib/datasets/california-housing.rb +60 -0
  10. data/lib/datasets/cifar.rb +2 -4
  11. data/lib/datasets/cldr-plurals.rb +2 -4
  12. data/lib/datasets/communities.rb +5 -8
  13. data/lib/datasets/dataset.rb +58 -23
  14. data/lib/datasets/diamonds.rb +26 -0
  15. data/lib/datasets/downloader.rb +110 -30
  16. data/lib/datasets/e-stat-japan.rb +2 -1
  17. data/lib/datasets/fashion-mnist.rb +4 -0
  18. data/lib/datasets/fuel-economy.rb +35 -0
  19. data/lib/datasets/geolonia.rb +67 -0
  20. data/lib/datasets/ggplot2-dataset.rb +79 -0
  21. data/lib/datasets/hepatitis.rb +5 -8
  22. data/lib/datasets/iris.rb +5 -8
  23. data/lib/datasets/ita-corpus.rb +57 -0
  24. data/lib/datasets/kuzushiji-mnist.rb +16 -0
  25. data/lib/datasets/lazy.rb +90 -0
  26. data/lib/datasets/libsvm-dataset-list.rb +5 -8
  27. data/lib/datasets/libsvm.rb +3 -4
  28. data/lib/datasets/license.rb +26 -0
  29. data/lib/datasets/livedoor-news.rb +80 -0
  30. data/lib/datasets/metadata.rb +14 -0
  31. data/lib/datasets/mnist.rb +7 -7
  32. data/lib/datasets/mushroom.rb +5 -8
  33. data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
  34. data/lib/datasets/penguins.rb +6 -8
  35. data/lib/datasets/penn-treebank.rb +2 -4
  36. data/lib/datasets/pmjt-dataset-list.rb +67 -0
  37. data/lib/datasets/postal-code-japan.rb +2 -6
  38. data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
  39. data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
  40. data/lib/datasets/seaborn.rb +90 -0
  41. data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
  42. data/lib/datasets/version.rb +1 -1
  43. data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
  44. data/lib/datasets/wikipedia.rb +16 -8
  45. data/lib/datasets/wine.rb +6 -9
  46. data/lib/datasets/zip-extractor.rb +48 -0
  47. data/lib/datasets.rb +2 -22
  48. data/red-datasets.gemspec +1 -1
  49. data/test/helper.rb +21 -0
  50. data/test/test-afinn.rb +60 -0
  51. data/test/test-aozora-bunko.rb +190 -0
  52. data/test/test-california-housing.rb +56 -0
  53. data/test/test-cldr-plurals.rb +1 -1
  54. data/test/test-dataset.rb +15 -7
  55. data/test/test-diamonds.rb +71 -0
  56. data/test/test-fuel-economy.rb +75 -0
  57. data/test/test-geolonia.rb +65 -0
  58. data/test/test-ita-corpus.rb +69 -0
  59. data/test/test-kuzushiji-mnist.rb +137 -0
  60. data/test/test-license.rb +24 -0
  61. data/test/test-livedoor-news.rb +351 -0
  62. data/test/test-metadata.rb +36 -0
  63. data/test/test-nagoya-university-conversation-corpus.rb +132 -0
  64. data/test/test-penguins.rb +1 -1
  65. data/test/test-pmjt-dataset-list.rb +50 -0
  66. data/test/test-quora-duplicate-question-pair.rb +33 -0
  67. data/test/test-rdataset.rb +246 -0
  68. data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
  69. data/test/test-sudachi-synonym-dictionary.rb +5 -5
  70. data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
  71. data/test/test-wikipedia.rb +25 -71
  72. metadata +62 -14
  73. data/lib/datasets/seaborn-data.rb +0 -49
  74. data/test/test-rdatasets.rb +0 -136
@@ -2,7 +2,7 @@ require_relative "dataset"
2
2
  require_relative "tar-gz-readable"
3
3
 
4
4
  module Datasets
5
- class RdatasetsList < Dataset
5
+ class RdatasetList < Dataset
6
6
  Record = Struct.new(:package,
7
7
  :dataset,
8
8
  :title,
@@ -18,8 +18,8 @@ module Datasets
18
18
 
19
19
  def initialize
20
20
  super
21
- @metadata.id = "rdatasets"
22
- @metadata.name = "Rdatasets"
21
+ @metadata.id = "rdataset-list"
22
+ @metadata.name = "Rdataset"
23
23
  @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
24
24
  @metadata.licenses = ["GPL-3"]
25
25
  @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
@@ -48,16 +48,19 @@ module Datasets
48
48
  end
49
49
 
50
50
  private def each_row(&block)
51
- download(@data_path, @data_url) unless @data_path.exist?
51
+ download(@data_path, @data_url)
52
52
  CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
53
53
  csv.each(&block)
54
54
  end
55
55
  end
56
56
  end
57
57
 
58
- class Rdatasets < Dataset
58
+ # For backward compatibility
59
+ RdatasetsList = RdatasetList
60
+
61
+ class Rdataset < Dataset
59
62
  def initialize(package_name, dataset_name)
60
- list = RdatasetsList.new
63
+ list = RdatasetList.new
61
64
 
62
65
  info = list.filter(package: package_name, dataset: dataset_name).first
63
66
  unless info
@@ -65,8 +68,8 @@ module Datasets
65
68
  end
66
69
 
67
70
  super()
68
- @metadata.id = "rdatasets-#{package_name}-#{dataset_name}"
69
- @metadata.name = "Rdatasets: #{package_name}: #{dataset_name}"
71
+ @metadata.id = "rdataset-#{package_name}-#{dataset_name}"
72
+ @metadata.name = "Rdataset: #{package_name}: #{dataset_name}"
70
73
  @metadata.url = info.csv
71
74
  @metadata.licenses = ["GPL-3"]
72
75
  @metadata.description = info.title
@@ -81,15 +84,63 @@ module Datasets
81
84
  def each(&block)
82
85
  return to_enum(__method__) unless block_given?
83
86
 
84
- download(@data_path, @metadata.url) unless @data_path.exist?
85
- CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
86
- csv.each do |row|
87
- record = row.to_h
88
- record.delete("")
89
- record.transform_keys!(&:to_sym)
90
- yield record
87
+ download(@data_path, @metadata.url)
88
+
89
+ na_converter = lambda do |field|
90
+ begin
91
+ if field.encode(CSV::ConverterEncoding) == "NA"
92
+ nil
93
+ else
94
+ field
95
+ end
96
+ rescue
97
+ field
91
98
  end
92
99
  end
100
+
101
+ inf_converter = lambda do |field|
102
+ begin
103
+ if field.encode(CSV::ConverterEncoding) == "Inf"
104
+ Float::INFINITY
105
+ else
106
+ field
107
+ end
108
+ rescue
109
+ field
110
+ end
111
+ end
112
+
113
+ quote_preserving_converter = lambda do |field, info|
114
+ f = field.encode(CSV::ConverterEncoding)
115
+ return f if info.quoted?
116
+
117
+ begin
118
+ begin
119
+ begin
120
+ return DateTime.parse(f) if f.match?(DateTimeMatcher)
121
+ rescue
122
+ return Integer(f)
123
+ end
124
+ rescue
125
+ return Float(f)
126
+ end
127
+ rescue
128
+ field
129
+ end
130
+ end
131
+
132
+ table = CSV.table(@data_path,
133
+ header_converters: [:symbol_raw],
134
+ # quote_preserving_converter should be the last
135
+ converters: [na_converter, inf_converter, quote_preserving_converter])
136
+ table.delete(:"") # delete 1st column for indices.
137
+
138
+ table.each do |row|
139
+ yield row.to_h
140
+ end
93
141
  end
94
142
  end
143
+
144
+ # For backward compatibility
145
+ Rdatasets = Rdataset
95
146
  end
@@ -0,0 +1,90 @@
1
+ require "json"
2
+
3
+ module Datasets
4
+ class SeabornList < Dataset
5
+ def initialize
6
+ super
7
+ @metadata.id = "seaborn-data-list"
8
+ @metadata.name = "seaborn: data list"
9
+ @metadata.url = "https://github.com/mwaskom/seaborn-data"
10
+ # Treat as the same license as seaborn
11
+ @metadata.licenses = ["BSD-3-Clause"]
12
+ @metadata.description = "Datasets for seaborn examples."
13
+ end
14
+
15
+ def each(&block)
16
+ return to_enum(__method__) unless block_given?
17
+
18
+ data_path = cache_dir_path + "trees.json"
19
+ url = "https://api.github.com/repos/mwaskom/seaborn-data/git/trees/master"
20
+ download(data_path, url)
21
+
22
+ tree = JSON.parse(File.read(data_path))["tree"]
23
+ tree.each do |content|
24
+ path = content["path"]
25
+ next unless path.end_with?(".csv")
26
+ dataset = File.basename(path, ".csv")
27
+ record = {dataset: dataset}
28
+ yield record
29
+ end
30
+ end
31
+ end
32
+
33
+ class Seaborn < Dataset
34
+ URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
35
+
36
+ def initialize(name)
37
+ super()
38
+ @metadata.id = "seaborn-#{name}"
39
+ @metadata.name = "seaborn: #{name}"
40
+ @metadata.url = URL_FORMAT % {name: name}
41
+ # @metadata.licenses = TODO
42
+
43
+ @name = name
44
+ end
45
+
46
+ def each(&block)
47
+ return to_enum(__method__) unless block_given?
48
+
49
+ data_path = cache_dir_path + "#{@name}.csv"
50
+ download(data_path, @metadata.url)
51
+ CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
52
+ csv.each do |row|
53
+ record = prepare_record(row)
54
+ yield record
55
+ end
56
+ end
57
+ end
58
+
59
+ private
60
+ def prepare_record(csv_row)
61
+ record = csv_row.to_h
62
+ record.transform_keys! do |key|
63
+ if key.nil?
64
+ :index
65
+ else
66
+ key.to_sym
67
+ end
68
+ end
69
+
70
+ # Perform the same preprocessing as seaborn's load_dataset function
71
+ preprocessor = :"preprocess_#{@name}_record"
72
+ __send__(preprocessor, record) if respond_to?(preprocessor, true)
73
+
74
+ record
75
+ end
76
+
77
+ # The same preprocessing as seaborn.load_dataset
78
+ def preprocess_flights_record(record)
79
+ record[:month] &&= record[:month][0,3]
80
+ end
81
+
82
+ # The same preprocessing as seaborn.load_dataset
83
+ def preprocess_penguins_record(record)
84
+ record[:sex] &&= record[:sex].capitalize
85
+ end
86
+ end
87
+
88
+ # For backward compatibility
89
+ SeabornData = Seaborn
90
+ end
@@ -21,9 +21,7 @@ module Datasets
21
21
  @metadata.id = "sudachi-synonym-dictionary"
22
22
  @metadata.name = "Sudachi synonym dictionary"
23
23
  @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
24
- @metadata.licenses = [
25
- "Apache-2.0",
26
- ]
24
+ @metadata.licenses = ["Apache-2.0"]
27
25
  @metadata.description = lambda do
28
26
  download_description
29
27
  end
@@ -65,10 +63,8 @@ module Datasets
65
63
  private
66
64
  def open_data
67
65
  data_path = cache_dir_path + "synonyms.txt"
68
- unless data_path.exist?
69
- data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
70
- download(data_path, data_url)
71
- end
66
+ data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
67
+ download(data_path, data_url)
72
68
  CSV.open(data_path,
73
69
  encoding: "UTF-8",
74
70
  skip_blanks: true) do |csv|
@@ -78,10 +74,8 @@ module Datasets
78
74
 
79
75
  def download_description
80
76
  description_path = cache_dir_path + "synonyms.md"
81
- unless description_path.exist?
82
- description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
83
- download(description_path, description_url)
84
- end
77
+ description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
78
+ download(description_path, description_url)
85
79
  description_path.read
86
80
  end
87
81
 
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.1.4"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -0,0 +1,219 @@
1
+ require "csv"
2
+ require "rexml/streamlistener"
3
+ require "rexml/parsers/baseparser"
4
+ require "rexml/parsers/streamparser"
5
+ require "time"
6
+
7
+ require_relative "dataset"
8
+ require_relative "tar-gz-readable"
9
+
10
+ module Datasets
11
+ class WikipediaKyotoJapaneseEnglish < Dataset
12
+ include TarGzReadable
13
+
14
+ Article = Struct.new(:source,
15
+ :copyright,
16
+ :contents,
17
+ :sections)
18
+
19
+ Section = Struct.new(:id,
20
+ :title,
21
+ :contents)
22
+
23
+ class Title < Struct.new(:section,
24
+ :japanese,
25
+ :english)
26
+ def title?
27
+ true
28
+ end
29
+
30
+ def sentence?
31
+ false
32
+ end
33
+ end
34
+
35
+ Paragraph = Struct.new(:id,
36
+ :sentences)
37
+
38
+ class Sentence < Struct.new(:id,
39
+ :section,
40
+ :paragraph,
41
+ :japanese,
42
+ :english)
43
+ def title?
44
+ false
45
+ end
46
+
47
+ def sentence?
48
+ true
49
+ end
50
+ end
51
+
52
+ Entry = Struct.new(:japanese,
53
+ :english)
54
+
55
+ def initialize(type: :article)
56
+ unless [:article, :lexicon].include?(type)
57
+ raise ArgumentError, "Please set type :article or :lexicon: #{type.inspect}"
58
+ end
59
+
60
+ super()
61
+ @type = type
62
+ @metadata.id = "wikipedia-kyoto-japanese-english"
63
+ @metadata.name =
64
+ "The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
65
+ @metadata.url = "https://alaginrc.nict.go.jp/WikiCorpus/index_E.html"
66
+ @metadata.licenses = ["CC-BY-SA-3.0"]
67
+ @metadata.description = <<-DESCRIPTION
68
+ "The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
69
+ aims mainly at supporting research and development relevant to
70
+ high-performance multilingual machine translation, information
71
+ extraction, and other language processing technologies. The National
72
+ Institute of Information and Communications Technology (NICT) has
73
+ created this corpus by manually translating Japanese Wikipedia
74
+ articles (related to Kyoto) into English.
75
+ DESCRIPTION
76
+ end
77
+
78
+ def each(&block)
79
+ return to_enum(__method__) unless block_given?
80
+
81
+ data_path = download_tar_gz
82
+
83
+ open_tar_gz(data_path) do |tar|
84
+ tar.each do |entry|
85
+ next unless entry.file?
86
+ base_name = File.basename(entry.full_name)
87
+ case @type
88
+ when :article
89
+ next unless base_name.end_with?(".xml")
90
+ listener = ArticleListener.new(block)
91
+ parser = REXML::Parsers::StreamParser.new(entry.read, listener)
92
+ parser.parse
93
+ when :lexicon
94
+ next unless base_name == "kyoto_lexicon.csv"
95
+ is_header = true
96
+ CSV.parse(entry.read.force_encoding("UTF-8")) do |row|
97
+ if is_header
98
+ is_header = false
99
+ next
100
+ end
101
+ yield(Entry.new(row[0], row[1]))
102
+ end
103
+ end
104
+ end
105
+ end
106
+ end
107
+
108
+ private
109
+ def download_tar_gz
110
+ base_name = "wiki_corpus_2.01.tar.gz"
111
+ data_path = cache_dir_path + base_name
112
+ data_url = "https://alaginrc.nict.go.jp/WikiCorpus/src/#{base_name}"
113
+ download(data_path, data_url)
114
+ data_path
115
+ end
116
+
117
+ class ArticleListener
118
+ include REXML::StreamListener
119
+
120
+ def initialize(block)
121
+ @block = block
122
+ @article = nil
123
+ @title = nil
124
+ @section = nil
125
+ @page = nil
126
+ @sentence = nil
127
+ @text_container_stack = []
128
+ @element_stack = []
129
+ @text_stack = [""]
130
+ end
131
+
132
+ def tag_start(name, attributes)
133
+ push_stacks(name, attributes)
134
+ case name
135
+ when "art"
136
+ @article = Article.new
137
+ @article.contents = []
138
+ @article.sections = []
139
+ when "tit"
140
+ @title = Title.new
141
+ @title.section = @section
142
+ @text_container_stack.push(@title)
143
+ when "sec"
144
+ @section = Section.new
145
+ @section.id = attributes["id"]
146
+ @section.contents = []
147
+ @text_container_stack.push(@section)
148
+ when "par"
149
+ @paragraph = Paragraph.new
150
+ @paragraph.id = attributes["id"]
151
+ @paragraph.sentences = []
152
+ @text_container_stack.push(@paragraph)
153
+ when "sen"
154
+ @sentence = Sentence.new
155
+ @sentence.id = attributes["id"]
156
+ @text_container_stack.push(@sentence)
157
+ end
158
+ end
159
+
160
+ def tag_end(name)
161
+ case name
162
+ when "art"
163
+ @block.call(@article)
164
+ @article = nil
165
+ when "inf"
166
+ @article.source = @text_stack.last
167
+ when "copyright"
168
+ @article.copyright = @text_stack.last
169
+ when "tit"
170
+ @article.contents << @title
171
+ if @section
172
+ @section.title = @title
173
+ @section.contents << @title
174
+ end
175
+ @title = nil
176
+ @text_container_stack.pop
177
+ when "sec"
178
+ @article.sections << @section
179
+ @section = nil
180
+ @text_container_stack.pop
181
+ when "par"
182
+ @paragraph = nil
183
+ @text_container_stack.pop
184
+ when "sen"
185
+ @article.contents << @sentence
186
+ @sentence.section = @section
187
+ @section.contents << @sentence if @section
188
+ @sentence.paragraph = @paragraph
189
+ @paragraph.sentences << @sentence if @paragraph
190
+ @sentence = nil
191
+ @text_container_stack.pop
192
+ when "j"
193
+ @text_container_stack.last.japanese = @text_stack.last
194
+ when "e"
195
+ attributes = @element_stack.last[:attributes]
196
+ if attributes["type"] == "check"
197
+ @text_container_stack.last.english = @text_stack.last
198
+ end
199
+ end
200
+ pop_stacks
201
+ end
202
+
203
+ def text(data)
204
+ @text_stack.last << data
205
+ end
206
+
207
+ private
208
+ def push_stacks(name, attributes)
209
+ @element_stack.push({name: name, attributes: attributes})
210
+ @text_stack.push("")
211
+ end
212
+
213
+ def pop_stacks
214
+ @text_stack.pop
215
+ @element_stack.pop
216
+ end
217
+ end
218
+ end
219
+ end
@@ -1,6 +1,7 @@
1
1
  require "rexml/streamlistener"
2
2
  require "rexml/parsers/baseparser"
3
3
  require "rexml/parsers/streamparser"
4
+ require "time"
4
5
 
5
6
  require_relative "dataset"
6
7
 
@@ -52,15 +53,22 @@ module Datasets
52
53
  end
53
54
 
54
55
  private
56
+ def base_name
57
+ "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
58
+ end
59
+
60
+ def data_path
61
+ cache_dir_path + base_name
62
+ end
63
+
55
64
  def open_data(&block)
56
- base_name = "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
57
- data_path = cache_dir_path + base_name
58
- unless data_path.exist?
59
- data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
60
- download(data_path, data_url)
65
+ data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
66
+ bz2 = Enumerator.new do |yielder|
67
+ download(data_path, data_url) do |bz2_chunk|
68
+ yielder << bz2_chunk
69
+ end
61
70
  end
62
-
63
- extract_bz2(data_path, &block)
71
+ extract_bz2(bz2, &block)
64
72
  end
65
73
 
66
74
  def type_in_path
@@ -153,7 +161,7 @@ module Datasets
153
161
  @text_stack.last << data
154
162
  end
155
163
 
156
- def cdata(contnet)
164
+ def cdata(content)
157
165
  @text_stack.last << content
158
166
  end
159
167
 
data/lib/datasets/wine.rb CHANGED
@@ -23,7 +23,8 @@ module Datasets
23
23
  super
24
24
  @metadata.id = 'wine'
25
25
  @metadata.name = 'Wine'
26
- @metadata.url = 'http://archive.ics.uci.edu/ml/datasets/wine'
26
+ @metadata.url = 'https://archive.ics.uci.edu/ml/datasets/wine'
27
+ @metadata.licenses = ["CC-BY-4.0"]
27
28
  @metadata.description = -> { read_names }
28
29
  end
29
30
 
@@ -43,19 +44,15 @@ module Datasets
43
44
 
44
45
  def read_names
45
46
  names_path = cache_dir_path + 'wine.names'
46
- unless names_path.exist?
47
- names_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48
- download(names_path, names_url)
49
- end
47
+ names_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48
+ download(names_path, names_url)
50
49
  names_path.read
51
50
  end
52
51
 
53
52
  def open_data
54
53
  data_path = cache_dir_path + 'wine.data'
55
- unless data_path.exist?
56
- data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
57
- download(data_path, data_url)
58
- end
54
+ data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
55
+ download(data_path, data_url)
59
56
  CSV.open(data_path, converters: %i[numeric]) do |csv|
60
57
  yield(csv)
61
58
  end
@@ -0,0 +1,48 @@
1
+ require 'zip'
2
+
3
+ module Datasets
4
+ class ZipExtractor
5
+ def initialize(path)
6
+ @path = path
7
+ end
8
+
9
+ def extract_first_file
10
+ Zip::File.open(@path) do |zip_file|
11
+ zip_file.each do |entry|
12
+ next unless entry.file?
13
+
14
+ entry.get_input_stream do |input|
15
+ return yield(input)
16
+ end
17
+ end
18
+ end
19
+ nil
20
+ end
21
+
22
+ def extract_file(file_path)
23
+ Zip::File.open(@path) do |zip_file|
24
+ zip_file.each do |entry|
25
+ next unless entry.file?
26
+ next unless entry.name == file_path
27
+
28
+ entry.get_input_stream do |input|
29
+ return yield(input)
30
+ end
31
+ end
32
+ end
33
+ nil
34
+ end
35
+
36
+ def extract_files
37
+ Zip::File.open(@path) do |zip_file|
38
+ zip_file.each do |entry|
39
+ next unless entry.file?
40
+
41
+ entry.get_input_stream do |input|
42
+ yield(input)
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
data/lib/datasets.rb CHANGED
@@ -1,22 +1,2 @@
1
- require_relative "datasets/version"
2
-
3
- require_relative "datasets/adult"
4
- require_relative "datasets/cifar"
5
- require_relative "datasets/cldr-plurals"
6
- require_relative "datasets/communities"
7
- require_relative "datasets/e-stat-japan"
8
- require_relative "datasets/fashion-mnist"
9
- require_relative "datasets/hepatitis"
10
- require_relative "datasets/iris"
11
- require_relative "datasets/libsvm"
12
- require_relative "datasets/libsvm-dataset-list"
13
- require_relative "datasets/mnist"
14
- require_relative "datasets/mushroom"
15
- require_relative "datasets/penguins"
16
- require_relative "datasets/penn-treebank"
17
- require_relative "datasets/postal-code-japan"
18
- require_relative "datasets/rdatasets"
19
- require_relative "datasets/seaborn-data"
20
- require_relative "datasets/sudachi-synonym-dictionary"
21
- require_relative "datasets/wikipedia"
22
- require_relative "datasets/wine"
1
+ require_relative "datasets/lazy"
2
+ Datasets::LAZY_LOADER.load_all
data/red-datasets.gemspec CHANGED
@@ -34,7 +34,7 @@ Gem::Specification.new do |spec|
34
34
  spec.files += Dir.glob("doc/text/*")
35
35
  spec.test_files += Dir.glob("test/**/*")
36
36
 
37
- spec.add_runtime_dependency("csv", ">= 3.0.5")
37
+ spec.add_runtime_dependency("csv", ">= 3.2.4")
38
38
  spec.add_runtime_dependency("rexml")
39
39
  spec.add_runtime_dependency("rubyzip")
40
40
 
data/test/helper.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require "fileutils"
2
2
  require "pathname"
3
3
  require "time"
4
+ require "tmpdir"
4
5
 
5
6
  require "datasets"
6
7
 
@@ -18,4 +19,24 @@ module Helper
18
19
  FileUtils.rm_rf(@tmp_dir)
19
20
  end
20
21
  end
22
+
23
+ module PathRestorable
24
+ def restore_path(path)
25
+ unless path.exist?
26
+ return yield
27
+ end
28
+
29
+ Dir.mktmpdir do |dir|
30
+ FileUtils.cp_r(path, dir, preserve: true)
31
+ begin
32
+ yield
33
+ ensure
34
+ FileUtils.rmtree(path, secure: true) if path.exist?
35
+ FileUtils.cp_r(Pathname(dir) + path.basename,
36
+ path,
37
+ preserve: true)
38
+ end
39
+ end
40
+ end
41
+ end
21
42
  end