red-datasets 0.1.4 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +23 -3
- data/Rakefile +56 -1
- data/doc/text/news.md +102 -0
- data/lib/datasets/adult.rb +6 -9
- data/lib/datasets/afinn.rb +48 -0
- data/lib/datasets/aozora-bunko.rb +196 -0
- data/lib/datasets/cache-path.rb +28 -0
- data/lib/datasets/california-housing.rb +60 -0
- data/lib/datasets/cifar.rb +2 -4
- data/lib/datasets/cldr-plurals.rb +2 -4
- data/lib/datasets/communities.rb +5 -8
- data/lib/datasets/dataset.rb +58 -23
- data/lib/datasets/diamonds.rb +26 -0
- data/lib/datasets/downloader.rb +110 -30
- data/lib/datasets/e-stat-japan.rb +2 -1
- data/lib/datasets/fashion-mnist.rb +4 -0
- data/lib/datasets/fuel-economy.rb +35 -0
- data/lib/datasets/geolonia.rb +67 -0
- data/lib/datasets/ggplot2-dataset.rb +79 -0
- data/lib/datasets/hepatitis.rb +5 -8
- data/lib/datasets/iris.rb +5 -8
- data/lib/datasets/ita-corpus.rb +57 -0
- data/lib/datasets/kuzushiji-mnist.rb +16 -0
- data/lib/datasets/lazy.rb +90 -0
- data/lib/datasets/libsvm-dataset-list.rb +5 -8
- data/lib/datasets/libsvm.rb +3 -4
- data/lib/datasets/license.rb +26 -0
- data/lib/datasets/livedoor-news.rb +80 -0
- data/lib/datasets/metadata.rb +14 -0
- data/lib/datasets/mnist.rb +7 -7
- data/lib/datasets/mushroom.rb +5 -8
- data/lib/datasets/nagoya-university-conversation-corpus.rb +109 -0
- data/lib/datasets/penguins.rb +6 -8
- data/lib/datasets/penn-treebank.rb +2 -4
- data/lib/datasets/pmjt-dataset-list.rb +67 -0
- data/lib/datasets/postal-code-japan.rb +2 -6
- data/lib/datasets/quora-duplicate-question-pair.rb +51 -0
- data/lib/datasets/{rdatasets.rb → rdataset.rb} +66 -15
- data/lib/datasets/seaborn.rb +90 -0
- data/lib/datasets/sudachi-synonym-dictionary.rb +5 -11
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia-kyoto-japanese-english.rb +219 -0
- data/lib/datasets/wikipedia.rb +16 -8
- data/lib/datasets/wine.rb +6 -9
- data/lib/datasets/zip-extractor.rb +48 -0
- data/lib/datasets.rb +2 -22
- data/red-datasets.gemspec +1 -1
- data/test/helper.rb +21 -0
- data/test/test-afinn.rb +60 -0
- data/test/test-aozora-bunko.rb +190 -0
- data/test/test-california-housing.rb +56 -0
- data/test/test-cldr-plurals.rb +1 -1
- data/test/test-dataset.rb +15 -7
- data/test/test-diamonds.rb +71 -0
- data/test/test-fuel-economy.rb +75 -0
- data/test/test-geolonia.rb +65 -0
- data/test/test-ita-corpus.rb +69 -0
- data/test/test-kuzushiji-mnist.rb +137 -0
- data/test/test-license.rb +24 -0
- data/test/test-livedoor-news.rb +351 -0
- data/test/test-metadata.rb +36 -0
- data/test/test-nagoya-university-conversation-corpus.rb +132 -0
- data/test/test-penguins.rb +1 -1
- data/test/test-pmjt-dataset-list.rb +50 -0
- data/test/test-quora-duplicate-question-pair.rb +33 -0
- data/test/test-rdataset.rb +246 -0
- data/test/{test-seaborn-data.rb → test-seaborn.rb} +71 -4
- data/test/test-sudachi-synonym-dictionary.rb +5 -5
- data/test/test-wikipedia-kyoto-japanese-english.rb +178 -0
- data/test/test-wikipedia.rb +25 -71
- metadata +62 -14
- data/lib/datasets/seaborn-data.rb +0 -49
- data/test/test-rdatasets.rb +0 -136
@@ -2,7 +2,7 @@ require_relative "dataset"
|
|
2
2
|
require_relative "tar-gz-readable"
|
3
3
|
|
4
4
|
module Datasets
|
5
|
-
class
|
5
|
+
class RdatasetList < Dataset
|
6
6
|
Record = Struct.new(:package,
|
7
7
|
:dataset,
|
8
8
|
:title,
|
@@ -18,8 +18,8 @@ module Datasets
|
|
18
18
|
|
19
19
|
def initialize
|
20
20
|
super
|
21
|
-
@metadata.id = "
|
22
|
-
@metadata.name = "
|
21
|
+
@metadata.id = "rdataset-list"
|
22
|
+
@metadata.name = "Rdataset"
|
23
23
|
@metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
|
24
24
|
@metadata.licenses = ["GPL-3"]
|
25
25
|
@data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
|
@@ -48,16 +48,19 @@ module Datasets
|
|
48
48
|
end
|
49
49
|
|
50
50
|
private def each_row(&block)
|
51
|
-
download(@data_path, @data_url)
|
51
|
+
download(@data_path, @data_url)
|
52
52
|
CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
|
53
53
|
csv.each(&block)
|
54
54
|
end
|
55
55
|
end
|
56
56
|
end
|
57
57
|
|
58
|
-
|
58
|
+
# For backward compatibility
|
59
|
+
RdatasetsList = RdatasetList
|
60
|
+
|
61
|
+
class Rdataset < Dataset
|
59
62
|
def initialize(package_name, dataset_name)
|
60
|
-
list =
|
63
|
+
list = RdatasetList.new
|
61
64
|
|
62
65
|
info = list.filter(package: package_name, dataset: dataset_name).first
|
63
66
|
unless info
|
@@ -65,8 +68,8 @@ module Datasets
|
|
65
68
|
end
|
66
69
|
|
67
70
|
super()
|
68
|
-
@metadata.id = "
|
69
|
-
@metadata.name = "
|
71
|
+
@metadata.id = "rdataset-#{package_name}-#{dataset_name}"
|
72
|
+
@metadata.name = "Rdataset: #{package_name}: #{dataset_name}"
|
70
73
|
@metadata.url = info.csv
|
71
74
|
@metadata.licenses = ["GPL-3"]
|
72
75
|
@metadata.description = info.title
|
@@ -81,15 +84,63 @@ module Datasets
|
|
81
84
|
def each(&block)
|
82
85
|
return to_enum(__method__) unless block_given?
|
83
86
|
|
84
|
-
download(@data_path, @metadata.url)
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
87
|
+
download(@data_path, @metadata.url)
|
88
|
+
|
89
|
+
na_converter = lambda do |field|
|
90
|
+
begin
|
91
|
+
if field.encode(CSV::ConverterEncoding) == "NA"
|
92
|
+
nil
|
93
|
+
else
|
94
|
+
field
|
95
|
+
end
|
96
|
+
rescue
|
97
|
+
field
|
91
98
|
end
|
92
99
|
end
|
100
|
+
|
101
|
+
inf_converter = lambda do |field|
|
102
|
+
begin
|
103
|
+
if field.encode(CSV::ConverterEncoding) == "Inf"
|
104
|
+
Float::INFINITY
|
105
|
+
else
|
106
|
+
field
|
107
|
+
end
|
108
|
+
rescue
|
109
|
+
field
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
quote_preserving_converter = lambda do |field, info|
|
114
|
+
f = field.encode(CSV::ConverterEncoding)
|
115
|
+
return f if info.quoted?
|
116
|
+
|
117
|
+
begin
|
118
|
+
begin
|
119
|
+
begin
|
120
|
+
return DateTime.parse(f) if f.match?(DateTimeMatcher)
|
121
|
+
rescue
|
122
|
+
return Integer(f)
|
123
|
+
end
|
124
|
+
rescue
|
125
|
+
return Float(f)
|
126
|
+
end
|
127
|
+
rescue
|
128
|
+
field
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
table = CSV.table(@data_path,
|
133
|
+
header_converters: [:symbol_raw],
|
134
|
+
# quote_preserving_converter should be the last
|
135
|
+
converters: [na_converter, inf_converter, quote_preserving_converter])
|
136
|
+
table.delete(:"") # delete 1st column for indices.
|
137
|
+
|
138
|
+
table.each do |row|
|
139
|
+
yield row.to_h
|
140
|
+
end
|
93
141
|
end
|
94
142
|
end
|
143
|
+
|
144
|
+
# For backward compatibility
|
145
|
+
Rdatasets = Rdataset
|
95
146
|
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require "json"
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class SeabornList < Dataset
|
5
|
+
def initialize
|
6
|
+
super
|
7
|
+
@metadata.id = "seaborn-data-list"
|
8
|
+
@metadata.name = "seaborn: data list"
|
9
|
+
@metadata.url = "https://github.com/mwaskom/seaborn-data"
|
10
|
+
# Treat as the same license as seaborn
|
11
|
+
@metadata.licenses = ["BSD-3-Clause"]
|
12
|
+
@metadata.description = "Datasets for seaborn examples."
|
13
|
+
end
|
14
|
+
|
15
|
+
def each(&block)
|
16
|
+
return to_enum(__method__) unless block_given?
|
17
|
+
|
18
|
+
data_path = cache_dir_path + "trees.json"
|
19
|
+
url = "https://api.github.com/repos/mwaskom/seaborn-data/git/trees/master"
|
20
|
+
download(data_path, url)
|
21
|
+
|
22
|
+
tree = JSON.parse(File.read(data_path))["tree"]
|
23
|
+
tree.each do |content|
|
24
|
+
path = content["path"]
|
25
|
+
next unless path.end_with?(".csv")
|
26
|
+
dataset = File.basename(path, ".csv")
|
27
|
+
record = {dataset: dataset}
|
28
|
+
yield record
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Seaborn < Dataset
|
34
|
+
URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
|
35
|
+
|
36
|
+
def initialize(name)
|
37
|
+
super()
|
38
|
+
@metadata.id = "seaborn-#{name}"
|
39
|
+
@metadata.name = "seaborn: #{name}"
|
40
|
+
@metadata.url = URL_FORMAT % {name: name}
|
41
|
+
# @metadata.licenses = TODO
|
42
|
+
|
43
|
+
@name = name
|
44
|
+
end
|
45
|
+
|
46
|
+
def each(&block)
|
47
|
+
return to_enum(__method__) unless block_given?
|
48
|
+
|
49
|
+
data_path = cache_dir_path + "#{@name}.csv"
|
50
|
+
download(data_path, @metadata.url)
|
51
|
+
CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
|
52
|
+
csv.each do |row|
|
53
|
+
record = prepare_record(row)
|
54
|
+
yield record
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
def prepare_record(csv_row)
|
61
|
+
record = csv_row.to_h
|
62
|
+
record.transform_keys! do |key|
|
63
|
+
if key.nil?
|
64
|
+
:index
|
65
|
+
else
|
66
|
+
key.to_sym
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Perform the same preprocessing as seaborn's load_dataset function
|
71
|
+
preprocessor = :"preprocess_#{@name}_record"
|
72
|
+
__send__(preprocessor, record) if respond_to?(preprocessor, true)
|
73
|
+
|
74
|
+
record
|
75
|
+
end
|
76
|
+
|
77
|
+
# The same preprocessing as seaborn.load_dataset
|
78
|
+
def preprocess_flights_record(record)
|
79
|
+
record[:month] &&= record[:month][0,3]
|
80
|
+
end
|
81
|
+
|
82
|
+
# The same preprocessing as seaborn.load_dataset
|
83
|
+
def preprocess_penguins_record(record)
|
84
|
+
record[:sex] &&= record[:sex].capitalize
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# For backward compatibility
|
89
|
+
SeabornData = Seaborn
|
90
|
+
end
|
@@ -21,9 +21,7 @@ module Datasets
|
|
21
21
|
@metadata.id = "sudachi-synonym-dictionary"
|
22
22
|
@metadata.name = "Sudachi synonym dictionary"
|
23
23
|
@metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
|
24
|
-
@metadata.licenses = [
|
25
|
-
"Apache-2.0",
|
26
|
-
]
|
24
|
+
@metadata.licenses = ["Apache-2.0"]
|
27
25
|
@metadata.description = lambda do
|
28
26
|
download_description
|
29
27
|
end
|
@@ -65,10 +63,8 @@ module Datasets
|
|
65
63
|
private
|
66
64
|
def open_data
|
67
65
|
data_path = cache_dir_path + "synonyms.txt"
|
68
|
-
|
69
|
-
|
70
|
-
download(data_path, data_url)
|
71
|
-
end
|
66
|
+
data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
|
67
|
+
download(data_path, data_url)
|
72
68
|
CSV.open(data_path,
|
73
69
|
encoding: "UTF-8",
|
74
70
|
skip_blanks: true) do |csv|
|
@@ -78,10 +74,8 @@ module Datasets
|
|
78
74
|
|
79
75
|
def download_description
|
80
76
|
description_path = cache_dir_path + "synonyms.md"
|
81
|
-
|
82
|
-
|
83
|
-
download(description_path, description_url)
|
84
|
-
end
|
77
|
+
description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
|
78
|
+
download(description_path, description_url)
|
85
79
|
description_path.read
|
86
80
|
end
|
87
81
|
|
data/lib/datasets/version.rb
CHANGED
@@ -0,0 +1,219 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "rexml/streamlistener"
|
3
|
+
require "rexml/parsers/baseparser"
|
4
|
+
require "rexml/parsers/streamparser"
|
5
|
+
require "time"
|
6
|
+
|
7
|
+
require_relative "dataset"
|
8
|
+
require_relative "tar-gz-readable"
|
9
|
+
|
10
|
+
module Datasets
|
11
|
+
class WikipediaKyotoJapaneseEnglish < Dataset
|
12
|
+
include TarGzReadable
|
13
|
+
|
14
|
+
Article = Struct.new(:source,
|
15
|
+
:copyright,
|
16
|
+
:contents,
|
17
|
+
:sections)
|
18
|
+
|
19
|
+
Section = Struct.new(:id,
|
20
|
+
:title,
|
21
|
+
:contents)
|
22
|
+
|
23
|
+
class Title < Struct.new(:section,
|
24
|
+
:japanese,
|
25
|
+
:english)
|
26
|
+
def title?
|
27
|
+
true
|
28
|
+
end
|
29
|
+
|
30
|
+
def sentence?
|
31
|
+
false
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
Paragraph = Struct.new(:id,
|
36
|
+
:sentences)
|
37
|
+
|
38
|
+
class Sentence < Struct.new(:id,
|
39
|
+
:section,
|
40
|
+
:paragraph,
|
41
|
+
:japanese,
|
42
|
+
:english)
|
43
|
+
def title?
|
44
|
+
false
|
45
|
+
end
|
46
|
+
|
47
|
+
def sentence?
|
48
|
+
true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
Entry = Struct.new(:japanese,
|
53
|
+
:english)
|
54
|
+
|
55
|
+
def initialize(type: :article)
|
56
|
+
unless [:article, :lexicon].include?(type)
|
57
|
+
raise ArgumentError, "Please set type :article or :lexicon: #{type.inspect}"
|
58
|
+
end
|
59
|
+
|
60
|
+
super()
|
61
|
+
@type = type
|
62
|
+
@metadata.id = "wikipedia-kyoto-japanese-english"
|
63
|
+
@metadata.name =
|
64
|
+
"The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
|
65
|
+
@metadata.url = "https://alaginrc.nict.go.jp/WikiCorpus/index_E.html"
|
66
|
+
@metadata.licenses = ["CC-BY-SA-3.0"]
|
67
|
+
@metadata.description = <<-DESCRIPTION
|
68
|
+
"The Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles"
|
69
|
+
aims mainly at supporting research and development relevant to
|
70
|
+
high-performance multilingual machine translation, information
|
71
|
+
extraction, and other language processing technologies. The National
|
72
|
+
Institute of Information and Communications Technology (NICT) has
|
73
|
+
created this corpus by manually translating Japanese Wikipedia
|
74
|
+
articles (related to Kyoto) into English.
|
75
|
+
DESCRIPTION
|
76
|
+
end
|
77
|
+
|
78
|
+
def each(&block)
|
79
|
+
return to_enum(__method__) unless block_given?
|
80
|
+
|
81
|
+
data_path = download_tar_gz
|
82
|
+
|
83
|
+
open_tar_gz(data_path) do |tar|
|
84
|
+
tar.each do |entry|
|
85
|
+
next unless entry.file?
|
86
|
+
base_name = File.basename(entry.full_name)
|
87
|
+
case @type
|
88
|
+
when :article
|
89
|
+
next unless base_name.end_with?(".xml")
|
90
|
+
listener = ArticleListener.new(block)
|
91
|
+
parser = REXML::Parsers::StreamParser.new(entry.read, listener)
|
92
|
+
parser.parse
|
93
|
+
when :lexicon
|
94
|
+
next unless base_name == "kyoto_lexicon.csv"
|
95
|
+
is_header = true
|
96
|
+
CSV.parse(entry.read.force_encoding("UTF-8")) do |row|
|
97
|
+
if is_header
|
98
|
+
is_header = false
|
99
|
+
next
|
100
|
+
end
|
101
|
+
yield(Entry.new(row[0], row[1]))
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
private
|
109
|
+
def download_tar_gz
|
110
|
+
base_name = "wiki_corpus_2.01.tar.gz"
|
111
|
+
data_path = cache_dir_path + base_name
|
112
|
+
data_url = "https://alaginrc.nict.go.jp/WikiCorpus/src/#{base_name}"
|
113
|
+
download(data_path, data_url)
|
114
|
+
data_path
|
115
|
+
end
|
116
|
+
|
117
|
+
class ArticleListener
|
118
|
+
include REXML::StreamListener
|
119
|
+
|
120
|
+
def initialize(block)
|
121
|
+
@block = block
|
122
|
+
@article = nil
|
123
|
+
@title = nil
|
124
|
+
@section = nil
|
125
|
+
@page = nil
|
126
|
+
@sentence = nil
|
127
|
+
@text_container_stack = []
|
128
|
+
@element_stack = []
|
129
|
+
@text_stack = [""]
|
130
|
+
end
|
131
|
+
|
132
|
+
def tag_start(name, attributes)
|
133
|
+
push_stacks(name, attributes)
|
134
|
+
case name
|
135
|
+
when "art"
|
136
|
+
@article = Article.new
|
137
|
+
@article.contents = []
|
138
|
+
@article.sections = []
|
139
|
+
when "tit"
|
140
|
+
@title = Title.new
|
141
|
+
@title.section = @section
|
142
|
+
@text_container_stack.push(@title)
|
143
|
+
when "sec"
|
144
|
+
@section = Section.new
|
145
|
+
@section.id = attributes["id"]
|
146
|
+
@section.contents = []
|
147
|
+
@text_container_stack.push(@section)
|
148
|
+
when "par"
|
149
|
+
@paragraph = Paragraph.new
|
150
|
+
@paragraph.id = attributes["id"]
|
151
|
+
@paragraph.sentences = []
|
152
|
+
@text_container_stack.push(@paragraph)
|
153
|
+
when "sen"
|
154
|
+
@sentence = Sentence.new
|
155
|
+
@sentence.id = attributes["id"]
|
156
|
+
@text_container_stack.push(@sentence)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def tag_end(name)
|
161
|
+
case name
|
162
|
+
when "art"
|
163
|
+
@block.call(@article)
|
164
|
+
@article = nil
|
165
|
+
when "inf"
|
166
|
+
@article.source = @text_stack.last
|
167
|
+
when "copyright"
|
168
|
+
@article.copyright = @text_stack.last
|
169
|
+
when "tit"
|
170
|
+
@article.contents << @title
|
171
|
+
if @section
|
172
|
+
@section.title = @title
|
173
|
+
@section.contents << @title
|
174
|
+
end
|
175
|
+
@title = nil
|
176
|
+
@text_container_stack.pop
|
177
|
+
when "sec"
|
178
|
+
@article.sections << @section
|
179
|
+
@section = nil
|
180
|
+
@text_container_stack.pop
|
181
|
+
when "par"
|
182
|
+
@paragraph = nil
|
183
|
+
@text_container_stack.pop
|
184
|
+
when "sen"
|
185
|
+
@article.contents << @sentence
|
186
|
+
@sentence.section = @section
|
187
|
+
@section.contents << @sentence if @section
|
188
|
+
@sentence.paragraph = @paragraph
|
189
|
+
@paragraph.sentences << @sentence if @paragraph
|
190
|
+
@sentence = nil
|
191
|
+
@text_container_stack.pop
|
192
|
+
when "j"
|
193
|
+
@text_container_stack.last.japanese = @text_stack.last
|
194
|
+
when "e"
|
195
|
+
attributes = @element_stack.last[:attributes]
|
196
|
+
if attributes["type"] == "check"
|
197
|
+
@text_container_stack.last.english = @text_stack.last
|
198
|
+
end
|
199
|
+
end
|
200
|
+
pop_stacks
|
201
|
+
end
|
202
|
+
|
203
|
+
def text(data)
|
204
|
+
@text_stack.last << data
|
205
|
+
end
|
206
|
+
|
207
|
+
private
|
208
|
+
def push_stacks(name, attributes)
|
209
|
+
@element_stack.push({name: name, attributes: attributes})
|
210
|
+
@text_stack.push("")
|
211
|
+
end
|
212
|
+
|
213
|
+
def pop_stacks
|
214
|
+
@text_stack.pop
|
215
|
+
@element_stack.pop
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
data/lib/datasets/wikipedia.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "rexml/streamlistener"
|
2
2
|
require "rexml/parsers/baseparser"
|
3
3
|
require "rexml/parsers/streamparser"
|
4
|
+
require "time"
|
4
5
|
|
5
6
|
require_relative "dataset"
|
6
7
|
|
@@ -52,15 +53,22 @@ module Datasets
|
|
52
53
|
end
|
53
54
|
|
54
55
|
private
|
56
|
+
def base_name
|
57
|
+
"#{@language}wiki-latest-#{type_in_path}.xml.bz2"
|
58
|
+
end
|
59
|
+
|
60
|
+
def data_path
|
61
|
+
cache_dir_path + base_name
|
62
|
+
end
|
63
|
+
|
55
64
|
def open_data(&block)
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
65
|
+
data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
|
66
|
+
bz2 = Enumerator.new do |yielder|
|
67
|
+
download(data_path, data_url) do |bz2_chunk|
|
68
|
+
yielder << bz2_chunk
|
69
|
+
end
|
61
70
|
end
|
62
|
-
|
63
|
-
extract_bz2(data_path, &block)
|
71
|
+
extract_bz2(bz2, &block)
|
64
72
|
end
|
65
73
|
|
66
74
|
def type_in_path
|
@@ -153,7 +161,7 @@ module Datasets
|
|
153
161
|
@text_stack.last << data
|
154
162
|
end
|
155
163
|
|
156
|
-
def cdata(
|
164
|
+
def cdata(content)
|
157
165
|
@text_stack.last << content
|
158
166
|
end
|
159
167
|
|
data/lib/datasets/wine.rb
CHANGED
@@ -23,7 +23,8 @@ module Datasets
|
|
23
23
|
super
|
24
24
|
@metadata.id = 'wine'
|
25
25
|
@metadata.name = 'Wine'
|
26
|
-
@metadata.url = '
|
26
|
+
@metadata.url = 'https://archive.ics.uci.edu/ml/datasets/wine'
|
27
|
+
@metadata.licenses = ["CC-BY-4.0"]
|
27
28
|
@metadata.description = -> { read_names }
|
28
29
|
end
|
29
30
|
|
@@ -43,19 +44,15 @@ module Datasets
|
|
43
44
|
|
44
45
|
def read_names
|
45
46
|
names_path = cache_dir_path + 'wine.names'
|
46
|
-
|
47
|
-
|
48
|
-
download(names_path, names_url)
|
49
|
-
end
|
47
|
+
names_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
|
48
|
+
download(names_path, names_url)
|
50
49
|
names_path.read
|
51
50
|
end
|
52
51
|
|
53
52
|
def open_data
|
54
53
|
data_path = cache_dir_path + 'wine.data'
|
55
|
-
|
56
|
-
|
57
|
-
download(data_path, data_url)
|
58
|
-
end
|
54
|
+
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
|
55
|
+
download(data_path, data_url)
|
59
56
|
CSV.open(data_path, converters: %i[numeric]) do |csv|
|
60
57
|
yield(csv)
|
61
58
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'zip'
|
2
|
+
|
3
|
+
module Datasets
|
4
|
+
class ZipExtractor
|
5
|
+
def initialize(path)
|
6
|
+
@path = path
|
7
|
+
end
|
8
|
+
|
9
|
+
def extract_first_file
|
10
|
+
Zip::File.open(@path) do |zip_file|
|
11
|
+
zip_file.each do |entry|
|
12
|
+
next unless entry.file?
|
13
|
+
|
14
|
+
entry.get_input_stream do |input|
|
15
|
+
return yield(input)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
|
22
|
+
def extract_file(file_path)
|
23
|
+
Zip::File.open(@path) do |zip_file|
|
24
|
+
zip_file.each do |entry|
|
25
|
+
next unless entry.file?
|
26
|
+
next unless entry.name == file_path
|
27
|
+
|
28
|
+
entry.get_input_stream do |input|
|
29
|
+
return yield(input)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
nil
|
34
|
+
end
|
35
|
+
|
36
|
+
def extract_files
|
37
|
+
Zip::File.open(@path) do |zip_file|
|
38
|
+
zip_file.each do |entry|
|
39
|
+
next unless entry.file?
|
40
|
+
|
41
|
+
entry.get_input_stream do |input|
|
42
|
+
yield(input)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/lib/datasets.rb
CHANGED
@@ -1,22 +1,2 @@
|
|
1
|
-
require_relative "datasets/
|
2
|
-
|
3
|
-
require_relative "datasets/adult"
|
4
|
-
require_relative "datasets/cifar"
|
5
|
-
require_relative "datasets/cldr-plurals"
|
6
|
-
require_relative "datasets/communities"
|
7
|
-
require_relative "datasets/e-stat-japan"
|
8
|
-
require_relative "datasets/fashion-mnist"
|
9
|
-
require_relative "datasets/hepatitis"
|
10
|
-
require_relative "datasets/iris"
|
11
|
-
require_relative "datasets/libsvm"
|
12
|
-
require_relative "datasets/libsvm-dataset-list"
|
13
|
-
require_relative "datasets/mnist"
|
14
|
-
require_relative "datasets/mushroom"
|
15
|
-
require_relative "datasets/penguins"
|
16
|
-
require_relative "datasets/penn-treebank"
|
17
|
-
require_relative "datasets/postal-code-japan"
|
18
|
-
require_relative "datasets/rdatasets"
|
19
|
-
require_relative "datasets/seaborn-data"
|
20
|
-
require_relative "datasets/sudachi-synonym-dictionary"
|
21
|
-
require_relative "datasets/wikipedia"
|
22
|
-
require_relative "datasets/wine"
|
1
|
+
require_relative "datasets/lazy"
|
2
|
+
Datasets::LAZY_LOADER.load_all
|
data/red-datasets.gemspec
CHANGED
@@ -34,7 +34,7 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.files += Dir.glob("doc/text/*")
|
35
35
|
spec.test_files += Dir.glob("test/**/*")
|
36
36
|
|
37
|
-
spec.add_runtime_dependency("csv", ">= 3.
|
37
|
+
spec.add_runtime_dependency("csv", ">= 3.2.4")
|
38
38
|
spec.add_runtime_dependency("rexml")
|
39
39
|
spec.add_runtime_dependency("rubyzip")
|
40
40
|
|
data/test/helper.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "fileutils"
|
2
2
|
require "pathname"
|
3
3
|
require "time"
|
4
|
+
require "tmpdir"
|
4
5
|
|
5
6
|
require "datasets"
|
6
7
|
|
@@ -18,4 +19,24 @@ module Helper
|
|
18
19
|
FileUtils.rm_rf(@tmp_dir)
|
19
20
|
end
|
20
21
|
end
|
22
|
+
|
23
|
+
module PathRestorable
|
24
|
+
def restore_path(path)
|
25
|
+
unless path.exist?
|
26
|
+
return yield
|
27
|
+
end
|
28
|
+
|
29
|
+
Dir.mktmpdir do |dir|
|
30
|
+
FileUtils.cp_r(path, dir, preserve: true)
|
31
|
+
begin
|
32
|
+
yield
|
33
|
+
ensure
|
34
|
+
FileUtils.rmtree(path, secure: true) if path.exist?
|
35
|
+
FileUtils.cp_r(Pathname(dir) + path.basename,
|
36
|
+
path,
|
37
|
+
preserve: true)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
21
42
|
end
|