red-datasets 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/doc/text/news.md +21 -0
- data/lib/datasets/dataset.rb +5 -0
- data/lib/datasets/downloader.rb +258 -36
- data/lib/datasets/metadata.rb +1 -1
- data/lib/datasets/table.rb +18 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +5 -0
- data/test/test-table.rb +20 -0
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6d071f7be3f241f1fb4327e63666c35879488f47c63e9844f8e86d099e385d79
|
4
|
+
data.tar.gz: 7e688dfc0ccc9d0ca8bc0070eef71dee1f3e7732e8887d443b37a577467dbf75
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f71bf4fbb25332709d4ef4c8ddc1121781ecac05097551d336091c875a5c885fd20bbba658b41085faf3d1433c29ece43d458c104c300f6eeaa0d8088eae6377
|
7
|
+
data.tar.gz: cca27dc33ed60f0093bcf940068590df4fb0848b6f40ba265b9bf53316c888267d8c561248fdc5c755cbf5c93ded2a5068d6f8369e5ec6951bee367c552d8677
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,26 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.0.3 - 2018-03-27
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `Datasets::Metadata#licenses`: Renamed from `#license`. This is a
|
8
|
+
broken change.
|
9
|
+
|
10
|
+
* `Datasets::Wikipedia`: Added missing license information.
|
11
|
+
|
12
|
+
* Progress: Stopped progress bar when the process goes to background.
|
13
|
+
|
14
|
+
* Progress: Added rest time and throughput information.
|
15
|
+
|
16
|
+
* Progress: Added dynamic terminal width change support.
|
17
|
+
|
18
|
+
* Progress: Added continuous download support.
|
19
|
+
|
20
|
+
* `Datasets::Dataset#to_table`: Added.
|
21
|
+
|
22
|
+
* `Datasets::Table`: Added.
|
23
|
+
|
3
24
|
## 0.0.2 - 2018-02-06
|
4
25
|
|
5
26
|
### Improvements
|
data/lib/datasets/dataset.rb
CHANGED
@@ -2,6 +2,7 @@ require "pathname"
|
|
2
2
|
|
3
3
|
require_relative "downloader"
|
4
4
|
require_relative "metadata"
|
5
|
+
require_relative "table"
|
5
6
|
|
6
7
|
module Datasets
|
7
8
|
class Dataset
|
@@ -10,6 +11,10 @@ module Datasets
|
|
10
11
|
@metadata = Metadata.new
|
11
12
|
end
|
12
13
|
|
14
|
+
def to_table
|
15
|
+
Table.new(self)
|
16
|
+
end
|
17
|
+
|
13
18
|
private
|
14
19
|
def cache_dir_path
|
15
20
|
case RUBY_PLATFORM
|
data/lib/datasets/downloader.rb
CHANGED
@@ -1,63 +1,285 @@
|
|
1
1
|
require "fileutils"
|
2
|
+
begin
|
3
|
+
require "io/console"
|
4
|
+
rescue LoadError
|
5
|
+
end
|
2
6
|
require "open-uri"
|
7
|
+
require "pathname"
|
3
8
|
|
4
9
|
module Datasets
|
5
10
|
class Downloader
|
6
11
|
def initialize(url)
|
7
|
-
|
12
|
+
if url.is_a?(URI::Generic)
|
13
|
+
url = url.dup
|
14
|
+
else
|
15
|
+
url = URI.parse(url)
|
16
|
+
end
|
8
17
|
@url = url
|
18
|
+
@url.extend(CurrentBufferReadable)
|
9
19
|
end
|
10
20
|
|
11
21
|
def download(output_path)
|
12
22
|
output_path.parent.mkpath
|
13
23
|
|
14
|
-
|
15
|
-
|
24
|
+
start = nil
|
25
|
+
partial_output_path = Pathname.new("#{output_path}.partial")
|
26
|
+
if partial_output_path.exist?
|
27
|
+
start = partial_output_path.size
|
28
|
+
end
|
29
|
+
|
30
|
+
progress_reporter = nil
|
31
|
+
content_length_proc = lambda do |content_length|
|
16
32
|
base_name = @url.path.split("/").last
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
:progress_proc => progress_proc,
|
32
|
-
}
|
33
|
-
else
|
34
|
-
options = {}
|
33
|
+
size_max = content_length
|
34
|
+
size_max += start if start
|
35
|
+
progress_reporter = ProgressReporter.new(base_name, size_max)
|
36
|
+
end
|
37
|
+
progress_proc = lambda do |size_current|
|
38
|
+
size_current += start if start
|
39
|
+
progress_reporter.report(size_current) if progress_reporter
|
40
|
+
end
|
41
|
+
options = {
|
42
|
+
:content_length_proc => content_length_proc,
|
43
|
+
:progress_proc => progress_proc,
|
44
|
+
}
|
45
|
+
if start
|
46
|
+
options["Range"] = "bytes=#{start}-"
|
35
47
|
end
|
36
48
|
|
37
49
|
begin
|
38
50
|
@url.open(options) do |input|
|
39
|
-
|
40
|
-
|
41
|
-
|
51
|
+
copy_stream(input, partial_output_path)
|
52
|
+
end
|
53
|
+
rescue Interrupt, Net::ReadTimeout
|
54
|
+
if @url.current_buffer
|
55
|
+
input = @url.current_buffer.io
|
56
|
+
input.rewind
|
57
|
+
copy_stream(input, partial_output_path)
|
42
58
|
end
|
43
|
-
rescue
|
44
|
-
FileUtils.rm_f(output_path)
|
45
59
|
raise
|
46
60
|
end
|
61
|
+
|
62
|
+
FileUtils.mv(partial_output_path, output_path)
|
47
63
|
end
|
48
64
|
|
49
65
|
private
|
50
|
-
def
|
51
|
-
if
|
52
|
-
"
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
66
|
+
def copy_stream(input, partial_output_path)
|
67
|
+
if partial_output_path.exist?
|
68
|
+
# TODO: It's better that we use "206 Partial Content" response
|
69
|
+
# to detect partial response.
|
70
|
+
partial_head = partial_output_path.open("rb") do |partial_output|
|
71
|
+
partial_output.read(256)
|
72
|
+
end
|
73
|
+
input_head = input.read(partial_head.bytesize)
|
74
|
+
input.rewind
|
75
|
+
if partial_head == input_head
|
76
|
+
mode = "wb"
|
77
|
+
else
|
78
|
+
mode = "ab"
|
79
|
+
end
|
59
80
|
else
|
60
|
-
"
|
81
|
+
mode = "wb"
|
82
|
+
end
|
83
|
+
partial_output_path.open(mode) do |partial_output|
|
84
|
+
IO.copy_stream(input, partial_output)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
module CurrentBufferReadable
|
89
|
+
attr_reader :current_buffer
|
90
|
+
def buffer_open(buffer, proxy, options)
|
91
|
+
@current_buffer = buffer
|
92
|
+
super
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class ProgressReporter
|
97
|
+
def initialize(base_name, size_max)
|
98
|
+
@base_name = base_name
|
99
|
+
@size_max = size_max
|
100
|
+
|
101
|
+
@time_previous = Time.now
|
102
|
+
@size_previous = 0
|
103
|
+
|
104
|
+
@need_report = ($stderr == STDERR and $stderr.tty?)
|
105
|
+
end
|
106
|
+
|
107
|
+
def report(size_current)
|
108
|
+
return unless @need_report
|
109
|
+
return if @size_max.nil?
|
110
|
+
return unless foreground?
|
111
|
+
|
112
|
+
done = (size_current == @size_max)
|
113
|
+
time_current = Time.now
|
114
|
+
if not done and time_current - @time_previous <= 1
|
115
|
+
return
|
116
|
+
end
|
117
|
+
|
118
|
+
read_bytes = size_current - @size_previous
|
119
|
+
throughput = read_bytes.to_f / (time_current - @time_previous)
|
120
|
+
@time_previous = time_current
|
121
|
+
@size_previous = size_current
|
122
|
+
|
123
|
+
message = build_message(size_current, throughput)
|
124
|
+
$stderr.print("\r#{message}") if message
|
125
|
+
$stderr.puts if done
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
def build_message(size_current, throughput)
|
130
|
+
percent = (size_current / @size_max.to_f) * 100
|
131
|
+
formatted_size = "[%s/%s]" % [
|
132
|
+
format_size(size_current),
|
133
|
+
format_size(@size_max),
|
134
|
+
]
|
135
|
+
rest_second = (@size_max - size_current) / throughput
|
136
|
+
separator = " - "
|
137
|
+
progress = "%05.1f%% %s %s %s" % [
|
138
|
+
percent,
|
139
|
+
formatted_size,
|
140
|
+
format_time_interval(rest_second),
|
141
|
+
format_throughput(throughput),
|
142
|
+
]
|
143
|
+
base_name = @base_name
|
144
|
+
|
145
|
+
width = guess_terminal_width
|
146
|
+
return "#{base_name}#{separator}#{progress}" if width.nil?
|
147
|
+
|
148
|
+
return nil if progress.size > width
|
149
|
+
|
150
|
+
base_name_width = width - progress.size - separator.size
|
151
|
+
if base_name.size > base_name_width
|
152
|
+
ellipsis = "..."
|
153
|
+
shorten_base_name_width = base_name_width - ellipsis.size
|
154
|
+
if shorten_base_name_width < 1
|
155
|
+
return progress
|
156
|
+
else
|
157
|
+
base_name = base_name[0, shorten_base_name_width] + ellipsis
|
158
|
+
end
|
159
|
+
end
|
160
|
+
"#{base_name}#{separator}#{progress}"
|
161
|
+
end
|
162
|
+
|
163
|
+
def format_size(size)
|
164
|
+
if size < 1000
|
165
|
+
"%d" % size
|
166
|
+
elsif size < (1000 ** 2)
|
167
|
+
"%6.2fKB" % (size.to_f / 1000)
|
168
|
+
elsif size < (1000 ** 3)
|
169
|
+
"%6.2fMB" % (size.to_f / (1000 ** 2))
|
170
|
+
elsif size < (1000 ** 4)
|
171
|
+
"%6.2fGB" % (size.to_f / (1000 ** 3))
|
172
|
+
else
|
173
|
+
"%.2fTB" % (size.to_f / (1000 ** 4))
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def format_time_interval(interval)
|
178
|
+
if interval < 60
|
179
|
+
"00:00:%02d" % interval
|
180
|
+
elsif interval < (60 * 60)
|
181
|
+
minute, second = interval.divmod(60)
|
182
|
+
"00:%02d:%02d" % [minute, second]
|
183
|
+
elsif interval < (60 * 60 * 24)
|
184
|
+
minute, second = interval.divmod(60)
|
185
|
+
hour, minute = minute.divmod(60)
|
186
|
+
"%02d:%02d:%02d" % [hour, minute, second]
|
187
|
+
else
|
188
|
+
minute, second = interval.divmod(60)
|
189
|
+
hour, minute = minute.divmod(60)
|
190
|
+
day, hour = hour.divmod(24)
|
191
|
+
"%dd %02d:%02d:%02d" % [day, hour, minute, second]
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def format_throughput(throughput)
|
196
|
+
throughput_byte = throughput / 8
|
197
|
+
if throughput_byte <= 1000
|
198
|
+
"%3dB/s" % throughput_byte
|
199
|
+
elsif throughput_byte <= (1000 ** 2)
|
200
|
+
"%3dKB/s" % (throughput_byte / 1000)
|
201
|
+
elsif throughput_byte <= (1000 ** 3)
|
202
|
+
"%3dMB/s" % (throughput_byte / (1000 ** 2))
|
203
|
+
else
|
204
|
+
"%3dGB/s" % (throughput_byte / (1000 ** 3))
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def foreground?
|
209
|
+
proc_stat_path = "/proc/self/stat"
|
210
|
+
ps_path = "/bin/ps"
|
211
|
+
|
212
|
+
if File.exist?(proc_stat_path)
|
213
|
+
stat = File.read(proc_stat_path).sub(/\A.+\) /, "").split
|
214
|
+
process_group_id = stat[2]
|
215
|
+
terminal_process_group_id = stat[5]
|
216
|
+
process_group_id == terminal_process_group_id
|
217
|
+
elsif File.executable?(ps_path)
|
218
|
+
IO.pipe do |input, output|
|
219
|
+
pid = spawn(ps_path, "-o", "stat", "-p", Process.pid.to_s,
|
220
|
+
{:out => output, :err => output})
|
221
|
+
output.close
|
222
|
+
_, status = Process.waitpid2(pid)
|
223
|
+
return false unless status.success?
|
224
|
+
|
225
|
+
input.each_line.to_a.last.include?("+")
|
226
|
+
end
|
227
|
+
else
|
228
|
+
false
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def guess_terminal_width
|
233
|
+
guess_terminal_width_from_io ||
|
234
|
+
guess_terminal_width_from_command ||
|
235
|
+
guess_terminal_width_from_env ||
|
236
|
+
80
|
237
|
+
end
|
238
|
+
|
239
|
+
def guess_terminal_width_from_io
|
240
|
+
if IO.respond_to?(:console)
|
241
|
+
IO.console.winsize[1]
|
242
|
+
elsif $stderr.respond_to?(:winsize)
|
243
|
+
begin
|
244
|
+
$stderr.winsize[1]
|
245
|
+
rescue SystemCallError
|
246
|
+
nil
|
247
|
+
end
|
248
|
+
else
|
249
|
+
nil
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
def guess_terminal_width_from_command
|
254
|
+
IO.pipe do |input, output|
|
255
|
+
begin
|
256
|
+
pid = spawn("tput", "cols", {:out => output, :err => output})
|
257
|
+
rescue SystemCallError
|
258
|
+
return nil
|
259
|
+
end
|
260
|
+
|
261
|
+
output.close
|
262
|
+
_, status = Process.waitpid2(pid)
|
263
|
+
return nil unless status.success?
|
264
|
+
|
265
|
+
result = input.read.chomp
|
266
|
+
begin
|
267
|
+
Integer(result, 10)
|
268
|
+
rescue ArgumentError
|
269
|
+
nil
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
def guess_terminal_width_from_env
|
275
|
+
env = ENV["COLUMNS"] || ENV["TERM_WIDTH"]
|
276
|
+
return nil if env.nil?
|
277
|
+
|
278
|
+
begin
|
279
|
+
Integer(env, 10)
|
280
|
+
rescue ArgumentError
|
281
|
+
nil
|
282
|
+
end
|
61
283
|
end
|
62
284
|
end
|
63
285
|
end
|
data/lib/datasets/metadata.rb
CHANGED
@@ -0,0 +1,18 @@
|
|
1
|
+
module Datasets
|
2
|
+
class Table
|
3
|
+
def initialize(dataset)
|
4
|
+
@dataset = dataset
|
5
|
+
end
|
6
|
+
|
7
|
+
def to_h
|
8
|
+
columns = {}
|
9
|
+
@dataset.each do |record|
|
10
|
+
record.to_h.each do |name, value|
|
11
|
+
values = (columns[name] ||= [])
|
12
|
+
values << value
|
13
|
+
end
|
14
|
+
end
|
15
|
+
columns
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/datasets/version.rb
CHANGED
data/lib/datasets/wikipedia.rb
CHANGED
@@ -32,6 +32,11 @@ module Datasets
|
|
32
32
|
@type = type
|
33
33
|
@metadata.name = "wikipedia-#{@language}-#{@type}"
|
34
34
|
@metadata.url = "https://dumps.wikimedia.org/"
|
35
|
+
@metadata.licenses = [
|
36
|
+
"CC-BY-SA-3.0",
|
37
|
+
"CC-BY-SA-4.0",
|
38
|
+
"GFDL-1.3-or-later",
|
39
|
+
]
|
35
40
|
@metadata.description = "Wikipedia #{@type} (#{@language})"
|
36
41
|
end
|
37
42
|
|
data/test/test-table.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
class TableTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@table = Datasets::Iris.new.to_table
|
4
|
+
end
|
5
|
+
|
6
|
+
test("#to_h") do
|
7
|
+
shorten_hash = {}
|
8
|
+
@table.to_h.each do |name, values|
|
9
|
+
shorten_hash[name] = values.first(5)
|
10
|
+
end
|
11
|
+
assert_equal({
|
12
|
+
:class => ["Iris-setosa"] * 5,
|
13
|
+
:petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
|
14
|
+
:petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
|
15
|
+
:sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
|
16
|
+
:sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
|
17
|
+
},
|
18
|
+
shorten_hash)
|
19
|
+
end
|
20
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-03-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -103,12 +103,14 @@ files:
|
|
103
103
|
- lib/datasets/downloader.rb
|
104
104
|
- lib/datasets/iris.rb
|
105
105
|
- lib/datasets/metadata.rb
|
106
|
+
- lib/datasets/table.rb
|
106
107
|
- lib/datasets/version.rb
|
107
108
|
- lib/datasets/wikipedia.rb
|
108
109
|
- red-datasets.gemspec
|
109
110
|
- test/helper.rb
|
110
111
|
- test/run-test.rb
|
111
112
|
- test/test-iris.rb
|
113
|
+
- test/test-table.rb
|
112
114
|
- test/test-wikipedia.rb
|
113
115
|
homepage: https://github.com/red-data-tools/red-datasets
|
114
116
|
licenses:
|
@@ -130,7 +132,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
130
132
|
version: '0'
|
131
133
|
requirements: []
|
132
134
|
rubyforge_project:
|
133
|
-
rubygems_version: 2.
|
135
|
+
rubygems_version: 2.7.6
|
134
136
|
signing_key:
|
135
137
|
specification_version: 4
|
136
138
|
summary: Red Datasets provides classes that provide common datasets such as iris dataset.
|
@@ -139,3 +141,4 @@ test_files:
|
|
139
141
|
- test/test-wikipedia.rb
|
140
142
|
- test/helper.rb
|
141
143
|
- test/run-test.rb
|
144
|
+
- test/test-table.rb
|