red-datasets 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/doc/text/news.md +21 -0
- data/lib/datasets/dataset.rb +5 -0
- data/lib/datasets/downloader.rb +258 -36
- data/lib/datasets/metadata.rb +1 -1
- data/lib/datasets/table.rb +18 -0
- data/lib/datasets/version.rb +1 -1
- data/lib/datasets/wikipedia.rb +5 -0
- data/test/test-table.rb +20 -0
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6d071f7be3f241f1fb4327e63666c35879488f47c63e9844f8e86d099e385d79
|
4
|
+
data.tar.gz: 7e688dfc0ccc9d0ca8bc0070eef71dee1f3e7732e8887d443b37a577467dbf75
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f71bf4fbb25332709d4ef4c8ddc1121781ecac05097551d336091c875a5c885fd20bbba658b41085faf3d1433c29ece43d458c104c300f6eeaa0d8088eae6377
|
7
|
+
data.tar.gz: cca27dc33ed60f0093bcf940068590df4fb0848b6f40ba265b9bf53316c888267d8c561248fdc5c755cbf5c93ded2a5068d6f8369e5ec6951bee367c552d8677
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,26 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 0.0.3 - 2018-03-27
|
4
|
+
|
5
|
+
### Improvements
|
6
|
+
|
7
|
+
* `Datasets::Metadata#licenses`: Renamed from `#license`. This is a
|
8
|
+
broken change.
|
9
|
+
|
10
|
+
* `Datasets::Wikipedia`: Added missing license information.
|
11
|
+
|
12
|
+
* Progress: Stopped progress bar when the process goes to background.
|
13
|
+
|
14
|
+
* Progress: Added rest time and throughput information.
|
15
|
+
|
16
|
+
* Progress: Added dynamic terminal width change support.
|
17
|
+
|
18
|
+
* Progress: Added continuous download support.
|
19
|
+
|
20
|
+
* `Datasets::Dataset#to_table`: Added.
|
21
|
+
|
22
|
+
* `Datasets::Table`: Added.
|
23
|
+
|
3
24
|
## 0.0.2 - 2018-02-06
|
4
25
|
|
5
26
|
### Improvements
|
data/lib/datasets/dataset.rb
CHANGED
@@ -2,6 +2,7 @@ require "pathname"
|
|
2
2
|
|
3
3
|
require_relative "downloader"
|
4
4
|
require_relative "metadata"
|
5
|
+
require_relative "table"
|
5
6
|
|
6
7
|
module Datasets
|
7
8
|
class Dataset
|
@@ -10,6 +11,10 @@ module Datasets
|
|
10
11
|
@metadata = Metadata.new
|
11
12
|
end
|
12
13
|
|
14
|
+
def to_table
|
15
|
+
Table.new(self)
|
16
|
+
end
|
17
|
+
|
13
18
|
private
|
14
19
|
def cache_dir_path
|
15
20
|
case RUBY_PLATFORM
|
data/lib/datasets/downloader.rb
CHANGED
@@ -1,63 +1,285 @@
|
|
1
1
|
require "fileutils"
|
2
|
+
begin
|
3
|
+
require "io/console"
|
4
|
+
rescue LoadError
|
5
|
+
end
|
2
6
|
require "open-uri"
|
7
|
+
require "pathname"
|
3
8
|
|
4
9
|
module Datasets
|
5
10
|
class Downloader
|
6
11
|
def initialize(url)
|
7
|
-
|
12
|
+
if url.is_a?(URI::Generic)
|
13
|
+
url = url.dup
|
14
|
+
else
|
15
|
+
url = URI.parse(url)
|
16
|
+
end
|
8
17
|
@url = url
|
18
|
+
@url.extend(CurrentBufferReadable)
|
9
19
|
end
|
10
20
|
|
11
21
|
def download(output_path)
|
12
22
|
output_path.parent.mkpath
|
13
23
|
|
14
|
-
|
15
|
-
|
24
|
+
start = nil
|
25
|
+
partial_output_path = Pathname.new("#{output_path}.partial")
|
26
|
+
if partial_output_path.exist?
|
27
|
+
start = partial_output_path.size
|
28
|
+
end
|
29
|
+
|
30
|
+
progress_reporter = nil
|
31
|
+
content_length_proc = lambda do |content_length|
|
16
32
|
base_name = @url.path.split("/").last
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
:progress_proc => progress_proc,
|
32
|
-
}
|
33
|
-
else
|
34
|
-
options = {}
|
33
|
+
size_max = content_length
|
34
|
+
size_max += start if start
|
35
|
+
progress_reporter = ProgressReporter.new(base_name, size_max)
|
36
|
+
end
|
37
|
+
progress_proc = lambda do |size_current|
|
38
|
+
size_current += start if start
|
39
|
+
progress_reporter.report(size_current) if progress_reporter
|
40
|
+
end
|
41
|
+
options = {
|
42
|
+
:content_length_proc => content_length_proc,
|
43
|
+
:progress_proc => progress_proc,
|
44
|
+
}
|
45
|
+
if start
|
46
|
+
options["Range"] = "bytes=#{start}-"
|
35
47
|
end
|
36
48
|
|
37
49
|
begin
|
38
50
|
@url.open(options) do |input|
|
39
|
-
|
40
|
-
|
41
|
-
|
51
|
+
copy_stream(input, partial_output_path)
|
52
|
+
end
|
53
|
+
rescue Interrupt, Net::ReadTimeout
|
54
|
+
if @url.current_buffer
|
55
|
+
input = @url.current_buffer.io
|
56
|
+
input.rewind
|
57
|
+
copy_stream(input, partial_output_path)
|
42
58
|
end
|
43
|
-
rescue
|
44
|
-
FileUtils.rm_f(output_path)
|
45
59
|
raise
|
46
60
|
end
|
61
|
+
|
62
|
+
FileUtils.mv(partial_output_path, output_path)
|
47
63
|
end
|
48
64
|
|
49
65
|
private
|
50
|
-
def
|
51
|
-
if
|
52
|
-
"
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
66
|
+
def copy_stream(input, partial_output_path)
|
67
|
+
if partial_output_path.exist?
|
68
|
+
# TODO: It's better that we use "206 Partial Content" response
|
69
|
+
# to detect partial response.
|
70
|
+
partial_head = partial_output_path.open("rb") do |partial_output|
|
71
|
+
partial_output.read(256)
|
72
|
+
end
|
73
|
+
input_head = input.read(partial_head.bytesize)
|
74
|
+
input.rewind
|
75
|
+
if partial_head == input_head
|
76
|
+
mode = "wb"
|
77
|
+
else
|
78
|
+
mode = "ab"
|
79
|
+
end
|
59
80
|
else
|
60
|
-
"
|
81
|
+
mode = "wb"
|
82
|
+
end
|
83
|
+
partial_output_path.open(mode) do |partial_output|
|
84
|
+
IO.copy_stream(input, partial_output)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
module CurrentBufferReadable
|
89
|
+
attr_reader :current_buffer
|
90
|
+
def buffer_open(buffer, proxy, options)
|
91
|
+
@current_buffer = buffer
|
92
|
+
super
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class ProgressReporter
|
97
|
+
def initialize(base_name, size_max)
|
98
|
+
@base_name = base_name
|
99
|
+
@size_max = size_max
|
100
|
+
|
101
|
+
@time_previous = Time.now
|
102
|
+
@size_previous = 0
|
103
|
+
|
104
|
+
@need_report = ($stderr == STDERR and $stderr.tty?)
|
105
|
+
end
|
106
|
+
|
107
|
+
def report(size_current)
|
108
|
+
return unless @need_report
|
109
|
+
return if @size_max.nil?
|
110
|
+
return unless foreground?
|
111
|
+
|
112
|
+
done = (size_current == @size_max)
|
113
|
+
time_current = Time.now
|
114
|
+
if not done and time_current - @time_previous <= 1
|
115
|
+
return
|
116
|
+
end
|
117
|
+
|
118
|
+
read_bytes = size_current - @size_previous
|
119
|
+
throughput = read_bytes.to_f / (time_current - @time_previous)
|
120
|
+
@time_previous = time_current
|
121
|
+
@size_previous = size_current
|
122
|
+
|
123
|
+
message = build_message(size_current, throughput)
|
124
|
+
$stderr.print("\r#{message}") if message
|
125
|
+
$stderr.puts if done
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
def build_message(size_current, throughput)
|
130
|
+
percent = (size_current / @size_max.to_f) * 100
|
131
|
+
formatted_size = "[%s/%s]" % [
|
132
|
+
format_size(size_current),
|
133
|
+
format_size(@size_max),
|
134
|
+
]
|
135
|
+
rest_second = (@size_max - size_current) / throughput
|
136
|
+
separator = " - "
|
137
|
+
progress = "%05.1f%% %s %s %s" % [
|
138
|
+
percent,
|
139
|
+
formatted_size,
|
140
|
+
format_time_interval(rest_second),
|
141
|
+
format_throughput(throughput),
|
142
|
+
]
|
143
|
+
base_name = @base_name
|
144
|
+
|
145
|
+
width = guess_terminal_width
|
146
|
+
return "#{base_name}#{separator}#{progress}" if width.nil?
|
147
|
+
|
148
|
+
return nil if progress.size > width
|
149
|
+
|
150
|
+
base_name_width = width - progress.size - separator.size
|
151
|
+
if base_name.size > base_name_width
|
152
|
+
ellipsis = "..."
|
153
|
+
shorten_base_name_width = base_name_width - ellipsis.size
|
154
|
+
if shorten_base_name_width < 1
|
155
|
+
return progress
|
156
|
+
else
|
157
|
+
base_name = base_name[0, shorten_base_name_width] + ellipsis
|
158
|
+
end
|
159
|
+
end
|
160
|
+
"#{base_name}#{separator}#{progress}"
|
161
|
+
end
|
162
|
+
|
163
|
+
def format_size(size)
|
164
|
+
if size < 1000
|
165
|
+
"%d" % size
|
166
|
+
elsif size < (1000 ** 2)
|
167
|
+
"%6.2fKB" % (size.to_f / 1000)
|
168
|
+
elsif size < (1000 ** 3)
|
169
|
+
"%6.2fMB" % (size.to_f / (1000 ** 2))
|
170
|
+
elsif size < (1000 ** 4)
|
171
|
+
"%6.2fGB" % (size.to_f / (1000 ** 3))
|
172
|
+
else
|
173
|
+
"%.2fTB" % (size.to_f / (1000 ** 4))
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def format_time_interval(interval)
|
178
|
+
if interval < 60
|
179
|
+
"00:00:%02d" % interval
|
180
|
+
elsif interval < (60 * 60)
|
181
|
+
minute, second = interval.divmod(60)
|
182
|
+
"00:%02d:%02d" % [minute, second]
|
183
|
+
elsif interval < (60 * 60 * 24)
|
184
|
+
minute, second = interval.divmod(60)
|
185
|
+
hour, minute = minute.divmod(60)
|
186
|
+
"%02d:%02d:%02d" % [hour, minute, second]
|
187
|
+
else
|
188
|
+
minute, second = interval.divmod(60)
|
189
|
+
hour, minute = minute.divmod(60)
|
190
|
+
day, hour = hour.divmod(24)
|
191
|
+
"%dd %02d:%02d:%02d" % [day, hour, minute, second]
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def format_throughput(throughput)
|
196
|
+
throughput_byte = throughput / 8
|
197
|
+
if throughput_byte <= 1000
|
198
|
+
"%3dB/s" % throughput_byte
|
199
|
+
elsif throughput_byte <= (1000 ** 2)
|
200
|
+
"%3dKB/s" % (throughput_byte / 1000)
|
201
|
+
elsif throughput_byte <= (1000 ** 3)
|
202
|
+
"%3dMB/s" % (throughput_byte / (1000 ** 2))
|
203
|
+
else
|
204
|
+
"%3dGB/s" % (throughput_byte / (1000 ** 3))
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def foreground?
|
209
|
+
proc_stat_path = "/proc/self/stat"
|
210
|
+
ps_path = "/bin/ps"
|
211
|
+
|
212
|
+
if File.exist?(proc_stat_path)
|
213
|
+
stat = File.read(proc_stat_path).sub(/\A.+\) /, "").split
|
214
|
+
process_group_id = stat[2]
|
215
|
+
terminal_process_group_id = stat[5]
|
216
|
+
process_group_id == terminal_process_group_id
|
217
|
+
elsif File.executable?(ps_path)
|
218
|
+
IO.pipe do |input, output|
|
219
|
+
pid = spawn(ps_path, "-o", "stat", "-p", Process.pid.to_s,
|
220
|
+
{:out => output, :err => output})
|
221
|
+
output.close
|
222
|
+
_, status = Process.waitpid2(pid)
|
223
|
+
return false unless status.success?
|
224
|
+
|
225
|
+
input.each_line.to_a.last.include?("+")
|
226
|
+
end
|
227
|
+
else
|
228
|
+
false
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def guess_terminal_width
|
233
|
+
guess_terminal_width_from_io ||
|
234
|
+
guess_terminal_width_from_command ||
|
235
|
+
guess_terminal_width_from_env ||
|
236
|
+
80
|
237
|
+
end
|
238
|
+
|
239
|
+
def guess_terminal_width_from_io
|
240
|
+
if IO.respond_to?(:console)
|
241
|
+
IO.console.winsize[1]
|
242
|
+
elsif $stderr.respond_to?(:winsize)
|
243
|
+
begin
|
244
|
+
$stderr.winsize[1]
|
245
|
+
rescue SystemCallError
|
246
|
+
nil
|
247
|
+
end
|
248
|
+
else
|
249
|
+
nil
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
def guess_terminal_width_from_command
|
254
|
+
IO.pipe do |input, output|
|
255
|
+
begin
|
256
|
+
pid = spawn("tput", "cols", {:out => output, :err => output})
|
257
|
+
rescue SystemCallError
|
258
|
+
return nil
|
259
|
+
end
|
260
|
+
|
261
|
+
output.close
|
262
|
+
_, status = Process.waitpid2(pid)
|
263
|
+
return nil unless status.success?
|
264
|
+
|
265
|
+
result = input.read.chomp
|
266
|
+
begin
|
267
|
+
Integer(result, 10)
|
268
|
+
rescue ArgumentError
|
269
|
+
nil
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
def guess_terminal_width_from_env
|
275
|
+
env = ENV["COLUMNS"] || ENV["TERM_WIDTH"]
|
276
|
+
return nil if env.nil?
|
277
|
+
|
278
|
+
begin
|
279
|
+
Integer(env, 10)
|
280
|
+
rescue ArgumentError
|
281
|
+
nil
|
282
|
+
end
|
61
283
|
end
|
62
284
|
end
|
63
285
|
end
|
data/lib/datasets/metadata.rb
CHANGED
@@ -0,0 +1,18 @@
|
|
1
|
+
module Datasets
|
2
|
+
class Table
|
3
|
+
def initialize(dataset)
|
4
|
+
@dataset = dataset
|
5
|
+
end
|
6
|
+
|
7
|
+
def to_h
|
8
|
+
columns = {}
|
9
|
+
@dataset.each do |record|
|
10
|
+
record.to_h.each do |name, value|
|
11
|
+
values = (columns[name] ||= [])
|
12
|
+
values << value
|
13
|
+
end
|
14
|
+
end
|
15
|
+
columns
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/datasets/version.rb
CHANGED
data/lib/datasets/wikipedia.rb
CHANGED
@@ -32,6 +32,11 @@ module Datasets
|
|
32
32
|
@type = type
|
33
33
|
@metadata.name = "wikipedia-#{@language}-#{@type}"
|
34
34
|
@metadata.url = "https://dumps.wikimedia.org/"
|
35
|
+
@metadata.licenses = [
|
36
|
+
"CC-BY-SA-3.0",
|
37
|
+
"CC-BY-SA-4.0",
|
38
|
+
"GFDL-1.3-or-later",
|
39
|
+
]
|
35
40
|
@metadata.description = "Wikipedia #{@type} (#{@language})"
|
36
41
|
end
|
37
42
|
|
data/test/test-table.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
class TableTest < Test::Unit::TestCase
|
2
|
+
def setup
|
3
|
+
@table = Datasets::Iris.new.to_table
|
4
|
+
end
|
5
|
+
|
6
|
+
test("#to_h") do
|
7
|
+
shorten_hash = {}
|
8
|
+
@table.to_h.each do |name, values|
|
9
|
+
shorten_hash[name] = values.first(5)
|
10
|
+
end
|
11
|
+
assert_equal({
|
12
|
+
:class => ["Iris-setosa"] * 5,
|
13
|
+
:petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
|
14
|
+
:petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
|
15
|
+
:sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
|
16
|
+
:sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
|
17
|
+
},
|
18
|
+
shorten_hash)
|
19
|
+
end
|
20
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: red-datasets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- tomisuker
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2018-
|
12
|
+
date: 2018-03-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -103,12 +103,14 @@ files:
|
|
103
103
|
- lib/datasets/downloader.rb
|
104
104
|
- lib/datasets/iris.rb
|
105
105
|
- lib/datasets/metadata.rb
|
106
|
+
- lib/datasets/table.rb
|
106
107
|
- lib/datasets/version.rb
|
107
108
|
- lib/datasets/wikipedia.rb
|
108
109
|
- red-datasets.gemspec
|
109
110
|
- test/helper.rb
|
110
111
|
- test/run-test.rb
|
111
112
|
- test/test-iris.rb
|
113
|
+
- test/test-table.rb
|
112
114
|
- test/test-wikipedia.rb
|
113
115
|
homepage: https://github.com/red-data-tools/red-datasets
|
114
116
|
licenses:
|
@@ -130,7 +132,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
130
132
|
version: '0'
|
131
133
|
requirements: []
|
132
134
|
rubyforge_project:
|
133
|
-
rubygems_version: 2.
|
135
|
+
rubygems_version: 2.7.6
|
134
136
|
signing_key:
|
135
137
|
specification_version: 4
|
136
138
|
summary: Red Datasets provides classes that provide common datasets such as iris dataset.
|
@@ -139,3 +141,4 @@ test_files:
|
|
139
141
|
- test/test-wikipedia.rb
|
140
142
|
- test/helper.rb
|
141
143
|
- test/run-test.rb
|
144
|
+
- test/test-table.rb
|