red-datasets 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 47a9f2cf4c17f8b64f0a88dc7738affbbcd316a0
4
- data.tar.gz: 7255596f70ff903f9103b3d72e78799a622e626e
2
+ SHA256:
3
+ metadata.gz: 6d071f7be3f241f1fb4327e63666c35879488f47c63e9844f8e86d099e385d79
4
+ data.tar.gz: 7e688dfc0ccc9d0ca8bc0070eef71dee1f3e7732e8887d443b37a577467dbf75
5
5
  SHA512:
6
- metadata.gz: e4065c07b451443e7ea2ff395144ecce0d9fa09e2a48979ba3b35c0cd0f18e48b27858a608d9b52e42bb112072a8fee1bffabbad33572028e25ef4aa163fb4cd
7
- data.tar.gz: 2e76c88a1bc14ffd4d4808d05a8c395415fa34dbb8b1189dbbe8e0afb8b573dd20b1f06338bd82f9cd01d413c1f0e83130748c7bffd3e623ac4e2bbe826bc9c2
6
+ metadata.gz: f71bf4fbb25332709d4ef4c8ddc1121781ecac05097551d336091c875a5c885fd20bbba658b41085faf3d1433c29ece43d458c104c300f6eeaa0d8088eae6377
7
+ data.tar.gz: cca27dc33ed60f0093bcf940068590df4fb0848b6f40ba265b9bf53316c888267d8c561248fdc5c755cbf5c93ded2a5068d6f8369e5ec6951bee367c552d8677
data/doc/text/news.md CHANGED
@@ -1,5 +1,26 @@
1
1
  # News
2
2
 
3
+ ## 0.0.3 - 2018-03-27
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::Metadata#licenses`: Renamed from `#license`. This is a
8
+ broken change.
9
+
10
+ * `Datasets::Wikipedia`: Added missing license information.
11
+
12
+ * Progress: Stopped progress bar when the process goes to background.
13
+
14
+ * Progress: Added rest time and throughput information.
15
+
16
+ * Progress: Added dynamic terminal width change support.
17
+
18
+ * Progress: Added continuous download support.
19
+
20
+ * `Datasets::Dataset#to_table`: Added.
21
+
22
+ * `Datasets::Table`: Added.
23
+
3
24
  ## 0.0.2 - 2018-02-06
4
25
 
5
26
  ### Improvements
@@ -2,6 +2,7 @@ require "pathname"
2
2
 
3
3
  require_relative "downloader"
4
4
  require_relative "metadata"
5
+ require_relative "table"
5
6
 
6
7
  module Datasets
7
8
  class Dataset
@@ -10,6 +11,10 @@ module Datasets
10
11
  @metadata = Metadata.new
11
12
  end
12
13
 
14
+ def to_table
15
+ Table.new(self)
16
+ end
17
+
13
18
  private
14
19
  def cache_dir_path
15
20
  case RUBY_PLATFORM
@@ -1,63 +1,285 @@
1
1
  require "fileutils"
2
+ begin
3
+ require "io/console"
4
+ rescue LoadError
5
+ end
2
6
  require "open-uri"
7
+ require "pathname"
3
8
 
4
9
  module Datasets
5
10
  class Downloader
6
11
  def initialize(url)
7
- url = URI.parse(url) unless url.is_a?(URI::Generic)
12
+ if url.is_a?(URI::Generic)
13
+ url = url.dup
14
+ else
15
+ url = URI.parse(url)
16
+ end
8
17
  @url = url
18
+ @url.extend(CurrentBufferReadable)
9
19
  end
10
20
 
11
21
  def download(output_path)
12
22
  output_path.parent.mkpath
13
23
 
14
- if $stderr == STDERR and $stderr.tty?
15
- max = nil
24
+ start = nil
25
+ partial_output_path = Pathname.new("#{output_path}.partial")
26
+ if partial_output_path.exist?
27
+ start = partial_output_path.size
28
+ end
29
+
30
+ progress_reporter = nil
31
+ content_length_proc = lambda do |content_length|
16
32
  base_name = @url.path.split("/").last
17
- content_length_proc = lambda do |content_length|
18
- max = content_length
19
- end
20
- progress_proc = lambda do |current|
21
- if max
22
- percent = (current / max.to_f) * 100
23
- formatted_size = "[%s/%s]" % [format_size(current), format_size(max)]
24
- $stderr.print("\r%s - %06.2f%% %s" %
25
- [base_name, percent, formatted_size])
26
- $stderr.puts if current == max
27
- end
28
- end
29
- options = {
30
- :content_length_proc => content_length_proc,
31
- :progress_proc => progress_proc,
32
- }
33
- else
34
- options = {}
33
+ size_max = content_length
34
+ size_max += start if start
35
+ progress_reporter = ProgressReporter.new(base_name, size_max)
36
+ end
37
+ progress_proc = lambda do |size_current|
38
+ size_current += start if start
39
+ progress_reporter.report(size_current) if progress_reporter
40
+ end
41
+ options = {
42
+ :content_length_proc => content_length_proc,
43
+ :progress_proc => progress_proc,
44
+ }
45
+ if start
46
+ options["Range"] = "bytes=#{start}-"
35
47
  end
36
48
 
37
49
  begin
38
50
  @url.open(options) do |input|
39
- output_path.open("wb") do |output|
40
- IO.copy_stream(input, output)
41
- end
51
+ copy_stream(input, partial_output_path)
52
+ end
53
+ rescue Interrupt, Net::ReadTimeout
54
+ if @url.current_buffer
55
+ input = @url.current_buffer.io
56
+ input.rewind
57
+ copy_stream(input, partial_output_path)
42
58
  end
43
- rescue
44
- FileUtils.rm_f(output_path)
45
59
  raise
46
60
  end
61
+
62
+ FileUtils.mv(partial_output_path, output_path)
47
63
  end
48
64
 
49
65
  private
50
- def format_size(size)
51
- if size < 1024
52
- "%d" % size
53
- elsif size < (1024 ** 2)
54
- "%7.2fKiB" % (size.to_f / 1024)
55
- elsif size < (1024 ** 3)
56
- "%7.2fMiB" % (size.to_f / (1024 ** 2))
57
- elsif size < (1024 ** 4)
58
- "%7.2fGiB" % (size.to_f / (1024 ** 3))
66
+ def copy_stream(input, partial_output_path)
67
+ if partial_output_path.exist?
68
+ # TODO: It's better that we use "206 Partial Content" response
69
+ # to detect partial response.
70
+ partial_head = partial_output_path.open("rb") do |partial_output|
71
+ partial_output.read(256)
72
+ end
73
+ input_head = input.read(partial_head.bytesize)
74
+ input.rewind
75
+ if partial_head == input_head
76
+ mode = "wb"
77
+ else
78
+ mode = "ab"
79
+ end
59
80
  else
60
- "%.2fTiB" % (size.to_f / (1024 ** 4))
81
+ mode = "wb"
82
+ end
83
+ partial_output_path.open(mode) do |partial_output|
84
+ IO.copy_stream(input, partial_output)
85
+ end
86
+ end
87
+
88
+ module CurrentBufferReadable
89
+ attr_reader :current_buffer
90
+ def buffer_open(buffer, proxy, options)
91
+ @current_buffer = buffer
92
+ super
93
+ end
94
+ end
95
+
96
+ class ProgressReporter
97
+ def initialize(base_name, size_max)
98
+ @base_name = base_name
99
+ @size_max = size_max
100
+
101
+ @time_previous = Time.now
102
+ @size_previous = 0
103
+
104
+ @need_report = ($stderr == STDERR and $stderr.tty?)
105
+ end
106
+
107
+ def report(size_current)
108
+ return unless @need_report
109
+ return if @size_max.nil?
110
+ return unless foreground?
111
+
112
+ done = (size_current == @size_max)
113
+ time_current = Time.now
114
+ if not done and time_current - @time_previous <= 1
115
+ return
116
+ end
117
+
118
+ read_bytes = size_current - @size_previous
119
+ throughput = read_bytes.to_f / (time_current - @time_previous)
120
+ @time_previous = time_current
121
+ @size_previous = size_current
122
+
123
+ message = build_message(size_current, throughput)
124
+ $stderr.print("\r#{message}") if message
125
+ $stderr.puts if done
126
+ end
127
+
128
+ private
129
+ def build_message(size_current, throughput)
130
+ percent = (size_current / @size_max.to_f) * 100
131
+ formatted_size = "[%s/%s]" % [
132
+ format_size(size_current),
133
+ format_size(@size_max),
134
+ ]
135
+ rest_second = (@size_max - size_current) / throughput
136
+ separator = " - "
137
+ progress = "%05.1f%% %s %s %s" % [
138
+ percent,
139
+ formatted_size,
140
+ format_time_interval(rest_second),
141
+ format_throughput(throughput),
142
+ ]
143
+ base_name = @base_name
144
+
145
+ width = guess_terminal_width
146
+ return "#{base_name}#{separator}#{progress}" if width.nil?
147
+
148
+ return nil if progress.size > width
149
+
150
+ base_name_width = width - progress.size - separator.size
151
+ if base_name.size > base_name_width
152
+ ellipsis = "..."
153
+ shorten_base_name_width = base_name_width - ellipsis.size
154
+ if shorten_base_name_width < 1
155
+ return progress
156
+ else
157
+ base_name = base_name[0, shorten_base_name_width] + ellipsis
158
+ end
159
+ end
160
+ "#{base_name}#{separator}#{progress}"
161
+ end
162
+
163
+ def format_size(size)
164
+ if size < 1000
165
+ "%d" % size
166
+ elsif size < (1000 ** 2)
167
+ "%6.2fKB" % (size.to_f / 1000)
168
+ elsif size < (1000 ** 3)
169
+ "%6.2fMB" % (size.to_f / (1000 ** 2))
170
+ elsif size < (1000 ** 4)
171
+ "%6.2fGB" % (size.to_f / (1000 ** 3))
172
+ else
173
+ "%.2fTB" % (size.to_f / (1000 ** 4))
174
+ end
175
+ end
176
+
177
+ def format_time_interval(interval)
178
+ if interval < 60
179
+ "00:00:%02d" % interval
180
+ elsif interval < (60 * 60)
181
+ minute, second = interval.divmod(60)
182
+ "00:%02d:%02d" % [minute, second]
183
+ elsif interval < (60 * 60 * 24)
184
+ minute, second = interval.divmod(60)
185
+ hour, minute = minute.divmod(60)
186
+ "%02d:%02d:%02d" % [hour, minute, second]
187
+ else
188
+ minute, second = interval.divmod(60)
189
+ hour, minute = minute.divmod(60)
190
+ day, hour = hour.divmod(24)
191
+ "%dd %02d:%02d:%02d" % [day, hour, minute, second]
192
+ end
193
+ end
194
+
195
+ def format_throughput(throughput)
196
+ throughput_byte = throughput / 8
197
+ if throughput_byte <= 1000
198
+ "%3dB/s" % throughput_byte
199
+ elsif throughput_byte <= (1000 ** 2)
200
+ "%3dKB/s" % (throughput_byte / 1000)
201
+ elsif throughput_byte <= (1000 ** 3)
202
+ "%3dMB/s" % (throughput_byte / (1000 ** 2))
203
+ else
204
+ "%3dGB/s" % (throughput_byte / (1000 ** 3))
205
+ end
206
+ end
207
+
208
+ def foreground?
209
+ proc_stat_path = "/proc/self/stat"
210
+ ps_path = "/bin/ps"
211
+
212
+ if File.exist?(proc_stat_path)
213
+ stat = File.read(proc_stat_path).sub(/\A.+\) /, "").split
214
+ process_group_id = stat[2]
215
+ terminal_process_group_id = stat[5]
216
+ process_group_id == terminal_process_group_id
217
+ elsif File.executable?(ps_path)
218
+ IO.pipe do |input, output|
219
+ pid = spawn(ps_path, "-o", "stat", "-p", Process.pid.to_s,
220
+ {:out => output, :err => output})
221
+ output.close
222
+ _, status = Process.waitpid2(pid)
223
+ return false unless status.success?
224
+
225
+ input.each_line.to_a.last.include?("+")
226
+ end
227
+ else
228
+ false
229
+ end
230
+ end
231
+
232
+ def guess_terminal_width
233
+ guess_terminal_width_from_io ||
234
+ guess_terminal_width_from_command ||
235
+ guess_terminal_width_from_env ||
236
+ 80
237
+ end
238
+
239
+ def guess_terminal_width_from_io
240
+ if IO.respond_to?(:console)
241
+ IO.console.winsize[1]
242
+ elsif $stderr.respond_to?(:winsize)
243
+ begin
244
+ $stderr.winsize[1]
245
+ rescue SystemCallError
246
+ nil
247
+ end
248
+ else
249
+ nil
250
+ end
251
+ end
252
+
253
+ def guess_terminal_width_from_command
254
+ IO.pipe do |input, output|
255
+ begin
256
+ pid = spawn("tput", "cols", {:out => output, :err => output})
257
+ rescue SystemCallError
258
+ return nil
259
+ end
260
+
261
+ output.close
262
+ _, status = Process.waitpid2(pid)
263
+ return nil unless status.success?
264
+
265
+ result = input.read.chomp
266
+ begin
267
+ Integer(result, 10)
268
+ rescue ArgumentError
269
+ nil
270
+ end
271
+ end
272
+ end
273
+
274
+ def guess_terminal_width_from_env
275
+ env = ENV["COLUMNS"] || ENV["TERM_WIDTH"]
276
+ return nil if env.nil?
277
+
278
+ begin
279
+ Integer(env, 10)
280
+ rescue ArgumentError
281
+ nil
282
+ end
61
283
  end
62
284
  end
63
285
  end
@@ -1,7 +1,7 @@
1
1
  module Datasets
2
2
  class Metadata < Struct.new(:name,
3
3
  :url,
4
- :license,
4
+ :licenses,
5
5
  :description)
6
6
  def description
7
7
  description_raw = super
@@ -0,0 +1,18 @@
1
+ module Datasets
2
+ class Table
3
+ def initialize(dataset)
4
+ @dataset = dataset
5
+ end
6
+
7
+ def to_h
8
+ columns = {}
9
+ @dataset.each do |record|
10
+ record.to_h.each do |name, value|
11
+ values = (columns[name] ||= [])
12
+ values << value
13
+ end
14
+ end
15
+ columns
16
+ end
17
+ end
18
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -32,6 +32,11 @@ module Datasets
32
32
  @type = type
33
33
  @metadata.name = "wikipedia-#{@language}-#{@type}"
34
34
  @metadata.url = "https://dumps.wikimedia.org/"
35
+ @metadata.licenses = [
36
+ "CC-BY-SA-3.0",
37
+ "CC-BY-SA-4.0",
38
+ "GFDL-1.3-or-later",
39
+ ]
35
40
  @metadata.description = "Wikipedia #{@type} (#{@language})"
36
41
  end
37
42
 
@@ -0,0 +1,20 @@
1
+ class TableTest < Test::Unit::TestCase
2
+ def setup
3
+ @table = Datasets::Iris.new.to_table
4
+ end
5
+
6
+ test("#to_h") do
7
+ shorten_hash = {}
8
+ @table.to_h.each do |name, values|
9
+ shorten_hash[name] = values.first(5)
10
+ end
11
+ assert_equal({
12
+ :class => ["Iris-setosa"] * 5,
13
+ :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
14
+ :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
15
+ :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
16
+ :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
17
+ },
18
+ shorten_hash)
19
+ end
20
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-02-06 00:00:00.000000000 Z
12
+ date: 2018-03-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -103,12 +103,14 @@ files:
103
103
  - lib/datasets/downloader.rb
104
104
  - lib/datasets/iris.rb
105
105
  - lib/datasets/metadata.rb
106
+ - lib/datasets/table.rb
106
107
  - lib/datasets/version.rb
107
108
  - lib/datasets/wikipedia.rb
108
109
  - red-datasets.gemspec
109
110
  - test/helper.rb
110
111
  - test/run-test.rb
111
112
  - test/test-iris.rb
113
+ - test/test-table.rb
112
114
  - test/test-wikipedia.rb
113
115
  homepage: https://github.com/red-data-tools/red-datasets
114
116
  licenses:
@@ -130,7 +132,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
130
132
  version: '0'
131
133
  requirements: []
132
134
  rubyforge_project:
133
- rubygems_version: 2.5.2.2
135
+ rubygems_version: 2.7.6
134
136
  signing_key:
135
137
  specification_version: 4
136
138
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
@@ -139,3 +141,4 @@ test_files:
139
141
  - test/test-wikipedia.rb
140
142
  - test/helper.rb
141
143
  - test/run-test.rb
144
+ - test/test-table.rb