red-datasets 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 47a9f2cf4c17f8b64f0a88dc7738affbbcd316a0
4
- data.tar.gz: 7255596f70ff903f9103b3d72e78799a622e626e
2
+ SHA256:
3
+ metadata.gz: 6d071f7be3f241f1fb4327e63666c35879488f47c63e9844f8e86d099e385d79
4
+ data.tar.gz: 7e688dfc0ccc9d0ca8bc0070eef71dee1f3e7732e8887d443b37a577467dbf75
5
5
  SHA512:
6
- metadata.gz: e4065c07b451443e7ea2ff395144ecce0d9fa09e2a48979ba3b35c0cd0f18e48b27858a608d9b52e42bb112072a8fee1bffabbad33572028e25ef4aa163fb4cd
7
- data.tar.gz: 2e76c88a1bc14ffd4d4808d05a8c395415fa34dbb8b1189dbbe8e0afb8b573dd20b1f06338bd82f9cd01d413c1f0e83130748c7bffd3e623ac4e2bbe826bc9c2
6
+ metadata.gz: f71bf4fbb25332709d4ef4c8ddc1121781ecac05097551d336091c875a5c885fd20bbba658b41085faf3d1433c29ece43d458c104c300f6eeaa0d8088eae6377
7
+ data.tar.gz: cca27dc33ed60f0093bcf940068590df4fb0848b6f40ba265b9bf53316c888267d8c561248fdc5c755cbf5c93ded2a5068d6f8369e5ec6951bee367c552d8677
data/doc/text/news.md CHANGED
@@ -1,5 +1,26 @@
1
1
  # News
2
2
 
3
+ ## 0.0.3 - 2018-03-27
4
+
5
+ ### Improvements
6
+
7
+ * `Datasets::Metadata#licenses`: Renamed from `#license`. This is a
8
+ broken change.
9
+
10
+ * `Datasets::Wikipedia`: Added missing license information.
11
+
12
+ * Progress: Stopped progress bar when the process goes to background.
13
+
14
+ * Progress: Added rest time and throughput information.
15
+
16
+ * Progress: Added dynamic terminal width change support.
17
+
18
+ * Progress: Added continuous download support.
19
+
20
+ * `Datasets::Dataset#to_table`: Added.
21
+
22
+ * `Datasets::Table`: Added.
23
+
3
24
  ## 0.0.2 - 2018-02-06
4
25
 
5
26
  ### Improvements
@@ -2,6 +2,7 @@ require "pathname"
2
2
 
3
3
  require_relative "downloader"
4
4
  require_relative "metadata"
5
+ require_relative "table"
5
6
 
6
7
  module Datasets
7
8
  class Dataset
@@ -10,6 +11,10 @@ module Datasets
10
11
  @metadata = Metadata.new
11
12
  end
12
13
 
14
+ def to_table
15
+ Table.new(self)
16
+ end
17
+
13
18
  private
14
19
  def cache_dir_path
15
20
  case RUBY_PLATFORM
@@ -1,63 +1,285 @@
1
1
  require "fileutils"
2
+ begin
3
+ require "io/console"
4
+ rescue LoadError
5
+ end
2
6
  require "open-uri"
7
+ require "pathname"
3
8
 
4
9
  module Datasets
5
10
  class Downloader
6
11
  def initialize(url)
7
- url = URI.parse(url) unless url.is_a?(URI::Generic)
12
+ if url.is_a?(URI::Generic)
13
+ url = url.dup
14
+ else
15
+ url = URI.parse(url)
16
+ end
8
17
  @url = url
18
+ @url.extend(CurrentBufferReadable)
9
19
  end
10
20
 
11
21
  def download(output_path)
12
22
  output_path.parent.mkpath
13
23
 
14
- if $stderr == STDERR and $stderr.tty?
15
- max = nil
24
+ start = nil
25
+ partial_output_path = Pathname.new("#{output_path}.partial")
26
+ if partial_output_path.exist?
27
+ start = partial_output_path.size
28
+ end
29
+
30
+ progress_reporter = nil
31
+ content_length_proc = lambda do |content_length|
16
32
  base_name = @url.path.split("/").last
17
- content_length_proc = lambda do |content_length|
18
- max = content_length
19
- end
20
- progress_proc = lambda do |current|
21
- if max
22
- percent = (current / max.to_f) * 100
23
- formatted_size = "[%s/%s]" % [format_size(current), format_size(max)]
24
- $stderr.print("\r%s - %06.2f%% %s" %
25
- [base_name, percent, formatted_size])
26
- $stderr.puts if current == max
27
- end
28
- end
29
- options = {
30
- :content_length_proc => content_length_proc,
31
- :progress_proc => progress_proc,
32
- }
33
- else
34
- options = {}
33
+ size_max = content_length
34
+ size_max += start if start
35
+ progress_reporter = ProgressReporter.new(base_name, size_max)
36
+ end
37
+ progress_proc = lambda do |size_current|
38
+ size_current += start if start
39
+ progress_reporter.report(size_current) if progress_reporter
40
+ end
41
+ options = {
42
+ :content_length_proc => content_length_proc,
43
+ :progress_proc => progress_proc,
44
+ }
45
+ if start
46
+ options["Range"] = "bytes=#{start}-"
35
47
  end
36
48
 
37
49
  begin
38
50
  @url.open(options) do |input|
39
- output_path.open("wb") do |output|
40
- IO.copy_stream(input, output)
41
- end
51
+ copy_stream(input, partial_output_path)
52
+ end
53
+ rescue Interrupt, Net::ReadTimeout
54
+ if @url.current_buffer
55
+ input = @url.current_buffer.io
56
+ input.rewind
57
+ copy_stream(input, partial_output_path)
42
58
  end
43
- rescue
44
- FileUtils.rm_f(output_path)
45
59
  raise
46
60
  end
61
+
62
+ FileUtils.mv(partial_output_path, output_path)
47
63
  end
48
64
 
49
65
  private
50
- def format_size(size)
51
- if size < 1024
52
- "%d" % size
53
- elsif size < (1024 ** 2)
54
- "%7.2fKiB" % (size.to_f / 1024)
55
- elsif size < (1024 ** 3)
56
- "%7.2fMiB" % (size.to_f / (1024 ** 2))
57
- elsif size < (1024 ** 4)
58
- "%7.2fGiB" % (size.to_f / (1024 ** 3))
66
+ def copy_stream(input, partial_output_path)
67
+ if partial_output_path.exist?
68
+ # TODO: It's better that we use "206 Partial Content" response
69
+ # to detect partial response.
70
+ partial_head = partial_output_path.open("rb") do |partial_output|
71
+ partial_output.read(256)
72
+ end
73
+ input_head = input.read(partial_head.bytesize)
74
+ input.rewind
75
+ if partial_head == input_head
76
+ mode = "wb"
77
+ else
78
+ mode = "ab"
79
+ end
59
80
  else
60
- "%.2fTiB" % (size.to_f / (1024 ** 4))
81
+ mode = "wb"
82
+ end
83
+ partial_output_path.open(mode) do |partial_output|
84
+ IO.copy_stream(input, partial_output)
85
+ end
86
+ end
87
+
88
+ module CurrentBufferReadable
89
+ attr_reader :current_buffer
90
+ def buffer_open(buffer, proxy, options)
91
+ @current_buffer = buffer
92
+ super
93
+ end
94
+ end
95
+
96
+ class ProgressReporter
97
+ def initialize(base_name, size_max)
98
+ @base_name = base_name
99
+ @size_max = size_max
100
+
101
+ @time_previous = Time.now
102
+ @size_previous = 0
103
+
104
+ @need_report = ($stderr == STDERR and $stderr.tty?)
105
+ end
106
+
107
+ def report(size_current)
108
+ return unless @need_report
109
+ return if @size_max.nil?
110
+ return unless foreground?
111
+
112
+ done = (size_current == @size_max)
113
+ time_current = Time.now
114
+ if not done and time_current - @time_previous <= 1
115
+ return
116
+ end
117
+
118
+ read_bytes = size_current - @size_previous
119
+ throughput = read_bytes.to_f / (time_current - @time_previous)
120
+ @time_previous = time_current
121
+ @size_previous = size_current
122
+
123
+ message = build_message(size_current, throughput)
124
+ $stderr.print("\r#{message}") if message
125
+ $stderr.puts if done
126
+ end
127
+
128
+ private
129
+ def build_message(size_current, throughput)
130
+ percent = (size_current / @size_max.to_f) * 100
131
+ formatted_size = "[%s/%s]" % [
132
+ format_size(size_current),
133
+ format_size(@size_max),
134
+ ]
135
+ rest_second = (@size_max - size_current) / throughput
136
+ separator = " - "
137
+ progress = "%05.1f%% %s %s %s" % [
138
+ percent,
139
+ formatted_size,
140
+ format_time_interval(rest_second),
141
+ format_throughput(throughput),
142
+ ]
143
+ base_name = @base_name
144
+
145
+ width = guess_terminal_width
146
+ return "#{base_name}#{separator}#{progress}" if width.nil?
147
+
148
+ return nil if progress.size > width
149
+
150
+ base_name_width = width - progress.size - separator.size
151
+ if base_name.size > base_name_width
152
+ ellipsis = "..."
153
+ shorten_base_name_width = base_name_width - ellipsis.size
154
+ if shorten_base_name_width < 1
155
+ return progress
156
+ else
157
+ base_name = base_name[0, shorten_base_name_width] + ellipsis
158
+ end
159
+ end
160
+ "#{base_name}#{separator}#{progress}"
161
+ end
162
+
163
+ def format_size(size)
164
+ if size < 1000
165
+ "%d" % size
166
+ elsif size < (1000 ** 2)
167
+ "%6.2fKB" % (size.to_f / 1000)
168
+ elsif size < (1000 ** 3)
169
+ "%6.2fMB" % (size.to_f / (1000 ** 2))
170
+ elsif size < (1000 ** 4)
171
+ "%6.2fGB" % (size.to_f / (1000 ** 3))
172
+ else
173
+ "%.2fTB" % (size.to_f / (1000 ** 4))
174
+ end
175
+ end
176
+
177
+ def format_time_interval(interval)
178
+ if interval < 60
179
+ "00:00:%02d" % interval
180
+ elsif interval < (60 * 60)
181
+ minute, second = interval.divmod(60)
182
+ "00:%02d:%02d" % [minute, second]
183
+ elsif interval < (60 * 60 * 24)
184
+ minute, second = interval.divmod(60)
185
+ hour, minute = minute.divmod(60)
186
+ "%02d:%02d:%02d" % [hour, minute, second]
187
+ else
188
+ minute, second = interval.divmod(60)
189
+ hour, minute = minute.divmod(60)
190
+ day, hour = hour.divmod(24)
191
+ "%dd %02d:%02d:%02d" % [day, hour, minute, second]
192
+ end
193
+ end
194
+
195
+ def format_throughput(throughput)
196
+ throughput_byte = throughput / 8
197
+ if throughput_byte <= 1000
198
+ "%3dB/s" % throughput_byte
199
+ elsif throughput_byte <= (1000 ** 2)
200
+ "%3dKB/s" % (throughput_byte / 1000)
201
+ elsif throughput_byte <= (1000 ** 3)
202
+ "%3dMB/s" % (throughput_byte / (1000 ** 2))
203
+ else
204
+ "%3dGB/s" % (throughput_byte / (1000 ** 3))
205
+ end
206
+ end
207
+
208
+ def foreground?
209
+ proc_stat_path = "/proc/self/stat"
210
+ ps_path = "/bin/ps"
211
+
212
+ if File.exist?(proc_stat_path)
213
+ stat = File.read(proc_stat_path).sub(/\A.+\) /, "").split
214
+ process_group_id = stat[2]
215
+ terminal_process_group_id = stat[5]
216
+ process_group_id == terminal_process_group_id
217
+ elsif File.executable?(ps_path)
218
+ IO.pipe do |input, output|
219
+ pid = spawn(ps_path, "-o", "stat", "-p", Process.pid.to_s,
220
+ {:out => output, :err => output})
221
+ output.close
222
+ _, status = Process.waitpid2(pid)
223
+ return false unless status.success?
224
+
225
+ input.each_line.to_a.last.include?("+")
226
+ end
227
+ else
228
+ false
229
+ end
230
+ end
231
+
232
+ def guess_terminal_width
233
+ guess_terminal_width_from_io ||
234
+ guess_terminal_width_from_command ||
235
+ guess_terminal_width_from_env ||
236
+ 80
237
+ end
238
+
239
+ def guess_terminal_width_from_io
240
+ if IO.respond_to?(:console)
241
+ IO.console.winsize[1]
242
+ elsif $stderr.respond_to?(:winsize)
243
+ begin
244
+ $stderr.winsize[1]
245
+ rescue SystemCallError
246
+ nil
247
+ end
248
+ else
249
+ nil
250
+ end
251
+ end
252
+
253
+ def guess_terminal_width_from_command
254
+ IO.pipe do |input, output|
255
+ begin
256
+ pid = spawn("tput", "cols", {:out => output, :err => output})
257
+ rescue SystemCallError
258
+ return nil
259
+ end
260
+
261
+ output.close
262
+ _, status = Process.waitpid2(pid)
263
+ return nil unless status.success?
264
+
265
+ result = input.read.chomp
266
+ begin
267
+ Integer(result, 10)
268
+ rescue ArgumentError
269
+ nil
270
+ end
271
+ end
272
+ end
273
+
274
+ def guess_terminal_width_from_env
275
+ env = ENV["COLUMNS"] || ENV["TERM_WIDTH"]
276
+ return nil if env.nil?
277
+
278
+ begin
279
+ Integer(env, 10)
280
+ rescue ArgumentError
281
+ nil
282
+ end
61
283
  end
62
284
  end
63
285
  end
@@ -1,7 +1,7 @@
1
1
  module Datasets
2
2
  class Metadata < Struct.new(:name,
3
3
  :url,
4
- :license,
4
+ :licenses,
5
5
  :description)
6
6
  def description
7
7
  description_raw = super
@@ -0,0 +1,18 @@
1
+ module Datasets
2
+ class Table
3
+ def initialize(dataset)
4
+ @dataset = dataset
5
+ end
6
+
7
+ def to_h
8
+ columns = {}
9
+ @dataset.each do |record|
10
+ record.to_h.each do |name, value|
11
+ values = (columns[name] ||= [])
12
+ values << value
13
+ end
14
+ end
15
+ columns
16
+ end
17
+ end
18
+ end
@@ -1,3 +1,3 @@
1
1
  module Datasets
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -32,6 +32,11 @@ module Datasets
32
32
  @type = type
33
33
  @metadata.name = "wikipedia-#{@language}-#{@type}"
34
34
  @metadata.url = "https://dumps.wikimedia.org/"
35
+ @metadata.licenses = [
36
+ "CC-BY-SA-3.0",
37
+ "CC-BY-SA-4.0",
38
+ "GFDL-1.3-or-later",
39
+ ]
35
40
  @metadata.description = "Wikipedia #{@type} (#{@language})"
36
41
  end
37
42
 
@@ -0,0 +1,20 @@
1
+ class TableTest < Test::Unit::TestCase
2
+ def setup
3
+ @table = Datasets::Iris.new.to_table
4
+ end
5
+
6
+ test("#to_h") do
7
+ shorten_hash = {}
8
+ @table.to_h.each do |name, values|
9
+ shorten_hash[name] = values.first(5)
10
+ end
11
+ assert_equal({
12
+ :class => ["Iris-setosa"] * 5,
13
+ :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
14
+ :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2],
15
+ :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
16
+ :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6],
17
+ },
18
+ shorten_hash)
19
+ end
20
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: red-datasets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - tomisuker
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-02-06 00:00:00.000000000 Z
12
+ date: 2018-03-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -103,12 +103,14 @@ files:
103
103
  - lib/datasets/downloader.rb
104
104
  - lib/datasets/iris.rb
105
105
  - lib/datasets/metadata.rb
106
+ - lib/datasets/table.rb
106
107
  - lib/datasets/version.rb
107
108
  - lib/datasets/wikipedia.rb
108
109
  - red-datasets.gemspec
109
110
  - test/helper.rb
110
111
  - test/run-test.rb
111
112
  - test/test-iris.rb
113
+ - test/test-table.rb
112
114
  - test/test-wikipedia.rb
113
115
  homepage: https://github.com/red-data-tools/red-datasets
114
116
  licenses:
@@ -130,7 +132,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
130
132
  version: '0'
131
133
  requirements: []
132
134
  rubyforge_project:
133
- rubygems_version: 2.5.2.2
135
+ rubygems_version: 2.7.6
134
136
  signing_key:
135
137
  specification_version: 4
136
138
  summary: Red Datasets provides classes that provide common datasets such as iris dataset.
@@ -139,3 +141,4 @@ test_files:
139
141
  - test/test-wikipedia.rb
140
142
  - test/helper.rb
141
143
  - test/run-test.rb
144
+ - test/test-table.rb