iij-dag-client 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/Gemfile +13 -0
  4. data/LICENSE.txt +174 -0
  5. data/Rakefile +43 -0
  6. data/config/settings.yml +11 -0
  7. data/iij-dag-client.gemspec +31 -0
  8. data/lib/dag.rb +33 -0
  9. data/lib/dag/client.rb +36 -0
  10. data/lib/dag/client/api.rb +295 -0
  11. data/lib/dag/client/api/cluster.rb +111 -0
  12. data/lib/dag/client/api/database.rb +58 -0
  13. data/lib/dag/client/api/job.rb +116 -0
  14. data/lib/dag/client/api/list_params.rb +36 -0
  15. data/lib/dag/client/api/rest_parameter.rb +149 -0
  16. data/lib/dag/client/api/storage.rb +354 -0
  17. data/lib/dag/client/api/storage_result.rb +52 -0
  18. data/lib/dag/client/api/table.rb +131 -0
  19. data/lib/dag/client/cluster.rb +26 -0
  20. data/lib/dag/client/cluster_validation.rb +59 -0
  21. data/lib/dag/client/database.rb +79 -0
  22. data/lib/dag/client/exception.rb +43 -0
  23. data/lib/dag/client/job.rb +56 -0
  24. data/lib/dag/client/job_validation.rb +22 -0
  25. data/lib/dag/client/model.rb +9 -0
  26. data/lib/dag/client/model/bucket.rb +20 -0
  27. data/lib/dag/client/model/bucket_collection.rb +34 -0
  28. data/lib/dag/client/model/cluster.rb +100 -0
  29. data/lib/dag/client/model/cluster_collection.rb +76 -0
  30. data/lib/dag/client/model/database.rb +34 -0
  31. data/lib/dag/client/model/database_collection.rb +51 -0
  32. data/lib/dag/client/model/job.rb +125 -0
  33. data/lib/dag/client/model/job_collection.rb +114 -0
  34. data/lib/dag/client/model/object.rb +56 -0
  35. data/lib/dag/client/model/object_collection.rb +64 -0
  36. data/lib/dag/client/model/table.rb +55 -0
  37. data/lib/dag/client/model/table_collection.rb +60 -0
  38. data/lib/dag/client/storage.rb +41 -0
  39. data/lib/dag/client/table.rb +16 -0
  40. data/lib/dag/client/version.rb +5 -0
  41. data/lib/dag/settings.rb +9 -0
  42. metadata +210 -0
@@ -0,0 +1,354 @@
1
+ require 'zlib'
2
+ require 'mime-types'
3
+ require 'singleton'
4
+
5
+ module Dag
6
+ class Client::API
7
+ module Storage
8
+ def buckets
9
+ xml_doc = execute_storage(RestParameter.new(:get, '/'))
10
+ Dag::Client::API::BucketsResult.new(xml_doc)
11
+ end
12
+
13
+ def objects(bucket, prefix: nil, max: nil, marker: nil, delimiter: nil)
14
+ resource = '/'
15
+ query_params = {}
16
+ if prefix
17
+ query_params.merge!('prefix' => prefix)
18
+ end
19
+
20
+ if max
21
+ query_params.merge!('max-keys' => max)
22
+ end
23
+
24
+ if marker
25
+ query_params.merge!('marker' => marker)
26
+ end
27
+
28
+ if delimiter
29
+ query_params.merge!('delimiter' => delimiter)
30
+ end
31
+
32
+ xml_doc = execute_storage(RestParameter.new(:get, resource, bucket: bucket, query_params: query_params))
33
+ Dag::Client::API::ObjectsResult.new(xml_doc)
34
+ end
35
+
36
+ def create_bucket(bucket)
37
+ resource = '/'
38
+ execute_storage(RestParameter.new(:put, resource, bucket: bucket, content_type: 'application/json'))
39
+ end
40
+
41
+ def create_object(bucket, object_name, options = {}, &block)
42
+ resource = "/#{object_name}"
43
+
44
+ type = MIME::Types.type_for(object_name).first
45
+ content_type = type ? type.to_s : 'application/octet-stream'
46
+ options = options.merge(bucket: bucket, content_type: content_type)
47
+ execute_storage(RestParameter.new(:put, resource, options), &block)
48
+ end
49
+
50
+ def create_multipart_object(bucket, object_name, options = {}, &block)
51
+ mu = MultipartUpload.new(bucket, object_name, options) do
52
+ self
53
+ end
54
+
55
+ # Initiate Multipart Upload
56
+ upload_id = mu.initiate_multipart_upload
57
+
58
+ begin
59
+ # Upload Part
60
+ upload_objects = mu.upload_part(upload_id, &block)
61
+
62
+ # Complete Multipart Upload
63
+ mu.complete_multipart_upload(upload_id, upload_objects)
64
+
65
+ rescue => e
66
+ # Abort Multipart Upload
67
+ mu.abort_multipart_upload(upload_id)
68
+
69
+ raise e
70
+ end
71
+ end
72
+
73
+ def get_object(bucket, object, range = nil)
74
+ resource = "/#{object}"
75
+ headers = {}
76
+ if range
77
+ bt = "bytes=#{range.first}-"
78
+ bt += "#{range.last}" if range.last != -1
79
+ headers[:Range] = bt
80
+ end
81
+ execute_storage(RestParameter.new(:get, resource, bucket: bucket, raw_data: true, headers: headers))
82
+ end
83
+
84
+ def delete_bucket(bucket)
85
+ resource = '/'
86
+ execute_storage(RestParameter.new(:delete, resource, bucket: bucket))
87
+ end
88
+
89
+ def delete_object(bucket, object)
90
+ resource = "/#{object}"
91
+ execute_storage(RestParameter.new(:delete, resource, bucket: bucket, content_type: 'application/json'))
92
+ end
93
+
94
+ def import(db_name, tbl_name, file_paths, options = {})
95
+ _import = Import.new(db_name, tbl_name, file_paths, options) do
96
+ self
97
+ end
98
+
99
+ # calc label suffix => Fixnum
100
+ suffix = _import.calc_label_suffix
101
+
102
+ # import execute
103
+ upload_objects = _import.execute(suffix)
104
+
105
+ STDERR.puts "finished upload #{upload_objects.size} objects."
106
+ STDERR.puts
107
+ STDERR.puts 'upload_objects:'
108
+ upload_objects.each do |o|
109
+ STDERR.puts o
110
+ end
111
+ end
112
+
113
+ private
114
+
115
+ class Import
116
+ def initialize(db_name, tbl_name, file_paths, options = {}, &block)
117
+ @db_name = db_name
118
+ @tbl_naem = tbl_name
119
+ @file_paths = file_paths
120
+ @jobs = options.delete(:jobs) || 1
121
+ @label = options.delete(:label) || 'label'
122
+ @splitsz = options.delete(:splitsz) || 100 * 1024 ** 2 #100M
123
+ @api = block[]
124
+
125
+ import_parameter = ImportParameter.instance
126
+ import_parameter.db_name = db_name
127
+ import_parameter.tbl_name = tbl_name
128
+ import_parameter.label = @label
129
+
130
+ if %w(_ .).include? @label[0]
131
+ raise Dag::Client::ParameterInvalid.new("label should not start with '_' or '.'")
132
+ end
133
+
134
+ STDERR.puts "Initialize...\njobs: #{@jobs}, splitsz: #{@splitsz}"
135
+ end
136
+
137
+ def calc_label_suffix
138
+ prefix = ImportParameter.instance.storage_prefix
139
+ objects = @api.objects(@db_name, prefix: prefix).objects
140
+
141
+ return 0 if objects.blank?
142
+
143
+ objects.map { |o| o.scan(/#{@label}_(\d+)/) }.flatten.map(&:to_i).sort.reverse.first + 1
144
+ end
145
+
146
+ def execute(suffix)
147
+ file_paths = @file_paths.is_a?(String) ? [@file_paths] : @file_paths
148
+
149
+ upload_objects = []
150
+ file_paths.each do |file_path|
151
+ file_index = if file_path.end_with?('.gz')
152
+ import_gz_file(file_path, suffix, upload_objects)
153
+ elsif file_path == "-"
154
+ import_stream($stdin, suffix, upload_objects)
155
+ else
156
+ import_text_file(file_path, suffix, upload_objects)
157
+ end
158
+
159
+ suffix += file_index
160
+ end
161
+
162
+ return upload_objects
163
+ end
164
+
165
+ def import_gz_file(file_path, suffix, upload_objects)
166
+ import_stream(Zlib::GzipReader.open(file_path), suffix, upload_objects)
167
+ rescue Zlib::Error
168
+ #if not gzip
169
+ import_text_file(file_path, suffix, upload_objects)
170
+ end
171
+
172
+ def import_text_file(file_path, suffix, upload_objects)
173
+ import_stream(File.open(file_path), suffix, upload_objects)
174
+ end
175
+
176
+ def import_stream(ifp, suffix, upload_objects)
177
+ q = SizedQueue.new(@jobs)
178
+ th = Array.new(@jobs) {
179
+ Thread.new{
180
+ while data = q.pop
181
+ break unless data
182
+ STDERR.puts "> starting upload part #{data[2]}, #{data[1].length}"
183
+ execute_storage_detail(data[1], suffix + data[0])
184
+ STDERR.puts "< finished upload part #{data[2]}, #{data[1].length}"
185
+ upload_objects << ImportParameter.instance.object_label(suffix + data[0])
186
+ end
187
+ q.push nil
188
+ }
189
+ }
190
+
191
+ begin
192
+ file_index = 0
193
+ import_index = ImportParameter.instance.index
194
+ while true
195
+ buffer = ifp.read(@splitsz)
196
+ break unless buffer
197
+ buffer.force_encoding("ASCII-8BIT")
198
+ nline = ifp.gets
199
+ if nline
200
+ nline.force_encoding("ASCII-8BIT")
201
+ buffer.concat(nline)
202
+ end
203
+ q.push [file_index, buffer, import_index]
204
+ file_index += 1
205
+ import_index += 1
206
+ end
207
+ q.push nil
208
+ end
209
+
210
+ th.map(&:join)
211
+ ifp.close
212
+
213
+ file_index
214
+ end
215
+
216
+ def execute_storage_detail(data, suffix)
217
+ str = StringIO.new
218
+ gz = Zlib::GzipWriter.new(str)
219
+ gz.write data
220
+ gz.close
221
+
222
+ options = {
223
+ content_type: 'application/x-gzip',
224
+ bucket: @db_name,
225
+ import: true
226
+ }
227
+
228
+ resource = ImportParameter.instance.url(suffix)
229
+ @api.execute_storage(RestParameter.new(:put, resource, options)) do
230
+ str.string
231
+ end
232
+ end
233
+
234
+ class ImportParameter
235
+ include Singleton
236
+
237
+ attr_accessor :db_name, :tbl_name, :label, :index
238
+
239
+ def initialize
240
+ @index = 1
241
+ end
242
+
243
+ def url(suffix)
244
+ "/#{@tbl_name}/#{@label}_#{suffix}.gz"
245
+ end
246
+
247
+ def object_label(suffix)
248
+ "/#{@db_name}/#{@tbl_name}/#{@label}_#{suffix}.gz"
249
+ end
250
+
251
+ def file_label(suffix)
252
+ "#{@label}_#{suffix}"
253
+ end
254
+
255
+ def storage_prefix
256
+ "#{@tbl_name}/#{@label}"
257
+ end
258
+ end
259
+ end
260
+
261
+ class MultipartUpload
262
+ def initialize(bucket, object, options = {}, &block)
263
+ type = MIME::Types.type_for(object).first
264
+ content_type = type ? type.to_s : 'application/octet-stream'
265
+ options = options.merge(bucket: bucket, content_type: content_type)
266
+
267
+ @bucket = bucket
268
+ @object = object
269
+ @splitsz = options.delete(:splitsz) || 100 * 1024 ** 2 #100MB
270
+ @jobs = options.delete(:jobs) || 1
271
+ @options = options
272
+ @api = block[]
273
+ end
274
+
275
+ def initiate_multipart_upload
276
+ STDERR.puts "Initiate multipart upload...\njobs:#{@jobs}, splitsz:#{@splitsz}"
277
+ resource = "/#{@object}?uploads"
278
+ response = @api.execute_storage(RestParameter.new(:post, resource, @options))
279
+ upload_id = response.elements['InitiateMultipartUploadResult/UploadId'].text
280
+ return upload_id
281
+ end
282
+
283
+ def upload_part(upload_id, &block)
284
+ upload_objects = {}
285
+ split_stream(upload_id, upload_objects, &block)
286
+ return Hash[upload_objects.sort]
287
+ end
288
+
289
+ def complete_multipart_upload(upload_id, upload_objects)
290
+ resource = "/#{@object}?uploadId=#{upload_id}"
291
+
292
+ payload = '<CompleteMultipartUpload>'
293
+ upload_objects.each do |part, etag|
294
+ payload += "<Part><PartNumber>#{part}</PartNumber><ETag>#{etag}</ETag></Part>"
295
+ end
296
+ payload += '</CompleteMultipartUpload>'
297
+
298
+ @api.execute_storage(RestParameter.new(:post, resource, @options)) do
299
+ payload
300
+ end
301
+
302
+ puts "complete multipart upload."
303
+ end
304
+
305
+ def abort_multipart_upload(upload_id)
306
+ resource = "/#{@object}?uploadId=#{upload_id}"
307
+ @api.execute_storage(RestParameter.new(:delete, resource, @options))
308
+ end
309
+
310
+ private
311
+
312
+ def split_stream(upload_id, upload_objects, &block)
313
+ limit = 5 * 1024 ** 2 #5MB
314
+ raise "split size is invalid. below lower limit of #{limit} byte" if @splitsz < limit
315
+
316
+ ifp = block[]
317
+
318
+ q = SizedQueue.new(@jobs)
319
+ th = Array.new(@jobs) {
320
+ Thread.new{
321
+ while data = q.pop
322
+ break unless data
323
+ puts "> starting upload part #{data[0]}, #{data[1].length}"
324
+ resource = "/#{@object}?partNumber=#{data[0]}&uploadId=#{upload_id}"
325
+ response = @api.execute_storage(RestParameter.new(:put, resource, @options)) do
326
+ data[1]
327
+ end
328
+ puts "< finished upload part #{data[0]}, #{data[1].length}"
329
+ upload_objects[data[0]] = response.headers['ETag'].first
330
+ end
331
+ q.push nil
332
+ }
333
+ }
334
+
335
+ begin
336
+ file_index = 1
337
+ while true
338
+ buffer = ifp.read(@splitsz)
339
+ break unless buffer
340
+ buffer.force_encoding("ASCII-8BIT")
341
+
342
+ q.push [file_index, buffer]
343
+ file_index += 1
344
+ end
345
+ q.push nil
346
+ end
347
+
348
+ th.map(&:join)
349
+ puts "finished upload #{file_index-1} part objects."
350
+ end
351
+ end
352
+ end
353
+ end
354
+ end
@@ -0,0 +1,52 @@
1
+ module Dag
2
+ class Client::API
3
+ class StorageResult
4
+ def initialize(xml_doc)
5
+ @xml_doc = xml_doc
6
+ end
7
+ end
8
+
9
+ class BucketsResult < StorageResult
10
+ def buckets
11
+ REXML::XPath.match(@xml_doc, "/ListAllMyBucketsResult/Buckets/Bucket/Name").map { |b| b.text }
12
+ end
13
+
14
+ def owner_id
15
+ REXML::XPath.match(@xml_doc, "/ListAllMyBucketsResult/Owner/ID").map { |b| b.text }.first
16
+ end
17
+
18
+ def owner_display_name
19
+ REXML::XPath.match(@xml_doc, "/ListAllMyBucketsResult/Owner/DisplayName").map { |b| b.text }.first
20
+ end
21
+ end
22
+
23
+ class ObjectsResult < StorageResult
24
+ def objects
25
+ REXML::XPath.match(@xml_doc, "/ListBucketResult/Contents/Key").map { |b| b.text }
26
+ end
27
+
28
+ def full_objects
29
+ REXML::XPath.match(@xml_doc, "/ListBucketResult/Contents").map{|m|
30
+ XmlSimple.xml_in(m.to_s)
31
+ }
32
+ end
33
+
34
+ def truncated?
35
+ REXML::XPath.match(@xml_doc, "/ListBucketResult/IsTruncated").map { |b| b.text }.first == 'true'
36
+ end
37
+
38
+ def marker
39
+ REXML::XPath.match(@xml_doc, "/ListBucketResult/Marker").map { |b| b.text }.first
40
+ end
41
+
42
+ def next_marker
43
+ REXML::XPath.match(@xml_doc, "/ListBucketResult/NextMarker").map { |b| b.text }.first
44
+ end
45
+
46
+ def max
47
+ REXML::XPath.match(@xml_doc, "/ListBucketResult/MaxKeys").map { |b| b.text }.first.to_i
48
+ end
49
+ end
50
+ end
51
+ end
52
+
@@ -0,0 +1,131 @@
1
+ require 'active_support/core_ext/object/to_query'
2
+
3
+ module Dag
4
+ class Client::API
5
+ module Table
6
+ def table_info_list(cluster_name, database_name, options = {})
7
+ resource = %Q(/v1/#{cluster_name}/#{database_name})
8
+ execute(RestParameter.new(:get, resource, cano_resource: 'table', query_params: list_params(options)))
9
+ end
10
+
11
+ def table(cluster_name, database_name, tbl_name, params: {})
12
+ resource = %Q(/v1/#{cluster_name}/#{database_name}/#{tbl_name})
13
+ begin
14
+ execute(RestParameter.new(:get, resource, cano_resource: 'table'))
15
+ rescue Dag::Client::APIFailure => e
16
+ raise e if e.api_code != "TableNotFound"
17
+ nil
18
+ end
19
+ end
20
+
21
+ def create_table(cluster_name, db_name, params: {})
22
+ tbl_name = params[:table]
23
+ raise Dag::Client::ParameterInvalid.new('table name is blank') if tbl_name.blank?
24
+
25
+ if tbl_name !~ /\A[a-z0-9_]+\Z/
26
+ raise Dag::Client::ParameterInvalid.new("tbl_name is invalid: #{tbl_name}")
27
+ end
28
+
29
+ if tbl_name.length > 128
30
+ raise Dag::Client::ParameterInvalid.new("tbl_name is too long: #{tbl_name}")
31
+ end
32
+
33
+ format = params[:format]
34
+ if format && !['csv', 'tsv', 'json', 'json_agent'].include?(format)
35
+ raise Dag::Client::ParameterInvalid.new("format is invalid: #{format}")
36
+ end
37
+
38
+ comment = params[:comment]
39
+ if comment.present? && comment !~ /\A[[:ascii:]]+\Z/
40
+ raise Dag::Client::ParameterInvalid.new("comment is not ascii")
41
+ end
42
+
43
+ if comment && comment.length > 100
44
+ raise Dag::Client::ParameterInvalid.new("comment is too long")
45
+ end
46
+
47
+ resource = %Q(/v1/#{cluster_name}/#{db_name}/#{tbl_name})
48
+ parameters = {}
49
+ if format
50
+ parameters.merge!('format' => format)
51
+ end
52
+ schema = params[:schema]
53
+ if schema
54
+ parameters.merge!('schema' => params[:schema])
55
+ end
56
+
57
+ if comment
58
+ parameters.merge!('comment' => comment)
59
+ end
60
+
61
+ # Table Check
62
+ if params[:create_api] && response = table(cluster_name, db_name, tbl_name)
63
+ if response['tableName'] == tbl_name
64
+ raise Dag::Client::TableAlreadyExists.new('Table already exists')
65
+ end
66
+ end
67
+
68
+ execute(RestParameter.new(:put, resource, cano_resource: 'table', content_type: 'application/json', parameters: parameters))
69
+ end
70
+
71
+ def split_table(cluster_name, database_name, tbl_name, params)
72
+ raise Dag::Client::ParameterInvalid.new('params is blank') if params.blank?
73
+
74
+ input_object_keys = params[:input_object_keys]
75
+ unless input_object_keys.instance_of?(Array)
76
+ raise Dag::Client::ParameterInvalid.new('input_object_keys is not array')
77
+ end
78
+ raise Dag::Client::ParameterInvalid.new('input_object_keys is blank') if input_object_keys.blank?
79
+
80
+ input_object_keys.each do |input_object_key|
81
+ unless input_object_key.start_with?('dag://')
82
+ raise Dag::Client::ParameterInvalid.new("input_object_key should start with 'dag://'")
83
+ end
84
+ end
85
+
86
+ input_format = params[:input_format]
87
+ raise Dag::Client::ParameterInvalid.new('input_format is blank') if input_format.blank?
88
+ unless ['csv', 'tsv', 'json'].include?(input_format)
89
+ raise Dag::Client::ParameterInvalid.new("input_format is invalid:#{input_format}")
90
+ end
91
+
92
+ parameters = {
93
+ 'inputObjectKeys' => params[:input_object_keys],
94
+ 'inputFormat' => input_format,
95
+ 'outputDatabase' => database_name,
96
+ 'outputTable' => tbl_name,
97
+ 'clusterName' => cluster_name
98
+ }
99
+
100
+ label = params[:label]
101
+ if label.present?
102
+ parameters.merge!('label' => label)
103
+ end
104
+
105
+ schema = params[:schema]
106
+ if schema.present?
107
+ parameters.merge!('schema' => schema)
108
+ end
109
+
110
+ execute(RestParameter.new(:post, "/v1/", cano_resource: 'split', content_type: 'application/json', parameters: parameters))
111
+ end
112
+
113
+ def delete_table(cluster_name, database_name, tbl_name)
114
+ execute(RestParameter.new(:delete, "/v1/#{cluster_name}/#{database_name}/#{tbl_name}", content_type: 'application/json', cano_resource: 'table'))
115
+ end
116
+
117
+ private
118
+
119
+ def default_schema(format)
120
+ case format
121
+ when 'csv', 'tsv'
122
+ 'v array<string>'
123
+ when 'json'
124
+ 'v map<string, string>'
125
+ when 'json_agent'
126
+ 'time int, v map<string, string>'
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end