bigquery_migration 0.1.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,105 @@
1
+ require 'thor'
2
+ require 'json'
3
+ require 'bigquery_migration'
4
+ require_relative 'action_runner'
5
+ require_relative 'hash_util'
6
+
7
+ class BigqueryMigration
8
+ class CLI < Thor
9
+ # cf. http://qiita.com/KitaitiMakoto/items/c6b9d6311c20a3cc21f9
10
+ def self.exit_on_failure?
11
+ true
12
+ end
13
+
14
+ # `run` is reserved by thor, we have to use def _run
15
+ map "run" => "_run"
16
+
17
+ option :config_path, :aliases => ['-c'], :type => :string,
18
+ :default => 'config.yml'
19
+ option :log_level, :aliases => ["-l"], :type => :string,
20
+ :desc => 'Log level such as fatal, error, warn, info, or debug',
21
+ :default => 'info'
22
+ option :log, :type => :string,
23
+ :desc => 'Output log to a file',
24
+ :default => 'STDOUT'
25
+ option :stdout, :type => :string,
26
+ :desc => 'Redirect STDOUT to a file',
27
+ :default => 'STDOUT'
28
+ option :stderr, :type => :string,
29
+ :desc => 'Redirect STDERR to a file',
30
+ :default => 'STDERR'
31
+ option :exec, :type => :boolean,
32
+ :desc => 'Execute or dry-run (Default: dry-run)',
33
+ :default => false
34
+ option :vars, :type => :hash,
35
+ :desc => 'Variables used in ERB, thor hash format'
36
+ option :output, :aliases => ["-o"], :type => :string,
37
+ :desc => 'Output result yaml to a file',
38
+ :default => 'STDOUT'
39
+
40
+ desc 'run <config.yml>', 'run bigquery_migration'
41
+ def _run(config_path)
42
+ opts = options.merge(
43
+ dry_run: !options[:exec]
44
+ )
45
+
46
+ init_logger
47
+ reopen_stdout
48
+ reopen_stderr
49
+
50
+ result = ActionRunner.new(config_path, opts).run
51
+ open_output do |io|
52
+ io.puts mask_secret(HashUtil.deep_stringify_keys(result).to_yaml)
53
+ logger.info { "DRY-RUN has finished. Use --exec option to run." } if opts[:dry_run]
54
+ end
55
+ exit(1) unless result[:success]
56
+ end
57
+
58
+ private
59
+
60
+ def logger
61
+ BigqueryMigration.logger
62
+ end
63
+
64
+ def init_logger
65
+ logger = BigqueryMigration::Logger.new(options[:log])
66
+ logger.level = options[:log_level]
67
+ BigqueryMigration.logger = logger
68
+ end
69
+
70
+ def reopen_stdout
71
+ unless options[:stdout] == 'STDOUT'
72
+ $stdout.reopen(options[:stdout])
73
+ end
74
+ $stdout.sync = true
75
+ end
76
+
77
+ def reopen_stderr
78
+ unless options[:stderr] == 'STDERR'
79
+ $stderr.reopen(options[:stderr])
80
+ end
81
+ $stderr.sync = true
82
+ end
83
+
84
+ def open_output
85
+ output = options[:output]
86
+ if output == 'STDOUT'
87
+ yield($stdout)
88
+ elsif output == 'STDERR'
89
+ yield($stderr)
90
+ else
91
+ File.open(output, 'w') do |io|
92
+ yield(io)
93
+ end
94
+ end
95
+ end
96
+
97
+ def mask_secret(yaml_string)
98
+ %w(password key).each do |secret|
99
+ yaml_string.gsub!(/([^ ]*#{secret}): .*$/, '\1: xxxxx')
100
+ end
101
+ yaml_string.gsub!(/(-----BEGIN\s+PRIVATE\s+KEY-----)[0-9A-Za-z+\/=\s\\]+(-----END\s+PRIVATE\s+KEY-----)/m, '\1 xxxxx \2')
102
+ yaml_string
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,51 @@
1
+ require 'set'
2
+ require 'yaml'
3
+ require 'erb'
4
+ require 'ostruct'
5
+
6
+ class BigqueryMigration
7
+ class ConfigLoader
8
+ attr_reader :config_path, :namespace
9
+
10
+ class AlreayIncluded < ::StandardError; end
11
+
12
+ def initialize(config_path, vars = {})
13
+ @config_path = File.expand_path(config_path)
14
+ @included_files = Set.new
15
+ @namespace = OpenStruct.new(vars)
16
+
17
+ unless @namespace.respond_to?(:include_file)
18
+ itself = self
19
+ # ToDo: better way?
20
+ @namespace.define_singleton_method(:include_file) do |path|
21
+ caller_path = caller[0][/^([^:]+):\d+:in `[^']*'$/, 1]
22
+ abs_path = File.expand_path(path, File.dirname(caller_path))
23
+ if File.extname(path) == '.erb'
24
+ itself.load_erb(abs_path)
25
+ else
26
+ File.read(abs_path)
27
+ end
28
+ end
29
+ end
30
+ end
31
+
32
+ def load
33
+ if File.extname(config_path) == '.erb'
34
+ YAML.load(load_erb(config_path))
35
+ else
36
+ YAML.load(File.read(config_path))
37
+ end
38
+ end
39
+
40
+ def load_erb(path = config_path)
41
+ unless @included_files.add?(path)
42
+ raise AlreayIncluded, "#{path} was included twice"
43
+ end
44
+
45
+ raw = File.read(path)
46
+ erb = ERB.new(raw, nil, "-")
47
+ erb.filename = path
48
+ erb.result(namespace.instance_eval { binding })
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,6 @@
1
+ class BigqueryMigration
2
+ class Error < StandardError; end
3
+ class ConfigError < Error; end
4
+ class JobTimeoutError < Error; end
5
+ class NotFoundError < Error; end
6
+ end
@@ -0,0 +1,35 @@
1
+ class BigqueryMigration
2
+ class HashUtil
3
+ def self.deep_symbolize_keys(hash)
4
+ if hash.is_a?(Hash)
5
+ hash.map do |key, val|
6
+ new_key = key.to_sym
7
+ new_val = deep_symbolize_keys(val)
8
+ [new_key, new_val]
9
+ end.to_h
10
+ elsif hash.is_a?(Array)
11
+ hash.map do |val|
12
+ deep_symbolize_keys(val)
13
+ end
14
+ else
15
+ hash
16
+ end
17
+ end
18
+
19
+ def self.deep_stringify_keys(hash)
20
+ if hash.is_a?(Hash)
21
+ hash.map do |key, val|
22
+ new_key = key.to_s
23
+ new_val = deep_stringify_keys(val)
24
+ [new_key, new_val]
25
+ end.to_h
26
+ elsif hash.is_a?(Array)
27
+ hash.map do |val|
28
+ deep_stringify_keys(val)
29
+ end
30
+ else
31
+ hash
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,45 @@
1
+ require 'logger'
2
+
3
+ class BigqueryMigration
4
+ class LogFormatter
5
+ FORMAT = "%s [%s] %s\n"
6
+
7
+ def initialize(opts={})
8
+ end
9
+
10
+ def call(severity, time, progname, msg)
11
+ FORMAT % [format_datetime(time), severity, format_message(msg)]
12
+ end
13
+
14
+ private
15
+ def format_datetime(time)
16
+ time.iso8601
17
+ end
18
+
19
+ def format_severity(severity)
20
+ severity
21
+ end
22
+
23
+ def format_message(message)
24
+ case message
25
+ when ::Exception
26
+ e = message
27
+ "#{e.class} (#{e.message})\n #{e.backtrace.join("\n ")}"
28
+ else
29
+ message.to_s
30
+ end
31
+ end
32
+ end
33
+
34
+ class Logger < ::Logger
35
+ def initialize(logdev, shift_age = 0, shift_size = 1048576)
36
+ logdev = STDOUT if logdev == 'STDOUT'
37
+ super(logdev, shift_age, shift_size)
38
+ @formatter = LogFormatter.new
39
+ end
40
+
41
+ def write(msg)
42
+ @logdev.write msg
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,388 @@
1
+ require 'csv'
2
+ require 'json'
3
+ require_relative 'error'
4
+
5
+ class BigqueryMigration
6
+ class Schema < ::Array
7
+ ALLOWED_FIELD_TYPES = Set.new(['STRING', 'INTEGER', 'FLOAT', 'BOOLEAN', 'RECORD', 'TIMESTAMP'])
8
+ ALLOWED_FIELD_MODES = Set.new(['NULLABLE', 'REQUIRED', 'REPEATED'])
9
+
10
+ def initialize(columns = [])
11
+ normalized = self.class.normalize_columns(columns)
12
+ super(normalized)
13
+ validate_columns!
14
+ end
15
+
16
+ def find_column_by_name(name)
17
+ self.class.find_column_by_name(self, name)
18
+ end
19
+
20
+ def validate_columns!
21
+ self.class.validate_columns!(self)
22
+ end
23
+
24
+ def validate_permitted_operations!(source_columns)
25
+ target_columns = self
26
+ self.class.validate_permitted_operations!(source_columns, target_columns)
27
+ end
28
+
29
+ def normalize_columns
30
+ self.class.normalize_columns(self)
31
+ end
32
+
33
+ def shallow_normalize_columns
34
+ self.class.shallow_normalize_columns(self)
35
+ end
36
+ def shallow_normalize_columns!
37
+ self.class.shallow_normalize_column!(self)
38
+ end
39
+
40
+ def flattened_columns
41
+ self.class.flattened_columns(self)
42
+ end
43
+
44
+ def equals?(source_columns)
45
+ self.class.equals?(source_columns, self)
46
+ end
47
+
48
+ # self - source_columns
49
+ def diff_columns(source_columns)
50
+ self.class.diff_columns(source_columns, self)
51
+ end
52
+
53
+ # diff with only column names
54
+ # self - source_columns
55
+ def diff_columns_by_name(source_columns)
56
+ self.class.diff_columns_by_name(source_columns, self)
57
+ end
58
+
59
+ # A.merge!(B) => B overwrites A
60
+ # A.reverse_merge!(B) => A overwrites B, but A is modified
61
+ def reverse_merge!(source_columns)
62
+ self.class.reverse_merge!(source_columns, self)
63
+ end
64
+
65
+ def reject_columns!(drop_columns)
66
+ self.class.reject_columns!(drop_columns, self)
67
+ end
68
+
69
+ def build_query_fields(source_columns)
70
+ self.class.build_query_fields(source_columns, self)
71
+ end
72
+
73
+
74
+ class << self
75
+ # The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
76
+ # and must start with a letter or underscore. The maximum length is 128 characters.
77
+ def validate_name!(name)
78
+ unless name =~ /\A[a-zA-Z_]+\w*\Z/
79
+ raise ConfigError, "Column name `#{name}` is invalid format"
80
+ end
81
+ unless name.length < 128
82
+ raise ConfigError, "Column name `#{name}` must be less than 128"
83
+ end
84
+ end
85
+
86
+ def validate_type!(type)
87
+ unless ALLOWED_FIELD_TYPES.include?(type)
88
+ raise ConfigError, "Column type `#{type}` is not allowed type"
89
+ end
90
+ end
91
+
92
+ def validate_mode!(mode)
93
+ unless ALLOWED_FIELD_MODES.include?(mode)
94
+ raise ConfigError, "Column mode `#{mode}` is not allowed mode"
95
+ end
96
+ end
97
+
98
+ def validate_columns!(columns)
99
+ columns.each do |column|
100
+ validate_name!(column[:name])
101
+ validate_type!(column[:type])
102
+ validate_mode!(column[:mode]) if column[:mode]
103
+
104
+ if column[:type] == 'RECORD'
105
+ validate_columns!(column[:fields])
106
+ end
107
+ end
108
+ end
109
+
110
+ def find_column_by_name(columns, name)
111
+ (columns || []).find { |c| c[:name] == name }
112
+ end
113
+
114
+ # validates permitted changes from old schema to new schema
115
+ def validate_permitted_operations!(source_columns, target_columns)
116
+ flattened_source_columns = flattened_columns(normalize_columns(source_columns))
117
+ flattened_target_columns = flattened_columns(normalize_columns(target_columns))
118
+
119
+ flattened_target_columns.keys.each do |flattened_name|
120
+ next unless flattened_source_columns.key?(flattened_name)
121
+ validate_permitted_operations_for_type!(
122
+ flattened_source_columns[flattened_name],
123
+ flattened_target_columns[flattened_name]
124
+ )
125
+ validate_permitted_operations_for_mode!(
126
+ flattened_source_columns[flattened_name],
127
+ flattened_target_columns[flattened_name]
128
+ )
129
+ end
130
+ end
131
+
132
+ # @param [Hash] source_column
133
+ # @param [Hash] target_column
134
+ #
135
+ # Disallowed conversion rule is as follows:
136
+ #
137
+ # type: RECORD => type: others
138
+ # mode: REPEATED => change type
139
+ #
140
+ def validate_permitted_operations_for_type!(source_column, target_column)
141
+ source_column = shallow_normalize_column(source_column)
142
+ target_column = shallow_normalize_column(target_column)
143
+
144
+ msg = "(#{source_column.to_h} => #{target_column.to_h})"
145
+ if source_column[:type] == 'RECORD'
146
+ if target_column[:type] != 'RECORD'
147
+ raise ConfigError, "`RECORD` can not be changed #{msg}"
148
+ end
149
+ end
150
+ if source_column[:mode] and source_column[:mode] == 'REPEATED'
151
+ if source_column[:type] != target_column[:type]
152
+ raise ConfigError, "`REPEATED` mode column's type can not be changed #{msg}"
153
+ end
154
+ end
155
+ end
156
+
157
+ # @param [Hash] source_column
158
+ # @param [Hash] target_column
159
+ #
160
+ # Allowed conversion rule is as follows:
161
+ #
162
+ # (new) => NULLABLE, REPEATED
163
+ # NULLABLE => NULLABLE
164
+ # REQUIRED => REQUIRED, NULLABLE
165
+ # REPEATED => REPEATED
166
+ def validate_permitted_operations_for_mode!(source_column, target_column)
167
+ source_column = shallow_normalize_column(source_column)
168
+ target_column = shallow_normalize_column(target_column)
169
+ source_mode = source_column[:mode]
170
+ target_mode = target_column[:mode]
171
+
172
+ return if source_mode == target_mode
173
+ msg = "(#{source_column.to_h} => #{target_column.to_h})"
174
+
175
+ case source_mode
176
+ when nil
177
+ if target_mode == 'REQUIRED'
178
+ raise ConfigError, "Newly adding a `REQUIRED` column is not allowed #{msg}"
179
+ end
180
+ when 'NULLABLE'
181
+ raise ConfigError, "`NULLABLE` column can not be changed #{msg}"
182
+ when 'REQUIRED'
183
+ if target_mode == 'REPEATED'
184
+ raise ConfigError, "`REQUIRED` column can not be changed to `REPEATED` #{msg}"
185
+ end
186
+ when 'REPEATED'
187
+ raise ConfigError, "`REPEATED` column can not be changed #{msg}"
188
+ end
189
+ end
190
+
191
+ def normalize_columns(columns)
192
+ columns = shallow_normalize_columns(columns)
193
+ columns.map do |column|
194
+ if column[:type] == 'RECORD' and column[:fields]
195
+ column[:fields] = normalize_columns(column[:fields])
196
+ end
197
+ column
198
+ end
199
+ end
200
+
201
+ def shallow_normalize_columns(columns)
202
+ columns.map {|column| shallow_normalize_column(column) }
203
+ end
204
+
205
+ def shallow_normalize_columns!(columns)
206
+ columns.each {|column| shallow_normalize_column!(column) }
207
+ columns
208
+ end
209
+
210
+ def shallow_normalize_column(column)
211
+ shallow_normalize_column!(column.dup)
212
+ end
213
+
214
+ def shallow_normalize_column!(column)
215
+ symbolize_keys!(column)
216
+ column[:type] = column[:type].upcase if column[:type]
217
+ column[:mode] ||= 'NULLABLE'
218
+ column[:mode] = column[:mode].upcase
219
+ column
220
+ end
221
+
222
+ def symbolize_keys!(column)
223
+ new_column = column.map do |key, val|
224
+ [key.to_sym, val]
225
+ end.to_h
226
+ column.replace(new_column)
227
+ end
228
+
229
+ # @param [Array] columns
230
+ # [{
231
+ # name: 'citiesLived',
232
+ # type: 'RECORD',
233
+ # fields: [
234
+ # {
235
+ # name: 'place', type: 'RECORD',
236
+ # fields: [
237
+ # { name: 'city', type: 'STRING' }, { name: 'postcode', type: 'STRING' }
238
+ # ]
239
+ # },
240
+ # { name: 'yearsLived', type: 'INTEGER' }
241
+ # ]
242
+ # }]
243
+ # @return Hash
244
+ # {
245
+ # 'citiesLived.place.city' => {
246
+ # type: 'STRING'
247
+ # },
248
+ # 'citiesLived.place.postcode' => {
249
+ # type: 'STRING'
250
+ # },
251
+ # 'citiesLived.yearsLived' => {
252
+ # type: 'INTEGER'
253
+ # }
254
+ # }
255
+ def flattened_columns(columns, parent_name: nil)
256
+ result = {}
257
+ columns.each do |column|
258
+ column_name = parent_name.nil? ? column[:name] : "#{parent_name}.#{column[:name]}"
259
+ if column[:type].upcase != 'RECORD'
260
+ result[column_name] = {}.tap do |value|
261
+ value[:type] = column[:type]
262
+ value[:mode] = column[:mode] if column[:mode]
263
+ end
264
+ else
265
+ result.merge!(flattened_columns(column[:fields], parent_name: column_name))
266
+ end
267
+ end
268
+ result
269
+ end
270
+
271
+ def equals?(source_columns, target_columns)
272
+ diff_columns(source_columns, target_columns).empty? and \
273
+ diff_columns(target_columns, source_columns).empty?
274
+ end
275
+
276
+ # target_columns - source_columns
277
+ def diff_columns(source_columns, target_columns)
278
+ _target_columns = shallow_normalize_columns(target_columns)
279
+ _source_columns = shallow_normalize_columns(source_columns)
280
+ diff_columns = _target_columns - _source_columns # shallow diff
281
+
282
+ diff_columns.map do |target_column|
283
+ t = target_column
284
+ source_column = find_column_by_name(_source_columns, target_column[:name])
285
+ next t unless source_column
286
+ next t unless target_column[:type] == 'RECORD' and source_column[:type] == 'RECORD'
287
+ next t unless target_column[:fields] and source_column[:fields]
288
+ # recusive diff for RECORD columns
289
+ diff_fields = diff_columns(source_column[:fields], target_column[:fields])
290
+ next nil if diff_fields.empty? # remove
291
+ target_column[:fields] = diff_fields
292
+ target_column
293
+ end.compact
294
+ end
295
+
296
+ # diff with only column_names
297
+ # target_columns - source_columns
298
+ def diff_columns_by_name(source_columns, target_columns)
299
+ _target_columns = shallow_normalize_columns(target_columns)
300
+ _source_columns = shallow_normalize_columns(source_columns)
301
+ diff_columns = _target_columns - _source_columns # shallow diff
302
+
303
+ diff_columns.map do |target_column|
304
+ t = target_column
305
+ source_column = find_column_by_name(_source_columns, target_column[:name])
306
+ next t unless source_column
307
+ next nil unless target_column[:type] == 'RECORD' and source_column[:type] == 'RECORD'
308
+ next nil unless target_column[:fields] and source_column[:fields]
309
+ # recusive diff for RECORD columns
310
+ diff_fields = diff_columns_by_name(source_column[:fields], target_column[:fields])
311
+ next nil if diff_fields.empty? # remove
312
+ target_column[:fields] = diff_fields
313
+ target_column
314
+ end.compact
315
+ end
316
+
317
+ # 1. target_column[:mode] ||= source_column[:mode] || 'NULLABLE' (not overwrite, but set if does not exist)
318
+ # 2. Add into target_columns if a source column does not exist in target_columns
319
+ #
320
+ # @param [Array] source_columns
321
+ # @param [Array] target_columns
322
+ def reverse_merge!(source_columns, target_columns)
323
+ shallow_normalize_columns!(source_columns)
324
+ shallow_normalize_columns!(target_columns)
325
+
326
+ source_columns.map do |source_column|
327
+ if target_column = find_column_by_name(target_columns, source_column[:name])
328
+ target_column[:mode] ||= source_column[:mode] || 'NULLABLE'
329
+ target_column[:type] ||= source_column[:type] # should never be happened
330
+ # Recursive merge fields of `RECORD` type
331
+ if target_column[:type] == 'RECORD' and target_column[:fields] and source_column[:fields]
332
+ reverse_merge!(source_column[:fields], target_column[:fields])
333
+ end
334
+ else
335
+ target_column = source_column.dup
336
+ target_column[:mode] ||= 'NULLABLE'
337
+ target_columns << target_column
338
+ end
339
+ end
340
+ target_columns
341
+ end
342
+
343
+ def reject_columns!(drop_columns, target_columns)
344
+ flattened_drop_columns = flattened_columns(drop_columns)
345
+
346
+ flattened_drop_columns.keys.each do |flattened_name|
347
+ # paths like a %w(citiesLived place city child1)
348
+ paths = flattened_name.split('.')
349
+ # object_id of fields and target_columns are different.
350
+ # But the internal elements refer to the same ones
351
+ fields = target_columns
352
+ paths.each do |path|
353
+ # The last element of the path does not have the fields
354
+ next if path == paths.last
355
+ # find recursively
356
+ column = fields.find { |f| f[:name] == path }
357
+ next if column.nil?
358
+ fields = column[:fields]
359
+ end
360
+
361
+ unless fields.empty?
362
+ fields.delete_if { |f| f[:name] == paths.last }
363
+ end
364
+ end
365
+ target_columns
366
+ end
367
+
368
+ def build_query_fields(source_columns, target_columns)
369
+ flattened_source_columns = flattened_columns(source_columns)
370
+ flattened_target_columns = flattened_columns(target_columns)
371
+
372
+ query_fields = flattened_target_columns.map do |flattened_name, flattened_target_column|
373
+ flattened_source_column = flattened_source_columns[flattened_name]
374
+ target_type = flattened_target_column[:type].upcase
375
+
376
+ if flattened_source_column
377
+ "#{target_type}(#{flattened_name}) AS #{flattened_name}"
378
+ else
379
+ flattened_name
380
+ # MEMO: NULL cast like "#{target_type}(NULL) AS #{flattened_name}" breaks RECORD columns as
381
+ # INTEGER(NULL) AS add_record.add_record.add_column1 => add_record_add_record_add_column1
382
+ # We have to add columns with patch_table beforehand
383
+ end
384
+ end
385
+ end
386
+ end
387
+ end
388
+ end