bigquery_migration 0.1.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+ require 'thor'
2
+ require 'json'
3
+ require 'bigquery_migration'
4
+ require_relative 'action_runner'
5
+ require_relative 'hash_util'
6
+
7
+ class BigqueryMigration
8
+ class CLI < Thor
9
+ # cf. http://qiita.com/KitaitiMakoto/items/c6b9d6311c20a3cc21f9
10
+ def self.exit_on_failure?
11
+ true
12
+ end
13
+
14
+ # `run` is reserved by thor, we have to use def _run
15
+ map "run" => "_run"
16
+
17
+ option :config_path, :aliases => ['-c'], :type => :string,
18
+ :default => 'config.yml'
19
+ option :log_level, :aliases => ["-l"], :type => :string,
20
+ :desc => 'Log level such as fatal, error, warn, info, or debug',
21
+ :default => 'info'
22
+ option :log, :type => :string,
23
+ :desc => 'Output log to a file',
24
+ :default => 'STDOUT'
25
+ option :stdout, :type => :string,
26
+ :desc => 'Redirect STDOUT to a file',
27
+ :default => 'STDOUT'
28
+ option :stderr, :type => :string,
29
+ :desc => 'Redirect STDERR to a file',
30
+ :default => 'STDERR'
31
+ option :exec, :type => :boolean,
32
+ :desc => 'Execute or dry-run (Default: dry-run)',
33
+ :default => false
34
+ option :vars, :type => :hash,
35
+ :desc => 'Variables used in ERB, thor hash format'
36
+ option :output, :aliases => ["-o"], :type => :string,
37
+ :desc => 'Output result yaml to a file',
38
+ :default => 'STDOUT'
39
+
40
+ desc 'run <config.yml>', 'run bigquery_migration'
41
+ def _run(config_path)
42
+ opts = options.merge(
43
+ dry_run: !options[:exec]
44
+ )
45
+
46
+ init_logger
47
+ reopen_stdout
48
+ reopen_stderr
49
+
50
+ result = ActionRunner.new(config_path, opts).run
51
+ open_output do |io|
52
+ io.puts mask_secret(HashUtil.deep_stringify_keys(result).to_yaml)
53
+ logger.info { "DRY-RUN has finished. Use --exec option to run." } if opts[:dry_run]
54
+ end
55
+ exit(1) unless result[:success]
56
+ end
57
+
58
+ private
59
+
60
+ def logger
61
+ BigqueryMigration.logger
62
+ end
63
+
64
+ def init_logger
65
+ logger = BigqueryMigration::Logger.new(options[:log])
66
+ logger.level = options[:log_level]
67
+ BigqueryMigration.logger = logger
68
+ end
69
+
70
+ def reopen_stdout
71
+ unless options[:stdout] == 'STDOUT'
72
+ $stdout.reopen(options[:stdout])
73
+ end
74
+ $stdout.sync = true
75
+ end
76
+
77
+ def reopen_stderr
78
+ unless options[:stderr] == 'STDERR'
79
+ $stderr.reopen(options[:stderr])
80
+ end
81
+ $stderr.sync = true
82
+ end
83
+
84
+ def open_output
85
+ output = options[:output]
86
+ if output == 'STDOUT'
87
+ yield($stdout)
88
+ elsif output == 'STDERR'
89
+ yield($stderr)
90
+ else
91
+ File.open(output, 'w') do |io|
92
+ yield(io)
93
+ end
94
+ end
95
+ end
96
+
97
+ def mask_secret(yaml_string)
98
+ %w(password key).each do |secret|
99
+ yaml_string.gsub!(/([^ ]*#{secret}): .*$/, '\1: xxxxx')
100
+ end
101
+ yaml_string.gsub!(/(-----BEGIN\s+PRIVATE\s+KEY-----)[0-9A-Za-z+\/=\s\\]+(-----END\s+PRIVATE\s+KEY-----)/m, '\1 xxxxx \2')
102
+ yaml_string
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,51 @@
1
+ require 'set'
2
+ require 'yaml'
3
+ require 'erb'
4
+ require 'ostruct'
5
+
6
+ class BigqueryMigration
7
+ class ConfigLoader
8
+ attr_reader :config_path, :namespace
9
+
10
+ class AlreayIncluded < ::StandardError; end
11
+
12
+ def initialize(config_path, vars = {})
13
+ @config_path = File.expand_path(config_path)
14
+ @included_files = Set.new
15
+ @namespace = OpenStruct.new(vars)
16
+
17
+ unless @namespace.respond_to?(:include_file)
18
+ itself = self
19
+ # ToDo: better way?
20
+ @namespace.define_singleton_method(:include_file) do |path|
21
+ caller_path = caller[0][/^([^:]+):\d+:in `[^']*'$/, 1]
22
+ abs_path = File.expand_path(path, File.dirname(caller_path))
23
+ if File.extname(path) == '.erb'
24
+ itself.load_erb(abs_path)
25
+ else
26
+ File.read(abs_path)
27
+ end
28
+ end
29
+ end
30
+ end
31
+
32
+ def load
33
+ if File.extname(config_path) == '.erb'
34
+ YAML.load(load_erb(config_path))
35
+ else
36
+ YAML.load(File.read(config_path))
37
+ end
38
+ end
39
+
40
+ def load_erb(path = config_path)
41
+ unless @included_files.add?(path)
42
+ raise AlreayIncluded, "#{path} was included twice"
43
+ end
44
+
45
+ raw = File.read(path)
46
+ erb = ERB.new(raw, nil, "-")
47
+ erb.filename = path
48
+ erb.result(namespace.instance_eval { binding })
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,6 @@
1
+ class BigqueryMigration
2
+ class Error < StandardError; end
3
+ class ConfigError < Error; end
4
+ class JobTimeoutError < Error; end
5
+ class NotFoundError < Error; end
6
+ end
@@ -0,0 +1,35 @@
1
+ class BigqueryMigration
2
+ class HashUtil
3
+ def self.deep_symbolize_keys(hash)
4
+ if hash.is_a?(Hash)
5
+ hash.map do |key, val|
6
+ new_key = key.to_sym
7
+ new_val = deep_symbolize_keys(val)
8
+ [new_key, new_val]
9
+ end.to_h
10
+ elsif hash.is_a?(Array)
11
+ hash.map do |val|
12
+ deep_symbolize_keys(val)
13
+ end
14
+ else
15
+ hash
16
+ end
17
+ end
18
+
19
+ def self.deep_stringify_keys(hash)
20
+ if hash.is_a?(Hash)
21
+ hash.map do |key, val|
22
+ new_key = key.to_s
23
+ new_val = deep_stringify_keys(val)
24
+ [new_key, new_val]
25
+ end.to_h
26
+ elsif hash.is_a?(Array)
27
+ hash.map do |val|
28
+ deep_stringify_keys(val)
29
+ end
30
+ else
31
+ hash
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,45 @@
1
+ require 'logger'
2
+
3
+ class BigqueryMigration
4
+ class LogFormatter
5
+ FORMAT = "%s [%s] %s\n"
6
+
7
+ def initialize(opts={})
8
+ end
9
+
10
+ def call(severity, time, progname, msg)
11
+ FORMAT % [format_datetime(time), severity, format_message(msg)]
12
+ end
13
+
14
+ private
15
+ def format_datetime(time)
16
+ time.iso8601
17
+ end
18
+
19
+ def format_severity(severity)
20
+ severity
21
+ end
22
+
23
+ def format_message(message)
24
+ case message
25
+ when ::Exception
26
+ e = message
27
+ "#{e.class} (#{e.message})\n #{e.backtrace.join("\n ")}"
28
+ else
29
+ message.to_s
30
+ end
31
+ end
32
+ end
33
+
34
+ class Logger < ::Logger
35
+ def initialize(logdev, shift_age = 0, shift_size = 1048576)
36
+ logdev = STDOUT if logdev == 'STDOUT'
37
+ super(logdev, shift_age, shift_size)
38
+ @formatter = LogFormatter.new
39
+ end
40
+
41
+ def write(msg)
42
+ @logdev.write msg
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,388 @@
1
+ require 'csv'
2
+ require 'json'
3
+ require_relative 'error'
4
+
5
+ class BigqueryMigration
6
+ class Schema < ::Array
7
+ ALLOWED_FIELD_TYPES = Set.new(['STRING', 'INTEGER', 'FLOAT', 'BOOLEAN', 'RECORD', 'TIMESTAMP'])
8
+ ALLOWED_FIELD_MODES = Set.new(['NULLABLE', 'REQUIRED', 'REPEATED'])
9
+
10
+ def initialize(columns = [])
11
+ normalized = self.class.normalize_columns(columns)
12
+ super(normalized)
13
+ validate_columns!
14
+ end
15
+
16
+ def find_column_by_name(name)
17
+ self.class.find_column_by_name(self, name)
18
+ end
19
+
20
+ def validate_columns!
21
+ self.class.validate_columns!(self)
22
+ end
23
+
24
+ def validate_permitted_operations!(source_columns)
25
+ target_columns = self
26
+ self.class.validate_permitted_operations!(source_columns, target_columns)
27
+ end
28
+
29
+ def normalize_columns
30
+ self.class.normalize_columns(self)
31
+ end
32
+
33
+ def shallow_normalize_columns
34
+ self.class.shallow_normalize_columns(self)
35
+ end
36
+ def shallow_normalize_columns!
37
+ self.class.shallow_normalize_column!(self)
38
+ end
39
+
40
+ def flattened_columns
41
+ self.class.flattened_columns(self)
42
+ end
43
+
44
+ def equals?(source_columns)
45
+ self.class.equals?(source_columns, self)
46
+ end
47
+
48
+ # self - source_columns
49
+ def diff_columns(source_columns)
50
+ self.class.diff_columns(source_columns, self)
51
+ end
52
+
53
+ # diff with only column names
54
+ # self - source_columns
55
+ def diff_columns_by_name(source_columns)
56
+ self.class.diff_columns_by_name(source_columns, self)
57
+ end
58
+
59
+ # A.merge!(B) => B overwrites A
60
+ # A.reverse_merge!(B) => A overwrites B, but A is modified
61
+ def reverse_merge!(source_columns)
62
+ self.class.reverse_merge!(source_columns, self)
63
+ end
64
+
65
+ def reject_columns!(drop_columns)
66
+ self.class.reject_columns!(drop_columns, self)
67
+ end
68
+
69
+ def build_query_fields(source_columns)
70
+ self.class.build_query_fields(source_columns, self)
71
+ end
72
+
73
+
74
+ class << self
75
+ # The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
76
+ # and must start with a letter or underscore. The maximum length is 128 characters.
77
+ def validate_name!(name)
78
+ unless name =~ /\A[a-zA-Z_]+\w*\Z/
79
+ raise ConfigError, "Column name `#{name}` is invalid format"
80
+ end
81
+ unless name.length < 128
82
+ raise ConfigError, "Column name `#{name}` must be less than 128"
83
+ end
84
+ end
85
+
86
+ def validate_type!(type)
87
+ unless ALLOWED_FIELD_TYPES.include?(type)
88
+ raise ConfigError, "Column type `#{type}` is not allowed type"
89
+ end
90
+ end
91
+
92
+ def validate_mode!(mode)
93
+ unless ALLOWED_FIELD_MODES.include?(mode)
94
+ raise ConfigError, "Column mode `#{mode}` is not allowed mode"
95
+ end
96
+ end
97
+
98
+ def validate_columns!(columns)
99
+ columns.each do |column|
100
+ validate_name!(column[:name])
101
+ validate_type!(column[:type])
102
+ validate_mode!(column[:mode]) if column[:mode]
103
+
104
+ if column[:type] == 'RECORD'
105
+ validate_columns!(column[:fields])
106
+ end
107
+ end
108
+ end
109
+
110
+ def find_column_by_name(columns, name)
111
+ (columns || []).find { |c| c[:name] == name }
112
+ end
113
+
114
+ # validates permitted changes from old schema to new schema
115
+ def validate_permitted_operations!(source_columns, target_columns)
116
+ flattened_source_columns = flattened_columns(normalize_columns(source_columns))
117
+ flattened_target_columns = flattened_columns(normalize_columns(target_columns))
118
+
119
+ flattened_target_columns.keys.each do |flattened_name|
120
+ next unless flattened_source_columns.key?(flattened_name)
121
+ validate_permitted_operations_for_type!(
122
+ flattened_source_columns[flattened_name],
123
+ flattened_target_columns[flattened_name]
124
+ )
125
+ validate_permitted_operations_for_mode!(
126
+ flattened_source_columns[flattened_name],
127
+ flattened_target_columns[flattened_name]
128
+ )
129
+ end
130
+ end
131
+
132
+ # @param [Hash] source_column
133
+ # @param [Hash] target_column
134
+ #
135
+ # Disallowed conversion rule is as follows:
136
+ #
137
+ # type: RECORD => type: others
138
+ # mode: REPEATED => change type
139
+ #
140
+ def validate_permitted_operations_for_type!(source_column, target_column)
141
+ source_column = shallow_normalize_column(source_column)
142
+ target_column = shallow_normalize_column(target_column)
143
+
144
+ msg = "(#{source_column.to_h} => #{target_column.to_h})"
145
+ if source_column[:type] == 'RECORD'
146
+ if target_column[:type] != 'RECORD'
147
+ raise ConfigError, "`RECORD` can not be changed #{msg}"
148
+ end
149
+ end
150
+ if source_column[:mode] and source_column[:mode] == 'REPEATED'
151
+ if source_column[:type] != target_column[:type]
152
+ raise ConfigError, "`REPEATED` mode column's type can not be changed #{msg}"
153
+ end
154
+ end
155
+ end
156
+
157
+ # @param [Hash] source_column
158
+ # @param [Hash] target_column
159
+ #
160
+ # Allowed conversion rule is as follows:
161
+ #
162
+ # (new) => NULLABLE, REPEATED
163
+ # NULLABLE => NULLABLE
164
+ # REQUIRED => REQUIRED, NULLABLE
165
+ # REPEATED => REPEATED
166
+ def validate_permitted_operations_for_mode!(source_column, target_column)
167
+ source_column = shallow_normalize_column(source_column)
168
+ target_column = shallow_normalize_column(target_column)
169
+ source_mode = source_column[:mode]
170
+ target_mode = target_column[:mode]
171
+
172
+ return if source_mode == target_mode
173
+ msg = "(#{source_column.to_h} => #{target_column.to_h})"
174
+
175
+ case source_mode
176
+ when nil
177
+ if target_mode == 'REQUIRED'
178
+ raise ConfigError, "Newly adding a `REQUIRED` column is not allowed #{msg}"
179
+ end
180
+ when 'NULLABLE'
181
+ raise ConfigError, "`NULLABLE` column can not be changed #{msg}"
182
+ when 'REQUIRED'
183
+ if target_mode == 'REPEATED'
184
+ raise ConfigError, "`REQUIRED` column can not be changed to `REPEATED` #{msg}"
185
+ end
186
+ when 'REPEATED'
187
+ raise ConfigError, "`REPEATED` column can not be changed #{msg}"
188
+ end
189
+ end
190
+
191
+ def normalize_columns(columns)
192
+ columns = shallow_normalize_columns(columns)
193
+ columns.map do |column|
194
+ if column[:type] == 'RECORD' and column[:fields]
195
+ column[:fields] = normalize_columns(column[:fields])
196
+ end
197
+ column
198
+ end
199
+ end
200
+
201
+ def shallow_normalize_columns(columns)
202
+ columns.map {|column| shallow_normalize_column(column) }
203
+ end
204
+
205
+ def shallow_normalize_columns!(columns)
206
+ columns.each {|column| shallow_normalize_column!(column) }
207
+ columns
208
+ end
209
+
210
+ def shallow_normalize_column(column)
211
+ shallow_normalize_column!(column.dup)
212
+ end
213
+
214
+ def shallow_normalize_column!(column)
215
+ symbolize_keys!(column)
216
+ column[:type] = column[:type].upcase if column[:type]
217
+ column[:mode] ||= 'NULLABLE'
218
+ column[:mode] = column[:mode].upcase
219
+ column
220
+ end
221
+
222
+ def symbolize_keys!(column)
223
+ new_column = column.map do |key, val|
224
+ [key.to_sym, val]
225
+ end.to_h
226
+ column.replace(new_column)
227
+ end
228
+
229
+ # @param [Array] columns
230
+ # [{
231
+ # name: 'citiesLived',
232
+ # type: 'RECORD',
233
+ # fields: [
234
+ # {
235
+ # name: 'place', type: 'RECORD',
236
+ # fields: [
237
+ # { name: 'city', type: 'STRING' }, { name: 'postcode', type: 'STRING' }
238
+ # ]
239
+ # },
240
+ # { name: 'yearsLived', type: 'INTEGER' }
241
+ # ]
242
+ # }]
243
+ # @return Hash
244
+ # {
245
+ # 'citiesLived.place.city' => {
246
+ # type: 'STRING'
247
+ # },
248
+ # 'citiesLived.place.postcode' => {
249
+ # type: 'STRING'
250
+ # },
251
+ # 'citiesLived.yearsLived' => {
252
+ # type: 'INTEGER'
253
+ # }
254
+ # }
255
+ def flattened_columns(columns, parent_name: nil)
256
+ result = {}
257
+ columns.each do |column|
258
+ column_name = parent_name.nil? ? column[:name] : "#{parent_name}.#{column[:name]}"
259
+ if column[:type].upcase != 'RECORD'
260
+ result[column_name] = {}.tap do |value|
261
+ value[:type] = column[:type]
262
+ value[:mode] = column[:mode] if column[:mode]
263
+ end
264
+ else
265
+ result.merge!(flattened_columns(column[:fields], parent_name: column_name))
266
+ end
267
+ end
268
+ result
269
+ end
270
+
271
+ def equals?(source_columns, target_columns)
272
+ diff_columns(source_columns, target_columns).empty? and \
273
+ diff_columns(target_columns, source_columns).empty?
274
+ end
275
+
276
+ # target_columns - source_columns
277
+ def diff_columns(source_columns, target_columns)
278
+ _target_columns = shallow_normalize_columns(target_columns)
279
+ _source_columns = shallow_normalize_columns(source_columns)
280
+ diff_columns = _target_columns - _source_columns # shallow diff
281
+
282
+ diff_columns.map do |target_column|
283
+ t = target_column
284
+ source_column = find_column_by_name(_source_columns, target_column[:name])
285
+ next t unless source_column
286
+ next t unless target_column[:type] == 'RECORD' and source_column[:type] == 'RECORD'
287
+ next t unless target_column[:fields] and source_column[:fields]
288
+ # recusive diff for RECORD columns
289
+ diff_fields = diff_columns(source_column[:fields], target_column[:fields])
290
+ next nil if diff_fields.empty? # remove
291
+ target_column[:fields] = diff_fields
292
+ target_column
293
+ end.compact
294
+ end
295
+
296
+ # diff with only column_names
297
+ # target_columns - source_columns
298
+ def diff_columns_by_name(source_columns, target_columns)
299
+ _target_columns = shallow_normalize_columns(target_columns)
300
+ _source_columns = shallow_normalize_columns(source_columns)
301
+ diff_columns = _target_columns - _source_columns # shallow diff
302
+
303
+ diff_columns.map do |target_column|
304
+ t = target_column
305
+ source_column = find_column_by_name(_source_columns, target_column[:name])
306
+ next t unless source_column
307
+ next nil unless target_column[:type] == 'RECORD' and source_column[:type] == 'RECORD'
308
+ next nil unless target_column[:fields] and source_column[:fields]
309
+ # recusive diff for RECORD columns
310
+ diff_fields = diff_columns_by_name(source_column[:fields], target_column[:fields])
311
+ next nil if diff_fields.empty? # remove
312
+ target_column[:fields] = diff_fields
313
+ target_column
314
+ end.compact
315
+ end
316
+
317
+ # 1. target_column[:mode] ||= source_column[:mode] || 'NULLABLE' (not overwrite, but set if does not exist)
318
+ # 2. Add into target_columns if a source column does not exist in target_columns
319
+ #
320
+ # @param [Array] source_columns
321
+ # @param [Array] target_columns
322
+ def reverse_merge!(source_columns, target_columns)
323
+ shallow_normalize_columns!(source_columns)
324
+ shallow_normalize_columns!(target_columns)
325
+
326
+ source_columns.map do |source_column|
327
+ if target_column = find_column_by_name(target_columns, source_column[:name])
328
+ target_column[:mode] ||= source_column[:mode] || 'NULLABLE'
329
+ target_column[:type] ||= source_column[:type] # should never be happened
330
+ # Recursive merge fields of `RECORD` type
331
+ if target_column[:type] == 'RECORD' and target_column[:fields] and source_column[:fields]
332
+ reverse_merge!(source_column[:fields], target_column[:fields])
333
+ end
334
+ else
335
+ target_column = source_column.dup
336
+ target_column[:mode] ||= 'NULLABLE'
337
+ target_columns << target_column
338
+ end
339
+ end
340
+ target_columns
341
+ end
342
+
343
+ def reject_columns!(drop_columns, target_columns)
344
+ flattened_drop_columns = flattened_columns(drop_columns)
345
+
346
+ flattened_drop_columns.keys.each do |flattened_name|
347
+ # paths like a %w(citiesLived place city child1)
348
+ paths = flattened_name.split('.')
349
+ # object_id of fields and target_columns are different.
350
+ # But the internal elements refer to the same ones
351
+ fields = target_columns
352
+ paths.each do |path|
353
+ # The last element of the path does not have the fields
354
+ next if path == paths.last
355
+ # find recursively
356
+ column = fields.find { |f| f[:name] == path }
357
+ next if column.nil?
358
+ fields = column[:fields]
359
+ end
360
+
361
+ unless fields.empty?
362
+ fields.delete_if { |f| f[:name] == paths.last }
363
+ end
364
+ end
365
+ target_columns
366
+ end
367
+
368
+ def build_query_fields(source_columns, target_columns)
369
+ flattened_source_columns = flattened_columns(source_columns)
370
+ flattened_target_columns = flattened_columns(target_columns)
371
+
372
+ query_fields = flattened_target_columns.map do |flattened_name, flattened_target_column|
373
+ flattened_source_column = flattened_source_columns[flattened_name]
374
+ target_type = flattened_target_column[:type].upcase
375
+
376
+ if flattened_source_column
377
+ "#{target_type}(#{flattened_name}) AS #{flattened_name}"
378
+ else
379
+ flattened_name
380
+ # MEMO: NULL cast like "#{target_type}(NULL) AS #{flattened_name}" breaks RECORD columns as
381
+ # INTEGER(NULL) AS add_record.add_record.add_column1 => add_record_add_record_add_column1
382
+ # We have to add columns with patch_table beforehand
383
+ end
384
+ end
385
+ end
386
+ end
387
+ end
388
+ end