naginegi 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "naginegi"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,57 @@
1
+ require 'naginegi/version'
2
+ require 'naginegi/embulk_config'
3
+ require 'naginegi/embulk'
4
+ require 'naginegi/mysql'
5
+ require 'naginegi/postgresql'
6
+ require 'logger'
7
+
8
+ module Naginegi
9
+ class EmbulkRunner
10
+ def initialize(db_configs: nil, log_level: 'warn', embulk_run_option: '')
11
+ @logger = Logger.new(STDOUT)
12
+ @logger.datetime_format = '%Y-%m-%d %H:%M:%S'
13
+
14
+ @db_configs = db_configs || YAML.load_file('database.yml')
15
+ @log_level = log_level
16
+ @embulk_run_option = embulk_run_option
17
+ end
18
+
19
+ def generate_config(bq_config)
20
+ Naginegi::EmbulkConfig.new.generate_config(@db_configs, bq_config)
21
+ end
22
+
23
+ def run(bq_config, target_table_names = [], retry_max = 0)
24
+ cmd = 'embulk --version'
25
+ unless system(cmd)
26
+ @logger.error('Cannot execute Embulk!!')
27
+ @logger.error('Cofirm Embulk install and environment')
28
+ return
29
+ end
30
+
31
+ error_tables = run_and_retry(bq_config, target_table_names, retry_max, 0)
32
+ error_tables.empty?
33
+ end
34
+
35
+ private
36
+
37
+ def run_and_retry(bq_config, target_table_names, retry_max, retry_count)
38
+ error_tables = Naginegi::Embulk.new(@log_level, @embulk_run_option).run(
39
+ @db_configs,
40
+ table_configs,
41
+ bq_config,
42
+ target_table_names
43
+ )
44
+ if !error_tables.empty? && retry_count < retry_max
45
+ @logger.warn('------------------------------------')
46
+ @logger.warn("retry start -> #{retry_count + 1} time")
47
+ @logger.warn('------------------------------------')
48
+ error_tables = run_and_retry(bq_config, error_tables, retry_max, retry_count + 1)
49
+ end
50
+ error_tables
51
+ end
52
+
53
+ def table_configs
54
+ @table_configs ||= Naginegi::TableConfig.generate_table_configs
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,115 @@
1
+ require 'json'
2
+ require 'erb'
3
+ require 'google/cloud/bigquery'
4
+ require 'unindent'
5
+ require 'date'
6
+
7
+ module Naginegi
8
+ class BigQuery
9
+ CONTENTS = <<-EOS.unindent
10
+ in:
11
+ type: <%= db_type %>
12
+ host: <%= host %>
13
+ user: <%= user %>
14
+ password: <%= password %>
15
+ database: <%= database %>
16
+ ssl: <%= ssl %>
17
+ query: |
18
+ <%= query %>
19
+ <%= options %>
20
+ out:
21
+ type: bigquery
22
+ auth_method: <%= auth_method %>
23
+ json_keyfile: <%= json_keyfile %>
24
+ <%= json_key_content %>
25
+ project: <%= project %>
26
+ service_account_email: <%= service_account_email %>
27
+ dataset: <%= dataset %>
28
+ table: <%= table_name %>
29
+ schema_file: <%= schema_file %>
30
+ auto_create_table: true
31
+ path_prefix: <%= path_prefix %>
32
+ source_format: NEWLINE_DELIMITED_JSON
33
+ file_ext: .json.gz
34
+ delete_from_local_when_job_end: 1
35
+ formatter:
36
+ type: jsonl
37
+ encoders:
38
+ - {type: gzip}
39
+ EOS
40
+
41
+ def initialize(config)
42
+ @config = config.dup
43
+ @current_date = Date.today
44
+ end
45
+
46
+ def self.generate_schema(columns)
47
+ json_body = columns.map(&:to_json).join(",\n")
48
+ "[\n" + json_body + "\n]\n"
49
+ end
50
+
51
+ def self.generate_sql(table_config, columns)
52
+ columns = columns.map(&:converted_value)
53
+ sql = "SELECT #{columns.join(',')}"
54
+ sql << " FROM #{table_config.name}"
55
+ sql << " WHERE #{table_config.condition}" if table_config.condition
56
+ sql << "\n"
57
+ sql
58
+ end
59
+
60
+ def generate_embulk_config(db_name, db_config, table_config, columns)
61
+ db_type = db_config['db_type']
62
+ host = db_config['host']
63
+ user = db_config['username']
64
+ password = db_config['password']
65
+ database = db_config['database']
66
+ ssl = db_config['embulk_ssl_enable'] || false
67
+ options = if db_type == 'mysql'
68
+ "options: {useLegacyDatetimeCode: false, serverTimezone: #{db_config['timezone']}}"
69
+ else
70
+ ''
71
+ end
72
+ query = Naginegi::BigQuery.generate_sql(table_config, columns)
73
+
74
+ auth_method = @config['auth_method']
75
+ if @config['json_key']
76
+ values = @config['json_key'].map do |k, v|
77
+ value = v.gsub("\n", '\\n')
78
+ "\"#{k}\": \"#{value}\""
79
+ end
80
+ json_key_content = "content: |\n {#{values.join(',')}}"
81
+ else
82
+ json_keyfile = @config['json_keyfile']
83
+ end
84
+ project = @config['project_id']
85
+ service_account_email = @config['service_email']
86
+ dataset = db_config['bq_dataset']
87
+ table_name = actual_table_name(table_config.name, db_config['daily_snapshot'] || table_config.daily_snapshot)
88
+ schema_file = "#{@config['schema_dir']}/#{db_name}/#{table_config.name}.json"
89
+ path_prefix = "/var/tmp/embulk_#{db_name}_#{table_config.name}"
90
+
91
+ ERB.new(CONTENTS).result(binding)
92
+ end
93
+
94
+ def delete_table(dataset, table_name)
95
+ keyfile = if @config['json_key']
96
+ value = @config['json_key'].dup
97
+ value['private_key'] = value['private_key'].gsub('\\n', "\n")
98
+ value
99
+ else
100
+ @config['json_keyfile']
101
+ end
102
+
103
+ bq = Google::Cloud::Bigquery.new(
104
+ project: @config['project_id'],
105
+ keyfile: keyfile
106
+ )
107
+ bq.service.delete_table(dataset, table_name)
108
+ end
109
+
110
+ def actual_table_name(table_name, daily_snapshot)
111
+ return table_name unless daily_snapshot
112
+ table_name + @current_date.strftime('%Y%m%d')
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,75 @@
1
+ require 'logger'
2
+
3
+ module Naginegi
4
+ class Embulk
5
+ def initialize(log_level, embulk_run_option)
6
+ @logger = Logger.new(STDOUT)
7
+ @logger.datetime_format = '%Y-%m-%d %H:%M:%S'
8
+
9
+ @log_level = log_level
10
+ @embulk_run_option = embulk_run_option
11
+ end
12
+
13
+ def run(db_configs, all_table_configs, bq_config, target_table_names = [])
14
+ error_tables = []
15
+ db_configs.keys.each do |db_name|
16
+ table_configs = select_table_configs(all_table_configs[db_name], target_table_names)
17
+ error_tables += run_by_database(
18
+ db_name,
19
+ table_configs,
20
+ bq_config,
21
+ db_configs[db_name]['bq_dataset']
22
+ )
23
+ end
24
+ error_tables
25
+ end
26
+
27
+ def select_table_configs(table_configs, target_table_names)
28
+ return table_configs if target_table_names.empty?
29
+ table_configs.select { |table_config| target_table_names.include?(table_config.name) }
30
+ end
31
+
32
+ private
33
+
34
+ def run_by_database(db_name, table_configs, bq_config, bq_dataset)
35
+ process_times = []
36
+ error_tables = []
37
+
38
+ bq_utility = Naginegi::BigQuery.new(bq_config)
39
+
40
+ table_configs.each do |table_config|
41
+ start_time = Time.now
42
+ @logger.info("table: #{table_config.name} - start")
43
+
44
+ begin
45
+ bq_utility.delete_table(bq_dataset, table_config.name)
46
+ @logger.info("#{table_config.name} is deleted")
47
+ rescue => e
48
+ @logger.warn(e.message)
49
+ end
50
+
51
+ cmd = "embulk run #{@embulk_run_option} #{bq_config['config_dir']}/#{db_name}/#{table_config.name}.yml --log-level #{@log_level}"
52
+ @logger.info("cmd: #{cmd}")
53
+
54
+ if system(cmd)
55
+ result = 'success'
56
+ else
57
+ result = 'error'
58
+ error_tables << table_config.name
59
+ end
60
+
61
+ process_time = "table: #{table_config.name} - result: #{result} #{format('%10.1f', Time.now - start_time)}sec"
62
+ @logger.info(process_time)
63
+
64
+ process_times << process_time
65
+ end
66
+
67
+ @logger.info('------------------------------------')
68
+ @logger.info("db_name: #{db_name}")
69
+
70
+ process_times.each { |process_time| @logger.info(process_time) }
71
+
72
+ error_tables
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,73 @@
1
+ module Naginegi
2
+ class EmbulkConfig
3
+ def generate_config(db_configs, bq_config)
4
+ bq_utility = BigQuery.new(bq_config)
5
+
6
+ db_configs.keys.each do |db_name|
7
+ db_config = db_configs[db_name]
8
+ table_configs = all_table_configs[db_name]
9
+ db_type = db_config['db_type']
10
+
11
+ case db_type
12
+ when 'mysql'
13
+ sql_client = MySQL::MySQLClient.new(db_config)
14
+ when 'postgresql'
15
+ sql_client = PostgreSQL::PgClient.new(db_config)
16
+ end
17
+
18
+ table_configs.each do |table_config|
19
+ write(
20
+ "#{bq_config['schema_dir']}/#{db_name}",
21
+ "#{table_config.name}.json",
22
+ sql_client.generate_bq_schema(table_config.name)
23
+ )
24
+ write(
25
+ "#{bq_config['config_dir']}/#{db_name}",
26
+ "#{table_config.name}.yml",
27
+ bq_utility.generate_embulk_config(
28
+ db_name,
29
+ db_config,
30
+ table_config,
31
+ sql_client.columns(table_config.name)
32
+ )
33
+ )
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def write(directory, file_name, content)
41
+ FileUtils.mkdir_p(directory) unless FileTest.exist?(directory)
42
+ File.write("#{directory}/#{file_name}", content)
43
+ end
44
+
45
+ def all_table_configs
46
+ @all_table_configs ||= Naginegi::TableConfig.generate_table_configs
47
+ end
48
+ end
49
+
50
+ class TableConfig
51
+ attr_reader :name, :daily_snapshot, :condition
52
+
53
+ def initialize(config)
54
+ @name = config['name']
55
+ @daily_snapshot = config['daily_snapshot'] || false
56
+ @condition = config['condition']
57
+ end
58
+
59
+ def self.generate_table_configs(file_path = 'table.yml')
60
+ configs = YAML.load_file(file_path)
61
+ configs.each_with_object({}) do |(db, database_config), table_configs|
62
+ table_configs[db] = database_config['tables'].map { |config| TableConfig.new(config) }
63
+ table_configs
64
+ end
65
+ end
66
+
67
+ def ==(other)
68
+ instance_variables.all? do |v|
69
+ instance_variable_get(v) == other.instance_variable_get(v)
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,95 @@
1
+ require 'mysql2-cs-bind'
2
+ require 'json'
3
+ require 'yaml'
4
+ require 'fileutils'
5
+ require 'naginegi/bigquery'
6
+
7
+ module Naginegi
8
+ module MySQL
9
+ class MySQLClient
10
+ COLUMN_SQL = <<-SQL.freeze
11
+ SELECT column_name, data_type
12
+ FROM INFORMATION_SCHEMA.COLUMNS
13
+ WHERE table_schema = ?
14
+ AND table_name = ?
15
+ ORDER BY ordinal_position
16
+ SQL
17
+
18
+ def initialize(database_config)
19
+ @database_config = database_config
20
+ end
21
+
22
+ def client
23
+ @client ||= Mysql2::Client.new(
24
+ host: @database_config['host'],
25
+ username: @database_config['username'],
26
+ password: @database_config['password'],
27
+ database: @database_config['database']
28
+ )
29
+ end
30
+
31
+ def generate_bq_schema(table_name)
32
+ infos = columns(table_name)
33
+ BigQuery.generate_schema(infos)
34
+ end
35
+
36
+ def columns(table_name)
37
+ rows = client.xquery(COLUMN_SQL, @database_config['database'], table_name)
38
+ rows.map { |row| Column.new(row['column_name'], row['data_type']) }
39
+ end
40
+ end
41
+
42
+ class Column
43
+ attr_reader :column_name, :data_type
44
+
45
+ TYPE_MAPPINGS = {
46
+ 'int' => 'INT64',
47
+ 'tinyint' => 'INT64',
48
+ 'smallint' => 'INT64',
49
+ 'mediumint' => 'INT64',
50
+ 'bigint' => 'INT64',
51
+ 'float' => 'FLOAT64',
52
+ 'double' => 'FLOAT64',
53
+ 'decimal' => 'FLOAT64',
54
+ 'char' => 'STRING',
55
+ 'varchar' => 'STRING',
56
+ 'tinytext' => 'STRING',
57
+ 'text' => 'STRING',
58
+ 'date' => 'TIMESTAMP',
59
+ 'datetime' => 'TIMESTAMP',
60
+ 'timestamp' => 'TIMESTAMP'
61
+ }.freeze
62
+
63
+ def initialize(column_name, data_type)
64
+ @column_name = column_name
65
+ @data_type = data_type
66
+ end
67
+
68
+ def bigquery_data_type
69
+ TYPE_MAPPINGS[@data_type] || 'STRING'
70
+ end
71
+
72
+ def converted_value
73
+ if bigquery_data_type == 'TIMESTAMP'
74
+ # time zone translate to UTC
75
+ "UNIX_TIMESTAMP(#{escaped_column_name}) AS #{escaped_column_name}"
76
+ elsif data_type == 'tinyint'
77
+ # for MySQL tinyint(1) problem
78
+ "CAST(#{escaped_column_name} AS signed) AS #{escaped_column_name}"
79
+ else
80
+ escaped_column_name
81
+ end
82
+ end
83
+
84
+ def to_json(*a)
85
+ { 'name' => @column_name, 'type' => bigquery_data_type }.to_json(*a)
86
+ end
87
+
88
+ private
89
+
90
+ def escaped_column_name
91
+ "`#{@column_name}`"
92
+ end
93
+ end
94
+ end
95
+ end