naginegi 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "naginegi"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,57 @@
1
+ require 'naginegi/version'
2
+ require 'naginegi/embulk_config'
3
+ require 'naginegi/embulk'
4
+ require 'naginegi/mysql'
5
+ require 'naginegi/postgresql'
6
+ require 'logger'
7
+
8
+ module Naginegi
9
+ class EmbulkRunner
10
+ def initialize
11
+ @logger = Logger.new(STDOUT)
12
+ @logger.datetime_format = '%Y-%m-%d %H:%M:%S'
13
+ end
14
+
15
+ def generate_config(bq_config)
16
+ Naginegi::EmbulkConfig.new.generate_config(db_configs, bq_config)
17
+ end
18
+
19
+ def run(bq_config, target_table_names = [], retry_max = 0)
20
+ cmd = 'embulk --version'
21
+ unless system(cmd)
22
+ @logger.error('Cannot execute Embulk!!')
23
+ @logger.error('Cofirm Embulk install and environment')
24
+ return
25
+ end
26
+
27
+ error_tables = run_and_retry(bq_config, target_table_names, retry_max, 0)
28
+ error_tables.empty?
29
+ end
30
+
31
+ private
32
+
33
+ def run_and_retry(bq_config, target_table_names, retry_max, retry_count)
34
+ error_tables = Naginegi::Embulk.new.run(
35
+ db_configs,
36
+ table_configs,
37
+ bq_config,
38
+ target_table_names
39
+ )
40
+ if !error_tables.empty? && retry_count < retry_max
41
+ @logger.warn('------------------------------------')
42
+ @logger.warn("retry start -> #{retry_count + 1} time")
43
+ @logger.warn('------------------------------------')
44
+ error_tables = run_and_retry(bq_config, error_tables, retry_max, retry_count + 1)
45
+ end
46
+ error_tables
47
+ end
48
+
49
+ def db_configs
50
+ @db_configs ||= YAML.load_file('database.yml')
51
+ end
52
+
53
+ def table_configs
54
+ @table_configs ||= Naginegi::MySQL::TableConfig.generate_table_configs
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,96 @@
1
+ require 'json'
2
+ require 'erb'
3
+ require 'google/cloud/bigquery'
4
+ require 'unindent'
5
+ require 'date'
6
+
7
+ module Naginegi
8
+ class BigQuery
9
+ CONTENTS = <<-EOS.unindent
10
+ in:
11
+ type: <%= db_type %>
12
+ user: <%= user %>
13
+ password: <%= password %>
14
+ database: <%= database %>
15
+ host: <%= host %>
16
+ query: |
17
+ <%= query %>
18
+ <%= options %>
19
+ out:
20
+ type: bigquery
21
+ auth_method: <%= auth_method %>
22
+ json_keyfile: <%= json_keyfile %>
23
+ project: <%= project %>
24
+ service_account_email: <%= service_account_email %>
25
+ dataset: <%= dataset %>
26
+ table: <%= table_name %>
27
+ schema_file: <%= schema_file %>
28
+ auto_create_table: true
29
+ path_prefix: <%= path_prefix %>
30
+ source_format: NEWLINE_DELIMITED_JSON
31
+ file_ext: .json.gz
32
+ delete_from_local_when_job_end: 1
33
+ formatter:
34
+ type: jsonl
35
+ encoders:
36
+ - {type: gzip}
37
+ EOS
38
+
39
+ def initialize(config)
40
+ @config = config.dup
41
+ @current_date = Date.today
42
+ end
43
+
44
+ def self.generate_schema(columns)
45
+ json_body = columns.map(&:to_json).join(",\n")
46
+ "[\n" + json_body + "\n]\n"
47
+ end
48
+
49
+ def self.generate_sql(table_config, columns)
50
+ columns = columns.map(&:converted_value)
51
+ sql = "SELECT #{columns.join(',')}"
52
+ sql << " FROM #{table_config.name}"
53
+ sql << " WHERE #{table_config.condition}" if table_config.condition
54
+ sql << "\n"
55
+ sql
56
+ end
57
+
58
+ def generate_embulk_config(db_name, db_config, table_config, columns)
59
+ db_type = db_config['db_type']
60
+ host = db_config['host']
61
+ user = db_config['username']
62
+ password = db_config['password']
63
+ database = db_config['database']
64
+ options = if db_type == 'mysql'
65
+ "options: {useLegacyDatetimeCode: false, serverTimezone: #{db_config['timezone']}}"
66
+ else
67
+ ''
68
+ end
69
+ query = Naginegi::BigQuery.generate_sql(table_config, columns)
70
+
71
+ auth_method = @config['auth_method']
72
+ json_keyfile = @config['json_keyfile']
73
+ project = @config['project_id']
74
+ service_account_email = @config['service_email']
75
+ dataset = db_config['bq_dataset']
76
+ table_name = actual_table_name(table_config.name, db_config['daily_snapshot'] || table_config.daily_snapshot)
77
+ schema_file = "#{@config['schema_dir']}/#{db_name}/#{table_config.name}.json"
78
+ path_prefix = "/var/tmp/embulk_#{db_name}_#{table_config.name}"
79
+
80
+ ERB.new(CONTENTS).result(binding)
81
+ end
82
+
83
+ def delete_table(dataset, table_name)
84
+ bq = Google::Cloud::Bigquery.new(
85
+ project: @config['project_id'],
86
+ keyfile: @config['json_keyfile']
87
+ )
88
+ bq.service.delete_table(dataset, table_name)
89
+ end
90
+
91
+ def actual_table_name(table_name, daily_snapshot)
92
+ return table_name unless daily_snapshot
93
+ table_name + @current_date.strftime('%Y%m%d')
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,72 @@
1
+ require 'logger'
2
+
3
+ module Naginegi
4
+ class Embulk
5
+ def initialize
6
+ @logger = Logger.new(STDOUT)
7
+ @logger.datetime_format = '%Y-%m-%d %H:%M:%S'
8
+ end
9
+
10
+ def run(db_configs, all_table_configs, bq_config, target_table_names = [])
11
+ error_tables = []
12
+ db_configs.keys.each do |db_name|
13
+ table_configs = select_table_configs(all_table_configs[db_name], target_table_names)
14
+ error_tables += run_by_database(
15
+ db_name,
16
+ table_configs,
17
+ bq_config,
18
+ db_configs[db_name]['bq_dataset']
19
+ )
20
+ end
21
+ error_tables
22
+ end
23
+
24
+ def select_table_configs(table_configs, target_table_names)
25
+ return table_configs if target_table_names.empty?
26
+ table_configs.select { |table_config| target_table_names.include?(table_config.name) }
27
+ end
28
+
29
+ private
30
+
31
+ def run_by_database(db_name, table_configs, bq_config, bq_dataset)
32
+ process_times = []
33
+ error_tables = []
34
+
35
+ bq_utility = Naginegi::BigQuery.new(bq_config)
36
+
37
+ table_configs.each do |table_config|
38
+ start_time = Time.now
39
+ @logger.info("table: #{table_config.name} - start")
40
+
41
+ begin
42
+ bq_utility.delete_table(bq_dataset, table_config.name)
43
+ @logger.info("#{table_config.name} is deleted")
44
+ rescue => e
45
+ @logger.warn(e.message)
46
+ end
47
+
48
+ cmd = "embulk run #{bq_config['config_dir']}/#{db_name}/#{table_config.name}.yml"
49
+ @logger.info("cmd: #{cmd}")
50
+
51
+ if system(cmd)
52
+ result = 'success'
53
+ else
54
+ result = 'error'
55
+ error_tables << table_config.name
56
+ end
57
+
58
+ process_time = "table: #{table_config.name} - result: #{result} #{format('%10.1f', Time.now - start_time)}sec"
59
+ @logger.info(process_time)
60
+
61
+ process_times << process_time
62
+ end
63
+
64
+ @logger.info('------------------------------------')
65
+ @logger.info("db_name: #{db_name}")
66
+
67
+ process_times.each { |process_time| @logger.info(process_time) }
68
+
69
+ error_tables
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,49 @@
1
+ module Naginegi
2
+ class EmbulkConfig
3
+ def generate_config(db_configs, bq_config)
4
+ bq_utility = BigQuery.new(bq_config)
5
+
6
+ db_configs.keys.each do |db_name|
7
+ db_config = db_configs[db_name]
8
+ table_configs = all_table_configs[db_name]
9
+ db_type = db_config['db_type']
10
+
11
+ case db_type
12
+ when 'mysql'
13
+ sql_client = MySQL::MySQLClient.new(db_config)
14
+ when 'postgresql'
15
+ sql_client = PostgreSQL::PgClient.new(db_config)
16
+ end
17
+
18
+ table_configs.each do |table_config|
19
+ write(
20
+ "#{bq_config['schema_dir']}/#{db_name}",
21
+ "#{table_config.name}.json",
22
+ sql_client.generate_bq_schema(table_config.name)
23
+ )
24
+ write(
25
+ "#{bq_config['config_dir']}/#{db_name}",
26
+ "#{table_config.name}.yml",
27
+ bq_utility.generate_embulk_config(
28
+ db_name,
29
+ db_config,
30
+ table_config,
31
+ sql_client.columns(table_config.name)
32
+ )
33
+ )
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def write(directory, file_name, content)
41
+ FileUtils.mkdir_p(directory) unless FileTest.exist?(directory)
42
+ File.write("#{directory}/#{file_name}", content)
43
+ end
44
+
45
+ def all_table_configs
46
+ @all_table_configs ||= MySQL::TableConfig.generate_table_configs
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,119 @@
1
+ require 'mysql2-cs-bind'
2
+ require 'json'
3
+ require 'yaml'
4
+ require 'fileutils'
5
+ require 'naginegi/bigquery'
6
+
7
+ module Naginegi
8
+ module MySQL
9
+ class MySQLClient
10
+ COLUMN_SQL = <<-SQL.freeze
11
+ SELECT column_name, data_type
12
+ FROM INFORMATION_SCHEMA.COLUMNS
13
+ WHERE table_schema = ?
14
+ AND table_name = ?
15
+ ORDER BY ordinal_position
16
+ SQL
17
+
18
+ def initialize(database_config)
19
+ @database_config = database_config
20
+ end
21
+
22
+ def client
23
+ @client ||= Mysql2::Client.new(
24
+ host: @database_config['host'],
25
+ username: @database_config['username'],
26
+ password: @database_config['password'],
27
+ database: @database_config['database']
28
+ )
29
+ end
30
+
31
+ def generate_bq_schema(table_name)
32
+ infos = columns(table_name)
33
+ BigQuery.generate_schema(infos)
34
+ end
35
+
36
+ def columns(table_name)
37
+ rows = client.xquery(COLUMN_SQL, @database_config['database'], table_name)
38
+ rows.map { |row| Column.new(row['column_name'], row['data_type']) }
39
+ end
40
+ end
41
+
42
+ class TableConfig
43
+ attr_reader :name, :daily_snapshot, :condition
44
+
45
+ def initialize(config)
46
+ @name = config['name']
47
+ @daily_snapshot = config['daily_snapshot'] || false
48
+ @condition = config['condition']
49
+ end
50
+
51
+ def self.generate_table_configs(file_path = 'table.yml')
52
+ configs = YAML.load_file(file_path)
53
+ configs.each_with_object({}) do |(db, database_config), table_configs|
54
+ table_configs[db] = database_config['tables'].map { |config| TableConfig.new(config) }
55
+ table_configs
56
+ end
57
+ end
58
+
59
+ def ==(other)
60
+ instance_variables.all? do |v|
61
+ instance_variable_get(v) == other.instance_variable_get(v)
62
+ end
63
+ end
64
+ end
65
+
66
+ class Column
67
+ attr_reader :column_name, :data_type
68
+
69
+ TYPE_MAPPINGS = {
70
+ 'int' => 'INT64',
71
+ 'tinyint' => 'INT64',
72
+ 'smallint' => 'INT64',
73
+ 'mediumint' => 'INT64',
74
+ 'bigint' => 'INT64',
75
+ 'float' => 'FLOAT64',
76
+ 'double' => 'FLOAT64',
77
+ 'decimal' => 'FLOAT64',
78
+ 'char' => 'STRING',
79
+ 'varchar' => 'STRING',
80
+ 'tinytext' => 'STRING',
81
+ 'text' => 'STRING',
82
+ 'date' => 'TIMESTAMP',
83
+ 'datetime' => 'TIMESTAMP',
84
+ 'timestamp' => 'TIMESTAMP'
85
+ }.freeze
86
+
87
+ def initialize(column_name, data_type)
88
+ @column_name = column_name
89
+ @data_type = data_type
90
+ end
91
+
92
+ def bigquery_data_type
93
+ TYPE_MAPPINGS[@data_type]
94
+ end
95
+
96
+ def converted_value
97
+ if bigquery_data_type == 'TIMESTAMP'
98
+ # time zone translate to UTC
99
+ "UNIX_TIMESTAMP(#{escaped_column_name}) AS #{escaped_column_name}"
100
+ elsif data_type == 'tinyint'
101
+ # for MySQL tinyint(1) problem
102
+ "CAST(#{escaped_column_name} AS signed) AS #{escaped_column_name}"
103
+ else
104
+ escaped_column_name
105
+ end
106
+ end
107
+
108
+ def to_json(*a)
109
+ { 'name' => @column_name, 'type' => bigquery_data_type }.to_json(*a)
110
+ end
111
+
112
+ private
113
+
114
+ def escaped_column_name
115
+ "`#{@column_name}`"
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,117 @@
1
+ require 'pg'
2
+ require 'json'
3
+ require 'yaml'
4
+ require 'fileutils'
5
+ require 'naginegi/bigquery'
6
+
7
+ module Naginegi
8
+ module PostgreSQL
9
+ class PgClient
10
+ COLUMN_SQL = <<-SQL.freeze
11
+ SELECT column_name, data_type
12
+ FROM INFORMATION_SCHEMA.COLUMNS
13
+ WHERE table_name = $1
14
+ ORDER BY ordinal_position
15
+ SQL
16
+
17
+ def initialize(db_config)
18
+ @db_config = db_config
19
+ end
20
+
21
+ def client
22
+ @client ||= PG::Connection.new(
23
+ host: @db_config['host'],
24
+ user: @db_config['username'],
25
+ password: @db_config['password'],
26
+ dbname: @db_config['database']
27
+ )
28
+ end
29
+
30
+ def generate_bq_schema(table_name)
31
+ infos = columns(table_name)
32
+ BigQuery.generate_schema(infos)
33
+ end
34
+
35
+ def columns(table_name)
36
+ rows = client.exec_params(COLUMN_SQL, [table_name])
37
+ rows.map { |row| Column.new(row['column_name'], row['data_type']) }
38
+ end
39
+ end
40
+
41
+ class TableConfig
42
+ attr_reader :name, :daily_snapshot, :condition
43
+
44
+ def initialize(config)
45
+ @name = config['name']
46
+ @daily_snapshot = config['daily_snapshot'] || false
47
+ @condition = config['condition']
48
+ end
49
+
50
+ def self.generate_table_configs(file_path = 'table.yml')
51
+ configs = YAML.load_file(file_path)
52
+ configs.each_with_object({}) do |(db, db_config), table_configs|
53
+ table_configs[db] = db_config['tables'].map { |config| TableConfig.new(config) }
54
+ table_configs
55
+ end
56
+ end
57
+
58
+ def ==(other)
59
+ instance_variables.all? do |v|
60
+ instance_variable_get(v) == other.instance_variable_get(v)
61
+ end
62
+ end
63
+ end
64
+
65
+ class Column
66
+ attr_reader :column_name, :data_type
67
+
68
+ TYPE_MAPPINGS = {
69
+ 'smallint' => 'INT64',
70
+ 'integer' => 'INT64',
71
+ 'bigint' => 'INT64',
72
+ 'smallserial' => 'INT64',
73
+ 'serial' => 'INT64',
74
+ 'bigserial' => 'INT64',
75
+ 'decimal' => 'FLOAT64',
76
+ 'numeric' => 'FLOAT64',
77
+ 'real' => 'FLOAT64',
78
+ 'double precision' => 'FLOAT64',
79
+ 'character' => 'STRING',
80
+ 'character varying' => 'STRING',
81
+ 'text' => 'STRING',
82
+ 'date' => 'TIMESTAMP',
83
+ 'timestamp' => 'TIMESTAMP',
84
+ 'timestamp with time zone' => 'TIMESTAMP',
85
+ 'boolean' => 'BOOL'
86
+ }.freeze
87
+
88
+ def initialize(column_name, data_type)
89
+ @column_name = column_name
90
+ @data_type = data_type
91
+ end
92
+
93
+ def bigquery_data_type
94
+ TYPE_MAPPINGS[@data_type]
95
+ end
96
+
97
+ def converted_value
98
+ if bigquery_data_type == 'TIMESTAMP'
99
+ # time zone translate to UTC
100
+ "EXTRACT(EPOCH FROM #{escaped_column_name}) AS #{escaped_column_name}"
101
+ else
102
+ escaped_column_name
103
+ end
104
+ end
105
+
106
+ def to_json(*a)
107
+ { 'name' => @column_name, 'type' => bigquery_data_type }.to_json(*a)
108
+ end
109
+
110
+ private
111
+
112
+ def escaped_column_name
113
+ "\"#{@column_name}\""
114
+ end
115
+ end
116
+ end
117
+ end