naginegi 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "naginegi"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,57 @@
1
+ require 'naginegi/version'
2
+ require 'naginegi/embulk_config'
3
+ require 'naginegi/embulk'
4
+ require 'naginegi/mysql'
5
+ require 'naginegi/postgresql'
6
+ require 'logger'
7
+
8
+ module Naginegi
9
+ class EmbulkRunner
10
+ def initialize(db_configs: nil, log_level: 'warn', embulk_run_option: '')
11
+ @logger = Logger.new(STDOUT)
12
+ @logger.datetime_format = '%Y-%m-%d %H:%M:%S'
13
+
14
+ @db_configs = db_configs || YAML.load_file('database.yml')
15
+ @log_level = log_level
16
+ @embulk_run_option = embulk_run_option
17
+ end
18
+
19
+ def generate_config(bq_config)
20
+ Naginegi::EmbulkConfig.new.generate_config(@db_configs, bq_config)
21
+ end
22
+
23
+ def run(bq_config, target_table_names = [], retry_max = 0)
24
+ cmd = 'embulk --version'
25
+ unless system(cmd)
26
+ @logger.error('Cannot execute Embulk!!')
27
+ @logger.error('Cofirm Embulk install and environment')
28
+ return
29
+ end
30
+
31
+ error_tables = run_and_retry(bq_config, target_table_names, retry_max, 0)
32
+ error_tables.empty?
33
+ end
34
+
35
+ private
36
+
37
+ def run_and_retry(bq_config, target_table_names, retry_max, retry_count)
38
+ error_tables = Naginegi::Embulk.new(@log_level, @embulk_run_option).run(
39
+ @db_configs,
40
+ table_configs,
41
+ bq_config,
42
+ target_table_names
43
+ )
44
+ if !error_tables.empty? && retry_count < retry_max
45
+ @logger.warn('------------------------------------')
46
+ @logger.warn("retry start -> #{retry_count + 1} time")
47
+ @logger.warn('------------------------------------')
48
+ error_tables = run_and_retry(bq_config, error_tables, retry_max, retry_count + 1)
49
+ end
50
+ error_tables
51
+ end
52
+
53
+ def table_configs
54
+ @table_configs ||= Naginegi::TableConfig.generate_table_configs
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,107 @@
1
+ require 'json'
2
+ require 'erb'
3
+ require 'google/cloud/bigquery'
4
+ require 'unindent'
5
+ require 'date'
6
+
7
+ module Naginegi
8
+ class BigQuery
9
+ CONTENTS = <<-EOS.unindent
10
+ in:
11
+ type: <%= db_type %>
12
+ host: <%= host %>
13
+ user: <%= user %>
14
+ password: <%= password %>
15
+ database: <%= database %>
16
+ ssl: <%= ssl %>
17
+ query: |
18
+ <%= query %>
19
+ <%= options %>
20
+ out:
21
+ type: bigquery
22
+ auth_method: <%= auth_method %>
23
+ json_keyfile: <%= json_keyfile %>
24
+ <%= json_key_content %>
25
+ project: <%= project %>
26
+ service_account_email: <%= service_account_email %>
27
+ dataset: <%= dataset %>
28
+ table: <%= table_name %>
29
+ schema_file: <%= schema_file %>
30
+ auto_create_table: true
31
+ path_prefix: <%= path_prefix %>
32
+ source_format: NEWLINE_DELIMITED_JSON
33
+ file_ext: .json.gz
34
+ delete_from_local_when_job_end: 1
35
+ formatter:
36
+ type: jsonl
37
+ encoders:
38
+ - {type: gzip}
39
+ EOS
40
+
41
+ def initialize(config)
42
+ @config = config.dup
43
+ @current_date = Date.today
44
+ end
45
+
46
+ def self.generate_schema(columns)
47
+ json_body = columns.map(&:to_json).join(",\n")
48
+ "[\n" + json_body + "\n]\n"
49
+ end
50
+
51
+ def self.generate_sql(table_config, columns)
52
+ columns = columns.map(&:converted_value)
53
+ sql = "SELECT #{columns.join(',')}"
54
+ sql << " FROM #{table_config.name}"
55
+ sql << " WHERE #{table_config.condition}" if table_config.condition
56
+ sql << "\n"
57
+ sql
58
+ end
59
+
60
+ def generate_embulk_config(db_name, db_config, table_config, columns)
61
+ db_type = db_config['db_type']
62
+ host = db_config['host']
63
+ user = db_config['username']
64
+ password = db_config['password']
65
+ database = db_config['database']
66
+ ssl = db_config['embulk_ssl_enable'] || false
67
+ options = if db_type == 'mysql'
68
+ "options: {useLegacyDatetimeCode: false, serverTimezone: #{db_config['timezone']}}"
69
+ else
70
+ ''
71
+ end
72
+ query = Naginegi::BigQuery.generate_sql(table_config, columns)
73
+
74
+ auth_method = @config['auth_method']
75
+ if @config['json_key']
76
+ values = @config['json_key'].map do |k, v|
77
+ value = v.gsub("\n", '\\n')
78
+ "\"#{k}\": \"#{value}\""
79
+ end
80
+ json_key_content = "content: |\n {#{values.join(',')}}"
81
+ else
82
+ json_keyfile = @config['json_keyfile']
83
+ end
84
+ project = @config['project_id']
85
+ service_account_email = @config['service_email']
86
+ dataset = db_config['bq_dataset']
87
+ table_name = actual_table_name(table_config.name, db_config['daily_snapshot'] || table_config.daily_snapshot)
88
+ schema_file = "#{@config['schema_dir']}/#{db_name}/#{table_config.name}.json"
89
+ path_prefix = "/var/tmp/embulk_#{db_name}_#{table_config.name}"
90
+
91
+ ERB.new(CONTENTS).result(binding)
92
+ end
93
+
94
+ def delete_table(dataset, table_name)
95
+ bq = Google::Cloud::Bigquery.new(
96
+ project: @config['project_id'],
97
+ keyfile: @config['json_keyfile']
98
+ )
99
+ bq.service.delete_table(dataset, table_name)
100
+ end
101
+
102
+ def actual_table_name(table_name, daily_snapshot)
103
+ return table_name unless daily_snapshot
104
+ table_name + @current_date.strftime('%Y%m%d')
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,75 @@
1
+ require 'logger'
2
+
3
+ module Naginegi
4
+ class Embulk
5
+ def initialize(log_level, embulk_run_option)
6
+ @logger = Logger.new(STDOUT)
7
+ @logger.datetime_format = '%Y-%m-%d %H:%M:%S'
8
+
9
+ @log_level = log_level
10
+ @embulk_run_option = embulk_run_option
11
+ end
12
+
13
+ def run(db_configs, all_table_configs, bq_config, target_table_names = [])
14
+ error_tables = []
15
+ db_configs.keys.each do |db_name|
16
+ table_configs = select_table_configs(all_table_configs[db_name], target_table_names)
17
+ error_tables += run_by_database(
18
+ db_name,
19
+ table_configs,
20
+ bq_config,
21
+ db_configs[db_name]['bq_dataset']
22
+ )
23
+ end
24
+ error_tables
25
+ end
26
+
27
+ def select_table_configs(table_configs, target_table_names)
28
+ return table_configs if target_table_names.empty?
29
+ table_configs.select { |table_config| target_table_names.include?(table_config.name) }
30
+ end
31
+
32
+ private
33
+
34
+ def run_by_database(db_name, table_configs, bq_config, bq_dataset)
35
+ process_times = []
36
+ error_tables = []
37
+
38
+ bq_utility = Naginegi::BigQuery.new(bq_config)
39
+
40
+ table_configs.each do |table_config|
41
+ start_time = Time.now
42
+ @logger.info("table: #{table_config.name} - start")
43
+
44
+ begin
45
+ bq_utility.delete_table(bq_dataset, table_config.name)
46
+ @logger.info("#{table_config.name} is deleted")
47
+ rescue => e
48
+ @logger.warn(e.message)
49
+ end
50
+
51
+ cmd = "embulk run #{@embulk_run_option} #{bq_config['config_dir']}/#{db_name}/#{table_config.name}.yml --log-level #{@log_level}"
52
+ @logger.info("cmd: #{cmd}")
53
+
54
+ if system(cmd)
55
+ result = 'success'
56
+ else
57
+ result = 'error'
58
+ error_tables << table_config.name
59
+ end
60
+
61
+ process_time = "table: #{table_config.name} - result: #{result} #{format('%10.1f', Time.now - start_time)}sec"
62
+ @logger.info(process_time)
63
+
64
+ process_times << process_time
65
+ end
66
+
67
+ @logger.info('------------------------------------')
68
+ @logger.info("db_name: #{db_name}")
69
+
70
+ process_times.each { |process_time| @logger.info(process_time) }
71
+
72
+ error_tables
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,73 @@
1
+ module Naginegi
2
+ class EmbulkConfig
3
+ def generate_config(db_configs, bq_config)
4
+ bq_utility = BigQuery.new(bq_config)
5
+
6
+ db_configs.keys.each do |db_name|
7
+ db_config = db_configs[db_name]
8
+ table_configs = all_table_configs[db_name]
9
+ db_type = db_config['db_type']
10
+
11
+ case db_type
12
+ when 'mysql'
13
+ sql_client = MySQL::MySQLClient.new(db_config)
14
+ when 'postgresql'
15
+ sql_client = PostgreSQL::PgClient.new(db_config)
16
+ end
17
+
18
+ table_configs.each do |table_config|
19
+ write(
20
+ "#{bq_config['schema_dir']}/#{db_name}",
21
+ "#{table_config.name}.json",
22
+ sql_client.generate_bq_schema(table_config.name)
23
+ )
24
+ write(
25
+ "#{bq_config['config_dir']}/#{db_name}",
26
+ "#{table_config.name}.yml",
27
+ bq_utility.generate_embulk_config(
28
+ db_name,
29
+ db_config,
30
+ table_config,
31
+ sql_client.columns(table_config.name)
32
+ )
33
+ )
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def write(directory, file_name, content)
41
+ FileUtils.mkdir_p(directory) unless FileTest.exist?(directory)
42
+ File.write("#{directory}/#{file_name}", content)
43
+ end
44
+
45
+ def all_table_configs
46
+ @all_table_configs ||= Naginegi::TableConfig.generate_table_configs
47
+ end
48
+ end
49
+
50
+ class TableConfig
51
+ attr_reader :name, :daily_snapshot, :condition
52
+
53
+ def initialize(config)
54
+ @name = config['name']
55
+ @daily_snapshot = config['daily_snapshot'] || false
56
+ @condition = config['condition']
57
+ end
58
+
59
+ def self.generate_table_configs(file_path = 'table.yml')
60
+ configs = YAML.load_file(file_path)
61
+ configs.each_with_object({}) do |(db, database_config), table_configs|
62
+ table_configs[db] = database_config['tables'].map { |config| TableConfig.new(config) }
63
+ table_configs
64
+ end
65
+ end
66
+
67
+ def ==(other)
68
+ instance_variables.all? do |v|
69
+ instance_variable_get(v) == other.instance_variable_get(v)
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,95 @@
1
+ require 'mysql2-cs-bind'
2
+ require 'json'
3
+ require 'yaml'
4
+ require 'fileutils'
5
+ require 'naginegi/bigquery'
6
+
7
+ module Naginegi
8
+ module MySQL
9
+ class MySQLClient
10
+ COLUMN_SQL = <<-SQL.freeze
11
+ SELECT column_name, data_type
12
+ FROM INFORMATION_SCHEMA.COLUMNS
13
+ WHERE table_schema = ?
14
+ AND table_name = ?
15
+ ORDER BY ordinal_position
16
+ SQL
17
+
18
+ def initialize(database_config)
19
+ @database_config = database_config
20
+ end
21
+
22
+ def client
23
+ @client ||= Mysql2::Client.new(
24
+ host: @database_config['host'],
25
+ username: @database_config['username'],
26
+ password: @database_config['password'],
27
+ database: @database_config['database']
28
+ )
29
+ end
30
+
31
+ def generate_bq_schema(table_name)
32
+ infos = columns(table_name)
33
+ BigQuery.generate_schema(infos)
34
+ end
35
+
36
+ def columns(table_name)
37
+ rows = client.xquery(COLUMN_SQL, @database_config['database'], table_name)
38
+ rows.map { |row| Column.new(row['column_name'], row['data_type']) }
39
+ end
40
+ end
41
+
42
+ class Column
43
+ attr_reader :column_name, :data_type
44
+
45
+ TYPE_MAPPINGS = {
46
+ 'int' => 'INT64',
47
+ 'tinyint' => 'INT64',
48
+ 'smallint' => 'INT64',
49
+ 'mediumint' => 'INT64',
50
+ 'bigint' => 'INT64',
51
+ 'float' => 'FLOAT64',
52
+ 'double' => 'FLOAT64',
53
+ 'decimal' => 'FLOAT64',
54
+ 'char' => 'STRING',
55
+ 'varchar' => 'STRING',
56
+ 'tinytext' => 'STRING',
57
+ 'text' => 'STRING',
58
+ 'date' => 'TIMESTAMP',
59
+ 'datetime' => 'TIMESTAMP',
60
+ 'timestamp' => 'TIMESTAMP'
61
+ }.freeze
62
+
63
+ def initialize(column_name, data_type)
64
+ @column_name = column_name
65
+ @data_type = data_type
66
+ end
67
+
68
+ def bigquery_data_type
69
+ TYPE_MAPPINGS[@data_type]
70
+ end
71
+
72
+ def converted_value
73
+ if bigquery_data_type == 'TIMESTAMP'
74
+ # time zone translate to UTC
75
+ "UNIX_TIMESTAMP(#{escaped_column_name}) AS #{escaped_column_name}"
76
+ elsif data_type == 'tinyint'
77
+ # for MySQL tinyint(1) problem
78
+ "CAST(#{escaped_column_name} AS signed) AS #{escaped_column_name}"
79
+ else
80
+ escaped_column_name
81
+ end
82
+ end
83
+
84
+ def to_json(*a)
85
+ { 'name' => @column_name, 'type' => bigquery_data_type }.to_json(*a)
86
+ end
87
+
88
+ private
89
+
90
+ def escaped_column_name
91
+ "`#{@column_name}`"
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,93 @@
1
+ require 'pg'
2
+ require 'json'
3
+ require 'yaml'
4
+ require 'fileutils'
5
+ require 'naginegi/bigquery'
6
+
7
+ module Naginegi
8
+ module PostgreSQL
9
+ class PgClient
10
+ COLUMN_SQL = <<-SQL.freeze
11
+ SELECT column_name, data_type
12
+ FROM INFORMATION_SCHEMA.COLUMNS
13
+ WHERE table_name = $1
14
+ ORDER BY ordinal_position
15
+ SQL
16
+
17
+ def initialize(db_config)
18
+ @db_config = db_config
19
+ end
20
+
21
+ def client
22
+ @client ||= PG::Connection.new(
23
+ host: @db_config['host'],
24
+ user: @db_config['username'],
25
+ password: @db_config['password'],
26
+ dbname: @db_config['database']
27
+ )
28
+ end
29
+
30
+ def generate_bq_schema(table_name)
31
+ infos = columns(table_name)
32
+ BigQuery.generate_schema(infos)
33
+ end
34
+
35
+ def columns(table_name)
36
+ rows = client.exec_params(COLUMN_SQL, [table_name])
37
+ rows.map { |row| Column.new(row['column_name'], row['data_type']) }
38
+ end
39
+ end
40
+
41
+ class Column
42
+ attr_reader :column_name, :data_type
43
+
44
+ TYPE_MAPPINGS = {
45
+ 'smallint' => 'INT64',
46
+ 'integer' => 'INT64',
47
+ 'bigint' => 'INT64',
48
+ 'smallserial' => 'INT64',
49
+ 'serial' => 'INT64',
50
+ 'bigserial' => 'INT64',
51
+ 'decimal' => 'FLOAT64',
52
+ 'numeric' => 'FLOAT64',
53
+ 'real' => 'FLOAT64',
54
+ 'double precision' => 'FLOAT64',
55
+ 'character' => 'STRING',
56
+ 'character varying' => 'STRING',
57
+ 'text' => 'STRING',
58
+ 'date' => 'TIMESTAMP',
59
+ 'timestamp' => 'TIMESTAMP',
60
+ 'timestamp with time zone' => 'TIMESTAMP',
61
+ 'boolean' => 'BOOL'
62
+ }.freeze
63
+
64
+ def initialize(column_name, data_type)
65
+ @column_name = column_name
66
+ @data_type = data_type
67
+ end
68
+
69
+ def bigquery_data_type
70
+ TYPE_MAPPINGS[@data_type]
71
+ end
72
+
73
+ def converted_value
74
+ if bigquery_data_type == 'TIMESTAMP'
75
+ # time zone translate to UTC
76
+ "EXTRACT(EPOCH FROM #{escaped_column_name}) AS #{escaped_column_name}"
77
+ else
78
+ escaped_column_name
79
+ end
80
+ end
81
+
82
+ def to_json(*a)
83
+ { 'name' => @column_name, 'type' => bigquery_data_type }.to_json(*a)
84
+ end
85
+
86
+ private
87
+
88
+ def escaped_column_name
89
+ "\"#{@column_name}\""
90
+ end
91
+ end
92
+ end
93
+ end