naginegi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "naginegi"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,57 @@
1
+ require 'naginegi/version'
2
+ require 'naginegi/embulk_config'
3
+ require 'naginegi/embulk'
4
+ require 'naginegi/mysql'
5
+ require 'naginegi/postgresql'
6
+ require 'logger'
7
+
8
+ module Naginegi
9
+ class EmbulkRunner
10
+ def initialize
11
+ @logger = Logger.new(STDOUT)
12
+ @logger.datetime_format = '%Y-%m-%d %H:%M:%S'
13
+ end
14
+
15
+ def generate_config(bq_config)
16
+ Naginegi::EmbulkConfig.new.generate_config(db_configs, bq_config)
17
+ end
18
+
19
+ def run(bq_config, target_table_names = [], retry_max = 0)
20
+ cmd = 'embulk --version'
21
+ unless system(cmd)
22
+ @logger.error('Cannot execute Embulk!!')
23
+ @logger.error('Cofirm Embulk install and environment')
24
+ return
25
+ end
26
+
27
+ error_tables = run_and_retry(bq_config, target_table_names, retry_max, 0)
28
+ error_tables.empty?
29
+ end
30
+
31
+ private
32
+
33
+ def run_and_retry(bq_config, target_table_names, retry_max, retry_count)
34
+ error_tables = Naginegi::Embulk.new.run(
35
+ db_configs,
36
+ table_configs,
37
+ bq_config,
38
+ target_table_names
39
+ )
40
+ if !error_tables.empty? && retry_count < retry_max
41
+ @logger.warn('------------------------------------')
42
+ @logger.warn("retry start -> #{retry_count + 1} time")
43
+ @logger.warn('------------------------------------')
44
+ error_tables = run_and_retry(bq_config, error_tables, retry_max, retry_count + 1)
45
+ end
46
+ error_tables
47
+ end
48
+
49
+ def db_configs
50
+ @db_configs ||= YAML.load_file('database.yml')
51
+ end
52
+
53
+ def table_configs
54
+ @table_configs ||= Naginegi::MySQL::TableConfig.generate_table_configs
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,96 @@
1
+ require 'json'
2
+ require 'erb'
3
+ require 'google/cloud/bigquery'
4
+ require 'unindent'
5
+ require 'date'
6
+
7
+ module Naginegi
8
+ class BigQuery
9
+ CONTENTS = <<-EOS.unindent
10
+ in:
11
+ type: <%= db_type %>
12
+ user: <%= user %>
13
+ password: <%= password %>
14
+ database: <%= database %>
15
+ host: <%= host %>
16
+ query: |
17
+ <%= query %>
18
+ <%= options %>
19
+ out:
20
+ type: bigquery
21
+ auth_method: <%= auth_method %>
22
+ json_keyfile: <%= json_keyfile %>
23
+ project: <%= project %>
24
+ service_account_email: <%= service_account_email %>
25
+ dataset: <%= dataset %>
26
+ table: <%= table_name %>
27
+ schema_file: <%= schema_file %>
28
+ auto_create_table: true
29
+ path_prefix: <%= path_prefix %>
30
+ source_format: NEWLINE_DELIMITED_JSON
31
+ file_ext: .json.gz
32
+ delete_from_local_when_job_end: 1
33
+ formatter:
34
+ type: jsonl
35
+ encoders:
36
+ - {type: gzip}
37
+ EOS
38
+
39
+ def initialize(config)
40
+ @config = config.dup
41
+ @current_date = Date.today
42
+ end
43
+
44
+ def self.generate_schema(columns)
45
+ json_body = columns.map(&:to_json).join(",\n")
46
+ "[\n" + json_body + "\n]\n"
47
+ end
48
+
49
+ def self.generate_sql(table_config, columns)
50
+ columns = columns.map(&:converted_value)
51
+ sql = "SELECT #{columns.join(',')}"
52
+ sql << " FROM #{table_config.name}"
53
+ sql << " WHERE #{table_config.condition}" if table_config.condition
54
+ sql << "\n"
55
+ sql
56
+ end
57
+
58
+ def generate_embulk_config(db_name, db_config, table_config, columns)
59
+ db_type = db_config['db_type']
60
+ host = db_config['host']
61
+ user = db_config['username']
62
+ password = db_config['password']
63
+ database = db_config['database']
64
+ options = if db_type == 'mysql'
65
+ "options: {useLegacyDatetimeCode: false, serverTimezone: #{db_config['timezone']}}"
66
+ else
67
+ ''
68
+ end
69
+ query = Naginegi::BigQuery.generate_sql(table_config, columns)
70
+
71
+ auth_method = @config['auth_method']
72
+ json_keyfile = @config['json_keyfile']
73
+ project = @config['project_id']
74
+ service_account_email = @config['service_email']
75
+ dataset = db_config['bq_dataset']
76
+ table_name = actual_table_name(table_config.name, db_config['daily_snapshot'] || table_config.daily_snapshot)
77
+ schema_file = "#{@config['schema_dir']}/#{db_name}/#{table_config.name}.json"
78
+ path_prefix = "/var/tmp/embulk_#{db_name}_#{table_config.name}"
79
+
80
+ ERB.new(CONTENTS).result(binding)
81
+ end
82
+
83
+ def delete_table(dataset, table_name)
84
+ bq = Google::Cloud::Bigquery.new(
85
+ project: @config['project_id'],
86
+ keyfile: @config['json_keyfile']
87
+ )
88
+ bq.service.delete_table(dataset, table_name)
89
+ end
90
+
91
+ def actual_table_name(table_name, daily_snapshot)
92
+ return table_name unless daily_snapshot
93
+ table_name + @current_date.strftime('%Y%m%d')
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,72 @@
1
+ require 'logger'
2
+
3
+ module Naginegi
4
+ class Embulk
5
+ def initialize
6
+ @logger = Logger.new(STDOUT)
7
+ @logger.datetime_format = '%Y-%m-%d %H:%M:%S'
8
+ end
9
+
10
+ def run(db_configs, all_table_configs, bq_config, target_table_names = [])
11
+ error_tables = []
12
+ db_configs.keys.each do |db_name|
13
+ table_configs = select_table_configs(all_table_configs[db_name], target_table_names)
14
+ error_tables += run_by_database(
15
+ db_name,
16
+ table_configs,
17
+ bq_config,
18
+ db_configs[db_name]['bq_dataset']
19
+ )
20
+ end
21
+ error_tables
22
+ end
23
+
24
+ def select_table_configs(table_configs, target_table_names)
25
+ return table_configs if target_table_names.empty?
26
+ table_configs.select { |table_config| target_table_names.include?(table_config.name) }
27
+ end
28
+
29
+ private
30
+
31
+ def run_by_database(db_name, table_configs, bq_config, bq_dataset)
32
+ process_times = []
33
+ error_tables = []
34
+
35
+ bq_utility = Naginegi::BigQuery.new(bq_config)
36
+
37
+ table_configs.each do |table_config|
38
+ start_time = Time.now
39
+ @logger.info("table: #{table_config.name} - start")
40
+
41
+ begin
42
+ bq_utility.delete_table(bq_dataset, table_config.name)
43
+ @logger.info("#{table_config.name} is deleted")
44
+ rescue => e
45
+ @logger.warn(e.message)
46
+ end
47
+
48
+ cmd = "embulk run #{bq_config['config_dir']}/#{db_name}/#{table_config.name}.yml"
49
+ @logger.info("cmd: #{cmd}")
50
+
51
+ if system(cmd)
52
+ result = 'success'
53
+ else
54
+ result = 'error'
55
+ error_tables << table_config.name
56
+ end
57
+
58
+ process_time = "table: #{table_config.name} - result: #{result} #{format('%10.1f', Time.now - start_time)}sec"
59
+ @logger.info(process_time)
60
+
61
+ process_times << process_time
62
+ end
63
+
64
+ @logger.info('------------------------------------')
65
+ @logger.info("db_name: #{db_name}")
66
+
67
+ process_times.each { |process_time| @logger.info(process_time) }
68
+
69
+ error_tables
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,49 @@
1
+ module Naginegi
2
+ class EmbulkConfig
3
+ def generate_config(db_configs, bq_config)
4
+ bq_utility = BigQuery.new(bq_config)
5
+
6
+ db_configs.keys.each do |db_name|
7
+ db_config = db_configs[db_name]
8
+ table_configs = all_table_configs[db_name]
9
+ db_type = db_config['db_type']
10
+
11
+ case db_type
12
+ when 'mysql'
13
+ sql_client = MySQL::MySQLClient.new(db_config)
14
+ when 'postgresql'
15
+ sql_client = PostgreSQL::PgClient.new(db_config)
16
+ end
17
+
18
+ table_configs.each do |table_config|
19
+ write(
20
+ "#{bq_config['schema_dir']}/#{db_name}",
21
+ "#{table_config.name}.json",
22
+ sql_client.generate_bq_schema(table_config.name)
23
+ )
24
+ write(
25
+ "#{bq_config['config_dir']}/#{db_name}",
26
+ "#{table_config.name}.yml",
27
+ bq_utility.generate_embulk_config(
28
+ db_name,
29
+ db_config,
30
+ table_config,
31
+ sql_client.columns(table_config.name)
32
+ )
33
+ )
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def write(directory, file_name, content)
41
+ FileUtils.mkdir_p(directory) unless FileTest.exist?(directory)
42
+ File.write("#{directory}/#{file_name}", content)
43
+ end
44
+
45
+ def all_table_configs
46
+ @all_table_configs ||= MySQL::TableConfig.generate_table_configs
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,119 @@
1
+ require 'mysql2-cs-bind'
2
+ require 'json'
3
+ require 'yaml'
4
+ require 'fileutils'
5
+ require 'naginegi/bigquery'
6
+
7
+ module Naginegi
8
+ module MySQL
9
+ class MySQLClient
10
+ COLUMN_SQL = <<-SQL.freeze
11
+ SELECT column_name, data_type
12
+ FROM INFORMATION_SCHEMA.COLUMNS
13
+ WHERE table_schema = ?
14
+ AND table_name = ?
15
+ ORDER BY ordinal_position
16
+ SQL
17
+
18
+ def initialize(database_config)
19
+ @database_config = database_config
20
+ end
21
+
22
+ def client
23
+ @client ||= Mysql2::Client.new(
24
+ host: @database_config['host'],
25
+ username: @database_config['username'],
26
+ password: @database_config['password'],
27
+ database: @database_config['database']
28
+ )
29
+ end
30
+
31
+ def generate_bq_schema(table_name)
32
+ infos = columns(table_name)
33
+ BigQuery.generate_schema(infos)
34
+ end
35
+
36
+ def columns(table_name)
37
+ rows = client.xquery(COLUMN_SQL, @database_config['database'], table_name)
38
+ rows.map { |row| Column.new(row['column_name'], row['data_type']) }
39
+ end
40
+ end
41
+
42
+ class TableConfig
43
+ attr_reader :name, :daily_snapshot, :condition
44
+
45
+ def initialize(config)
46
+ @name = config['name']
47
+ @daily_snapshot = config['daily_snapshot'] || false
48
+ @condition = config['condition']
49
+ end
50
+
51
+ def self.generate_table_configs(file_path = 'table.yml')
52
+ configs = YAML.load_file(file_path)
53
+ configs.each_with_object({}) do |(db, database_config), table_configs|
54
+ table_configs[db] = database_config['tables'].map { |config| TableConfig.new(config) }
55
+ table_configs
56
+ end
57
+ end
58
+
59
+ def ==(other)
60
+ instance_variables.all? do |v|
61
+ instance_variable_get(v) == other.instance_variable_get(v)
62
+ end
63
+ end
64
+ end
65
+
66
+ class Column
67
+ attr_reader :column_name, :data_type
68
+
69
+ TYPE_MAPPINGS = {
70
+ 'int' => 'INT64',
71
+ 'tinyint' => 'INT64',
72
+ 'smallint' => 'INT64',
73
+ 'mediumint' => 'INT64',
74
+ 'bigint' => 'INT64',
75
+ 'float' => 'FLOAT64',
76
+ 'double' => 'FLOAT64',
77
+ 'decimal' => 'FLOAT64',
78
+ 'char' => 'STRING',
79
+ 'varchar' => 'STRING',
80
+ 'tinytext' => 'STRING',
81
+ 'text' => 'STRING',
82
+ 'date' => 'TIMESTAMP',
83
+ 'datetime' => 'TIMESTAMP',
84
+ 'timestamp' => 'TIMESTAMP'
85
+ }.freeze
86
+
87
+ def initialize(column_name, data_type)
88
+ @column_name = column_name
89
+ @data_type = data_type
90
+ end
91
+
92
+ def bigquery_data_type
93
+ TYPE_MAPPINGS[@data_type]
94
+ end
95
+
96
+ def converted_value
97
+ if bigquery_data_type == 'TIMESTAMP'
98
+ # time zone translate to UTC
99
+ "UNIX_TIMESTAMP(#{escaped_column_name}) AS #{escaped_column_name}"
100
+ elsif data_type == 'tinyint'
101
+ # for MySQL tinyint(1) problem
102
+ "CAST(#{escaped_column_name} AS signed) AS #{escaped_column_name}"
103
+ else
104
+ escaped_column_name
105
+ end
106
+ end
107
+
108
+ def to_json(*a)
109
+ { 'name' => @column_name, 'type' => bigquery_data_type }.to_json(*a)
110
+ end
111
+
112
+ private
113
+
114
+ def escaped_column_name
115
+ "`#{@column_name}`"
116
+ end
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,117 @@
1
+ require 'pg'
2
+ require 'json'
3
+ require 'yaml'
4
+ require 'fileutils'
5
+ require 'naginegi/bigquery'
6
+
7
+ module Naginegi
8
+ module PostgreSQL
9
+ class PgClient
10
+ COLUMN_SQL = <<-SQL.freeze
11
+ SELECT column_name, data_type
12
+ FROM INFORMATION_SCHEMA.COLUMNS
13
+ WHERE table_name = $1
14
+ ORDER BY ordinal_position
15
+ SQL
16
+
17
+ def initialize(db_config)
18
+ @db_config = db_config
19
+ end
20
+
21
+ def client
22
+ @client ||= PG::Connection.new(
23
+ host: @db_config['host'],
24
+ user: @db_config['username'],
25
+ password: @db_config['password'],
26
+ dbname: @db_config['database']
27
+ )
28
+ end
29
+
30
+ def generate_bq_schema(table_name)
31
+ infos = columns(table_name)
32
+ BigQuery.generate_schema(infos)
33
+ end
34
+
35
+ def columns(table_name)
36
+ rows = client.exec_params(COLUMN_SQL, [table_name])
37
+ rows.map { |row| Column.new(row['column_name'], row['data_type']) }
38
+ end
39
+ end
40
+
41
+ class TableConfig
42
+ attr_reader :name, :daily_snapshot, :condition
43
+
44
+ def initialize(config)
45
+ @name = config['name']
46
+ @daily_snapshot = config['daily_snapshot'] || false
47
+ @condition = config['condition']
48
+ end
49
+
50
+ def self.generate_table_configs(file_path = 'table.yml')
51
+ configs = YAML.load_file(file_path)
52
+ configs.each_with_object({}) do |(db, db_config), table_configs|
53
+ table_configs[db] = db_config['tables'].map { |config| TableConfig.new(config) }
54
+ table_configs
55
+ end
56
+ end
57
+
58
+ def ==(other)
59
+ instance_variables.all? do |v|
60
+ instance_variable_get(v) == other.instance_variable_get(v)
61
+ end
62
+ end
63
+ end
64
+
65
+ class Column
66
+ attr_reader :column_name, :data_type
67
+
68
+ TYPE_MAPPINGS = {
69
+ 'smallint' => 'INT64',
70
+ 'integer' => 'INT64',
71
+ 'bigint' => 'INT64',
72
+ 'smallserial' => 'INT64',
73
+ 'serial' => 'INT64',
74
+ 'bigserial' => 'INT64',
75
+ 'decimal' => 'FLOAT64',
76
+ 'numeric' => 'FLOAT64',
77
+ 'real' => 'FLOAT64',
78
+ 'double precision' => 'FLOAT64',
79
+ 'character' => 'STRING',
80
+ 'character varying' => 'STRING',
81
+ 'text' => 'STRING',
82
+ 'date' => 'TIMESTAMP',
83
+ 'timestamp' => 'TIMESTAMP',
84
+ 'timestamp with time zone' => 'TIMESTAMP',
85
+ 'boolean' => 'BOOL'
86
+ }.freeze
87
+
88
+ def initialize(column_name, data_type)
89
+ @column_name = column_name
90
+ @data_type = data_type
91
+ end
92
+
93
+ def bigquery_data_type
94
+ TYPE_MAPPINGS[@data_type]
95
+ end
96
+
97
+ def converted_value
98
+ if bigquery_data_type == 'TIMESTAMP'
99
+ # time zone translate to UTC
100
+ "EXTRACT(EPOCH FROM #{escaped_column_name}) AS #{escaped_column_name}"
101
+ else
102
+ escaped_column_name
103
+ end
104
+ end
105
+
106
+ def to_json(*a)
107
+ { 'name' => @column_name, 'type' => bigquery_data_type }.to_json(*a)
108
+ end
109
+
110
+ private
111
+
112
+ def escaped_column_name
113
+ "\"#{@column_name}\""
114
+ end
115
+ end
116
+ end
117
+ end