naginegi 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.circleci/config.yml +41 -0
- data/.gitignore +41 -0
- data/.rubocop.yml +105 -0
- data/.travis.yml +7 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +120 -0
- data/LICENSE +21 -0
- data/README.md +189 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/naginegi.rb +57 -0
- data/lib/naginegi/bigquery.rb +96 -0
- data/lib/naginegi/embulk.rb +72 -0
- data/lib/naginegi/embulk_config.rb +49 -0
- data/lib/naginegi/mysql.rb +119 -0
- data/lib/naginegi/postgresql.rb +117 -0
- data/lib/naginegi/version.rb +3 -0
- data/lint.sh +54 -0
- data/naginegi.gemspec +34 -0
- metadata +203 -0
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "naginegi"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/lib/naginegi.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'naginegi/version'
|
2
|
+
require 'naginegi/embulk_config'
|
3
|
+
require 'naginegi/embulk'
|
4
|
+
require 'naginegi/mysql'
|
5
|
+
require 'naginegi/postgresql'
|
6
|
+
require 'logger'
|
7
|
+
|
8
|
+
module Naginegi
|
9
|
+
class EmbulkRunner
|
10
|
+
def initialize
|
11
|
+
@logger = Logger.new(STDOUT)
|
12
|
+
@logger.datetime_format = '%Y-%m-%d %H:%M:%S'
|
13
|
+
end
|
14
|
+
|
15
|
+
def generate_config(bq_config)
|
16
|
+
Naginegi::EmbulkConfig.new.generate_config(db_configs, bq_config)
|
17
|
+
end
|
18
|
+
|
19
|
+
def run(bq_config, target_table_names = [], retry_max = 0)
|
20
|
+
cmd = 'embulk --version'
|
21
|
+
unless system(cmd)
|
22
|
+
@logger.error('Cannot execute Embulk!!')
|
23
|
+
@logger.error('Cofirm Embulk install and environment')
|
24
|
+
return
|
25
|
+
end
|
26
|
+
|
27
|
+
error_tables = run_and_retry(bq_config, target_table_names, retry_max, 0)
|
28
|
+
error_tables.empty?
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def run_and_retry(bq_config, target_table_names, retry_max, retry_count)
|
34
|
+
error_tables = Naginegi::Embulk.new.run(
|
35
|
+
db_configs,
|
36
|
+
table_configs,
|
37
|
+
bq_config,
|
38
|
+
target_table_names
|
39
|
+
)
|
40
|
+
if !error_tables.empty? && retry_count < retry_max
|
41
|
+
@logger.warn('------------------------------------')
|
42
|
+
@logger.warn("retry start -> #{retry_count + 1} time")
|
43
|
+
@logger.warn('------------------------------------')
|
44
|
+
error_tables = run_and_retry(bq_config, error_tables, retry_max, retry_count + 1)
|
45
|
+
end
|
46
|
+
error_tables
|
47
|
+
end
|
48
|
+
|
49
|
+
def db_configs
|
50
|
+
@db_configs ||= YAML.load_file('database.yml')
|
51
|
+
end
|
52
|
+
|
53
|
+
def table_configs
|
54
|
+
@table_configs ||= Naginegi::MySQL::TableConfig.generate_table_configs
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'erb'
|
3
|
+
require 'google/cloud/bigquery'
|
4
|
+
require 'unindent'
|
5
|
+
require 'date'
|
6
|
+
|
7
|
+
module Naginegi
|
8
|
+
class BigQuery
|
9
|
+
CONTENTS = <<-EOS.unindent
|
10
|
+
in:
|
11
|
+
type: <%= db_type %>
|
12
|
+
user: <%= user %>
|
13
|
+
password: <%= password %>
|
14
|
+
database: <%= database %>
|
15
|
+
host: <%= host %>
|
16
|
+
query: |
|
17
|
+
<%= query %>
|
18
|
+
<%= options %>
|
19
|
+
out:
|
20
|
+
type: bigquery
|
21
|
+
auth_method: <%= auth_method %>
|
22
|
+
json_keyfile: <%= json_keyfile %>
|
23
|
+
project: <%= project %>
|
24
|
+
service_account_email: <%= service_account_email %>
|
25
|
+
dataset: <%= dataset %>
|
26
|
+
table: <%= table_name %>
|
27
|
+
schema_file: <%= schema_file %>
|
28
|
+
auto_create_table: true
|
29
|
+
path_prefix: <%= path_prefix %>
|
30
|
+
source_format: NEWLINE_DELIMITED_JSON
|
31
|
+
file_ext: .json.gz
|
32
|
+
delete_from_local_when_job_end: 1
|
33
|
+
formatter:
|
34
|
+
type: jsonl
|
35
|
+
encoders:
|
36
|
+
- {type: gzip}
|
37
|
+
EOS
|
38
|
+
|
39
|
+
def initialize(config)
|
40
|
+
@config = config.dup
|
41
|
+
@current_date = Date.today
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.generate_schema(columns)
|
45
|
+
json_body = columns.map(&:to_json).join(",\n")
|
46
|
+
"[\n" + json_body + "\n]\n"
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.generate_sql(table_config, columns)
|
50
|
+
columns = columns.map(&:converted_value)
|
51
|
+
sql = "SELECT #{columns.join(',')}"
|
52
|
+
sql << " FROM #{table_config.name}"
|
53
|
+
sql << " WHERE #{table_config.condition}" if table_config.condition
|
54
|
+
sql << "\n"
|
55
|
+
sql
|
56
|
+
end
|
57
|
+
|
58
|
+
def generate_embulk_config(db_name, db_config, table_config, columns)
|
59
|
+
db_type = db_config['db_type']
|
60
|
+
host = db_config['host']
|
61
|
+
user = db_config['username']
|
62
|
+
password = db_config['password']
|
63
|
+
database = db_config['database']
|
64
|
+
options = if db_type == 'mysql'
|
65
|
+
"options: {useLegacyDatetimeCode: false, serverTimezone: #{db_config['timezone']}}"
|
66
|
+
else
|
67
|
+
''
|
68
|
+
end
|
69
|
+
query = Naginegi::BigQuery.generate_sql(table_config, columns)
|
70
|
+
|
71
|
+
auth_method = @config['auth_method']
|
72
|
+
json_keyfile = @config['json_keyfile']
|
73
|
+
project = @config['project_id']
|
74
|
+
service_account_email = @config['service_email']
|
75
|
+
dataset = db_config['bq_dataset']
|
76
|
+
table_name = actual_table_name(table_config.name, db_config['daily_snapshot'] || table_config.daily_snapshot)
|
77
|
+
schema_file = "#{@config['schema_dir']}/#{db_name}/#{table_config.name}.json"
|
78
|
+
path_prefix = "/var/tmp/embulk_#{db_name}_#{table_config.name}"
|
79
|
+
|
80
|
+
ERB.new(CONTENTS).result(binding)
|
81
|
+
end
|
82
|
+
|
83
|
+
def delete_table(dataset, table_name)
|
84
|
+
bq = Google::Cloud::Bigquery.new(
|
85
|
+
project: @config['project_id'],
|
86
|
+
keyfile: @config['json_keyfile']
|
87
|
+
)
|
88
|
+
bq.service.delete_table(dataset, table_name)
|
89
|
+
end
|
90
|
+
|
91
|
+
def actual_table_name(table_name, daily_snapshot)
|
92
|
+
return table_name unless daily_snapshot
|
93
|
+
table_name + @current_date.strftime('%Y%m%d')
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module Naginegi
|
4
|
+
class Embulk
|
5
|
+
def initialize
|
6
|
+
@logger = Logger.new(STDOUT)
|
7
|
+
@logger.datetime_format = '%Y-%m-%d %H:%M:%S'
|
8
|
+
end
|
9
|
+
|
10
|
+
def run(db_configs, all_table_configs, bq_config, target_table_names = [])
|
11
|
+
error_tables = []
|
12
|
+
db_configs.keys.each do |db_name|
|
13
|
+
table_configs = select_table_configs(all_table_configs[db_name], target_table_names)
|
14
|
+
error_tables += run_by_database(
|
15
|
+
db_name,
|
16
|
+
table_configs,
|
17
|
+
bq_config,
|
18
|
+
db_configs[db_name]['bq_dataset']
|
19
|
+
)
|
20
|
+
end
|
21
|
+
error_tables
|
22
|
+
end
|
23
|
+
|
24
|
+
def select_table_configs(table_configs, target_table_names)
|
25
|
+
return table_configs if target_table_names.empty?
|
26
|
+
table_configs.select { |table_config| target_table_names.include?(table_config.name) }
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def run_by_database(db_name, table_configs, bq_config, bq_dataset)
|
32
|
+
process_times = []
|
33
|
+
error_tables = []
|
34
|
+
|
35
|
+
bq_utility = Naginegi::BigQuery.new(bq_config)
|
36
|
+
|
37
|
+
table_configs.each do |table_config|
|
38
|
+
start_time = Time.now
|
39
|
+
@logger.info("table: #{table_config.name} - start")
|
40
|
+
|
41
|
+
begin
|
42
|
+
bq_utility.delete_table(bq_dataset, table_config.name)
|
43
|
+
@logger.info("#{table_config.name} is deleted")
|
44
|
+
rescue => e
|
45
|
+
@logger.warn(e.message)
|
46
|
+
end
|
47
|
+
|
48
|
+
cmd = "embulk run #{bq_config['config_dir']}/#{db_name}/#{table_config.name}.yml"
|
49
|
+
@logger.info("cmd: #{cmd}")
|
50
|
+
|
51
|
+
if system(cmd)
|
52
|
+
result = 'success'
|
53
|
+
else
|
54
|
+
result = 'error'
|
55
|
+
error_tables << table_config.name
|
56
|
+
end
|
57
|
+
|
58
|
+
process_time = "table: #{table_config.name} - result: #{result} #{format('%10.1f', Time.now - start_time)}sec"
|
59
|
+
@logger.info(process_time)
|
60
|
+
|
61
|
+
process_times << process_time
|
62
|
+
end
|
63
|
+
|
64
|
+
@logger.info('------------------------------------')
|
65
|
+
@logger.info("db_name: #{db_name}")
|
66
|
+
|
67
|
+
process_times.each { |process_time| @logger.info(process_time) }
|
68
|
+
|
69
|
+
error_tables
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Naginegi
|
2
|
+
class EmbulkConfig
|
3
|
+
def generate_config(db_configs, bq_config)
|
4
|
+
bq_utility = BigQuery.new(bq_config)
|
5
|
+
|
6
|
+
db_configs.keys.each do |db_name|
|
7
|
+
db_config = db_configs[db_name]
|
8
|
+
table_configs = all_table_configs[db_name]
|
9
|
+
db_type = db_config['db_type']
|
10
|
+
|
11
|
+
case db_type
|
12
|
+
when 'mysql'
|
13
|
+
sql_client = MySQL::MySQLClient.new(db_config)
|
14
|
+
when 'postgresql'
|
15
|
+
sql_client = PostgreSQL::PgClient.new(db_config)
|
16
|
+
end
|
17
|
+
|
18
|
+
table_configs.each do |table_config|
|
19
|
+
write(
|
20
|
+
"#{bq_config['schema_dir']}/#{db_name}",
|
21
|
+
"#{table_config.name}.json",
|
22
|
+
sql_client.generate_bq_schema(table_config.name)
|
23
|
+
)
|
24
|
+
write(
|
25
|
+
"#{bq_config['config_dir']}/#{db_name}",
|
26
|
+
"#{table_config.name}.yml",
|
27
|
+
bq_utility.generate_embulk_config(
|
28
|
+
db_name,
|
29
|
+
db_config,
|
30
|
+
table_config,
|
31
|
+
sql_client.columns(table_config.name)
|
32
|
+
)
|
33
|
+
)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def write(directory, file_name, content)
|
41
|
+
FileUtils.mkdir_p(directory) unless FileTest.exist?(directory)
|
42
|
+
File.write("#{directory}/#{file_name}", content)
|
43
|
+
end
|
44
|
+
|
45
|
+
def all_table_configs
|
46
|
+
@all_table_configs ||= MySQL::TableConfig.generate_table_configs
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'mysql2-cs-bind'
|
2
|
+
require 'json'
|
3
|
+
require 'yaml'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'naginegi/bigquery'
|
6
|
+
|
7
|
+
module Naginegi
|
8
|
+
module MySQL
|
9
|
+
class MySQLClient
|
10
|
+
COLUMN_SQL = <<-SQL.freeze
|
11
|
+
SELECT column_name, data_type
|
12
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
13
|
+
WHERE table_schema = ?
|
14
|
+
AND table_name = ?
|
15
|
+
ORDER BY ordinal_position
|
16
|
+
SQL
|
17
|
+
|
18
|
+
def initialize(database_config)
|
19
|
+
@database_config = database_config
|
20
|
+
end
|
21
|
+
|
22
|
+
def client
|
23
|
+
@client ||= Mysql2::Client.new(
|
24
|
+
host: @database_config['host'],
|
25
|
+
username: @database_config['username'],
|
26
|
+
password: @database_config['password'],
|
27
|
+
database: @database_config['database']
|
28
|
+
)
|
29
|
+
end
|
30
|
+
|
31
|
+
def generate_bq_schema(table_name)
|
32
|
+
infos = columns(table_name)
|
33
|
+
BigQuery.generate_schema(infos)
|
34
|
+
end
|
35
|
+
|
36
|
+
def columns(table_name)
|
37
|
+
rows = client.xquery(COLUMN_SQL, @database_config['database'], table_name)
|
38
|
+
rows.map { |row| Column.new(row['column_name'], row['data_type']) }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class TableConfig
|
43
|
+
attr_reader :name, :daily_snapshot, :condition
|
44
|
+
|
45
|
+
def initialize(config)
|
46
|
+
@name = config['name']
|
47
|
+
@daily_snapshot = config['daily_snapshot'] || false
|
48
|
+
@condition = config['condition']
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.generate_table_configs(file_path = 'table.yml')
|
52
|
+
configs = YAML.load_file(file_path)
|
53
|
+
configs.each_with_object({}) do |(db, database_config), table_configs|
|
54
|
+
table_configs[db] = database_config['tables'].map { |config| TableConfig.new(config) }
|
55
|
+
table_configs
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def ==(other)
|
60
|
+
instance_variables.all? do |v|
|
61
|
+
instance_variable_get(v) == other.instance_variable_get(v)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class Column
|
67
|
+
attr_reader :column_name, :data_type
|
68
|
+
|
69
|
+
TYPE_MAPPINGS = {
|
70
|
+
'int' => 'INT64',
|
71
|
+
'tinyint' => 'INT64',
|
72
|
+
'smallint' => 'INT64',
|
73
|
+
'mediumint' => 'INT64',
|
74
|
+
'bigint' => 'INT64',
|
75
|
+
'float' => 'FLOAT64',
|
76
|
+
'double' => 'FLOAT64',
|
77
|
+
'decimal' => 'FLOAT64',
|
78
|
+
'char' => 'STRING',
|
79
|
+
'varchar' => 'STRING',
|
80
|
+
'tinytext' => 'STRING',
|
81
|
+
'text' => 'STRING',
|
82
|
+
'date' => 'TIMESTAMP',
|
83
|
+
'datetime' => 'TIMESTAMP',
|
84
|
+
'timestamp' => 'TIMESTAMP'
|
85
|
+
}.freeze
|
86
|
+
|
87
|
+
def initialize(column_name, data_type)
|
88
|
+
@column_name = column_name
|
89
|
+
@data_type = data_type
|
90
|
+
end
|
91
|
+
|
92
|
+
def bigquery_data_type
|
93
|
+
TYPE_MAPPINGS[@data_type]
|
94
|
+
end
|
95
|
+
|
96
|
+
def converted_value
|
97
|
+
if bigquery_data_type == 'TIMESTAMP'
|
98
|
+
# time zone translate to UTC
|
99
|
+
"UNIX_TIMESTAMP(#{escaped_column_name}) AS #{escaped_column_name}"
|
100
|
+
elsif data_type == 'tinyint'
|
101
|
+
# for MySQL tinyint(1) problem
|
102
|
+
"CAST(#{escaped_column_name} AS signed) AS #{escaped_column_name}"
|
103
|
+
else
|
104
|
+
escaped_column_name
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def to_json(*a)
|
109
|
+
{ 'name' => @column_name, 'type' => bigquery_data_type }.to_json(*a)
|
110
|
+
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
def escaped_column_name
|
115
|
+
"`#{@column_name}`"
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'pg'
|
2
|
+
require 'json'
|
3
|
+
require 'yaml'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'naginegi/bigquery'
|
6
|
+
|
7
|
+
module Naginegi
|
8
|
+
module PostgreSQL
|
9
|
+
class PgClient
|
10
|
+
COLUMN_SQL = <<-SQL.freeze
|
11
|
+
SELECT column_name, data_type
|
12
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
13
|
+
WHERE table_name = $1
|
14
|
+
ORDER BY ordinal_position
|
15
|
+
SQL
|
16
|
+
|
17
|
+
def initialize(db_config)
|
18
|
+
@db_config = db_config
|
19
|
+
end
|
20
|
+
|
21
|
+
def client
|
22
|
+
@client ||= PG::Connection.new(
|
23
|
+
host: @db_config['host'],
|
24
|
+
user: @db_config['username'],
|
25
|
+
password: @db_config['password'],
|
26
|
+
dbname: @db_config['database']
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
def generate_bq_schema(table_name)
|
31
|
+
infos = columns(table_name)
|
32
|
+
BigQuery.generate_schema(infos)
|
33
|
+
end
|
34
|
+
|
35
|
+
def columns(table_name)
|
36
|
+
rows = client.exec_params(COLUMN_SQL, [table_name])
|
37
|
+
rows.map { |row| Column.new(row['column_name'], row['data_type']) }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class TableConfig
|
42
|
+
attr_reader :name, :daily_snapshot, :condition
|
43
|
+
|
44
|
+
def initialize(config)
|
45
|
+
@name = config['name']
|
46
|
+
@daily_snapshot = config['daily_snapshot'] || false
|
47
|
+
@condition = config['condition']
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.generate_table_configs(file_path = 'table.yml')
|
51
|
+
configs = YAML.load_file(file_path)
|
52
|
+
configs.each_with_object({}) do |(db, db_config), table_configs|
|
53
|
+
table_configs[db] = db_config['tables'].map { |config| TableConfig.new(config) }
|
54
|
+
table_configs
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def ==(other)
|
59
|
+
instance_variables.all? do |v|
|
60
|
+
instance_variable_get(v) == other.instance_variable_get(v)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class Column
|
66
|
+
attr_reader :column_name, :data_type
|
67
|
+
|
68
|
+
TYPE_MAPPINGS = {
|
69
|
+
'smallint' => 'INT64',
|
70
|
+
'integer' => 'INT64',
|
71
|
+
'bigint' => 'INT64',
|
72
|
+
'smallserial' => 'INT64',
|
73
|
+
'serial' => 'INT64',
|
74
|
+
'bigserial' => 'INT64',
|
75
|
+
'decimal' => 'FLOAT64',
|
76
|
+
'numeric' => 'FLOAT64',
|
77
|
+
'real' => 'FLOAT64',
|
78
|
+
'double precision' => 'FLOAT64',
|
79
|
+
'character' => 'STRING',
|
80
|
+
'character varying' => 'STRING',
|
81
|
+
'text' => 'STRING',
|
82
|
+
'date' => 'TIMESTAMP',
|
83
|
+
'timestamp' => 'TIMESTAMP',
|
84
|
+
'timestamp with time zone' => 'TIMESTAMP',
|
85
|
+
'boolean' => 'BOOL'
|
86
|
+
}.freeze
|
87
|
+
|
88
|
+
def initialize(column_name, data_type)
|
89
|
+
@column_name = column_name
|
90
|
+
@data_type = data_type
|
91
|
+
end
|
92
|
+
|
93
|
+
def bigquery_data_type
|
94
|
+
TYPE_MAPPINGS[@data_type]
|
95
|
+
end
|
96
|
+
|
97
|
+
def converted_value
|
98
|
+
if bigquery_data_type == 'TIMESTAMP'
|
99
|
+
# time zone translate to UTC
|
100
|
+
"EXTRACT(EPOCH FROM #{escaped_column_name}) AS #{escaped_column_name}"
|
101
|
+
else
|
102
|
+
escaped_column_name
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def to_json(*a)
|
107
|
+
{ 'name' => @column_name, 'type' => bigquery_data_type }.to_json(*a)
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def escaped_column_name
|
113
|
+
"\"#{@column_name}\""
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|