data-exporter 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 29fcc31fa3466058bc785fe88e2e1acadec677d7
4
+ data.tar.gz: 8a41c38d5c0383ce63ae6290f88406e82f32c279
5
+ SHA512:
6
+ metadata.gz: 9ac800e453a9623a806dc87b87d7a8878d5ab78c59b36a7c9d84a46524c3b2567713fce70e3e05af5039bb205711c5f9ed2043f401f3e98ca3a6c7260601b892
7
+ data.tar.gz: d490349507e86a2f597e9ab0e58ed1fccb6d402516d3566be748b3abfd53bc583f244e73a60674b00cb94575442e3e7968f4b99979e0187ecf1982561e0e17ba
@@ -0,0 +1,40 @@
1
+ DataExporter
2
+ ------------
3
+ Use to export and import MySQL databases to S3.
4
+
5
+ To export a MySQL database:
6
+
7
+ 1. Add gem to Gemfile
8
+ 2. Create a YAML or ERB configuration that `data-exporter` can read:
9
+ ```yaml
10
+ export_dir: '/tmp'
11
+ backup_dir: '/backups'
12
+ backup_key: 'config/backup_key' # openssl encryption key file
13
+ mysql:
14
+ adapter: 'mysql2'
15
+ host: 'localhost'
16
+ database: 'centurion_development'
17
+ username: 'root'
18
+ password: ''
19
+ s3:
20
+ access_key_id: 'ACCESS_KEY_ID'
21
+ secret_access_key: 'SECRET_ACESS_KEY'
22
+ bucket_name: 'socialcast_backups'
23
+ prefix: 'centurion_development'
24
+ ```
25
+
26
+ 3. Execute:
27
+ ```shell
28
+ bundle exec data-exporter export -f etc/data_exporter.yml --csv
29
+ ```
30
+
31
+
32
+ To import a MySQL database as a directory of CSV files:
33
+ ```shell
34
+ bundle exec data-exporter unpack -f etc/data_exporter.yml --csv
35
+ ```
36
+
37
+ To import a MySQL database backup via --sftp:
38
+ ```shell
39
+ bundle exec data-exporter unpack -f etc/data_exporter.yml --csv --sftp
40
+ ```
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env rake
2
+
3
+ begin
4
+ require 'bundler/setup'
5
+ rescue LoadError
6
+ puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
7
+ end
8
+
9
+ Bundler::GemHelper.install_tasks
10
+
11
+ require 'rspec/core/rake_task'
12
+
13
+ RSpec::Core::RakeTask.new(:spec)
14
+
15
+ task :default => :spec
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.expand_path('../../lib/', __FILE__)
3
+ require 'data_exporter'
4
+ DataExporter::CLI.start
@@ -0,0 +1,21 @@
1
+ module DataExporter
2
+ require 'data_exporter/configuration'
3
+ require 'data_exporter/archive'
4
+ require 'data_exporter/actions'
5
+ require 'data_exporter/cli'
6
+
7
+ class << self
8
+ def configuration
9
+ @configuration ||= DataExporter::Configuration.new
10
+ end
11
+ alias :config :configuration
12
+
13
+ def configure(&block)
14
+ yield configuration
15
+ end
16
+
17
+ def database_connection
18
+ ActiveRecord::Base.establish_connection(configuration.database)
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,321 @@
1
+ require 'active_record'
2
+ require 'csv'
3
+ require 'open3'
4
+ require 'net/sftp'
5
+ require 'aws/s3'
6
+ require 'ostruct'
7
+
8
+ module DataExporter
9
+ IGNORED_TABLES = %w(schema_migrations checksums)
10
+
11
+ module Actions
12
+ def decrypt(in_file_name = '-')
13
+ ['openssl', 'enc', '-d', '-aes-256-cbc', '-salt', '-pass', "file:#{config.backup_key}", '-in', in_file_name]
14
+ end
15
+
16
+ def expand(file_name = '-')
17
+ ['gunzip', file_name]
18
+ end
19
+
20
+ def unarchive(file_name = '-')
21
+ ['tar', '-xf', file_name]
22
+ end
23
+
24
+ def compress(file_name = '-')
25
+ ['gzip', '--to-stdout', file_name]
26
+ end
27
+
28
+ def encrypt(backup_key)
29
+ ['openssl', 'enc', '-aes-256-cbc', '-salt', '-pass', "file:#{backup_key}"]
30
+ end
31
+
32
+ def unpack(encrypted_file, unpack_dir)
33
+ case encrypted_file
34
+ when /\.tar\.gz\.enc\z/
35
+ unpack_encrypted_archive(encrypted_file, unpack_dir)
36
+ when /\.sql\.gz\.enc\z/
37
+ unpack_encrypted_file(encrypted_file, unpack_dir)
38
+ end
39
+ end
40
+
41
+ def export(backup_key, encrypted_file, archive_dir = nil)
42
+ case encrypted_file
43
+ when /\.tar\.gz\.enc\z/
44
+ export_encrypted_archive(backup_key, encrypted_file, archive_dir)
45
+ when /\.sql\.gz\.enc\z/
46
+ export_encrypted_file(backup_key, encrypted_file)
47
+ end
48
+ end
49
+
50
+ def config
51
+ DataExporter.config
52
+ end
53
+
54
+ def find_last_backup(prefix, suffix, backup_dir)
55
+ if config.sftp_enabled?
56
+ find_last_sftp_backup(prefix, suffix, backup_dir)
57
+ else
58
+ find_last_s3_backup(prefix, suffix, backup_dir)
59
+ end
60
+ end
61
+
62
+ def find_last_sftp_backup(prefix, suffix, backup_dir = '/')
63
+ backup_entry = nil
64
+ sftp.dir.glob(backup_dir, prefix + '*' + suffix) do |entry|
65
+ backup_entry ||= entry
66
+ if entry.attributes.mtime > backup_entry.attributes.mtime
67
+ backup_entry = entry
68
+ end
69
+ end
70
+ OpenStruct.new(:name => File.join(backup_dir, backup_entry.name), :mtime => backup_entry.attributes.mtime, :size => backup_entry.attributes.size) if backup_entry
71
+ rescue => e
72
+ log "#{e.to_s}"
73
+ nil
74
+ end
75
+
76
+ def glob_escape(glob)
77
+ glob.gsub('.', '\\.').gsub('*', '.*')
78
+ end
79
+
80
+ def find_last_s3_backup(prefix, suffix, backup_dir = nil)
81
+ s3_backup = nil
82
+ s3_bucket = config.s3[:bucket]
83
+ s3_prefix = backup_dir ? File.join(backup_dir, prefix) : prefix
84
+ s3.buckets[s3_bucket].objects.with_prefix(s3_prefix).each do |s3_object|
85
+ next unless s3_object.key =~ /#{glob_escape(suffix)}\Z/
86
+ s3_backup = s3_object
87
+ end
88
+ OpenStruct.new(:name => s3_backup.key, :mtime => s3_backup.last_modified.to_i, :size => s3_backup.content_length, :io => s3_backup) if s3_backup
89
+ rescue => e
90
+ log "#{e.to_s}"
91
+ nil
92
+ end
93
+
94
+ private
95
+
96
+ SFTP_MAX_TIMEOUT = 30
97
+
98
+ def pipeline_with_log(*args)
99
+ opts = Hash === args.last ? args.pop.dup : {}
100
+ err_r, err_w = IO.pipe
101
+ threads = []
102
+ process = 0
103
+ Open3.pipeline_start(*args, opts.merge(:err => err_w)) do |ts|
104
+ ts.each { |t| wait_unless_done(t.value) }
105
+ err_w.close
106
+ log(err_r.read)
107
+ ts.each { |x| log("executed '#{args[process].join( ' ')}' - #{x.value}"); process += 1 }
108
+ threads += ts
109
+ end
110
+ threads.map(&:value)
111
+ end
112
+
113
+ def wait_unless_done(process)
114
+ Process.wait(process.pid) if process && process.try(:pid)
115
+ rescue
116
+ end
117
+
118
+ def unpack_encrypted_file(encrypted_file, unpack_dir)
119
+ FileUtils.mkdir_p(unpack_dir)
120
+ out_file_name = File.join(unpack_dir, File.basename(encrypted_file).chomp('.gz.enc'))
121
+ status = pipeline_with_log(decrypt(encrypted_file), expand, :out => out_file_name)
122
+ raise SystemExit, "Problem unpacking #{encrypted_file}" unless status.all? { |x| x.success? }
123
+ end
124
+
125
+ def unpack_encrypted_archive(encrypted_archive, unpack_dir)
126
+ tmp_unpack_dir = Dir.mktmpdir('data_exporter')
127
+
128
+ status = Dir.chdir(tmp_unpack_dir) do
129
+ pipeline_with_log(decrypt(encrypted_archive), expand, unarchive)
130
+ end
131
+ raise SystemExit, "Problem unpacking #{encrypted_archive}" unless status.all? { |x| x.success? }
132
+
133
+ archive_unpack_dir = File.join(tmp_unpack_dir, config.archive_base_directory, '/')
134
+ FileUtils.rmtree(unpack_dir)
135
+ FileUtils.move(archive_unpack_dir, unpack_dir)
136
+ end
137
+
138
+ def export_encrypted_archive(backup_key, encrypted_archive, archive_dir = nil)
139
+ local_archive_name = File.join(config.export_dir, File.basename(encrypted_archive).chomp('.gz.enc'))
140
+ archive_dir ? archive_dir_csv_export(local_archive_name, archive_dir) : mysql_csv_export(local_archive_name)
141
+ pipeline_with_log(compress(local_archive_name), encrypt(backup_key), :out => encrypted_archive)
142
+ log "removing #{local_archive_name}"
143
+ FileUtils.rm(local_archive_name)
144
+ end
145
+
146
+ def export_encrypted_file(backup_key, encrypted_file)
147
+ pipeline_with_log(mysqldump_export, compress, encrypt(backup_key), :out => encrypted_file)
148
+ end
149
+
150
+ def download(remote_file_info, download_dir)
151
+ if config.sftp_enabled?
152
+ download_via_sftp(remote_file_info, download_dir)
153
+ else
154
+ download_via_s3(remote_file_info, download_dir)
155
+ end
156
+ end
157
+
158
+ def download_via_sftp(remote_file_info, local_dir)
159
+ local_file = nil
160
+ local_file = File.join(local_dir, File.basename(remote_file_info.name))
161
+ sftp.download!(remote_file_info.name, local_file) do |event, downloader, *args|
162
+ case event
163
+ when :open then
164
+ # args[0] : file metadata
165
+ log "downloading #{args[0].remote} -> #{args[0].local} (#{args[0].size} bytes)"
166
+ when :close then
167
+ # args[0] : file metadata
168
+ log "finished #{args[0].remote}"
169
+ end
170
+ end
171
+ local_file
172
+ end
173
+
174
+ def download_via_s3(remote_file_info, local_dir)
175
+ local_file = File.join(local_dir, File.basename(remote_file_info.name))
176
+ log "downloading #{remote_file_info.name} to #{local_file}"
177
+ File.open(local_file, 'w') do |file|
178
+ remote_file_info.io.read do |chunk|
179
+ file.write(chunk)
180
+ end
181
+ end
182
+ local_file
183
+ end
184
+
185
+ def upload(local_file, remote_dir)
186
+ if config.sftp_enabled?
187
+ upload_via_sftp(local_file, remote_dir)
188
+ else
189
+ upload_via_s3(local_file, remote_dir)
190
+ end
191
+ end
192
+
193
+ def upload_via_sftp(local_file, backup_dir = nil)
194
+ remote_file = backup_dir ? File.join(backup_dir, File.basename(local_file)) : File.basename(local_file)
195
+ sftp.mkdir(backup_dir)
196
+ sftp.upload!(local_file, remote_file) do |event, uploader, *args|
197
+ case event
198
+ when :open then
199
+ # args[0] : file metadata
200
+ log "uploading #{args[0].local} -> #{args[0].remote} (#{args[0].size} bytes)"
201
+ when :close then
202
+ # args[0] : file metadata
203
+ log "finished #{args[0].remote}"
204
+ end
205
+ end
206
+ end
207
+
208
+ def upload_via_s3(local_file, backup_dir = nil)
209
+ s3_bucket = config.s3[:bucket]
210
+ s3_file = backup_dir ? File.join(backup_dir, File.basename(local_file)) : File.basename(local_file)
211
+ log "uploading #{local_file} to s3://#{s3_bucket}/#{s3_file}"
212
+ s3.buckets[s3_bucket].objects[s3_file].write(open(local_file))
213
+ end
214
+
215
+ def archive_dir_csv_export(archive_name, archive_dir)
216
+ log "creating #{archive_name}"
217
+ DataExporter::Archive.open(archive_name) do |archive|
218
+ Dir.glob(File.join(archive_dir, '*')).each do |file_name|
219
+ archive_file_name = File.join(config.archive_base_directory, File::basename(file_name))
220
+ log "appending #{archive_file_name}"
221
+ archive.append(archive_file_name, file_name)
222
+ end
223
+ end
224
+ end
225
+
226
+ def mysql_csv_export(archive_name)
227
+ log "creating #{archive_name}"
228
+ DataExporter::Archive.open(archive_name) do |archive|
229
+ mysql_to_csv do |file_name|
230
+ archive_file_name = File.join(config.archive_base_directory, File::basename(file_name))
231
+ log "appending #{archive_file_name}"
232
+ archive.append(archive_file_name, file_name)
233
+ FileUtils.rm(file_name)
234
+ end
235
+ end
236
+ end
237
+
238
+ # NOTE database lock is necessary for consistency
239
+ def mysql_to_csv(&block)
240
+ begin
241
+ connection_pool = DataExporter::database_connection
242
+ connection_pool.connection.tables.each do |table|
243
+ next if IGNORED_TABLES.include?(table)
244
+ fields = connection_pool.connection.columns(table).map(&:name)
245
+ total = 0
246
+ connection_pool.connection.select_rows("SELECT COUNT(*) AS count FROM #{table};").each { |result| total = result[0] }
247
+ selected_fields = fields - config.pii_fields(table)
248
+ page = 1
249
+ block_size = 500_000
250
+ total_number_of_pages = (total.to_f / block_size.to_f).ceil
251
+ last_id = 0
252
+ quoted_selected_fields = selected_fields.map { |field| connection_pool.connection.quote_column_name(field) }
253
+ while page <= total_number_of_pages do
254
+ csv_file = "#{File.join(config.export_dir, "#{table}_#{page}")}.csv"
255
+ CSV.open(csv_file, "w") do |csv|
256
+ csv << selected_fields
257
+ results = connection_pool.connection.select_rows("SELECT #{quoted_selected_fields.join(',')} FROM #{table} #{"WHERE id > #{last_id} ORDER BY id ASC LIMIT #{block_size}" if fields.include?('id')};")
258
+ if results
259
+ results.each do |table_values|
260
+ begin
261
+ csv << table_values
262
+ rescue => e
263
+ STDERR.puts "skipping #{table}.id = #{table_values[0]} - #{e.to_s}"
264
+ rescue => g
265
+ STDERR.puts "#{g.to_s}"
266
+ end
267
+ end
268
+ last_id = results.last[0] if fields.include?('id')
269
+ end
270
+ end
271
+ yield csv_file
272
+ page += 1
273
+ end
274
+ end
275
+ ensure
276
+ connection_pool.disconnect!
277
+ end
278
+ end
279
+
280
+ def mysqldump_export
281
+ [
282
+ config.mysqldump_path,
283
+ *config.mysqldump_options,
284
+ config.database[:host] ? "--host=#{config.database[:host]}" : nil,
285
+ config.database[:port] ? "--port=#{config.database[:port]}" : nil,
286
+ "--user=#{config.database[:username]}",
287
+ config.database[:password] ? "--password=#{config.database[:password]}" : nil,
288
+ config.database[:database]
289
+ ].compact
290
+ end
291
+
292
+ def log(*a); end
293
+
294
+ def https_proxy
295
+ ENV['https_proxy'] || ENV['HTTPS_PROXY']
296
+ end
297
+
298
+ def s3_options
299
+ @s3_options ||=
300
+ {
301
+ :access_key_id => config.s3[:access_key_id],
302
+ :secret_access_key => config.s3[:secret_access_key],
303
+ :proxy_uri => https_proxy,
304
+ :use_ssl => true
305
+ }
306
+ end
307
+
308
+ def s3
309
+ @s3 ||= AWS::S3.new(s3_options)
310
+ end
311
+
312
+ def sftp
313
+ @sftp ||= Net::SFTP.start(config.sftp[:host], config.sftp[:user], timeout: SFTP_MAX_TIMEOUT)
314
+ end
315
+
316
+ def redis
317
+ require 'redis'
318
+ @redis ||= Redis.new(:host => config.redis[:host], :port => config.redis[:port])
319
+ end
320
+ end
321
+ end
@@ -0,0 +1,25 @@
1
+ require 'archive/tar/minitar'
2
+
3
+ module DataExporter
4
+ class Archive
5
+ class << self
6
+ def open(archive_filename, &block)
7
+ File.open(archive_filename, 'w') do |file|
8
+ ::Archive::Tar::Minitar::Writer.open(file) do |writer|
9
+ yield self.new(writer)
10
+ end
11
+ end
12
+ end
13
+ end
14
+
15
+ def initialize(writer)
16
+ @writer = writer
17
+ end
18
+
19
+ def append(archive_filename, filename)
20
+ @writer.add_file(archive_filename, :mode => 0644, :mtime => Time.now) do |archive_file|
21
+ archive_file.write File.read(filename)
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,154 @@
1
+ require 'thor'
2
+ require 'logger'
3
+
4
+ module DataExporter
5
+ class CLI < Thor
6
+ include DataExporter::Actions
7
+
8
+ namespace :data
9
+ class_option 'debug', :aliases => '-d', :desc => 'Turn on debug logging', :default => false
10
+ class_option 'csv', :desc => 'perform csv unpack', :default => false
11
+ class_option 'quiet', :aliases => '-q', :desc => 'silence diagnostic information', :default => false
12
+ class_option 'config_file', :aliases => '-f', :desc => 'configuration file to load', :required => true
13
+ class_option 'preserve', :aliases => '-p', :desc => 'preserve unpack after download', :default => false
14
+ class_option 'mode', :desc => 'backup mode, corresponds to section data_exporter.yml configuration file'
15
+
16
+ desc 'unpack', 'unpack mysql database'
17
+ method_option 'date', :desc => 'unpack export for date'
18
+ method_option 'unpack_dir', :desc => 'directory to unpack export into'
19
+ def unpack_task
20
+ config.load(options.merge(:mysql_required => false))
21
+
22
+ remote_backup = find_last_backup(config.backup_prefix, backup_suffix, config.backup_dir)
23
+
24
+ abort no_backups_message unless remote_backup
25
+
26
+ FileUtils.mkdir_p(config.download_dir)
27
+ FileUtils.mkdir_p(File.dirname(config.unpack_dir))
28
+
29
+ begin
30
+ local_encrypted_archive = download(remote_backup, config.download_dir)
31
+ log "expanding #{local_encrypted_archive}"
32
+ unpack(local_encrypted_archive, config.unpack_dir)
33
+ ensure
34
+ unless options[:preserve]
35
+ log "removing #{local_encrypted_archive}"
36
+ FileUtils.rm(local_encrypted_archive)
37
+ end
38
+ end
39
+ end
40
+
41
+ desc 'export', 'export mysql database'
42
+ method_option 'pii_file', :desc => 'tables and columns labeled as pii'
43
+ method_option 'archive_dir', :desc => 'local directory to export'
44
+ method_option 'date', :desc => 'date of export'
45
+ def export_task
46
+ config.load(options.merge(:mysql_required => !options[:archive_dir]))
47
+ raise ArgumentError, '--csv required for --archive-dir' if options[:archive_dir] && !config.csv_enabled?
48
+
49
+ local_encrypted_archive_name = File.join(config.export_dir, encrypted_export_archive_name)
50
+ log "creating #{local_encrypted_archive_name}"
51
+ export(config.backup_key, local_encrypted_archive_name, options[:archive_dir])
52
+
53
+ begin
54
+ upload(local_encrypted_archive_name, config.backup_dir)
55
+ ensure
56
+ unless options[:preserve]
57
+ log "removing #{local_encrypted_archive_name}"
58
+ FileUtils.rm(local_encrypted_archive_name)
59
+ end
60
+ end
61
+ end
62
+
63
+ desc 'status', 'display status of current exported backups'
64
+ method_option 'date', :desc => 'unpack export for date'
65
+ method_option 'redis_key_prefix', :desc => 'redis_key_prefix for monitoring keys'
66
+ def status_task
67
+ config.load(options)
68
+
69
+ remote_backup = find_last_backup(config.backup_prefix, backup_suffix, config.backup_dir)
70
+
71
+ abort no_backups_message unless remote_backup
72
+ log("last backup %s at %s (%s)" % [remote_backup.name, Time.at(remote_backup.mtime).utc.iso8601, bytes(remote_backup.size)])
73
+
74
+ update_redis_counters(remote_backup, options[:redis_key_prefix]) if options[:redis_key_prefix]
75
+ end
76
+
77
+ private
78
+
79
+ def logger
80
+ @logger ||= Logger.new(STDOUT)
81
+ end
82
+
83
+ def log(*a)
84
+ if options[:debug]
85
+ logger.info(*a)
86
+ else
87
+ say *a unless options[:quiet]
88
+ end
89
+ end
90
+
91
+ def export_date_format
92
+ (options[:date] ? Date.parse(options[:date]) : Time.now).strftime("%Y-%m-%d-%H-%M")
93
+ end
94
+
95
+ def export_base_name
96
+ "#{config.backup_prefix}_#{export_date_format}_db"
97
+ end
98
+
99
+ def export_archive_name
100
+ if config.csv_enabled?
101
+ [export_base_name, 'csv', 'tar'].join('.')
102
+ else
103
+ [export_base_name, 'sql'].join('.')
104
+ end
105
+ end
106
+
107
+ def encrypted_export_archive_name
108
+ [export_archive_name, 'gz', 'enc'].join('.')
109
+ end
110
+
111
+ def backup_suffix
112
+ suffix =
113
+ if config.csv_enabled?
114
+ ['csv', 'tar', 'gz', 'enc']
115
+ else
116
+ ['sql', 'gz', 'enc']
117
+ end
118
+ suffix.unshift "#{options[:date]}*_db" if options[:date]
119
+ suffix.join('.')
120
+ end
121
+
122
+ def no_backups_message
123
+ if config.sftp_enabled?
124
+ "No backups found in #{config.backup_dir} matching #{config.backup_prefix}.*#{backup_suffix}"
125
+ else
126
+ "No backups found in #{config.s3[:bucket]}/#{config.backup_dir} matching #{config.backup_prefix}.*#{backup_suffix}"
127
+ end
128
+ end
129
+
130
+ def bytes(size)
131
+ {GB: 30, MB: 20, KB: 10}.each do |prefix, pow|
132
+ b = size >> pow
133
+ return "#{b} #{prefix}" if b > 0
134
+ end
135
+ return "#{size} B"
136
+ end
137
+
138
+ def update_redis_counters(remote_backup, redis_key_prefix)
139
+ counters = {}
140
+ if config.csv_enabled?
141
+ counters["#{redis_key_prefix}:mysql_csv_last_backup_timestamp"] = remote_backup.mtime
142
+ counters["#{redis_key_prefix}:mysql_csv_last_backup_size"] = remote_backup.size
143
+ else
144
+ counters["#{redis_key_prefix}:mysql_last_backup_timestamp"] = remote_backup.mtime
145
+ counters["#{redis_key_prefix}:mysql_last_backup_size"] = remote_backup.size
146
+ end
147
+
148
+ counters.each do |key, val|
149
+ log "setting redis #{key} to #{val}"
150
+ redis.set(key, val)
151
+ end
152
+ end
153
+ end
154
+ end