leda 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ require 'leda'
2
+
3
+ module Leda
4
+ ##
5
+ # Actually runs a dump or restore using the store info in a {{Configuration}}.
6
+ class Runner
7
+ attr_reader :current_env, :configuration
8
+
9
+ def initialize(current_env, configuration)
10
+ @current_env = current_env
11
+ @configuration = configuration
12
+ end
13
+
14
+ def directory(env, data_unit=nil, store=nil)
15
+ p = configuration.base_dir.join(env)
16
+ p = p.join(data_unit.name) if data_unit
17
+ p = p.join(store.name) if store
18
+ p
19
+ end
20
+
21
+ def relative_directory(env, data_unit=nil, store=nil)
22
+ directory(@current_env, data_unit, store).
23
+ relative_path_from(configuration.project_root_dir)
24
+ end
25
+
26
+ ##
27
+ # Performs dumps for the configured stores. Can optionally be limited to
28
+ # one data unit and/or store type.
29
+ def dump(data_unit_name=nil, store_name=nil)
30
+ each_data_unit_store(data_unit_name, store_name).each do |data_unit, store|
31
+ dir = directory(@current_env, data_unit, store)
32
+ dir.mkpath
33
+ store.dump(dir)
34
+ end
35
+ end
36
+
37
+ def dump_relative_paths(data_unit_name=nil, store_name=nil)
38
+ each_data_unit_store(data_unit_name, store_name).flat_map do |data_unit, store|
39
+ relative_directory(@current_env, data_unit, store)
40
+ end
41
+ end
42
+
43
+ ##
44
+ # Performs restores for the configured stores. Can optionally be limited to
45
+ # one data unit and/or store type.
46
+ def restore_from(source_env, data_unit_name=nil, store_name=nil)
47
+ each_data_unit_store(data_unit_name, store_name).each do |data_unit, store|
48
+ store.restore_from(directory(source_env, data_unit, store))
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def each_data_unit_store(data_unit_name=nil, store_name=nil)
55
+ Enumerator.new do |y|
56
+ yielded_any = false
57
+ configuration.data_units.each do |du|
58
+ if data_unit_name.nil? || du.name == data_unit_name
59
+ du.stores.each do |store|
60
+ if store_name.nil? || store.name == store_name
61
+ yielded_any = true
62
+ y << [du, store]
63
+ end
64
+ end
65
+ end
66
+ end
67
+ fail "No data configured that matches #{[data_unit_name, store_name].compact.join(':')}" unless yielded_any
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,53 @@
1
+ require 'leda'
2
+ require 'active_support/core_ext/string/inflections'
3
+
4
+ module Leda
5
+ ##
6
+ # Mix-in for defining the set of data needed from a particular backing store
7
+ # in a data unit. E.g., for a relational database it might be a set of tables.
8
+ #
9
+ # A store must define the following methods:
10
+ #
11
+ # # Dump the configured data to the specified directory
12
+ # # @param [Pathname]
13
+ # def dump(directory); end
14
+ #
15
+ # # Restore from the data found in the given directory
16
+ # # @param [Pathname]
17
+ # def restore_from(directory); end
18
+ module Store
19
+ attr_reader :options
20
+
21
+ def initialize(options={})
22
+ @options = options.dup
23
+ end
24
+
25
+ def name
26
+ Store.default_name(self.class)
27
+ end
28
+
29
+ def self.default_name(clazz)
30
+ clazz.name.demodulize.underscore
31
+ end
32
+
33
+ def self.registered_stores
34
+ @registered_stores ||= {}
35
+ end
36
+
37
+ def self.included(included_into)
38
+ register_store(included_into, default_name(included_into))
39
+ end
40
+
41
+ def self.register_store(store_class, name)
42
+ registered_stores[name.to_s] = store_class
43
+ end
44
+
45
+ def self.find(store_name)
46
+ registered_stores[store_name.to_s]
47
+ end
48
+ end
49
+ end
50
+
51
+ # XXX: temporary
52
+ require 'leda/stores/postgresql'
53
+ require 'leda/stores/elasticsearch'
@@ -0,0 +1,190 @@
1
+ require 'leda'
2
+
3
+ require 'oj'
4
+
5
+ module Leda
6
+ module Stores
7
+ class Elasticsearch
8
+ include Leda::Store
9
+
10
+ attr_reader :indices, :es_client
11
+
12
+ def initialize(*)
13
+ super
14
+
15
+ @indices = options[:indices] || options[:indexes]
16
+ end
17
+
18
+ def dump(directory)
19
+ Runner.new(directory, indices, es_client).dump
20
+ end
21
+
22
+ def restore_from(directory)
23
+ Runner.new(directory, indices, es_client).restore
24
+ end
25
+
26
+ private
27
+
28
+ def es_client
29
+ # TODO: make this configuration externalizable
30
+ ::Elasticsearch::Model.client
31
+ end
32
+
33
+ class Runner
34
+ attr_reader :directory, :indices, :es_client
35
+
36
+ def initialize(directory, indices, es_client)
37
+ @directory = directory
38
+ @indices = indices
39
+ @es_client = es_client
40
+ end
41
+
42
+ def dump
43
+ $stderr.puts "Exporting to #{echo_fn(directory)} ..."
44
+ indices.each do |index|
45
+ dump_index_metadata(index)
46
+ scan_all_records_into_bulk_format(index)
47
+ end
48
+ $stderr.puts "... export complete."
49
+ end
50
+
51
+ def restore
52
+ $stderr.puts "Importing from #{echo_fn(directory)} ..."
53
+ indices.each do |index|
54
+ replace_index(index)
55
+ bulk_load_records(index)
56
+ end
57
+ $stderr.puts "... import complete."
58
+ end
59
+
60
+ private
61
+
62
+ def echo_fn(pathname)
63
+ # TODO: an alternative
64
+ pathname.relative_path_from(Rails.root)
65
+ end
66
+
67
+ def mapping_filename(index)
68
+ directory.join("#{index}_mapping.json")
69
+ end
70
+
71
+ def settings_filename(index)
72
+ directory.join("#{index}_settings.json")
73
+ end
74
+
75
+ def bulk_records_filename(index)
76
+ directory.join("#{index}_bulk-records.json")
77
+ end
78
+
79
+ def dump_index_metadata(index)
80
+ dump_mapping(index)
81
+ dump_settings(index)
82
+ end
83
+
84
+ def dump_mapping(index)
85
+ fn = mapping_filename(index)
86
+ $stderr.puts " - Dumping mapping for #{index} to #{echo_fn(fn)}"
87
+ mapping = es_client.indices.get_mapping index: index
88
+ fn.open('w') { |f| f.puts JSON.pretty_generate(mapping) }
89
+ end
90
+
91
+ def dump_settings(index)
92
+ fn = settings_filename(index)
93
+ $stderr.puts " - Dumping settings for #{index} to #{echo_fn(fn)}"
94
+ settings = es_client.indices.get_settings index: index
95
+ fn.open('w') { |f| f.puts JSON.pretty_generate(settings) }
96
+ end
97
+
98
+ def scan_all_records_into_bulk_format(index)
99
+ fn = bulk_records_filename(index)
100
+ $stderr.puts " - Dumping records for #{index} to #{echo_fn(fn)} "
101
+
102
+ # start the scroll with a search
103
+ results = es_client.search index: index, search_type: 'scan', scroll: '5m', size: 500
104
+ total_ct = results['hits']['total']
105
+
106
+ written_ct = 0
107
+ fn.open('w:utf-8') do |f|
108
+ while results = es_client.scroll(scroll_id: results['_scroll_id'], scroll: '5m') and not results['hits']['hits'].empty?
109
+ results['hits']['hits'].each do |hit|
110
+ f.puts convert_to_bulk_index_rows(hit)
111
+ end
112
+ written_ct += results['hits']['hits'].size
113
+ $stderr.print "\r #{written_ct} / #{total_ct} => %5.1f%% done" % (written_ct * 100.0 / total_ct)
114
+ end
115
+ end
116
+ $stderr.puts "\r #{written_ct} / #{total_ct} => all done."
117
+ end
118
+
119
+ def convert_to_bulk_index_rows(hit)
120
+ [
121
+ Oj.dump({ "index" => hit.slice("_index", "_type", "_id") }),
122
+ Oj.dump(hit['_source'])
123
+ ]
124
+ end
125
+
126
+ def replace_index(index)
127
+ map_fn = mapping_filename(index)
128
+ $stderr.puts " - Reading mapping from #{echo_fn(map_fn)}"
129
+ mappings = Oj.load(map_fn.read).values.first # assume only one index
130
+
131
+ set_fn = settings_filename(index)
132
+ $stderr.puts " - Reading settings from #{echo_fn(set_fn)}"
133
+ settings = Oj.load(set_fn.read).values.first # assume only one index
134
+
135
+ body = {}.merge!(mappings).merge!(settings)
136
+
137
+ begin
138
+ $stderr.print " - Deleting index #{index} ... "
139
+ es_client.indices.delete index: index
140
+ $stderr.puts "done"
141
+ rescue ::Elasticsearch::Transport::Transport::Errors::NotFound
142
+ $stderr.puts "not necessary"
143
+ end
144
+
145
+ $stderr.puts " - Creating index #{index} using settings and mapping"
146
+ es_client.indices.create index: index, body: body
147
+ end
148
+
149
+ RESTORE_BATCH_DISPATCH_TRIGGER=(1 << 19) # Stay below 1MB per request
150
+ # N.b.: Assumption that each bulk op is two lines. This is true
151
+ # so long as they are all index ops.
152
+ BULK_LINES_PER_RECORD=2
153
+
154
+ def bulk_load_records(index)
155
+ fn = bulk_records_filename(index)
156
+ $stderr.puts " - Reading records for #{index} from #{echo_fn(fn)} "
157
+
158
+ total_ct = 0
159
+ fn.each_line { |l| total_ct += 1 }
160
+ total_ct /= BULK_LINES_PER_RECORD
161
+
162
+ batch = ""
163
+ batch_line_ct = 0
164
+ loaded_ct = 0
165
+ fn.each_line do |line|
166
+ batch_line_ct += 1
167
+ batch << line
168
+ if batch_line_ct % BULK_LINES_PER_RECORD == 0 && batch.size > RESTORE_BATCH_DISPATCH_TRIGGER
169
+ bulk_load_batch(batch)
170
+ loaded_ct += batch_line_ct / BULK_LINES_PER_RECORD
171
+ $stderr.print "\r #{loaded_ct} / #{total_ct} => %5.1f%% done" % (loaded_ct * 100.0 / total_ct)
172
+ batch = ""
173
+ batch_line_ct = 0
174
+ end
175
+ end
176
+ unless batch.empty?
177
+ bulk_load_batch(batch)
178
+ loaded_ct += batch_line_ct / BULK_LINES_PER_RECORD
179
+ end
180
+ $stderr.puts "\r #{loaded_ct} / #{total_ct} => all done."
181
+ end
182
+
183
+ def bulk_load_batch(batch)
184
+ es_client.bulk body: batch
185
+ end
186
+
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,134 @@
1
+ require 'leda'
2
+
3
+ require 'tempfile'
4
+ require 'shellwords'
5
+
6
+ module Leda
7
+ module Stores
8
+ ##
9
+ # Store for PostgreSQL. Uses PG command line utilties to dump and restore.
10
+ #
11
+ # Options:
12
+ #
13
+ # * `:tables`. An array of table names to dump/restore. The tables will be
14
+ # restored in the order given in the array.
15
+ class Postgresql
16
+ include Leda::Store
17
+
18
+ attr_reader :tables
19
+
20
+ def initialize(*args)
21
+ super
22
+
23
+ @tables = options[:tables]
24
+ @filter_executable = options[:filter]
25
+ end
26
+
27
+ def filename(directory)
28
+ directory.join('dump.psql')
29
+ end
30
+
31
+ def dump(directory)
32
+ pgenv
33
+
34
+ fn = filename(directory).to_s
35
+ $stderr.puts "Exporting to #{fn} ..."
36
+ dump_cmd = (['pg_dump', '-a', '-Fp', '-O', '-x'] + tables.flat_map { |t| ['-t', t] }).shelljoin
37
+
38
+ # TODO:
39
+ filter_cmd = nil
40
+ if @filter_executable
41
+ filter_cmd = "| #{@filter_executable}"
42
+ end
43
+
44
+ out_cmd = "> " + [fn].shelljoin
45
+ if system([dump_cmd, filter_cmd, out_cmd].compact.join(' '))
46
+ $stderr.puts "... export complete."
47
+ else
48
+ fail "Export failed."
49
+ end
50
+ end
51
+
52
+ def restore_from(directory)
53
+ pgenv
54
+
55
+ source_file = filename(directory)
56
+
57
+ unless source_file.exist?
58
+ fail "Expected provider dump not found: #{source_file}"
59
+ end
60
+
61
+ begin
62
+ $stderr.puts "Importing from #{source_file}"
63
+ open('|psql -aq', 'w') do |psql|
64
+ psql.puts '\set ON_ERROR_STOP'
65
+ psql.puts "BEGIN;"
66
+ psql.puts "TRUNCATE #{tables.join(', ')} CASCADE;"
67
+ psql.puts source_file.read
68
+ psql.puts "COMMIT;"
69
+ end
70
+ rescue Errno::EPIPE => e
71
+ $stderr.puts "psql terminated early; check above for a reason"
72
+ end
73
+ end
74
+
75
+ private
76
+
77
+ def database_config
78
+ # TODO: make this agnostic
79
+ @database_config ||= ActiveRecord::Base.configurations[Rails.env].reverse_merge(
80
+ 'host' => 'localhost'
81
+ )
82
+ end
83
+
84
+ ##
85
+ # Sets the libpq environment variables based on the current AR database config.
86
+ def pgenv
87
+ pgenv_values.each do |env_var, value|
88
+ ENV[env_var] = value.to_s if value
89
+ end
90
+ ENV['PGPASSFILE'] = temporary_pgpassfile
91
+ end
92
+
93
+ ##
94
+ # Computes, but does not set into the environment, the libpq env vars implied
95
+ # by the given AR-style config hash. Does not include the password, since
96
+ # there is no libpq value for the password.
97
+ def pgenv_values
98
+ @pgenv_values ||=
99
+ {
100
+ 'host' => 'PGHOST',
101
+ 'port' => 'PGPORT',
102
+ 'username' => 'PGUSER',
103
+ 'database' => 'PGDATABASE',
104
+ }.each_with_object({}) do |(param_name, env_var), values|
105
+ values[env_var] = database_config[param_name] if database_config[param_name]
106
+ end
107
+ end
108
+
109
+ ##
110
+ # Creates a temporary pgpass file based on the given AR-style config hash.
111
+ # Returns the path to the file. This file will be automatically deleted when
112
+ # the interpreter shuts down, so don't share the path outside of the process
113
+ # and its children.
114
+ def temporary_pgpassfile
115
+ return @temporary_pgpassfile.path if @temporary_pgpassfile
116
+
117
+ pass = database_config['password']
118
+
119
+ # Tempfile.open does not return the tempfile
120
+ begin
121
+ # must maintain a reference to the tempfile object so that it doesn't
122
+ # get deleted until we're done with it.
123
+ @temporary_pgpassfile = Tempfile.open('dc')
124
+ @temporary_pgpassfile.chmod(0600)
125
+ @temporary_pgpassfile.puts [pgenv_values['PGHOST'], pgenv_values['PGPORT'] || '*', pgenv_values['PGDATABASE'], pgenv_values['PGUSER'], pass].join(':')
126
+ ensure
127
+ @temporary_pgpassfile.close
128
+ end
129
+
130
+ @temporary_pgpassfile.path
131
+ end
132
+ end
133
+ end
134
+ end