RubyGems - leda - Versions diffs - 0.0.1 - Mend

leda 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +7 -0
data/.gitignore +15 -0
data/.rspec +1 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +91 -0
data/Rakefile +2 -0
data/leda.gemspec +29 -0
data/lib/leda.rb +37 -0
data/lib/leda/capistrano.rb +84 -0
data/lib/leda/configuration.rb +91 -0
data/lib/leda/data_unit.rb +13 -0
data/lib/leda/rake.rb +67 -0
data/lib/leda/runner.rb +71 -0
data/lib/leda/store.rb +53 -0
data/lib/leda/stores/elasticsearch.rb +190 -0
data/lib/leda/stores/postgresql.rb +134 -0
data/lib/leda/version.rb +3 -0
data/spec/leda/configuration_spec.rb +101 -0
data/spec/leda/rake_spec.rb +86 -0
data/spec/leda/runner_spec.rb +194 -0
data/spec/leda/version_spec.rb +11 -0
data/spec/spec_helper.rb +14 -0
data/spec/support/recording_store.rb +29 -0
data/spec/support/tmpdir.rb +18 -0
metadata +145 -0

data/lib/leda/runner.rb ADDED

@@ -0,0 +1,71 @@
+require 'leda'
+module Leda
+  ##
+  # Actually runs a dump or restore using the store info in a {{Configuration}}.
+  class Runner
+    attr_reader :current_env, :configuration
+    def initialize(current_env, configuration)
+      @current_env = current_env
+      @configuration = configuration
+    end
+    def directory(env, data_unit=nil, store=nil)
+      p = configuration.base_dir.join(env)
+      p = p.join(data_unit.name) if data_unit
+      p = p.join(store.name) if store
+      p
+    end
+    def relative_directory(env, data_unit=nil, store=nil)
+      directory(@current_env, data_unit, store).
+        relative_path_from(configuration.project_root_dir)
+    end
+    ##
+    # Performs dumps for the configured stores. Can optionally be limited to
+    # one data unit and/or store type.
+    def dump(data_unit_name=nil, store_name=nil)
+      each_data_unit_store(data_unit_name, store_name).each do |data_unit, store|
+        dir = directory(@current_env, data_unit, store)
+        dir.mkpath
+        store.dump(dir)
+      end
+    end
+    def dump_relative_paths(data_unit_name=nil, store_name=nil)
+      each_data_unit_store(data_unit_name, store_name).flat_map do |data_unit, store|
+        relative_directory(@current_env, data_unit, store)
+      end
+    end
+    ##
+    # Performs restores for the configured stores. Can optionally be limited to
+    # one data unit and/or store type.
+    def restore_from(source_env, data_unit_name=nil, store_name=nil)
+      each_data_unit_store(data_unit_name, store_name).each do |data_unit, store|
+        store.restore_from(directory(source_env, data_unit, store))
+      end
+    end
+    private
+    def each_data_unit_store(data_unit_name=nil, store_name=nil)
+      Enumerator.new do |y|
+        yielded_any = false
+        configuration.data_units.each do |du|
+          if data_unit_name.nil? || du.name == data_unit_name
+            du.stores.each do |store|
+              if store_name.nil? || store.name == store_name
+                yielded_any = true
+                y << [du, store]
+              end
+            end
+          end
+        end
+        fail "No data configured that matches #{[data_unit_name, store_name].compact.join(':')}" unless yielded_any
+      end
+    end
+  end
+end

data/lib/leda/store.rb ADDED

@@ -0,0 +1,53 @@
+require 'leda'
+require 'active_support/core_ext/string/inflections'
+module Leda
+  ##
+  # Mix-in for defining the set of data needed from a particular backing store
+  # in a data unit. E.g., for a relational database it might be a set of tables.
+  #
+  # A store must define the following methods:
+  #
+  #   # Dump the configured data to the specified directory
+  #   # @param [Pathname]
+  #   def dump(directory); end
+  #
+  #   # Restore from the data found in the given directory
+  #   # @param [Pathname]
+  #   def restore_from(directory); end
+  module Store
+    attr_reader :options
+    def initialize(options={})
+      @options = options.dup
+    end
+    def name
+      Store.default_name(self.class)
+    end
+    def self.default_name(clazz)
+      clazz.name.demodulize.underscore
+    end
+    def self.registered_stores
+      @registered_stores ||= {}
+    end
+    def self.included(included_into)
+      register_store(included_into, default_name(included_into))
+    end
+    def self.register_store(store_class, name)
+      registered_stores[name.to_s] = store_class
+    end
+    def self.find(store_name)
+      registered_stores[store_name.to_s]
+    end
+  end
+end
+# XXX: temporary
+require 'leda/stores/postgresql'
+require 'leda/stores/elasticsearch'

data/lib/leda/stores/elasticsearch.rb ADDED

@@ -0,0 +1,190 @@
+require 'leda'
+require 'oj'
+module Leda
+  module Stores
+    class Elasticsearch
+      include Leda::Store
+      attr_reader :indices, :es_client
+      def initialize(*)
+        super
+        @indices = options[:indices] || options[:indexes]
+      end
+      def dump(directory)
+        Runner.new(directory, indices, es_client).dump
+      end
+      def restore_from(directory)
+        Runner.new(directory, indices, es_client).restore
+      end
+      private
+      def es_client
+        # TODO: make this configuration externalizable
+        ::Elasticsearch::Model.client
+      end
+      class Runner
+        attr_reader :directory, :indices, :es_client
+        def initialize(directory, indices, es_client)
+          @directory = directory
+          @indices = indices
+          @es_client = es_client
+        end
+        def dump
+          $stderr.puts "Exporting to #{echo_fn(directory)} ..."
+          indices.each do |index|
+            dump_index_metadata(index)
+            scan_all_records_into_bulk_format(index)
+          end
+          $stderr.puts "... export complete."
+        end
+        def restore
+          $stderr.puts "Importing from #{echo_fn(directory)} ..."
+          indices.each do |index|
+            replace_index(index)
+            bulk_load_records(index)
+          end
+          $stderr.puts "... import complete."
+        end
+        private
+        def echo_fn(pathname)
+          # TODO: an alternative
+          pathname.relative_path_from(Rails.root)
+        end
+        def mapping_filename(index)
+          directory.join("#{index}_mapping.json")
+        end
+        def settings_filename(index)
+          directory.join("#{index}_settings.json")
+        end
+        def bulk_records_filename(index)
+          directory.join("#{index}_bulk-records.json")
+        end
+        def dump_index_metadata(index)
+          dump_mapping(index)
+          dump_settings(index)
+        end
+        def dump_mapping(index)
+          fn = mapping_filename(index)
+          $stderr.puts "  - Dumping mapping for #{index} to #{echo_fn(fn)}"
+          mapping = es_client.indices.get_mapping index: index
+          fn.open('w') { |f| f.puts JSON.pretty_generate(mapping) }
+        end
+        def dump_settings(index)
+          fn = settings_filename(index)
+          $stderr.puts "  - Dumping settings for #{index} to #{echo_fn(fn)}"
+          settings = es_client.indices.get_settings index: index
+          fn.open('w') { |f| f.puts JSON.pretty_generate(settings) }
+        end
+        def scan_all_records_into_bulk_format(index)
+          fn = bulk_records_filename(index)
+          $stderr.puts "  - Dumping records for #{index} to #{echo_fn(fn)} "
+          # start the scroll with a search
+          results = es_client.search index: index, search_type: 'scan', scroll: '5m', size: 500
+          total_ct = results['hits']['total']
+          written_ct = 0
+          fn.open('w:utf-8') do |f|
+            while results = es_client.scroll(scroll_id: results['_scroll_id'], scroll: '5m') and not results['hits']['hits'].empty?
+              results['hits']['hits'].each do |hit|
+                f.puts convert_to_bulk_index_rows(hit)
+              end
+              written_ct += results['hits']['hits'].size
+              $stderr.print "\r    #{written_ct} / #{total_ct} => %5.1f%% done" % (written_ct * 100.0 / total_ct)
+            end
+          end
+          $stderr.puts "\r     #{written_ct} / #{total_ct} =>  all done."
+        end
+        def convert_to_bulk_index_rows(hit)
+          [
+            Oj.dump({ "index" => hit.slice("_index", "_type", "_id") }),
+            Oj.dump(hit['_source'])
+          ]
+        end
+        def replace_index(index)
+          map_fn = mapping_filename(index)
+          $stderr.puts "  - Reading mapping from #{echo_fn(map_fn)}"
+          mappings = Oj.load(map_fn.read).values.first # assume only one index
+          set_fn = settings_filename(index)
+          $stderr.puts "  - Reading settings from #{echo_fn(set_fn)}"
+          settings = Oj.load(set_fn.read).values.first # assume only one index
+          body = {}.merge!(mappings).merge!(settings)
+          begin
+            $stderr.print "  - Deleting index #{index} ... "
+            es_client.indices.delete index: index
+            $stderr.puts "done"
+          rescue ::Elasticsearch::Transport::Transport::Errors::NotFound
+            $stderr.puts "not necessary"
+          end
+          $stderr.puts "  - Creating index #{index} using settings and mapping"
+          es_client.indices.create index: index, body: body
+        end
+        RESTORE_BATCH_DISPATCH_TRIGGER=(1 << 19) # Stay below 1MB per request
+        # N.b.: Assumption that each bulk op is two lines. This is true
+        # so long as they are all index ops.
+        BULK_LINES_PER_RECORD=2
+        def bulk_load_records(index)
+          fn = bulk_records_filename(index)
+          $stderr.puts "  - Reading records for #{index} from #{echo_fn(fn)} "
+          total_ct = 0
+          fn.each_line { |l| total_ct += 1 }
+          total_ct /= BULK_LINES_PER_RECORD
+          batch = ""
+          batch_line_ct = 0
+          loaded_ct = 0
+          fn.each_line do |line|
+            batch_line_ct += 1
+            batch << line
+            if batch_line_ct % BULK_LINES_PER_RECORD == 0 && batch.size > RESTORE_BATCH_DISPATCH_TRIGGER
+              bulk_load_batch(batch)
+              loaded_ct += batch_line_ct / BULK_LINES_PER_RECORD
+              $stderr.print "\r    #{loaded_ct} / #{total_ct} => %5.1f%% done" % (loaded_ct * 100.0 / total_ct)
+              batch = ""
+              batch_line_ct = 0
+            end
+          end
+          unless batch.empty?
+            bulk_load_batch(batch)
+            loaded_ct += batch_line_ct / BULK_LINES_PER_RECORD
+          end
+          $stderr.puts "\r     #{loaded_ct} / #{total_ct} =>  all done."
+        end
+        def bulk_load_batch(batch)
+          es_client.bulk body: batch
+        end
+      end
+    end
+  end
+end

data/lib/leda/stores/postgresql.rb ADDED

@@ -0,0 +1,134 @@
+require 'leda'
+require 'tempfile'
+require 'shellwords'
+module Leda
+  module Stores
+    ##
+    # Store for PostgreSQL. Uses PG command line utilties to dump and restore.
+    #
+    # Options:
+    #
+    # * `:tables`. An array of table names to dump/restore. The tables will be
+    #   restored in the order given in the array.
+    class Postgresql
+      include Leda::Store
+      attr_reader :tables
+      def initialize(*args)
+        super
+        @tables = options[:tables]
+        @filter_executable = options[:filter]
+      end
+      def filename(directory)
+        directory.join('dump.psql')
+      end
+      def dump(directory)
+        pgenv
+        fn = filename(directory).to_s
+        $stderr.puts "Exporting to #{fn} ..."
+        dump_cmd = (['pg_dump', '-a', '-Fp', '-O', '-x'] + tables.flat_map { |t| ['-t', t] }).shelljoin
+        # TODO:
+        filter_cmd = nil
+        if @filter_executable
+          filter_cmd = "| #{@filter_executable}"
+        end
+        out_cmd = "> " + [fn].shelljoin
+        if system([dump_cmd, filter_cmd, out_cmd].compact.join(' '))
+          $stderr.puts "... export complete."
+        else
+          fail "Export failed."
+        end
+      end
+      def restore_from(directory)
+        pgenv
+        source_file = filename(directory)
+        unless source_file.exist?
+          fail "Expected provider dump not found: #{source_file}"
+        end
+        begin
+          $stderr.puts "Importing from #{source_file}"
+          open('|psql -aq', 'w') do |psql|
+            psql.puts '\set ON_ERROR_STOP'
+            psql.puts "BEGIN;"
+            psql.puts "TRUNCATE #{tables.join(', ')} CASCADE;"
+            psql.puts source_file.read
+            psql.puts "COMMIT;"
+          end
+        rescue Errno::EPIPE => e
+          $stderr.puts "psql terminated early; check above for a reason"
+        end
+      end
+      private
+      def database_config
+        # TODO: make this agnostic
+        @database_config ||= ActiveRecord::Base.configurations[Rails.env].reverse_merge(
+          'host' => 'localhost'
+        )
+      end
+      ##
+      # Sets the libpq environment variables based on the current AR database config.
+      def pgenv
+        pgenv_values.each do |env_var, value|
+          ENV[env_var] = value.to_s if value
+        end
+        ENV['PGPASSFILE'] = temporary_pgpassfile
+      end
+      ##
+      # Computes, but does not set into the environment, the libpq env vars implied
+      # by the given AR-style config hash. Does not include the password, since
+      # there is no libpq value for the password.
+      def pgenv_values
+        @pgenv_values ||=
+          {
+            'host' => 'PGHOST',
+            'port' => 'PGPORT',
+            'username' => 'PGUSER',
+            'database' => 'PGDATABASE',
+          }.each_with_object({}) do |(param_name, env_var), values|
+            values[env_var] = database_config[param_name] if database_config[param_name]
+          end
+      end
+      ##
+      # Creates a temporary pgpass file based on the given AR-style config hash.
+      # Returns the path to the file. This file will be automatically deleted when
+      # the interpreter shuts down, so don't share the path outside of the process
+      # and its children.
+      def temporary_pgpassfile
+        return @temporary_pgpassfile.path if @temporary_pgpassfile
+        pass = database_config['password']
+        # Tempfile.open does not return the tempfile
+        begin
+          # must maintain a reference to the tempfile object so that it doesn't
+          # get deleted until we're done with it.
+          @temporary_pgpassfile = Tempfile.open('dc')
+          @temporary_pgpassfile.chmod(0600)
+          @temporary_pgpassfile.puts [pgenv_values['PGHOST'], pgenv_values['PGPORT'] || '*', pgenv_values['PGDATABASE'], pgenv_values['PGUSER'], pass].join(':')
+        ensure
+          @temporary_pgpassfile.close
+        end
+        @temporary_pgpassfile.path
+      end
+    end
+  end
+end