RubyGems - cassback - Versions diffs - 0.1.1 - Mend

cassback 0.1.1

Files changed (6) hide show

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 619f294e60950a9f4122ebd6879c7e51a0e524d2
+  data.tar.gz: f1e827c821dd8301ce03276ee5e87ca2c11e8092
+SHA512:
+  metadata.gz: 6ef3cbf11aeca4fb97dde85241807db0c159b475c09c7cd41f75464484dbe22566590df0e4f48a540096607b14fe8a2fe8dca269b43629144f6b5bf8c8130a5b
+  data.tar.gz: 29ece32e3a289a3240822c9b27f3e7bd0d7fda87cf91838221343448d2056ff872f8ab056169624dde35261d153fb30aee0327f02746febd1bf06cd0e3fd3154

data/bin/cassback.rb ADDED

@@ -0,0 +1,168 @@
+#!/usr/bin/env ruby
+require 'logger'
+require 'optparse'
+require 'yaml'
+require_relative '../lib/hadoop.rb'
+require_relative '../lib/cassandra.rb'
+require_relative '../lib/backuptool.rb'
+# This allows merging hashes that can contain themself hashes,
+class ::Hash
+  def deep_merge!(second)
+    merger = proc { |_key, v1, v2| Hash === v1 && Hash === v2 ? v1.merge(v2, &merger) : Array === v1 && Array === v2 ? v1 | v2 : [:undefined, nil, :nil].include?(v2) ? v1 : v2 }
+    merge!(second.to_h, &merger)
+  end
+end
+# Create a Ruby logger with time/size rotation that logs both to file and console.
+two_mb = 2 * 1024 * 1024
+logger = Logger.new('| tee cassback.log', 'weekly', two_mb)
+#  Default action
+action = nil
+# Default config file
+config_file = ''
+# Default command line config
+command_line_config = {}
+# Default options
+options = {
+  'cassandra' => {
+    'config' => '/etc/cassandra/conf/cassandra.yaml',
+  },
+  'hadoop'    => {
+    'hostname'  => 'localhost',
+    'port'      => 14_000,
+    'directory' => 'cassandra',
+  },
+  'restore'   => {
+    'destination' => 'cassandra',
+  },
+}
+# If no argument given in command line, print the help
+ARGV << '-h' if ARGV.empty?
+# Parse command line options
+parser = OptionParser.new do |opts|
+  opts.banner = 'Usage: cassback.rb [options]'
+  opts.separator ''
+  opts.separator 'Configuration:'
+  opts.on('-C', '--config CONFIGFILE', 'Configuration file for the application') do |v|
+    config_file = v
+  end
+  opts.separator ''
+  opts.separator 'Actions:'
+  opts.on('-S', '--snapshot', 'creates a new snapshot and send it to Hadoop') do |_v|
+    action = 'new'
+  end
+  opts.on('-R', '--restore', 'restores a snapshot from Hadoop, needs a date and a destination') do |_v|
+    action = 'restore'
+  end
+  opts.on('-L', '--list', 'list snapshots on Hadoop') do |_v|
+    action = 'list'
+  end
+  opts.on('-F', '--flush', 'removes a backuped snapshot from Hadoop, needs a date') do |_v|
+    action = 'delete'
+  end
+  opts.separator ''
+  opts.separator 'Action related:'
+  opts.on('-n', '--node NODE', 'Cassandra server node (default is current host)') do |v|
+    options['node'] = v
+  end
+  opts.on('-d', '--date DATE', 'snapshot date, like YYYY_MM_DD') do |v|
+    options['date'] = v
+  end
+  opts.on('-t', '--destination DIR', 'local destination path for restore (default is cassandra)') do |v|
+    options['restore']['destination'] = v
+  end
+  opts.separator ''
+  opts.separator 'Hadoop (WebHDFS):'
+  opts.on('-H', '--host HOSTNAME', 'Hostname (default is localhost)') do |v|
+    command_line_config['hadoop']['host'] = v
+  end
+  opts.on('-P', '--port PORT', 'Port (default is 14000)') do |v|
+    command_line_config['hadoop']['port'] = v
+  end
+  opts.on('-D', '--directory DIRECTORY', 'Directory where to store backups (default is cassandra)') do |v|
+    command_line_config['hadoop']['directory'] = v
+  end
+  opts.separator ''
+  opts.separator 'Cassandra:'
+  opts.on('-F', '--cassandra CONFIGFILE', 'Cassandra configuration file (default is /etc/cassandra/conf/cassandra.yaml)') do |v|
+    command_line_config['cassandra']['config'] = v
+  end
+  opts.separator ''
+  opts.separator 'Help:'
+  opts.on('-h', '--help', 'Displays Help') do
+    puts opts
+    exit
+  end
+end
+parser.parse!
+# Read the configuration file if exist
+begin
+  options.deep_merge!(YAML.load_file(config_file))
+  logger.info("Using configuration file #{config_file}")
+rescue
+  logger.warn('Unable to read configuration file, continue with default settings')
+ensure
+  # merge with command line settings.§
+  options.deep_merge!command_line_config
+end
+# Fail if no action specified
+if action.nil?
+  logger.error('No action given')
+  exit(1)
+end
+begin
+  # Create the Hadoop object
+  hadoop = Hadoop.new(host: options['hadoop']['hostname'], port: options['hadoop']['port'], base_dir: options['hadoop']['directory'])
+  #  Create the Cassandra object
+  cassandra = Cassandra.new(options['cassandra']['config'], logger)
+  #  Create the backup object
+  bck = BackupTool.new(cassandra, hadoop, logger)
+  # If no node specified, use the local node
+  options['node'] = cassandra.node_name unless options.include? 'node'
+  #  New snapshot
+  if action == 'new'
+    bck.new_snapshot
+  # Restore a snapshot
+  elsif action == 'restore'
+    raise('No date given') unless options.include? 'date'
+    bck.restore_snapshot(options['node'], options['date'], options['restore']['destination'])
+  # List snapshots
+  elsif action == 'list'
+    bck.list_snapshots(node: options['node'])
+  #  Delete a snapshot
+  elsif action == 'delete'
+    raise('No date given') unless options.include? 'date'
+    bck.delete_snapshots(node: options['node'], date: options['date'])
+  end
+#  In case of failure
+rescue Exception => e
+  logger.error(e.message)
+  exit(1)
+end
+exit(0)

data/lib/backuptool.rb ADDED

@@ -0,0 +1,201 @@
+require 'fileutils'
+require 'table_print'
+# Buffer size, used for downloads
+BUFFER_SIZE = 10_000_000
+# Directory where metadata is
+META_DIR = 'cass_snap_metadata'.freeze
+class BackupTool
+  # Create a new BackupTool instance
+  # * *Args*    :
+  #   - +cassandra+ -> Cassandra instance
+  #   - +hadoop+ -> HDFS instance
+  #   - +logger+ -> Logger
+  def initialize(cassandra, hadoop, logger)
+    @cassandra = cassandra
+    @hadoop = hadoop
+    @logger = logger
+    @metadir = META_DIR
+  end
+  # Look for snapshots
+  # * *Args*    :
+  #   - +node+ -> Cassandra node name
+  #   - +date+ -> HDFS instance
+  def search_snapshots(node: 'ALL', date: 'ALL')
+    result = []
+    def get_snapshot_metadata(node, date)
+      remote = @hadoop.base_dir + '/' + @metadir + '/' + @cassandra.cluster_name + '/' + node + '/cass_snap_' + date
+      return @hadoop.read(remote).split("\n").to_set
+    rescue Exception => e
+      raise("Could not read metadata : #{e.message}")
+    end
+    def get_snapshots_node(node, date)
+      result = []
+      begin
+        if date == 'ALL'
+          ls = @hadoop.list("#{@hadoop.base_dir}/#{@metadir}/#{@cassandra.cluster_name}/#{node}")
+          ls.each do |item|
+            date = item['pathSuffix'].gsub('cass_snap_', '')
+            metadata = get_snapshot_metadata(node, date)
+            snapshot = CassandraSnapshot.new(@cassandra.cluster_name, node, date, metadata)
+            result.push(snapshot)
+          end
+        else
+          metadata = get_snapshot_metadata(node, date)
+          snapshot = CassandraSnapshot.new(@cassandra.cluster_name, node, date, metadata)
+          result.push(snapshot)
+        end
+      rescue Exception => e
+        @logger.warn("Could not get snapshots for node #{node} : #{e.message}")
+      end
+      result
+    end
+    if node == 'ALL'
+      begin
+        ls = @hadoop.list("#{@hadoop.base_dir}/#{@metadir}/#{@cassandra.cluster_name}")
+        ls.each do |item|
+          n = item['pathSuffix']
+          result += get_snapshots_node(n, date)
+        end
+      rescue Exception => e
+        @logger.warn("Could not get snapshots for cluster #{@cassandra.cluster_name} : #{e.message}")
+      end
+    else
+      result = get_snapshots_node(node, date)
+    end
+    result.sort
+  end
+  def list_snapshots(node: @cassandra.node_name)
+    @logger.info('Listing available snapshots')
+    snapshots = search_snapshots(node: node)
+    tp(snapshots, 'cluster', 'node', 'date')
+  end
+  def new_snapshot
+    @logger.info('Starting a new snapshot')
+    snapshot = @cassandra.new_snapshot
+    existing = search_snapshots(node: snapshot.node)
+    last = if existing.empty?
+             CassandraSnapshot.new(snapshot.cluster, snapshot.node, 'never')
+           else
+             existing[-1]
+    end
+    @logger.info('Uploading tables to Hadoop')
+    files = snapshot.metadata - last.metadata
+    @logger.info("#{files.length} files to upload")
+    files.each do |file|
+      @logger.info("Sending file #{file} to Hadoop")
+      local = @cassandra.data_path + '/' + file
+      remote = @hadoop.base_dir + '/' + snapshot.cluster + '/' + snapshot.node + '/' + file
+      @logger.debug("#{local} => #{remote}")
+      f = File.open(local, 'r')
+      @hadoop.create(remote, f, overwrite: true)
+      f.close
+    end
+    @logger.info('Sending metadata to Hadoop')
+    remote = @hadoop.base_dir + '/' + @metadir + '/' + snapshot.cluster + '/' + snapshot.node + '/cass_snap_' + snapshot.date
+    @logger.debug("metadata => #{remote}")
+    @hadoop.create(remote, snapshot.metadata.to_a * "\n", overwrite: true)
+    @cassandra.delete_snapshot(snapshot)
+    @logger.info('Success !')
+  end
+  def delete_snapshots(node: @cassandra.node_name, date: 'ALL')
+    snapshots = search_snapshots(node: node, date: date)
+    if snapshots.empty?
+      raise('No snapshot found for deletion')
+    else
+      snapshots.each do |snapshot|
+        @logger.info("Deleting snapshot #{snapshot}")
+        node_snapshots = search_snapshots(node: snapshot.node)
+        merged_metadata = Set.new
+        node_snapshots.each do |s|
+          merged_metadata += s.metadata if s != snapshot
+        end
+        files = snapshot.metadata - merged_metadata
+        @logger.info("#{files.length} files to delete")
+        files.each do |file|
+          @logger.info("Deleting file #{file}")
+          remote = @hadoop.base_dir + '/' + snapshot.cluster + '/' + snapshot.node + '/' + file
+          @logger.debug("DELETE => #{remote}")
+          @hadoop.delete(remote)
+        end
+        @logger.info('Deleting metadata in Hadoop')
+        remote = @hadoop.base_dir + '/' + @metadir + '/' + snapshot.cluster + '/' + snapshot.node + '/cass_snap_' + snapshot.date
+        @logger.debug("DELETE => #{remote}")
+        @hadoop.delete(remote)
+      end
+    end
+  end
+  # Download a file from HDFS, buffered way
+  # * *Args*    :
+  #   - +remote+ -> HDFS path
+  #   - +local+ -> local path
+  def buffered_download(remote, local)
+    @logger.debug("#{remote} => #{local}")
+    # Create the destination directory if not exists
+    path = File.dirname(local)
+    FileUtils.mkdir_p(path) unless File.exist?(path)
+    file = open(local, 'wb')
+    offset = 0
+    length = BUFFER_SIZE
+    print '['
+    while length == BUFFER_SIZE
+      print '#'
+      content = @hadoop.read(remote, offset: offset, length: BUFFER_SIZE)
+      file.write(content)
+      length = content.length
+      offset += length
+    end
+    print "]\n"
+    file.close
+  end
+  # Restore a snapshot from HDFS
+  # * *Args*    :
+  #   - +node+ -> node where the snapshot comes from
+  #   - +date+ -> snapshot date
+  #   - +destination+ -> local directory where to restore
+  def restore_snapshot(node, date, destination)
+    # Search the snapshot matching node and date
+    snapshots = search_snapshots(node: node, date: date)
+    if snapshots.empty?
+      raise('No snapshot found for restore')
+    elsif snapshots.length > 1
+      raise('More than one candidate snapshot to restore')
+    else
+      snapshot = snapshots[0]
+      @logger.info("Restoring snapshot #{snapshot}")
+      @logger.info("#{snapshot.metadata.length} files to restore")
+      # For each file in metadata
+      snapshot.metadata.each do |file|
+        @logger.info("Restoring file #{file}")
+        local = destination + '/' + file
+        remote = @hadoop.base_dir + '/' + snapshot.cluster + '/' + snapshot.node + '/' + file
+        # Download the file from hdfs
+        buffered_download(remote, local)
+      end
+      @logger.info('Success !')
+    end
+  end
+end

data/lib/cassandra.rb ADDED

@@ -0,0 +1,153 @@
+require 'set'
+require 'socket'
+require 'yaml'
+class Cassandra
+  attr_reader :data_path, :cluster_name, :node_name
+  def initialize(config_file, logger)
+    @logger = logger
+    read_config_file(config_file)
+    @node_name = Socket.gethostname
+    @logger.info("Cassandra cluster name = #{@cluster_name}")
+    @logger.info("Cassandra node name = #{@node_name}")
+    @logger.info("Cassandra data path = #{@data_path}")
+  end
+  def read_config_file(config_file)
+    config = YAML.load_file(config_file)
+    if config.include? 'cluster_name'
+      @cluster_name = config['cluster_name'].tr(' ', '_')
+    else
+      @logger.warn("Could not found cluster name in Cassandra config file #{@config_file}")
+      @cluster_name = 'noname_cassandra_cluster'
+    end
+    if config.include? 'data_file_directories'
+      if config['data_file_directories'].length == 1
+        @data_path = config['data_file_directories'][0]
+      else
+        # TODO : manage multiple data directories
+        raise('This backup tool does not currently work with multiple data directories')
+      end
+    else
+      raise('Not data directory defined in config file')
+    end
+  rescue Exception => e
+    raise("Could not parse Cassandra config file #{config_file} (#{e.message})")
+  end
+  private :read_config_file
+  def nodetool_snapshot(name)
+    @logger.debug("Starting a new Cassandra snapshot #{name}")
+    begin
+      success = system('nodetool', 'snapshot', '-t', name)
+      if success
+        @logger.debug('Cassandra Snapshot successful')
+      else
+        raise
+      end
+    rescue Exception => e
+      raise("Error while snapshot command (#{e.message})")
+    end
+  end
+  private :nodetool_snapshot
+  def nodetool_clearsnapshot(name)
+    @logger.debug("Deleting snapshot #{name} in Cassandra")
+    begin
+      success = system('nodetool', 'clearsnapshot', '-t', name)
+      if success
+        @logger.debug('Cassandra Snapshot deletion successful')
+      else
+        raise
+      end
+    rescue Exception => e
+      raise("Error while clearsnapshot command (#{e.message})")
+    end
+  end
+  private :nodetool_clearsnapshot
+  def get_keyspaces_and_tables
+    result = {}
+    Dir.foreach(@data_path) do |keyspace|
+      next if keyspace == '.' || keyspace == '..'
+      result[keyspace] = []
+      Dir.foreach(@data_path + '/' + keyspace) do |table|
+        next if table == '.' || table == '..'
+        result[keyspace].push(table)
+      end
+    end
+    result
+  end
+  private :get_keyspaces_and_tables
+  def build_metadata(name)
+    result = Set.new
+    ks = get_keyspaces_and_tables
+    ks.each do |keyspace, tables|
+      tables.each do |table|
+        snapdir = @data_path + '/' + keyspace + '/' + table + '/snapshots/' + name
+        next unless Dir.exist?(snapdir)
+        Dir.foreach(snapdir) do |filename|
+          next if filename == '.' || filename == '..'
+          result.add(keyspace + '/' + table + '/snapshots/' + name + '/' + filename)
+        end
+      end
+    end
+    result
+  end
+  private :build_metadata
+  def new_snapshot
+    today = Time.new.strftime('%Y_%m_%d')
+    snapname = 'cass_snap_' + today
+    nodetool_snapshot(snapname)
+    metadata = build_metadata(snapname)
+    CassandraSnapshot.new(@cluster_name, @node_name, today, metadata)
+  end
+  def delete_snapshot(snapshot)
+    snapname = 'cass_snap_' + snapshot.date
+    nodetool_clearsnapshot(snapname)
+  end
+end
+class CassandraSnapshot
+  attr_reader :cluster, :node, :date, :metadata
+  def initialize(cluster, node, date, metadata = nil)
+    @cluster = cluster
+    @node = node
+    @date = date
+    @metadata = if metadata.nil?
+                  Set.new
+                else
+                  metadata
+    end
+  end
+  def to_s
+    "[#{@cluster}|#{@node}|#{@date}]"
+  end
+  def ==(other)
+    @cluster == other.cluster && @node == other.node && @date == other.date
+  end
+  def <=>(other)
+    c = @cluster <=> other.cluster
+    n = @node <=> other.node
+    d = @date <=> other.date
+    c * 3 + n * 2 + d
+  end
+end

data/lib/hadoop.rb ADDED

@@ -0,0 +1,14 @@
+require 'webhdfs'
+require 'webhdfs/fileutils'
+WebHDFS::ClientV1::REDIRECTED_OPERATIONS.delete('OPEN')
+class Hadoop < WebHDFS::Client
+  attr_reader :base_dir
+  def initialize(host: 'localhost', port: 14_000, base_dir: '/')
+    super(host = host, port = port)
+    @kerberos = true
+    @base_dir = base_dir
+  end
+end

metadata ADDED

@@ -0,0 +1,121 @@
+--- !ruby/object:Gem::Specification
+name: cassback
+version: !ruby/object:Gem::Version
+  version: 0.1.1
+platform: ruby
+authors:
+- Vincent Van Hollebeke
+- Bogdan Niculescu
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2016-04-20 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.11'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.11'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: webhdfs
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.8'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.8.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.8'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.8.0
+- !ruby/object:Gem::Dependency
+  name: table_print
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.5'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.5.6
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.5'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.5.6
+description: This is a tool that allows creating backups of Cassandra and pushing
+  them into HDFS.
+email:
+- v.vanhollebeke@criteo.com
+- b.niculescu@criteo.com
+executables:
+- cassback.rb
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/cassback.rb
+- lib/backuptool.rb
+- lib/cassandra.rb
+- lib/hadoop.rb
+homepage: http://rubygems.org/gems/cassback
+licenses:
+- Apache2
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.5.2
+signing_key:
+specification_version: 4
+summary: Cassandra backup to HDFS.
+test_files: []
+has_rdoc: