RubyGems - wonderdog - Versions diffs - 0.1.1 → 0.2.0 - Mend

wonderdog 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

data/.gitignore +2 -0
data/.idea/encodings.xml +5 -0
data/.idea/misc.xml +5 -0
data/.idea/modules.xml +9 -0
data/.idea/scopes/scope_settings.xml +5 -0
data/.idea/vcs.xml +7 -0
data/.idea/wonderdog.iml +41 -0
data/Gemfile +1 -1
data/bin/estool +22 -1
data/bin/squirrel.rb +108 -0
data/lib/wonderdog.rb +3 -0
data/lib/wonderdog/hadoop_invocation_override.rb +4 -1
data/lib/wonderdog/version.rb +1 -1
data/pom.xml +1 -1
data/spec/spec_helper.rb +1 -1
data/spec/wonderdog/hadoop_invocation_override_spec.rb +1 -1
data/squirrel/all_facets.rb +95 -0
data/squirrel/change_es_index_settings.rb +19 -0
data/squirrel/clear_es_caches.rb +30 -0
data/squirrel/esbackup.rb +184 -0
data/squirrel/esbackup_stripped.rb +153 -0
data/squirrel/fields.sh +5 -0
data/squirrel/getFields.rb +19 -0
data/squirrel/replay.rb +219 -0
data/squirrel/squirrel.rb +95 -0
data/squirrel/warmer_interface.rb +59 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +2 -2
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +14 -2
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +20 -5
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +55 -26
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +59 -22
data/test/cardinality.rb +43 -0
data/test/change_es_index_settings.rb +19 -0
data/test/clear_es_caches.rb +30 -0
data/test/config/mapping.yml +327 -0
data/test/config/mappings.yml +328 -0
data/test/count_check.txt +0 -0
data/test/esbackup_stripped.rb +153 -0
data/test/mapping.yml +327 -0
data/test/medium_slow_queries +41 -0
data/test/queries.txt +0 -0
data/test/quick_test_slow_queries +4 -0
data/test/run_pry.rb +3 -0
data/test/some_slow_queries +53 -0
data/test/warmer_interface.rb +64 -0
data/test/warmindices.rb +65 -0
data/wonderdog.gemspec +1 -1
metadata +40 -7

data/squirrel/esbackup.rb ADDED

@@ -0,0 +1,184 @@
+#!/usr/bin/env ruby
+# Simple script to dump elasticsearch indexes as raw JSON
+require 'tire'
+require 'zlib'
+require 'socket'
+require 'pathname'
+require 'configliere'
+require 'multi_json'
+Settings.use :commandline
+def Settings.available_commands() %w[ backup restore ]                        ; end
+def Settings.usage()   'usage: esbackup command [ path ] [..--param=value..]' ; end
+Settings.description = <<-DESC.gsub(/^ {2}/, '').chomp
+  Simple backup and restore tool for Elasticsearch.
+  # Example backup command
+  $ esbackup backup -c localhost -p 9200 -b 100 -i my_index -q '{"query":{"term":{"field":"search_term"}}}'
+  # Example restore command
+  $ esbackup restore -c localhost -p 9200 -i my_index -m mapping_file.json
+  CAVEAT: Due to stupid fucking restrictions in the gem we use (tire), if you would like to use a dynamic query
+  when backing up you have to specify it like so: -q '{"query_method":["*args_to_method"]}'
+  Available commands:
+#{Settings.available_commands.map{ |cmd| '    ' << cmd }.join("\n")}
+DESC
+Settings.define :host,       default: Socket.gethostname, flag: 'c', description: 'Host to connect to Elasticsearch'
+Settings.define :port,       default: 9200,               flag: 'p', description: 'Port to connect to Elasticsearch'
+Settings.define :batch_size, default: 10000,              flag: 'b', description: 'Batch size'
+Settings.define :index,                                   flag: 'i', description: 'Index to backup/restore', required: true
+Settings.define :mappings,                                flag: 'm', description: 'Dump mappings to or load mappings from provided file'
+Settings.define :query,                                   flag: 'q', description: 'A JSON hash containing a query to apply to the command (backup only)'
+Settings.define :dump_file,  default: nil                 flag: 'd', description: 'The name of the JSON dump file, if not passed the index name will be the dump files name'
+Settings.resolve!
+Tire::Configuration.url "http://#{Settings[:host]}:#{Settings[:port]}"
+class ESBackup
+  def initialize(output_dir, options = {})
+    options = options.merge()
+    @output_dir   = output_dir || ''
+    @index        = options[:index]
+    @batch_size   = options[:batch_size].to_i
+    @mapping_file = options[:mappings]
+    @query        = MultiJson.load(options[:query]) rescue nil
+    if options[:dump_file].nil?
+      @dump_file    = @index
+    else
+      @dump_file = options[:dump_file]
+    end
+  end
+  def dump_mapping
+    index = Tire::Index.new @index
+    File.open(@mapping_file, 'w'){ |f| f.puts index.mapping.to_json }
+  end
+  def fullpath dir
+    basedir = dir.start_with?('/') ? dir : File.join(Dir.pwd, dir)
+    FileUtils.mkdir_p(basedir)
+    basedir
+  end
+  def gz_output
+    File.join(fullpath(@output_dir), @index + '.gz')
+  end
+  def create_scanner
+    scan_opts = { size: @batch_size }
+    additional_query = @query
+    Tire::Search::Scan.new(@index, scan_opts) do
+      # This is fucking stupid; why people have to be cute and make everything DSL only
+      # I'll never understand, but the person who wrote this gem has forced us to ONLY be able to
+      # ask queries in this manner.
+      query do
+        additional_query.each_pair do |key, vals|
+          case vals
+          # Assuming here that you are only asking for one field at a time...this is getting hacky fast
+          when Hash  then self.send(key.to_sym, *vals.to_a.flatten)
+          when Array then self.send(key.to_sym, *vals)
+          end
+        end
+      end if additional_query
+    end
+  end
+  def run
+    dump_mapping if @mapping_file
+    gz = Zlib::GzipWriter.open gz_output
+    count = 0
+    create_scanner.each do |document|
+      document.each do |record|
+        json_doc = record.to_hash.except(:type, :_index, :_explanation, :_score, :_version, :highlight, :sort).to_json
+        gz.puts json_doc
+        count += 1
+      end
+    end
+    gz.close
+    puts "#{@index} backup complete. #{count} records written"
+  end
+end
+class ESRestore
+  def initialize(input, options = {})
+    @index        = options[:index]
+    @batch_size   = options[:batch_size].to_i
+    @gz_input     = Zlib::GzipReader.open(input)
+    @mapping_file = options[:mappings]
+  end
+  def create_index
+    index   = Tire::Index.new @index
+    options = @mapping_file ? { mappings: MultiJson.load(File.read(@mapping_file)) } : {}
+    index.create(options) unless index.exists?
+    index
+  end
+  def run
+    reindex = create_index
+    count, documents = 0, []
+    @gz_input.each_line do |json|
+      documents << MultiJson.load(json)
+      count     += 1
+      if count % @batch_size == 0
+        reindex.bulk_create documents
+        puts "#{count} records loaded"
+        documents.clear
+      end
+    end
+    @gz_input.close()
+    reindex.bulk_create documents if not documents.empty?
+    puts "#{@index} restore complete with #{count} records loaded"
+  end
+end
+class ESDup
+  def initialize(input, options = {})
+    @index        = options[:index]
+    @batch_size   = options[:batch_size].to_i
+    @gz_input     = Zlib::GzipReader.open(input)
+    @mapping_file = options[:mappings]
+  end
+  def create_index
+    index   = Tire::Index.new @index
+    options = @mapping_file ? { mappings: MultiJson.load(File.read(@mapping_file)) } : {}
+    index.create(options) unless index.exists?
+    index
+  end
+  def run
+    reindex = create_index
+    count, documents = 0, []
+    @gz_input.each_line do |json|
+      line = MultiJson.load(json)
+      line.delete("_id")
+      line.delete("id")
+      documents << line
+      count     += 1
+      if count % @batch_size == 0
+        reindex.bulk_create documents
+        puts "#{count} records loaded"
+        documents.clear
+      end
+    end
+    @gz_input.close()
+    reindex.bulk_create documents if not documents.empty?
+    puts "#{@index} restore complete with #{count} records loaded"
+  end
+end
+case command = Settings.rest.shift.to_s.to_sym
+when :restore then ESRestore.new(Settings.rest.shift, Settings.to_hash).run
+when :backup  then ESBackup.new(Settings.rest.shift, Settings.to_hash).run
+when :duplicate then ESDup.new(Settings.rest.shift, Settings.to_hash).run
+else abort Settings.help("Must specify either backup, restore or duplicate.  Got <#{command}>")
+end

data/squirrel/esbackup_stripped.rb ADDED

@@ -0,0 +1,153 @@
+#!/usr/bin/env ruby
+# Simple script to dump elasticsearch indexes as raw JSON
+require 'tire'
+require 'zlib'
+require 'socket'
+require 'pathname'
+require 'multi_json'
+class ESBackup
+  def initialize(output_dir, options = {})
+    Tire::Configuration.url "http://#{options[:host]}:#{options[:port]}"
+    @output_dir   = output_dir || ''
+    @index        = options[:index]
+    @batch_size   = options[:batch_size].to_i
+    @mapping_file = options[:mappings]
+    if options[:query].nil?
+      @query      = nil
+    else
+      @query      = MultiJson.load(options[:query]) rescue nil
+    end
+    if options[:dump_file].nil?
+      @dump_file  = @index
+    else
+      @dump_file  = options[:dump_file]
+    end
+  end
+  def dump_mapping
+    index = Tire::Index.new @index
+    File.open(@mapping_file, 'w'){ |f| f.puts index.mapping.to_json }
+  end
+  def fullpath dir
+    basedir = dir.start_with?('/') ? dir : File.join(Dir.pwd, dir)
+    FileUtils.mkdir_p(basedir)
+    basedir
+  end
+  def gz_output
+    File.join(fullpath(@output_dir), @index + '.gz')
+  end
+  def create_scanner
+    scan_opts = { size: @batch_size }
+    additional_query = @query
+    Tire::Search::Scan.new(@index, scan_opts) do
+      # This is fucking stupid; why people have to be cute and make everything DSL only
+      # I'll never understand, but the person who wrote this gem has forced us to ONLY be able to
+      # ask queries in this manner.
+      query do
+        additional_query.each_pair do |key, vals|
+          case vals
+            # Assuming here that you are only asking for one field at a time...this is getting hacky fast
+            when Hash  then self.send(key.to_sym, *vals.to_a.flatten)
+            when Array then self.send(key.to_sym, *vals)
+          end
+        end
+      end if additional_query
+    end
+  end
+  def run
+    dump_mapping if @mapping_file
+    gz = Zlib::GzipWriter.open gz_output
+    count = 0
+    create_scanner.each do |document|
+      document.each do |record|
+        json_doc = record.to_hash.except(:type, :_index, :_explanation, :_score, :_version, :highlight, :sort).to_json
+        gz.puts json_doc
+        count += 1
+      end
+    end
+    gz.close
+    puts "#{@index} backup complete. #{count} records written"
+  end
+end
+class ESRestore
+  def initialize(input, options = {})
+    Tire::Configuration.url "http://#{options[:host]}:#{options[:port]}"
+    @index        = options[:index]
+    @batch_size   = options[:batch_size].to_i
+    @gz_input     = Zlib::GzipReader.open(input)
+    @mapping_file = options[:mappings]
+  end
+  def create_index
+    index   = Tire::Index.new @index
+    options = @mapping_file ? { mappings: MultiJson.load(File.read(@mapping_file)) } : {}
+    index.create(options) unless index.exists?
+    index
+  end
+  def run
+    reindex = create_index
+    count, documents = 0, []
+    @gz_input.each_line do |json|
+      documents << MultiJson.load(json)
+      count     += 1
+      if count % @batch_size == 0
+        reindex.bulk_create documents
+        puts "#{count} records loaded"
+        documents.clear
+      end
+    end
+    @gz_input.close()
+    reindex.bulk_create documents if not documents.empty?
+    puts "#{@index} restore complete with #{count} records loaded"
+  end
+end
+class ESDup
+  def initialize(input, options = {})
+    Tire::Configuration.url "http://#{options[:host]}:#{options[:port]}"
+    @index        = options[:index]
+    @batch_size   = options[:batch_size].to_i
+    @gz_input     = Zlib::GzipReader.open(input)
+    @mapping_file = options[:mappings]
+  end
+  def create_index
+    index   = Tire::Index.new @index
+    options = @mapping_file ? { mappings: MultiJson.load(File.read(@mapping_file)) } : {}
+    index.create(options) unless index.exists?
+    index
+  end
+  def run
+    reindex = create_index
+    count, documents = 0, []
+    @gz_input.each_line do |json|
+      line = MultiJson.load(json)
+      line.delete("_id")
+      line.delete("id")
+      documents << line
+      count     += 1
+      if count % @batch_size == 0
+        reindex.bulk_create documents
+        puts "#{count} records loaded"
+        documents.clear
+      end
+    end
+    @gz_input.close()
+    reindex.bulk_create documents if not documents.empty?
+    puts "#{@index} restore complete with #{count} records loaded"
+  end
+end

data/squirrel/fields.sh ADDED

@@ -0,0 +1,5 @@
+for foo in flight_id metric tb_h feature base_feature metric_feature cnt; do
+echo $foo;
+/home/maphysics/.rbenv/shims/ruby /home/maphysics/GitProjects/wonderdog/test/getFields.rb --dump=/home/maphysics/GitProjects/wonderdog/test/flight_count_20130405 --field=$foo >> $foo.txt ;
+cat $foo.txt |sort | uniq -c |sort -n | wc -l;
+done

data/squirrel/getFields.rb ADDED

@@ -0,0 +1,19 @@
+require 'rubygems'
+require 'configliere'
+require 'json'
+require 'multi_json'
+Settings.use :commandline
+Settings.use :config_block
+Settings.define :dump
+Settings.define :field
+Settings.resolve!
+def get_value_counts(dump, field)
+  File.open(dump).each do |line|
+    record = MultiJson.load(line)
+    puts record[field]
+  end
+end
+get_value_counts(Settings.dump, Settings.field)

data/squirrel/replay.rb ADDED

@@ -0,0 +1,219 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'json'
+require 'time'
+#require 'random'
+########################################################################################################################
+# This program is designed to read an elasticsearch log file and return                                                #
+# information about how long a slow process took, run the query, and                                                   #
+# return information about how long it took to run the query again.                                                    #
+# Example command:                                                                                                     #
+#   ruby ./replay.rb --logfile=/var/log/elasticsearch/patrick.log --port=9200 --host=localhost                    #
+########################################################################################################################
+########################################################################################################################
+# Global variables for storing metadata                                                                                #
+########################################################################################################################
+@slowlog_lines = []
+@metadata_hash = {}
+########################################################################################################################
+# Parse logfile, grab:                                                                                                 #
+# *the timestamp                                                                                                       #
+# *the index                                                                                                           #
+# *the node                                                                                                            #
+# *the type of search                                                                                                  #
+# *the time in millisecond                                                                                             #
+# *At least first 50 char of query                                                                                     #
+########################################################################################################################
+class ParseMetaData
+  attr_accessor :metaData
+  def initialize(metaString, metaArray = [])
+    @metaString = metaString
+    @metaArray = metaArray
+    @metaData = {}
+    @bracket_pairs = get_bracket_pair_indexes
+  end
+  def get_left_bracket_indexes
+    @metaString.enum_for(:scan, Regexp.new('\[')).map {Regexp.last_match.begin(0)}
+  end
+  def get_right_bracket_indexes
+    @metaString.enum_for(:scan, Regexp.new('\]')).map {Regexp.last_match.begin(0)}
+  end
+  def get_bracket_pair_indexes
+    get_left_bracket_indexes.zip(get_right_bracket_indexes)
+  end
+  def get_query
+    startInd = @metaString.enum_for(:scan, Regexp.new(' source\[')).map {Regexp.last_match.begin(0)+8}
+    endInd = @metaString.enum_for(:scan, Regexp.new('_source\[')).map {Regexp.last_match.begin(0)-9}
+    @metaData["query"] = @metaString[startInd[0]..endInd[0]]
+  end
+  def find_meta_data(meta)
+    start = @metaString.enum_for(:scan, Regexp.new(meta)).map {Regexp.last_match.begin(0) + meta.size}
+    index = get_left_bracket_indexes.index(start[0])
+    unless index.nil?
+      bracket_pair = @bracket_pairs[index]
+      #puts @metaString[bracket_pair[0]+1..bracket_pair[1]-1].inspect
+      @metaData[meta] = @metaString[bracket_pair[0]+1..bracket_pair[1]-1]
+    end
+  end
+  def get_extra_meta_data
+    @metaArray.each do |meta|
+      find_meta_data(meta)
+    end
+  end
+  def get_basic_meta_data
+    #FIXME! Make this dynamic and depended on the first four [] to contain the same things everytime
+    @metaData["timestamp"] = @metaString[@bracket_pairs[0][0]+1..@bracket_pairs[0][1]-1]
+    @metaData["node"] = @metaString[@bracket_pairs[3][0]+1..@bracket_pairs[3][1]-1]
+    @metaData["index"] = @metaString[@bracket_pairs[4][0]+1..@bracket_pairs[4][1]-1]
+  end
+  def get_meta_data
+    get_basic_meta_data
+    get_query
+    unless @metaArray == []
+      get_extra_meta_data
+    end
+  end
+end
+def parse_logline(line, metaArray)
+  if (line =~ %r{, source\[(.*)\], extra_source})
+    query = $1
+  else
+    warn("couldn't parse line")
+    return
+  end
+  #puts line
+  parser = ParseMetaData.new(line, metaArray)
+  parser.get_meta_data
+  return parser.metaData["query"], parser.metaData
+end
+########################################################################################################################
+# Return the following info to stdout as tab delimited:                                                                #
+# Current time                                                                                                         #
+# Original timestamp                                                                                                   #
+# Duration of query in log                                                                                             #
+# Duration of re-ran query according to elastic search                                                                 #
+# Duration of re-ran query according to the wall clock                                                                 #
+# The meta captured from the logfile                                                                                   #
+# A snippet of query                                                                                                   #
+# Extra source data from logfile                                                                                       #
+########################################################################################################################
+class Replay
+  def initialize(logfile, host, port, preference, routing)
+    @logfile = logfile
+    @host = host
+    @port = port
+    @preference = preference
+    @routing = routing
+  end
+  def header()
+    puts "\n"
+    puts %w[current_timestamp original_timestamp es_duration(ms) new_duration(ms) clock_time_duration(ms) node index query_fragment].join("\t")
+  end
+  def output(query, data, malformed=false)
+    query_fragment = query[0..49]
+    if malformed
+      puts "malformed"
+      puts query_fragment
+    else
+      took = data['took'].to_s
+      current_time = data['new_timestamp'].to_s
+      original_timestamp = data['timestamp'].to_s
+      es_duration = data['original_dur'].to_s
+      new_duration = data['new_duration'].to_i.to_s
+      node = data['node'].to_s
+      index = data['index'].to_s
+      if Random.rand() < 0.1
+        header
+      end
+      puts [current_time, original_timestamp, es_duration, took, new_duration, node, index, query_fragment].join("\t")
+    end
+  end
+  def build_curl_command_string(query, data)
+    base_uri = "'#{@host}:#{@port}/#{data['index']}/_search"
+    if @preference[0] && @routing[0]
+      base_uri.concat("?preference=#{@preference[1]},routing=#{@routing[1]}")
+    elsif @preference[0] && !@routing[0]
+      base_uri.concat("?reference=#{@preference[1]}")
+    elsif @routing[0] && !@preference[0]
+      base_uri.concat("routing=#{@routing[1]}")
+    end
+    curl_command = "curl -s -XGET ".concat(base_uri)
+    curl_command.concat("/' -d '#{query}'")
+  end
+########################################################################################################################
+# Execute slow query from log                                                                                          #
+########################################################################################################################
+  def execute_query(total_took, query, data)
+    if query.include? " " or query.index('(\\\'.*?\\\')').nil?
+      if data['search_type'] == "QUERY_THEN_FETCH"
+        data['new_timestamp'] = Time.now
+        data['new_start_time'] = Time.now.to_f * 1000
+        cmd = build_curl_command_string(query, data)
+        #puts cmd
+        curl_result = `#{cmd}`
+        #puts curl_result
+        #puts "\n"
+        data['new_end_time'] = Time.now.to_f * 1000
+        data['new_duration'] = data['new_end_time'] - data['new_start_time']
+        data['original_dur'] = data['took']
+        data = data.merge(JSON.parse(curl_result))
+        output(query, data)
+      else
+        puts "error don't know search type, please throw an exception here"
+      end
+    else
+      puts "malformed query string"
+      puts query
+      output(query, data, malformed=true)
+    end
+    total_took + data['new_duration'].to_i
+  end
+########################################################################################################################
+# MAIN                                                                                                                 #
+########################################################################################################################
+  def run
+    sl_regex = Regexp.new(('(slowlog\\.query)'), Regexp::IGNORECASE)
+    metaArray = %w[took took_millis types search_type total_shards]
+    header
+    total_took = 0
+    File.readlines(@logfile).each do |line|
+      if sl_regex.match(line)
+        query, query_hash = parse_logline(line, metaArray)
+        total_took = execute_query(total_took, query, query_hash)
+      end
+    end
+    total_took /= 60000.0
+    puts "All together the slow logs took: #{total_took}min"
+  end
+end