RubyGems - fluent-plugin-hekk_redshift - Versions diffs - 0.0.4 - Mend

fluent-plugin-hekk_redshift 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/.gitignore +19 -0
data/Gemfile +3 -0
data/LICENSE +191 -0
data/README.md +162 -0
data/Rakefile +16 -0
data/VERSION +1 -0
data/fluent-plugin-hekk_redshift.gemspec +26 -0
data/lib/fluent/plugin/out_hekk_redshift.rb +295 -0
data/test/plugin/test_out_redshift.rb +526 -0
data/test/test_helper.rb +8 -0
metadata +155 -0

data/lib/fluent/plugin/out_hekk_redshift.rb ADDED Viewed

@@ -0,0 +1,295 @@
+module Fluent
+class HekkRedshiftOutput < BufferedOutput
+  Fluent::Plugin.register_output('hekk_redshift', self)
+  # ignore load table error. (invalid data format)
+  IGNORE_REDSHIFT_ERROR_REGEXP = /^ERROR:  Load into table '[^']+' failed\./
+  def initialize
+    super
+    require 'aws-sdk'
+    require 'zlib'
+    require 'time'
+    require 'tempfile'
+    require 'pg'
+    require 'json'
+    require 'csv'
+  end
+  config_param :record_log_tag, :string, :default => 'log'
+  # s3
+  config_param :aws_key_id, :string
+  config_param :aws_sec_key, :string
+  config_param :s3_bucket, :string
+  config_param :s3_endpoint, :string, :default => nil
+  config_param :path, :string, :default => ''
+  config_param :timestamp_key_format, :string, :default => 'year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M'
+  config_param :utc, :bool, :default => false
+  # redshift
+  config_param :redshift_host, :string
+  config_param :redshift_port, :integer, :default => 5439
+  config_param :redshift_dbname, :string
+  config_param :redshift_user, :string
+  config_param :redshift_password, :string
+  config_param :redshift_tablename, :string
+  config_param :redshift_copy_command_tablename, :string
+  config_param :redshift_copy_command_columnname, :string
+  config_param :redshift_schemaname, :string, :default => nil
+  config_param :redshift_copy_base_options, :string , :default => 'FILLRECORD ACCEPTANYDATE TRUNCATECOLUMNS'
+  config_param :redshift_copy_options, :string , :default => nil
+  # file format
+  config_param :file_type, :string, :default => nil  # json, tsv, csv, msgpack
+  config_param :delimiter, :string, :default => nil
+  # for debug
+  config_param :log_suffix, :string, :default => ''
+  def configure(conf)
+    super
+    @path = "#{@path}/" unless @path.end_with?('/') # append last slash
+    @path = @path[1..-1] if @path.start_with?('/')  # remove head slash
+    @utc = true if conf['utc']
+    @db_conf = {
+      host:@redshift_host,
+      port:@redshift_port,
+      dbname:@redshift_dbname,
+      user:@redshift_user,
+      password:@redshift_password
+    }
+    @delimiter = determine_delimiter(@file_type) if @delimiter.nil? or @delimiter.empty?
+    $log.debug format_log("redshift file_type:#{@file_type} delimiter:'#{@delimiter}'")
+    @copy_sql_template = "copy #{table_name_with_schema} from '%s' CREDENTIALS 'aws_access_key_id=#{@aws_key_id};aws_secret_access_key=%s' delimiter '#{@delimiter}' GZIP ESCAPE #{@redshift_copy_base_options} #{@redshift_copy_options};"
+    @insert_sql_template = "insert into #{@redshift_copy_command_tablename}(#{@redshift_copy_command_columnname}) values('%s');"
+  end
+  def start
+    super
+    # init s3 conf
+    options = {
+      :access_key_id     => @aws_key_id,
+      :secret_access_key => @aws_sec_key
+    }
+    options[:s3_endpoint] = @s3_endpoint if @s3_endpoint
+    @s3 = AWS::S3.new(options)
+    @bucket = @s3.buckets[@s3_bucket]
+  end
+  def format(tag, time, record)
+    if json?
+      record.to_msgpack
+    elsif msgpack?
+      { @record_log_tag => record }.to_msgpack
+    else
+      "#{record[@record_log_tag]}\n"
+    end
+  end
+  def write(chunk)
+    $log.debug format_log('start creating gz.')
+    # create a gz file
+    tmp = Tempfile.new('s3-')
+    tmp =
+      if json? || msgpack?
+        create_gz_file_from_structured_data(tmp, chunk, @delimiter)
+      else
+        create_gz_file_from_flat_data(tmp, chunk)
+      end
+    # no data -> skip
+    unless tmp
+      $log.debug format_log('received no valid data. ')
+      return false # for debug
+    end
+    # create a file path with time format
+    s3path = create_s3path(@bucket, @path)
+    # upload gz to s3
+    @bucket.objects[s3path].write(Pathname.new(tmp.path),
+                                  :acl => :bucket_owner_full_control)
+    # close temp file
+    tmp.close!
+    # copy gz on s3 to redshift
+    s3_uri = "s3://#{@s3_bucket}/#{s3path}"
+    copy_sql = @copy_sql_template % [s3_uri, @aws_sec_key]
+    sql = @insert_sql_template % [copy_sql.gsub(/'/, "\\\\'")]
+    insert_sql_to_redshift(sql)
+    true # for debug
+  end
+  protected
+  def format_log(message)
+    (@log_suffix and not @log_suffix.empty?) ? "#{message} #{@log_suffix}" : message
+  end
+  private
+  def json?
+    @file_type == 'json'
+  end
+  def msgpack?
+    @file_type == 'msgpack'
+  end
+  def create_gz_file_from_flat_data(dst_file, chunk)
+    gzw = nil
+    begin
+      gzw = Zlib::GzipWriter.new(dst_file)
+      chunk.write_to(gzw)
+    ensure
+      gzw.close rescue nil if gzw
+    end
+    dst_file
+  end
+  def create_gz_file_from_structured_data(dst_file, chunk, delimiter)
+    # fetch the table definition from redshift
+    redshift_table_columns = fetch_table_columns
+    if redshift_table_columns == nil
+      raise 'failed to fetch the redshift table definition.'
+    elsif redshift_table_columns.empty?
+      $log.warn format_log("no table on redshift. table_name=#{table_name_with_schema}")
+      return nil
+    end
+    # convert json to tsv format text
+    gzw = nil
+    begin
+      gzw = Zlib::GzipWriter.new(dst_file)
+      chunk.msgpack_each do |record|
+        begin
+          hash = json? ? json_to_hash(record[@record_log_tag]) : record[@record_log_tag]
+          tsv_text = hash_to_table_text(redshift_table_columns, hash, delimiter)
+          gzw.write(tsv_text) if tsv_text and not tsv_text.empty?
+        rescue
+          if json?
+            $log.error format_log("failed to create table text from json. text=(#{record[@record_log_tag]})"), :error=>$!.to_s
+          else
+            $log.error format_log("failed to create table text from msgpack. text=(#{record[@record_log_tag]})"), :error=>$!.to_s
+          end
+          $log.error_backtrace
+        end
+      end
+      return nil unless gzw.pos > 0
+    ensure
+      gzw.close rescue nil if gzw
+    end
+    dst_file
+  end
+  def determine_delimiter(file_type)
+    case file_type
+    when 'json', 'msgpack', 'tsv'
+      "\t"
+    when 'csv'
+      ','
+    else
+      raise Fluent::ConfigError, "Invalid file_type:#{file_type}."
+    end
+  end
+  def fetch_table_columns
+    conn = PG.connect(@db_conf)
+    begin
+      columns = nil
+      conn.exec(fetch_columns_sql_with_schema) do |result|
+        columns = result.collect{|row| row['column_name']}
+      end
+      columns
+    ensure
+      conn.close rescue nil
+    end
+  end
+  def fetch_columns_sql_with_schema
+    @fetch_columns_sql ||= if @redshift_schemaname
+                             "select column_name from INFORMATION_SCHEMA.COLUMNS where table_schema = '#{@redshift_schemaname}' and table_name = '#{@redshift_tablename}' order by ordinal_position;"
+                           else
+                             "select column_name from INFORMATION_SCHEMA.COLUMNS where table_name = '#{@redshift_tablename}' order by ordinal_position;"
+                           end
+  end
+  def json_to_hash(json_text)
+    return nil if json_text.to_s.empty?
+    JSON.parse(json_text)
+  rescue => e
+    $log.warn format_log('failed to parse json. '), :error => e.to_s
+  end
+  def hash_to_table_text(redshift_table_columns, hash, delimiter)
+    return '' unless hash
+    # extract values from hash
+    val_list = redshift_table_columns.collect do |cn|
+      val = hash[cn]
+      val = JSON.generate(val) if val.kind_of?(Hash) or val.kind_of?(Array)
+      if val.to_s.empty?
+        nil
+      else
+        val.to_s
+      end
+    end
+    if val_list.all?{|v| v.nil? or v.empty?}
+      $log.warn format_log("no data match for table columns on redshift. data=#{hash} table_columns=#{redshift_table_columns}")
+      return ''
+    end
+    generate_line_with_delimiter(val_list, delimiter)
+  end
+  def generate_line_with_delimiter(val_list, delimiter)
+    val_list = val_list.collect do |val|
+      if val.nil? or val.empty?
+        ''
+      else
+        val.gsub(/\\/, "\\\\\\").gsub(/\t/, "\\\t").gsub(/\n/, "\\\n") # escape tab, newline and backslash
+      end
+    end
+    val_list.join(delimiter) + "\n"
+  end
+  def create_s3path(bucket, path)
+    timestamp_key = (@utc) ? Time.now.utc.strftime(@timestamp_key_format) : Time.now.strftime(@timestamp_key_format)
+    i = 0
+    begin
+      suffix = "_#{'%02d' % i}"
+      s3path = "#{path}#{timestamp_key}#{suffix}.gz"
+      i += 1
+    end while bucket.objects[s3path].exists?
+    s3path
+  end
+  def table_name_with_schema
+    @table_name_with_schema ||= if @redshift_schemaname
+                                  "#{@redshift_schemaname}.#{@redshift_tablename}"
+                                else
+                                  @redshift_tablename
+                                end
+  end
+  def insert_sql_to_redshift(sql)
+    $log.debug  format_log('start inserting copy command.')
+    conn = nil
+    begin
+      conn = PG.connect(@db_conf)
+      conn.exec(sql)
+      $log.info format_log('completed inserting to redshift.')
+    rescue PG::Error => e
+      $log.error format_log('failed to insert copy command into redshift.'), :error=>e.to_s
+      raise e unless e.to_s =~ IGNORE_REDSHIFT_ERROR_REGEXP
+      return false # for debug
+    ensure
+      conn.close rescue nil if conn
+    end
+  end
+end
+end