RubyGems - fluent-plugin-redshift - Versions diffs - 0.1.0 → 0.1.1 - Mend

fluent-plugin-redshift 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/README.md +14 -2
data/VERSION +1 -1
data/lib/fluent/plugin/out_redshift.rb +116 -25
data/test/plugin/test_out_redshift.rb +33 -3
metadata +3 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 208e99381f5503be7e6af6fa3ad54fc2cbeb9bea
-  data.tar.gz: ed2707d372c126d420fd907bc409b5d0fede132d
+  metadata.gz: 6ff3e820b9cccac040efc73a11cf261f4a307dda
+  data.tar.gz: f00f6cc70b5cf75ac399ba65cc25f773d02123db
 SHA512:
-  metadata.gz: f8f93c9b5bb3ba860a3c6900c1889453f9417a36bce39def88e82b5ca4bc7dd8a936b6cf805d246317daafd9f2cab168fa7d890a61bbad7b719b14c40321954b
-  data.tar.gz: d19b4d4751c71d293493cce89cf93224115c762a32a00e363ce774ffdb40798598f656a6e7de126e0b13012011c311a52bef86804f785ec490b52ad67755876b
+  metadata.gz: 096462829885b50f9cd01852843c80fd8096d028d8e9c0f45160fc2be4f3f66e43f7e28cf4b137c79f46bbbaad8f042693ee72e9b00f793e16a2dd280d1e7b9b
+  data.tar.gz: 739976ff4077c4ce34c51ee311dcdc10dd186f9f45ab39414292f8f49b9d87d2ad2dec5f2540f3f465bf6604932e05c115d2da5250b619236df580f7840ebf29

data/README.md CHANGED

@@ -19,10 +19,14 @@ Format:
         # s3 (for copying data to redshift)
         aws_key_id YOUR_AWS_KEY_ID
         aws_sec_key YOUR_AWS_SECRET_KEY
+        ## or Use IAM Role instead of credentials.
+        aws_iam_role arn:aws:iam::ACCOUNT_ID:role/ROLE_NAME
         s3_bucket YOUR_S3_BUCKET
         s3_endpoint YOUR_S3_BUCKET_END_POINT
         path YOUR_S3_PATH
         timestamp_key_format year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M
+        s3_server_side_encryption S3_SERVER_SIDE_ENCRYPTION
         # redshift
         redshift_host YOUR_AMAZON_REDSHIFT_CLUSTER_END_POINT
@@ -32,6 +36,7 @@ Format:
         redshift_password YOUR_AMAZON_REDSHIFT_CLUSTER_PASSWORD
         redshift_schemaname YOUR_AMAZON_REDSHIFT_CLUSTER_TARGET_SCHEMA_NAME
         redshift_tablename YOUR_AMAZON_REDSHIFT_CLUSTER_TARGET_TABLE_NAME
+        redshift_copy_columns COLMUNS_FOR_COPY
         file_type [tsv|csv|json|msgpack]
         # buffer
@@ -64,6 +69,7 @@ Example (watch and upload json formatted apache log):
         s3_endpoint s3.amazonaws.com
         path path/on/s3/apache_json_log/
         timestamp_key_format year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M
+        s3_server_side_encryption aes256
         # redshift
         redshift_host xxx-yyy-zzz.xxxxxxxxxx.us-east-1.redshift.amazonaws.com
@@ -86,9 +92,11 @@ Example (watch and upload json formatted apache log):
 + `type` (required) : The value must be `redshift`.
-+ `aws_key_id` (required) : AWS access key id to access s3 bucket.
++ `aws_key_id` : AWS access key id to access s3 bucket.
++ `aws_sec_key` : AWS secret key id to access s3 bucket.
-+ `aws_sec_key` (required) : AWS secret key id to access s3 bucket.
++ `aws_iam_role` : AWS IAM Role name to access s3 bucket and copy into redshift.
 + `s3_bucket` (required) : s3 bucket name. S3 bucket must be same as the region of your Redshift cluster.
@@ -105,6 +113,8 @@ Example (watch and upload json formatted apache log):
   hapyrus-example/apache_json_log/year=2013/month=03/day=05/hour=12/20130305_1230_00.gz
 </pre>
++ `s3_server_side_encryption` : S3 Server-Side Encryption (Only aes256 is supported)
 + `redshift_host` (required) : the end point(or hostname) of your Amazon Redshift cluster.
 + `redshift_port` (required) : port number.
@@ -121,6 +131,8 @@ Example (watch and upload json formatted apache log):
 + `redshift_connect_timeout` : maximum time to wait for connection to succeed.
++ `redshift_copy_columns` : columns for copying. Value needs to be comma-separated like `id,name,age`
 + `file_type` : file format of the source data.  `csv`, `tsv`, `msgpack` or `json` are available.
 + `delimiter` : delimiter of the source data. This option will be ignored if `file_type` is specified.

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.1.0
1	+ 0.1.1

data/lib/fluent/plugin/out_redshift.rb CHANGED

@@ -22,34 +22,81 @@ class RedshiftOutput < BufferedOutput
   config_param :record_log_tag, :string, :default => 'log'
   # s3
-  config_param :aws_key_id, :string, :secret => true
-  config_param :aws_sec_key, :string, :secret => true
-  config_param :s3_bucket, :string
-  config_param :s3_endpoint, :string, :default => nil
-  config_param :path, :string, :default => ""
-  config_param :timestamp_key_format, :string, :default => 'year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M'
+  config_param :aws_key_id, :string, :secret => true, :default => nil,
+               :desc => "AWS access key id to access s3 bucket."
+  config_param :aws_sec_key, :string, :secret => true, :default => nil,
+               :desc => "AWS secret key id to access s3 bucket."
+  config_param :aws_iam_role, :string, :secret => true, :default => nil,
+               :desc => "AWS IAM Role to access s3 bucket."
+  config_param :s3_bucket, :string,
+               :desc => <<-DESC
+S3 bucket name.
+S3 bucket must be same as the region of your Redshift cluster.
+DESC
+  config_param :s3_endpoint, :string, :default => nil,
+               :desc => "S3 endpoint."
+  config_param :path, :string, :default => "",
+               :desc => "S3 path to input."
+  config_param :timestamp_key_format, :string, :default => 'year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M',
+               :desc => <<-DESC
+The format of the object keys.
+It can include date-format directives.
+DESC
   config_param :utc, :bool, :default => false
+  config_param :s3_server_side_encryption, :string, :default => nil,
+               :desc => "S3 Server-Side Encryption (Only aes256 is supported)."
   # redshift
-  config_param :redshift_host, :string
-  config_param :redshift_port, :integer, :default => 5439
-  config_param :redshift_dbname, :string
-  config_param :redshift_user, :string
-  config_param :redshift_password, :string, :secret => true
-  config_param :redshift_tablename, :string
-  config_param :redshift_schemaname, :string, :default => nil
+  config_param :redshift_host, :string,
+               :desc => "The end point(or hostname) of your Amazon Redshift cluster."
+  config_param :redshift_port, :integer, :default => 5439,
+               :desc => "Port number."
+  config_param :redshift_dbname, :string,
+               :desc => "Database name."
+  config_param :redshift_user, :string,
+               :desc => "User name."
+  config_param :redshift_password, :string, :secret => true,
+               :desc => "Password for the user name."
+  config_param :redshift_tablename, :string,
+               :desc => "Table name to store data."
+  config_param :redshift_schemaname, :string, :default => nil,
+               :desc => <<-DESC
+Schema name to store data. By default, this option is not
+Set and find table without schema as your own search_path.
+DESC
   config_param :redshift_copy_base_options, :string , :default => "FILLRECORD ACCEPTANYDATE TRUNCATECOLUMNS"
   config_param :redshift_copy_options, :string , :default => nil
-  config_param :redshift_connect_timeout, :integer, :default => 10
+  config_param :redshift_connect_timeout, :integer, :default => 10,
+               :desc => "Maximum time to wait for connection to succeed."
+  config_param :redshift_copy_columns, :string, :default => nil,
+               :desc => <<-DESC
+Columns for copying.
+Value needs to be comma-separated like id,name,age
+DESC
   # file format
-  config_param :file_type, :string, :default => nil  # json, tsv, csv, msgpack
-  config_param :delimiter, :string, :default => nil
+  config_param :file_type, :string, :default => nil,
+               :desc => "File format of the source data. csv, tsv, msgpack or json are available."
+  config_param :delimiter, :string, :default => nil,
+               :desc => <<-DESC
+Delimiter of the source data.
+This option will be ignored if file_type is specified.
+DESC
   # maintenance
-  config_param :maintenance_file_path, :string, :default => nil
+  config_param :maintenance_file_path, :string, :default => nil,
+               :desc => <<-DESC
+Path of maintenance file. plugin skip processing and keep retrying
+during a file existing in this file path.
+To avoid data loss due to too many retries caused by long mainenance,
+setting retry_limit and retry_wait is recommended.
+DESC
   # for debug
   config_param :log_suffix, :string, :default => ''
   def configure(conf)
     super
+    if !check_credentials
+      raise ConfigError, "aws_key_id and aws_sec_key is required. or, use aws_iam_role instead."
+    end
     @path = "#{@path}/" unless @path.end_with?('/') # append last slash
     @path = @path[1..-1] if @path.start_with?('/')  # remove head slash
     @utc = true if conf['utc']
@@ -64,17 +111,26 @@ class RedshiftOutput < BufferedOutput
     @delimiter = determine_delimiter(@file_type) if @delimiter.nil? or @delimiter.empty?
     $log.debug format_log("redshift file_type:#{@file_type} delimiter:'#{@delimiter}'")
     @table_name_with_schema = [@redshift_schemaname, @redshift_tablename].compact.join('.')
-    @copy_sql_template = "copy #{@table_name_with_schema} from '%s' CREDENTIALS 'aws_access_key_id=#{@aws_key_id};aws_secret_access_key=%s' delimiter '#{@delimiter}' GZIP ESCAPE #{@redshift_copy_base_options} #{@redshift_copy_options};"
+    @redshift_copy_columns = if !@redshift_copy_columns.to_s.empty?
+                               @redshift_copy_columns.split(/[,\s]+/)
+                             else
+                               nil
+                             end
+    @copy_sql_template = build_redshift_copy_sql_template
     @maintenance_monitor = MaintenanceMonitor.new(@maintenance_file_path)
+    @s3_server_side_encryption = @s3_server_side_encryption.to_sym if s3_server_side_encryption
   end
   def start
     super
     # init s3 conf
-    options = {
-      :access_key_id     => @aws_key_id,
-      :secret_access_key => @aws_sec_key
-    }
+    options = {}
+    if @aws_key_id && @aws_sec_key
+      options = {
+        :access_key_id     => @aws_key_id,
+        :secret_access_key => @aws_sec_key
+      }
+    end
     options[:s3_endpoint] = @s3_endpoint if @s3_endpoint
     @s3 = AWS::S3.new(options)
     @bucket = @s3.buckets[@s3_bucket]
@@ -115,14 +171,15 @@ class RedshiftOutput < BufferedOutput
     # upload gz to s3
     @bucket.objects[s3path].write(Pathname.new(tmp.path),
-                                  :acl => :bucket_owner_full_control)
+                                  :acl => :bucket_owner_full_control,
+                                  :server_side_encryption => @s3_server_side_encryption)
     # close temp file
     tmp.close!
     # copy gz on s3 to redshift
     s3_uri = "s3://#{@s3_bucket}/#{s3path}"
-    sql = @copy_sql_template % [s3_uri, @aws_sec_key]
+    sql = @copy_sql_template % s3_uri
     $log.debug format_log("start copying. s3_uri=#{s3_uri}")
     begin
@@ -146,6 +203,21 @@ class RedshiftOutput < BufferedOutput
   private
+  def build_redshift_copy_sql_template
+    copy_columns = if @redshift_copy_columns
+                     "(#{@redshift_copy_columns.join(",")})"
+                   else
+                     ''
+                   end
+    credentials = if @aws_key_id && @aws_sec_key
+                    "CREDENTIALS 'aws_access_key_id=#{@aws_key_id};aws_secret_access_key=#{@aws_sec_key}'"
+                  else
+                    "CREDENTIALS 'aws_iam_role=#{@aws_iam_role}'"
+                  end
+   "copy #{@table_name_with_schema}#{copy_columns} from '%s' #{credentials} delimiter '#{@delimiter}' GZIP ESCAPE #{@redshift_copy_base_options} #{@redshift_copy_options};"
+  end
   def json?
     @file_type == 'json'
   end
@@ -175,6 +247,14 @@ class RedshiftOutput < BufferedOutput
       return nil
     end
+    if @redshift_copy_columns
+      unknown_colmns = @redshift_copy_columns - redshift_table_columns
+      unless unknown_colmns.empty?
+        raise Fluent::ConfigError, "missing columns included in redshift_copy_columns - missing columns:\"#{unknown_colmns.join(',')}\""
+      end
+      redshift_table_columns = @redshift_copy_columns
+    end
     # convert json to tsv format text
     gzw = nil
     begin
@@ -186,7 +266,8 @@ class RedshiftOutput < BufferedOutput
           tsv_text = hash_to_table_text(redshift_table_columns, hash, delimiter)
           gzw.write(tsv_text) if tsv_text and not tsv_text.empty?
         rescue => e
-          $log.error format_log("failed to create table text from #{@file_type}. text=(#{record[@record_log_tag]})"), :error=>e.to_s
+          text = record.is_a?(Hash) ? record[@record_log_tag] : record
+          $log.error format_log("failed to create table text from #{@file_type}. text=(#{text})"), :error=>e.to_s
           $log.error_backtrace
         end
       end
@@ -261,6 +342,16 @@ class RedshiftOutput < BufferedOutput
     s3path
   end
+  def check_credentials
+    if @aws_key_id && @aws_sec_key
+      true
+    elsif @aws_iam_role
+      true
+    else
+      false
+    end
+  end
   class RedshiftError < StandardError
     def initialize(msg)
       case msg

data/test/plugin/test_out_redshift.rb CHANGED

@@ -20,6 +20,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
   CONFIG_BASE= %[
     aws_key_id test_key_id
     aws_sec_key test_sec_key
+    aws_iam_role test_iam_role
     s3_bucket test_bucket
     path log
     redshift_host test_host
@@ -96,6 +97,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
     d = create_driver(CONFIG_CSV)
     assert_equal "test_key_id", d.instance.aws_key_id
     assert_equal "test_sec_key", d.instance.aws_sec_key
+    assert_equal "test_iam_role", d.instance.aws_iam_role
     assert_equal "test_bucket", d.instance.s3_bucket
     assert_equal "log/", d.instance.path
     assert_equal "test_host", d.instance.redshift_host
@@ -111,6 +113,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
     assert_equal ",", d.instance.delimiter
     assert_equal true, d.instance.utc
     assert_equal MAINTENANCE_FILE_PATH_FOR_TEST, d.instance.maintenance_file_path
+    assert_equal nil, d.instance.redshift_copy_columns
   end
   def test_configure_with_schemaname
     d = create_driver(CONFIG_JSON_WITH_SCHEMA)
@@ -168,6 +171,15 @@ class RedshiftOutputTest < Test::Unit::TestCase
     d = create_driver(CONFIG_CSV.gsub(/ *log_suffix *.+$/, ''))
     assert_equal "", d.instance.log_suffix
   end
+  def test_configure_redshift_copy_columns
+    d = create_driver(CONFIG_CSV + "\n  redshift_copy_columns id,name, age created_at")
+    assert_equal %w(id name age created_at), d.instance.redshift_copy_columns
+    assert_match /^copy test_table\(id,name,age,created_at\) from/, d.instance.instance_variable_get("@copy_sql_template")
+  end
+  def test_configure_s3_server_side_encryption
+    d = create_driver(CONFIG_CSV + "\n  s3_server_side_encryption aes256")
+    assert_equal :aes256, d.instance.s3_server_side_encryption
+  end
   def emit_csv(d)
     d.emit(RECORD_CSV_A, DEFAULT_TIME)
@@ -231,9 +243,9 @@ class RedshiftOutputTest < Test::Unit::TestCase
       end
     copy_query_regex =
       if schema_name
-        /\Acopy #{schema_name}.#{table_name} from/
+        /\Acopy #{schema_name}.#{table_name}(\(.+\))? from/
       else
-        /\Acopy #{table_name} from/
+        /\Acopy #{table_name}(\(.+\))? from/
       end
     flexmock(Fluent::RedshiftOutput::RedshiftConnection).new_instances do |conn|
@@ -274,7 +286,8 @@ class RedshiftOutputTest < Test::Unit::TestCase
         }
         assert_equal expected_data, data
       },
-      :acl => :bucket_owner_full_control
+      :acl => :bucket_owner_full_control,
+      :server_side_encryption => nil
     ).and_return { true }
     # create mock of s3 object collection
@@ -476,6 +489,23 @@ class RedshiftOutputTest < Test::Unit::TestCase
     }
   end
+  def test_write_with_json_with_copy_columns
+    setup_mocks(%[val_a\tval_b\n])
+    setup_tempfile_mock_to_be_closed
+    d_json = create_driver(CONFIG_JSON + "\n  redshift_copy_columns key_a,key_b")
+    emit_json(d_json)
+    assert_equal true, d_json.run
+  end
+  def test_write_with_json_uknown_columns_in_copy_columns
+    setup_mocks("")
+    d_json = create_driver(CONFIG_JSON + "\n  redshift_copy_columns key_a,key_z")
+    emit_json(d_json)
+    assert_raise(Fluent::ConfigError, "missing columns included in redshift_copy_columns - missing columns:\"key_z\"") {
+      d_json.run
+    }
+  end
   def test_write_with_json_fetch_column_with_schema
     setup_mocks(%[val_a\tval_b\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\n\\N\t\\N\tval_c\tval_d\t\\N\t\\N\t\\N\t\\N\n],
                schema_name: 'test_schema')

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fluent-plugin-redshift
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Masashi Miyazaki
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-08-11 00:00:00.000000000 Z
+date: 2016-07-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: fluentd
@@ -135,7 +135,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.6
+rubygems_version: 2.0.14.1
 signing_key:
 specification_version: 4
 summary: Amazon Redshift output plugin for Fluentd