fluent-plugin-redshift 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 208e99381f5503be7e6af6fa3ad54fc2cbeb9bea
4
- data.tar.gz: ed2707d372c126d420fd907bc409b5d0fede132d
3
+ metadata.gz: 6ff3e820b9cccac040efc73a11cf261f4a307dda
4
+ data.tar.gz: f00f6cc70b5cf75ac399ba65cc25f773d02123db
5
5
  SHA512:
6
- metadata.gz: f8f93c9b5bb3ba860a3c6900c1889453f9417a36bce39def88e82b5ca4bc7dd8a936b6cf805d246317daafd9f2cab168fa7d890a61bbad7b719b14c40321954b
7
- data.tar.gz: d19b4d4751c71d293493cce89cf93224115c762a32a00e363ce774ffdb40798598f656a6e7de126e0b13012011c311a52bef86804f785ec490b52ad67755876b
6
+ metadata.gz: 096462829885b50f9cd01852843c80fd8096d028d8e9c0f45160fc2be4f3f66e43f7e28cf4b137c79f46bbbaad8f042693ee72e9b00f793e16a2dd280d1e7b9b
7
+ data.tar.gz: 739976ff4077c4ce34c51ee311dcdc10dd186f9f45ab39414292f8f49b9d87d2ad2dec5f2540f3f465bf6604932e05c115d2da5250b619236df580f7840ebf29
data/README.md CHANGED
@@ -19,10 +19,14 @@ Format:
19
19
  # s3 (for copying data to redshift)
20
20
  aws_key_id YOUR_AWS_KEY_ID
21
21
  aws_sec_key YOUR_AWS_SECRET_KEY
22
+ ## or Use IAM Role instead of credentials.
23
+ aws_iam_role arn:aws:iam::ACCOUNT_ID:role/ROLE_NAME
24
+
22
25
  s3_bucket YOUR_S3_BUCKET
23
26
  s3_endpoint YOUR_S3_BUCKET_END_POINT
24
27
  path YOUR_S3_PATH
25
28
  timestamp_key_format year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M
29
+ s3_server_side_encryption S3_SERVER_SIDE_ENCRYPTION
26
30
 
27
31
  # redshift
28
32
  redshift_host YOUR_AMAZON_REDSHIFT_CLUSTER_END_POINT
@@ -32,6 +36,7 @@ Format:
32
36
  redshift_password YOUR_AMAZON_REDSHIFT_CLUSTER_PASSWORD
33
37
  redshift_schemaname YOUR_AMAZON_REDSHIFT_CLUSTER_TARGET_SCHEMA_NAME
34
38
  redshift_tablename YOUR_AMAZON_REDSHIFT_CLUSTER_TARGET_TABLE_NAME
39
+ redshift_copy_columns COLMUNS_FOR_COPY
35
40
  file_type [tsv|csv|json|msgpack]
36
41
 
37
42
  # buffer
@@ -64,6 +69,7 @@ Example (watch and upload json formatted apache log):
64
69
  s3_endpoint s3.amazonaws.com
65
70
  path path/on/s3/apache_json_log/
66
71
  timestamp_key_format year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M
72
+ s3_server_side_encryption aes256
67
73
 
68
74
  # redshift
69
75
  redshift_host xxx-yyy-zzz.xxxxxxxxxx.us-east-1.redshift.amazonaws.com
@@ -86,9 +92,11 @@ Example (watch and upload json formatted apache log):
86
92
 
87
93
  + `type` (required) : The value must be `redshift`.
88
94
 
89
- + `aws_key_id` (required) : AWS access key id to access s3 bucket.
95
+ + `aws_key_id` : AWS access key id to access s3 bucket.
96
+
97
+ + `aws_sec_key` : AWS secret key id to access s3 bucket.
90
98
 
91
- + `aws_sec_key` (required) : AWS secret key id to access s3 bucket.
99
+ + `aws_iam_role` : AWS IAM Role name to access s3 bucket and copy into redshift.
92
100
 
93
101
  + `s3_bucket` (required) : s3 bucket name. S3 bucket must be same as the region of your Redshift cluster.
94
102
 
@@ -105,6 +113,8 @@ Example (watch and upload json formatted apache log):
105
113
  hapyrus-example/apache_json_log/year=2013/month=03/day=05/hour=12/20130305_1230_00.gz
106
114
  </pre>
107
115
 
116
+ + `s3_server_side_encryption` : S3 Server-Side Encryption (Only aes256 is supported)
117
+
108
118
  + `redshift_host` (required) : the end point(or hostname) of your Amazon Redshift cluster.
109
119
 
110
120
  + `redshift_port` (required) : port number.
@@ -121,6 +131,8 @@ Example (watch and upload json formatted apache log):
121
131
 
122
132
  + `redshift_connect_timeout` : maximum time to wait for connection to succeed.
123
133
 
134
+ + `redshift_copy_columns` : columns for copying. Value needs to be comma-separated like `id,name,age`
135
+
124
136
  + `file_type` : file format of the source data. `csv`, `tsv`, `msgpack` or `json` are available.
125
137
 
126
138
  + `delimiter` : delimiter of the source data. This option will be ignored if `file_type` is specified.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.1.1
@@ -22,34 +22,81 @@ class RedshiftOutput < BufferedOutput
22
22
 
23
23
  config_param :record_log_tag, :string, :default => 'log'
24
24
  # s3
25
- config_param :aws_key_id, :string, :secret => true
26
- config_param :aws_sec_key, :string, :secret => true
27
- config_param :s3_bucket, :string
28
- config_param :s3_endpoint, :string, :default => nil
29
- config_param :path, :string, :default => ""
30
- config_param :timestamp_key_format, :string, :default => 'year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M'
25
+ config_param :aws_key_id, :string, :secret => true, :default => nil,
26
+ :desc => "AWS access key id to access s3 bucket."
27
+ config_param :aws_sec_key, :string, :secret => true, :default => nil,
28
+ :desc => "AWS secret key id to access s3 bucket."
29
+ config_param :aws_iam_role, :string, :secret => true, :default => nil,
30
+ :desc => "AWS IAM Role to access s3 bucket."
31
+ config_param :s3_bucket, :string,
32
+ :desc => <<-DESC
33
+ S3 bucket name.
34
+ S3 bucket must be same as the region of your Redshift cluster.
35
+
36
+ DESC
37
+ config_param :s3_endpoint, :string, :default => nil,
38
+ :desc => "S3 endpoint."
39
+ config_param :path, :string, :default => "",
40
+ :desc => "S3 path to input."
41
+ config_param :timestamp_key_format, :string, :default => 'year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M',
42
+ :desc => <<-DESC
43
+ The format of the object keys.
44
+ It can include date-format directives.
45
+ DESC
31
46
  config_param :utc, :bool, :default => false
47
+ config_param :s3_server_side_encryption, :string, :default => nil,
48
+ :desc => "S3 Server-Side Encryption (Only aes256 is supported)."
32
49
  # redshift
33
- config_param :redshift_host, :string
34
- config_param :redshift_port, :integer, :default => 5439
35
- config_param :redshift_dbname, :string
36
- config_param :redshift_user, :string
37
- config_param :redshift_password, :string, :secret => true
38
- config_param :redshift_tablename, :string
39
- config_param :redshift_schemaname, :string, :default => nil
50
+ config_param :redshift_host, :string,
51
+ :desc => "The end point(or hostname) of your Amazon Redshift cluster."
52
+ config_param :redshift_port, :integer, :default => 5439,
53
+ :desc => "Port number."
54
+ config_param :redshift_dbname, :string,
55
+ :desc => "Database name."
56
+ config_param :redshift_user, :string,
57
+ :desc => "User name."
58
+ config_param :redshift_password, :string, :secret => true,
59
+ :desc => "Password for the user name."
60
+ config_param :redshift_tablename, :string,
61
+ :desc => "Table name to store data."
62
+ config_param :redshift_schemaname, :string, :default => nil,
63
+ :desc => <<-DESC
64
+ Schema name to store data. By default, this option is not
65
+ Set and find table without schema as your own search_path.
66
+ DESC
40
67
  config_param :redshift_copy_base_options, :string , :default => "FILLRECORD ACCEPTANYDATE TRUNCATECOLUMNS"
41
68
  config_param :redshift_copy_options, :string , :default => nil
42
- config_param :redshift_connect_timeout, :integer, :default => 10
69
+ config_param :redshift_connect_timeout, :integer, :default => 10,
70
+ :desc => "Maximum time to wait for connection to succeed."
71
+ config_param :redshift_copy_columns, :string, :default => nil,
72
+ :desc => <<-DESC
73
+ Columns for copying.
74
+ Value needs to be comma-separated like id,name,age
75
+ DESC
43
76
  # file format
44
- config_param :file_type, :string, :default => nil # json, tsv, csv, msgpack
45
- config_param :delimiter, :string, :default => nil
77
+ config_param :file_type, :string, :default => nil,
78
+ :desc => "File format of the source data. csv, tsv, msgpack or json are available."
79
+ config_param :delimiter, :string, :default => nil,
80
+ :desc => <<-DESC
81
+ Delimiter of the source data.
82
+ This option will be ignored if file_type is specified.
83
+ DESC
46
84
  # maintenance
47
- config_param :maintenance_file_path, :string, :default => nil
85
+ config_param :maintenance_file_path, :string, :default => nil,
86
+ :desc => <<-DESC
87
+ Path of maintenance file. plugin skip processing and keep retrying
88
+ during a file existing in this file path.
89
+ To avoid data loss due to too many retries caused by long mainenance,
90
+ setting retry_limit and retry_wait is recommended.
91
+ DESC
48
92
  # for debug
49
93
  config_param :log_suffix, :string, :default => ''
50
94
 
51
95
  def configure(conf)
52
96
  super
97
+ if !check_credentials
98
+ raise ConfigError, "aws_key_id and aws_sec_key is required. or, use aws_iam_role instead."
99
+ end
53
100
  @path = "#{@path}/" unless @path.end_with?('/') # append last slash
54
101
  @path = @path[1..-1] if @path.start_with?('/') # remove head slash
55
102
  @utc = true if conf['utc']
@@ -64,17 +111,26 @@ class RedshiftOutput < BufferedOutput
64
111
  @delimiter = determine_delimiter(@file_type) if @delimiter.nil? or @delimiter.empty?
65
112
  $log.debug format_log("redshift file_type:#{@file_type} delimiter:'#{@delimiter}'")
66
113
  @table_name_with_schema = [@redshift_schemaname, @redshift_tablename].compact.join('.')
67
- @copy_sql_template = "copy #{@table_name_with_schema} from '%s' CREDENTIALS 'aws_access_key_id=#{@aws_key_id};aws_secret_access_key=%s' delimiter '#{@delimiter}' GZIP ESCAPE #{@redshift_copy_base_options} #{@redshift_copy_options};"
114
+ @redshift_copy_columns = if !@redshift_copy_columns.to_s.empty?
115
+ @redshift_copy_columns.split(/[,\s]+/)
116
+ else
117
+ nil
118
+ end
119
+ @copy_sql_template = build_redshift_copy_sql_template
68
120
  @maintenance_monitor = MaintenanceMonitor.new(@maintenance_file_path)
121
+ @s3_server_side_encryption = @s3_server_side_encryption.to_sym if s3_server_side_encryption
69
122
  end
70
123
 
71
124
  def start
72
125
  super
73
126
  # init s3 conf
74
- options = {
75
- :access_key_id => @aws_key_id,
76
- :secret_access_key => @aws_sec_key
77
- }
127
+ options = {}
128
+ if @aws_key_id && @aws_sec_key
129
+ options = {
130
+ :access_key_id => @aws_key_id,
131
+ :secret_access_key => @aws_sec_key
132
+ }
133
+ end
78
134
  options[:s3_endpoint] = @s3_endpoint if @s3_endpoint
79
135
  @s3 = AWS::S3.new(options)
80
136
  @bucket = @s3.buckets[@s3_bucket]
@@ -115,14 +171,15 @@ class RedshiftOutput < BufferedOutput
115
171
 
116
172
  # upload gz to s3
117
173
  @bucket.objects[s3path].write(Pathname.new(tmp.path),
118
- :acl => :bucket_owner_full_control)
174
+ :acl => :bucket_owner_full_control,
175
+ :server_side_encryption => @s3_server_side_encryption)
119
176
 
120
177
  # close temp file
121
178
  tmp.close!
122
179
 
123
180
  # copy gz on s3 to redshift
124
181
  s3_uri = "s3://#{@s3_bucket}/#{s3path}"
125
- sql = @copy_sql_template % [s3_uri, @aws_sec_key]
182
+ sql = @copy_sql_template % s3_uri
126
183
  $log.debug format_log("start copying. s3_uri=#{s3_uri}")
127
184
 
128
185
  begin
@@ -146,6 +203,21 @@ class RedshiftOutput < BufferedOutput
146
203
 
147
204
  private
148
205
 
206
+ def build_redshift_copy_sql_template
207
+ copy_columns = if @redshift_copy_columns
208
+ "(#{@redshift_copy_columns.join(",")})"
209
+ else
210
+ ''
211
+ end
212
+ credentials = if @aws_key_id && @aws_sec_key
213
+ "CREDENTIALS 'aws_access_key_id=#{@aws_key_id};aws_secret_access_key=#{@aws_sec_key}'"
214
+ else
215
+ "CREDENTIALS 'aws_iam_role=#{@aws_iam_role}'"
216
+ end
217
+ "copy #{@table_name_with_schema}#{copy_columns} from '%s' #{credentials} delimiter '#{@delimiter}' GZIP ESCAPE #{@redshift_copy_base_options} #{@redshift_copy_options};"
218
+ end
219
+
220
+
149
221
  def json?
150
222
  @file_type == 'json'
151
223
  end
@@ -175,6 +247,14 @@ class RedshiftOutput < BufferedOutput
175
247
  return nil
176
248
  end
177
249
 
250
+ if @redshift_copy_columns
251
+ unknown_colmns = @redshift_copy_columns - redshift_table_columns
252
+ unless unknown_colmns.empty?
253
+ raise Fluent::ConfigError, "missing columns included in redshift_copy_columns - missing columns:\"#{unknown_colmns.join(',')}\""
254
+ end
255
+ redshift_table_columns = @redshift_copy_columns
256
+ end
257
+
178
258
  # convert json to tsv format text
179
259
  gzw = nil
180
260
  begin
@@ -186,7 +266,8 @@ class RedshiftOutput < BufferedOutput
186
266
  tsv_text = hash_to_table_text(redshift_table_columns, hash, delimiter)
187
267
  gzw.write(tsv_text) if tsv_text and not tsv_text.empty?
188
268
  rescue => e
189
- $log.error format_log("failed to create table text from #{@file_type}. text=(#{record[@record_log_tag]})"), :error=>e.to_s
269
+ text = record.is_a?(Hash) ? record[@record_log_tag] : record
270
+ $log.error format_log("failed to create table text from #{@file_type}. text=(#{text})"), :error=>e.to_s
190
271
  $log.error_backtrace
191
272
  end
192
273
  end
@@ -261,6 +342,16 @@ class RedshiftOutput < BufferedOutput
261
342
  s3path
262
343
  end
263
344
 
345
+ def check_credentials
346
+ if @aws_key_id && @aws_sec_key
347
+ true
348
+ elsif @aws_iam_role
349
+ true
350
+ else
351
+ false
352
+ end
353
+ end
354
+
264
355
  class RedshiftError < StandardError
265
356
  def initialize(msg)
266
357
  case msg
@@ -20,6 +20,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
20
20
  CONFIG_BASE= %[
21
21
  aws_key_id test_key_id
22
22
  aws_sec_key test_sec_key
23
+ aws_iam_role test_iam_role
23
24
  s3_bucket test_bucket
24
25
  path log
25
26
  redshift_host test_host
@@ -96,6 +97,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
96
97
  d = create_driver(CONFIG_CSV)
97
98
  assert_equal "test_key_id", d.instance.aws_key_id
98
99
  assert_equal "test_sec_key", d.instance.aws_sec_key
100
+ assert_equal "test_iam_role", d.instance.aws_iam_role
99
101
  assert_equal "test_bucket", d.instance.s3_bucket
100
102
  assert_equal "log/", d.instance.path
101
103
  assert_equal "test_host", d.instance.redshift_host
@@ -111,6 +113,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
111
113
  assert_equal ",", d.instance.delimiter
112
114
  assert_equal true, d.instance.utc
113
115
  assert_equal MAINTENANCE_FILE_PATH_FOR_TEST, d.instance.maintenance_file_path
116
+ assert_equal nil, d.instance.redshift_copy_columns
114
117
  end
115
118
  def test_configure_with_schemaname
116
119
  d = create_driver(CONFIG_JSON_WITH_SCHEMA)
@@ -168,6 +171,15 @@ class RedshiftOutputTest < Test::Unit::TestCase
168
171
  d = create_driver(CONFIG_CSV.gsub(/ *log_suffix *.+$/, ''))
169
172
  assert_equal "", d.instance.log_suffix
170
173
  end
174
+ def test_configure_redshift_copy_columns
175
+ d = create_driver(CONFIG_CSV + "\n redshift_copy_columns id,name, age created_at")
176
+ assert_equal %w(id name age created_at), d.instance.redshift_copy_columns
177
+ assert_match /^copy test_table\(id,name,age,created_at\) from/, d.instance.instance_variable_get("@copy_sql_template")
178
+ end
179
+ def test_configure_s3_server_side_encryption
180
+ d = create_driver(CONFIG_CSV + "\n s3_server_side_encryption aes256")
181
+ assert_equal :aes256, d.instance.s3_server_side_encryption
182
+ end
171
183
 
172
184
  def emit_csv(d)
173
185
  d.emit(RECORD_CSV_A, DEFAULT_TIME)
@@ -231,9 +243,9 @@ class RedshiftOutputTest < Test::Unit::TestCase
231
243
  end
232
244
  copy_query_regex =
233
245
  if schema_name
234
- /\Acopy #{schema_name}.#{table_name} from/
246
+ /\Acopy #{schema_name}.#{table_name}(\(.+\))? from/
235
247
  else
236
- /\Acopy #{table_name} from/
248
+ /\Acopy #{table_name}(\(.+\))? from/
237
249
  end
238
250
 
239
251
  flexmock(Fluent::RedshiftOutput::RedshiftConnection).new_instances do |conn|
@@ -274,7 +286,8 @@ class RedshiftOutputTest < Test::Unit::TestCase
274
286
  }
275
287
  assert_equal expected_data, data
276
288
  },
277
- :acl => :bucket_owner_full_control
289
+ :acl => :bucket_owner_full_control,
290
+ :server_side_encryption => nil
278
291
  ).and_return { true }
279
292
 
280
293
  # create mock of s3 object collection
@@ -476,6 +489,23 @@ class RedshiftOutputTest < Test::Unit::TestCase
476
489
  }
477
490
  end
478
491
 
492
+ def test_write_with_json_with_copy_columns
493
+ setup_mocks(%[val_a\tval_b\n])
494
+ setup_tempfile_mock_to_be_closed
495
+ d_json = create_driver(CONFIG_JSON + "\n redshift_copy_columns key_a,key_b")
496
+ emit_json(d_json)
497
+ assert_equal true, d_json.run
498
+ end
499
+
500
+ def test_write_with_json_uknown_columns_in_copy_columns
501
+ setup_mocks("")
502
+ d_json = create_driver(CONFIG_JSON + "\n redshift_copy_columns key_a,key_z")
503
+ emit_json(d_json)
504
+ assert_raise(Fluent::ConfigError, "missing columns included in redshift_copy_columns - missing columns:\"key_z\"") {
505
+ d_json.run
506
+ }
507
+ end
508
+
479
509
  def test_write_with_json_fetch_column_with_schema
480
510
  setup_mocks(%[val_a\tval_b\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\n\\N\t\\N\tval_c\tval_d\t\\N\t\\N\t\\N\t\\N\n],
481
511
  schema_name: 'test_schema')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-redshift
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Masashi Miyazaki
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-08-11 00:00:00.000000000 Z
11
+ date: 2016-07-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: fluentd
@@ -135,7 +135,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
135
135
  version: '0'
136
136
  requirements: []
137
137
  rubyforge_project:
138
- rubygems_version: 2.4.6
138
+ rubygems_version: 2.0.14.1
139
139
  signing_key:
140
140
  specification_version: 4
141
141
  summary: Amazon Redshift output plugin for Fluentd