fluent-plugin-redshift 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +14 -2
- data/VERSION +1 -1
- data/lib/fluent/plugin/out_redshift.rb +116 -25
- data/test/plugin/test_out_redshift.rb +33 -3
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6ff3e820b9cccac040efc73a11cf261f4a307dda
|
4
|
+
data.tar.gz: f00f6cc70b5cf75ac399ba65cc25f773d02123db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 096462829885b50f9cd01852843c80fd8096d028d8e9c0f45160fc2be4f3f66e43f7e28cf4b137c79f46bbbaad8f042693ee72e9b00f793e16a2dd280d1e7b9b
|
7
|
+
data.tar.gz: 739976ff4077c4ce34c51ee311dcdc10dd186f9f45ab39414292f8f49b9d87d2ad2dec5f2540f3f465bf6604932e05c115d2da5250b619236df580f7840ebf29
|
data/README.md
CHANGED
@@ -19,10 +19,14 @@ Format:
|
|
19
19
|
# s3 (for copying data to redshift)
|
20
20
|
aws_key_id YOUR_AWS_KEY_ID
|
21
21
|
aws_sec_key YOUR_AWS_SECRET_KEY
|
22
|
+
## or Use IAM Role instead of credentials.
|
23
|
+
aws_iam_role arn:aws:iam::ACCOUNT_ID:role/ROLE_NAME
|
24
|
+
|
22
25
|
s3_bucket YOUR_S3_BUCKET
|
23
26
|
s3_endpoint YOUR_S3_BUCKET_END_POINT
|
24
27
|
path YOUR_S3_PATH
|
25
28
|
timestamp_key_format year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M
|
29
|
+
s3_server_side_encryption S3_SERVER_SIDE_ENCRYPTION
|
26
30
|
|
27
31
|
# redshift
|
28
32
|
redshift_host YOUR_AMAZON_REDSHIFT_CLUSTER_END_POINT
|
@@ -32,6 +36,7 @@ Format:
|
|
32
36
|
redshift_password YOUR_AMAZON_REDSHIFT_CLUSTER_PASSWORD
|
33
37
|
redshift_schemaname YOUR_AMAZON_REDSHIFT_CLUSTER_TARGET_SCHEMA_NAME
|
34
38
|
redshift_tablename YOUR_AMAZON_REDSHIFT_CLUSTER_TARGET_TABLE_NAME
|
39
|
+
redshift_copy_columns COLMUNS_FOR_COPY
|
35
40
|
file_type [tsv|csv|json|msgpack]
|
36
41
|
|
37
42
|
# buffer
|
@@ -64,6 +69,7 @@ Example (watch and upload json formatted apache log):
|
|
64
69
|
s3_endpoint s3.amazonaws.com
|
65
70
|
path path/on/s3/apache_json_log/
|
66
71
|
timestamp_key_format year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M
|
72
|
+
s3_server_side_encryption aes256
|
67
73
|
|
68
74
|
# redshift
|
69
75
|
redshift_host xxx-yyy-zzz.xxxxxxxxxx.us-east-1.redshift.amazonaws.com
|
@@ -86,9 +92,11 @@ Example (watch and upload json formatted apache log):
|
|
86
92
|
|
87
93
|
+ `type` (required) : The value must be `redshift`.
|
88
94
|
|
89
|
-
+ `aws_key_id`
|
95
|
+
+ `aws_key_id` : AWS access key id to access s3 bucket.
|
96
|
+
|
97
|
+
+ `aws_sec_key` : AWS secret key id to access s3 bucket.
|
90
98
|
|
91
|
-
+ `
|
99
|
+
+ `aws_iam_role` : AWS IAM Role name to access s3 bucket and copy into redshift.
|
92
100
|
|
93
101
|
+ `s3_bucket` (required) : s3 bucket name. S3 bucket must be same as the region of your Redshift cluster.
|
94
102
|
|
@@ -105,6 +113,8 @@ Example (watch and upload json formatted apache log):
|
|
105
113
|
hapyrus-example/apache_json_log/year=2013/month=03/day=05/hour=12/20130305_1230_00.gz
|
106
114
|
</pre>
|
107
115
|
|
116
|
+
+ `s3_server_side_encryption` : S3 Server-Side Encryption (Only aes256 is supported)
|
117
|
+
|
108
118
|
+ `redshift_host` (required) : the end point(or hostname) of your Amazon Redshift cluster.
|
109
119
|
|
110
120
|
+ `redshift_port` (required) : port number.
|
@@ -121,6 +131,8 @@ Example (watch and upload json formatted apache log):
|
|
121
131
|
|
122
132
|
+ `redshift_connect_timeout` : maximum time to wait for connection to succeed.
|
123
133
|
|
134
|
+
+ `redshift_copy_columns` : columns for copying. Value needs to be comma-separated like `id,name,age`
|
135
|
+
|
124
136
|
+ `file_type` : file format of the source data. `csv`, `tsv`, `msgpack` or `json` are available.
|
125
137
|
|
126
138
|
+ `delimiter` : delimiter of the source data. This option will be ignored if `file_type` is specified.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
@@ -22,34 +22,81 @@ class RedshiftOutput < BufferedOutput
|
|
22
22
|
|
23
23
|
config_param :record_log_tag, :string, :default => 'log'
|
24
24
|
# s3
|
25
|
-
config_param :aws_key_id, :string, :secret => true
|
26
|
-
|
27
|
-
config_param :
|
28
|
-
|
29
|
-
config_param :
|
30
|
-
|
25
|
+
config_param :aws_key_id, :string, :secret => true, :default => nil,
|
26
|
+
:desc => "AWS access key id to access s3 bucket."
|
27
|
+
config_param :aws_sec_key, :string, :secret => true, :default => nil,
|
28
|
+
:desc => "AWS secret key id to access s3 bucket."
|
29
|
+
config_param :aws_iam_role, :string, :secret => true, :default => nil,
|
30
|
+
:desc => "AWS IAM Role to access s3 bucket."
|
31
|
+
config_param :s3_bucket, :string,
|
32
|
+
:desc => <<-DESC
|
33
|
+
S3 bucket name.
|
34
|
+
S3 bucket must be same as the region of your Redshift cluster.
|
35
|
+
|
36
|
+
DESC
|
37
|
+
config_param :s3_endpoint, :string, :default => nil,
|
38
|
+
:desc => "S3 endpoint."
|
39
|
+
config_param :path, :string, :default => "",
|
40
|
+
:desc => "S3 path to input."
|
41
|
+
config_param :timestamp_key_format, :string, :default => 'year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M',
|
42
|
+
:desc => <<-DESC
|
43
|
+
The format of the object keys.
|
44
|
+
It can include date-format directives.
|
45
|
+
DESC
|
31
46
|
config_param :utc, :bool, :default => false
|
47
|
+
config_param :s3_server_side_encryption, :string, :default => nil,
|
48
|
+
:desc => "S3 Server-Side Encryption (Only aes256 is supported)."
|
32
49
|
# redshift
|
33
|
-
config_param :redshift_host, :string
|
34
|
-
|
35
|
-
config_param :
|
36
|
-
|
37
|
-
config_param :
|
38
|
-
|
39
|
-
config_param :
|
50
|
+
config_param :redshift_host, :string,
|
51
|
+
:desc => "The end point(or hostname) of your Amazon Redshift cluster."
|
52
|
+
config_param :redshift_port, :integer, :default => 5439,
|
53
|
+
:desc => "Port number."
|
54
|
+
config_param :redshift_dbname, :string,
|
55
|
+
:desc => "Database name."
|
56
|
+
config_param :redshift_user, :string,
|
57
|
+
:desc => "User name."
|
58
|
+
config_param :redshift_password, :string, :secret => true,
|
59
|
+
:desc => "Password for the user name."
|
60
|
+
config_param :redshift_tablename, :string,
|
61
|
+
:desc => "Table name to store data."
|
62
|
+
config_param :redshift_schemaname, :string, :default => nil,
|
63
|
+
:desc => <<-DESC
|
64
|
+
Schema name to store data. By default, this option is not
|
65
|
+
Set and find table without schema as your own search_path.
|
66
|
+
DESC
|
40
67
|
config_param :redshift_copy_base_options, :string , :default => "FILLRECORD ACCEPTANYDATE TRUNCATECOLUMNS"
|
41
68
|
config_param :redshift_copy_options, :string , :default => nil
|
42
|
-
config_param :redshift_connect_timeout, :integer, :default => 10
|
69
|
+
config_param :redshift_connect_timeout, :integer, :default => 10,
|
70
|
+
:desc => "Maximum time to wait for connection to succeed."
|
71
|
+
config_param :redshift_copy_columns, :string, :default => nil,
|
72
|
+
:desc => <<-DESC
|
73
|
+
Columns for copying.
|
74
|
+
Value needs to be comma-separated like id,name,age
|
75
|
+
DESC
|
43
76
|
# file format
|
44
|
-
config_param :file_type, :string, :default => nil
|
45
|
-
|
77
|
+
config_param :file_type, :string, :default => nil,
|
78
|
+
:desc => "File format of the source data. csv, tsv, msgpack or json are available."
|
79
|
+
config_param :delimiter, :string, :default => nil,
|
80
|
+
:desc => <<-DESC
|
81
|
+
Delimiter of the source data.
|
82
|
+
This option will be ignored if file_type is specified.
|
83
|
+
DESC
|
46
84
|
# maintenance
|
47
|
-
config_param :maintenance_file_path, :string, :default => nil
|
85
|
+
config_param :maintenance_file_path, :string, :default => nil,
|
86
|
+
:desc => <<-DESC
|
87
|
+
Path of maintenance file. plugin skip processing and keep retrying
|
88
|
+
during a file existing in this file path.
|
89
|
+
To avoid data loss due to too many retries caused by long mainenance,
|
90
|
+
setting retry_limit and retry_wait is recommended.
|
91
|
+
DESC
|
48
92
|
# for debug
|
49
93
|
config_param :log_suffix, :string, :default => ''
|
50
94
|
|
51
95
|
def configure(conf)
|
52
96
|
super
|
97
|
+
if !check_credentials
|
98
|
+
raise ConfigError, "aws_key_id and aws_sec_key is required. or, use aws_iam_role instead."
|
99
|
+
end
|
53
100
|
@path = "#{@path}/" unless @path.end_with?('/') # append last slash
|
54
101
|
@path = @path[1..-1] if @path.start_with?('/') # remove head slash
|
55
102
|
@utc = true if conf['utc']
|
@@ -64,17 +111,26 @@ class RedshiftOutput < BufferedOutput
|
|
64
111
|
@delimiter = determine_delimiter(@file_type) if @delimiter.nil? or @delimiter.empty?
|
65
112
|
$log.debug format_log("redshift file_type:#{@file_type} delimiter:'#{@delimiter}'")
|
66
113
|
@table_name_with_schema = [@redshift_schemaname, @redshift_tablename].compact.join('.')
|
67
|
-
@
|
114
|
+
@redshift_copy_columns = if !@redshift_copy_columns.to_s.empty?
|
115
|
+
@redshift_copy_columns.split(/[,\s]+/)
|
116
|
+
else
|
117
|
+
nil
|
118
|
+
end
|
119
|
+
@copy_sql_template = build_redshift_copy_sql_template
|
68
120
|
@maintenance_monitor = MaintenanceMonitor.new(@maintenance_file_path)
|
121
|
+
@s3_server_side_encryption = @s3_server_side_encryption.to_sym if s3_server_side_encryption
|
69
122
|
end
|
70
123
|
|
71
124
|
def start
|
72
125
|
super
|
73
126
|
# init s3 conf
|
74
|
-
options = {
|
75
|
-
|
76
|
-
|
77
|
-
|
127
|
+
options = {}
|
128
|
+
if @aws_key_id && @aws_sec_key
|
129
|
+
options = {
|
130
|
+
:access_key_id => @aws_key_id,
|
131
|
+
:secret_access_key => @aws_sec_key
|
132
|
+
}
|
133
|
+
end
|
78
134
|
options[:s3_endpoint] = @s3_endpoint if @s3_endpoint
|
79
135
|
@s3 = AWS::S3.new(options)
|
80
136
|
@bucket = @s3.buckets[@s3_bucket]
|
@@ -115,14 +171,15 @@ class RedshiftOutput < BufferedOutput
|
|
115
171
|
|
116
172
|
# upload gz to s3
|
117
173
|
@bucket.objects[s3path].write(Pathname.new(tmp.path),
|
118
|
-
:acl => :bucket_owner_full_control
|
174
|
+
:acl => :bucket_owner_full_control,
|
175
|
+
:server_side_encryption => @s3_server_side_encryption)
|
119
176
|
|
120
177
|
# close temp file
|
121
178
|
tmp.close!
|
122
179
|
|
123
180
|
# copy gz on s3 to redshift
|
124
181
|
s3_uri = "s3://#{@s3_bucket}/#{s3path}"
|
125
|
-
sql = @copy_sql_template %
|
182
|
+
sql = @copy_sql_template % s3_uri
|
126
183
|
$log.debug format_log("start copying. s3_uri=#{s3_uri}")
|
127
184
|
|
128
185
|
begin
|
@@ -146,6 +203,21 @@ class RedshiftOutput < BufferedOutput
|
|
146
203
|
|
147
204
|
private
|
148
205
|
|
206
|
+
def build_redshift_copy_sql_template
|
207
|
+
copy_columns = if @redshift_copy_columns
|
208
|
+
"(#{@redshift_copy_columns.join(",")})"
|
209
|
+
else
|
210
|
+
''
|
211
|
+
end
|
212
|
+
credentials = if @aws_key_id && @aws_sec_key
|
213
|
+
"CREDENTIALS 'aws_access_key_id=#{@aws_key_id};aws_secret_access_key=#{@aws_sec_key}'"
|
214
|
+
else
|
215
|
+
"CREDENTIALS 'aws_iam_role=#{@aws_iam_role}'"
|
216
|
+
end
|
217
|
+
"copy #{@table_name_with_schema}#{copy_columns} from '%s' #{credentials} delimiter '#{@delimiter}' GZIP ESCAPE #{@redshift_copy_base_options} #{@redshift_copy_options};"
|
218
|
+
end
|
219
|
+
|
220
|
+
|
149
221
|
def json?
|
150
222
|
@file_type == 'json'
|
151
223
|
end
|
@@ -175,6 +247,14 @@ class RedshiftOutput < BufferedOutput
|
|
175
247
|
return nil
|
176
248
|
end
|
177
249
|
|
250
|
+
if @redshift_copy_columns
|
251
|
+
unknown_colmns = @redshift_copy_columns - redshift_table_columns
|
252
|
+
unless unknown_colmns.empty?
|
253
|
+
raise Fluent::ConfigError, "missing columns included in redshift_copy_columns - missing columns:\"#{unknown_colmns.join(',')}\""
|
254
|
+
end
|
255
|
+
redshift_table_columns = @redshift_copy_columns
|
256
|
+
end
|
257
|
+
|
178
258
|
# convert json to tsv format text
|
179
259
|
gzw = nil
|
180
260
|
begin
|
@@ -186,7 +266,8 @@ class RedshiftOutput < BufferedOutput
|
|
186
266
|
tsv_text = hash_to_table_text(redshift_table_columns, hash, delimiter)
|
187
267
|
gzw.write(tsv_text) if tsv_text and not tsv_text.empty?
|
188
268
|
rescue => e
|
189
|
-
|
269
|
+
text = record.is_a?(Hash) ? record[@record_log_tag] : record
|
270
|
+
$log.error format_log("failed to create table text from #{@file_type}. text=(#{text})"), :error=>e.to_s
|
190
271
|
$log.error_backtrace
|
191
272
|
end
|
192
273
|
end
|
@@ -261,6 +342,16 @@ class RedshiftOutput < BufferedOutput
|
|
261
342
|
s3path
|
262
343
|
end
|
263
344
|
|
345
|
+
def check_credentials
|
346
|
+
if @aws_key_id && @aws_sec_key
|
347
|
+
true
|
348
|
+
elsif @aws_iam_role
|
349
|
+
true
|
350
|
+
else
|
351
|
+
false
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
264
355
|
class RedshiftError < StandardError
|
265
356
|
def initialize(msg)
|
266
357
|
case msg
|
@@ -20,6 +20,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
20
20
|
CONFIG_BASE= %[
|
21
21
|
aws_key_id test_key_id
|
22
22
|
aws_sec_key test_sec_key
|
23
|
+
aws_iam_role test_iam_role
|
23
24
|
s3_bucket test_bucket
|
24
25
|
path log
|
25
26
|
redshift_host test_host
|
@@ -96,6 +97,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
96
97
|
d = create_driver(CONFIG_CSV)
|
97
98
|
assert_equal "test_key_id", d.instance.aws_key_id
|
98
99
|
assert_equal "test_sec_key", d.instance.aws_sec_key
|
100
|
+
assert_equal "test_iam_role", d.instance.aws_iam_role
|
99
101
|
assert_equal "test_bucket", d.instance.s3_bucket
|
100
102
|
assert_equal "log/", d.instance.path
|
101
103
|
assert_equal "test_host", d.instance.redshift_host
|
@@ -111,6 +113,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
111
113
|
assert_equal ",", d.instance.delimiter
|
112
114
|
assert_equal true, d.instance.utc
|
113
115
|
assert_equal MAINTENANCE_FILE_PATH_FOR_TEST, d.instance.maintenance_file_path
|
116
|
+
assert_equal nil, d.instance.redshift_copy_columns
|
114
117
|
end
|
115
118
|
def test_configure_with_schemaname
|
116
119
|
d = create_driver(CONFIG_JSON_WITH_SCHEMA)
|
@@ -168,6 +171,15 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
168
171
|
d = create_driver(CONFIG_CSV.gsub(/ *log_suffix *.+$/, ''))
|
169
172
|
assert_equal "", d.instance.log_suffix
|
170
173
|
end
|
174
|
+
def test_configure_redshift_copy_columns
|
175
|
+
d = create_driver(CONFIG_CSV + "\n redshift_copy_columns id,name, age created_at")
|
176
|
+
assert_equal %w(id name age created_at), d.instance.redshift_copy_columns
|
177
|
+
assert_match /^copy test_table\(id,name,age,created_at\) from/, d.instance.instance_variable_get("@copy_sql_template")
|
178
|
+
end
|
179
|
+
def test_configure_s3_server_side_encryption
|
180
|
+
d = create_driver(CONFIG_CSV + "\n s3_server_side_encryption aes256")
|
181
|
+
assert_equal :aes256, d.instance.s3_server_side_encryption
|
182
|
+
end
|
171
183
|
|
172
184
|
def emit_csv(d)
|
173
185
|
d.emit(RECORD_CSV_A, DEFAULT_TIME)
|
@@ -231,9 +243,9 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
231
243
|
end
|
232
244
|
copy_query_regex =
|
233
245
|
if schema_name
|
234
|
-
/\Acopy #{schema_name}.#{table_name} from/
|
246
|
+
/\Acopy #{schema_name}.#{table_name}(\(.+\))? from/
|
235
247
|
else
|
236
|
-
/\Acopy #{table_name} from/
|
248
|
+
/\Acopy #{table_name}(\(.+\))? from/
|
237
249
|
end
|
238
250
|
|
239
251
|
flexmock(Fluent::RedshiftOutput::RedshiftConnection).new_instances do |conn|
|
@@ -274,7 +286,8 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
274
286
|
}
|
275
287
|
assert_equal expected_data, data
|
276
288
|
},
|
277
|
-
:acl => :bucket_owner_full_control
|
289
|
+
:acl => :bucket_owner_full_control,
|
290
|
+
:server_side_encryption => nil
|
278
291
|
).and_return { true }
|
279
292
|
|
280
293
|
# create mock of s3 object collection
|
@@ -476,6 +489,23 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
476
489
|
}
|
477
490
|
end
|
478
491
|
|
492
|
+
def test_write_with_json_with_copy_columns
|
493
|
+
setup_mocks(%[val_a\tval_b\n])
|
494
|
+
setup_tempfile_mock_to_be_closed
|
495
|
+
d_json = create_driver(CONFIG_JSON + "\n redshift_copy_columns key_a,key_b")
|
496
|
+
emit_json(d_json)
|
497
|
+
assert_equal true, d_json.run
|
498
|
+
end
|
499
|
+
|
500
|
+
def test_write_with_json_uknown_columns_in_copy_columns
|
501
|
+
setup_mocks("")
|
502
|
+
d_json = create_driver(CONFIG_JSON + "\n redshift_copy_columns key_a,key_z")
|
503
|
+
emit_json(d_json)
|
504
|
+
assert_raise(Fluent::ConfigError, "missing columns included in redshift_copy_columns - missing columns:\"key_z\"") {
|
505
|
+
d_json.run
|
506
|
+
}
|
507
|
+
end
|
508
|
+
|
479
509
|
def test_write_with_json_fetch_column_with_schema
|
480
510
|
setup_mocks(%[val_a\tval_b\t\\N\t\\N\t\\N\t\\N\t\\N\t\\N\n\\N\t\\N\tval_c\tval_d\t\\N\t\\N\t\\N\t\\N\n],
|
481
511
|
schema_name: 'test_schema')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-redshift
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Masashi Miyazaki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fluentd
|
@@ -135,7 +135,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
135
135
|
version: '0'
|
136
136
|
requirements: []
|
137
137
|
rubyforge_project:
|
138
|
-
rubygems_version: 2.
|
138
|
+
rubygems_version: 2.0.14.1
|
139
139
|
signing_key:
|
140
140
|
specification_version: 4
|
141
141
|
summary: Amazon Redshift output plugin for Fluentd
|