fluent-plugin-redshift 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/lib/fluent/plugin/out_redshift.rb +114 -38
- data/test/plugin/test_out_redshift.rb +50 -101
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae21e38c3e70d5c36c0c52bcf3e38183756e4534
|
4
|
+
data.tar.gz: a2ebf5bc56f51b9c5f4dbd55fcf07499d9bc9bd8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 064a44a6ed9086b1aef44eded321e984a9ccb8afc530e46b03ad99a77b2a2b384c24900ceea220f92c96871c1db18c9503910aff50ccc2b36d042e3d6077d80d
|
7
|
+
data.tar.gz: c85eb3832be2f660f8974b535e95430a51c2a13556a97d3f417eec816a05eb78c015ba079080847a46f48b3f6350cbddd3a06bae12464bcdcba4a26776e28390
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.6
|
@@ -59,7 +59,8 @@ class RedshiftOutput < BufferedOutput
|
|
59
59
|
}
|
60
60
|
@delimiter = determine_delimiter(@file_type) if @delimiter.nil? or @delimiter.empty?
|
61
61
|
$log.debug format_log("redshift file_type:#{@file_type} delimiter:'#{@delimiter}'")
|
62
|
-
@
|
62
|
+
@table_name_with_schema = [@redshift_schemaname, @redshift_tablename].compact.join('.')
|
63
|
+
@copy_sql_template = "copy #{@table_name_with_schema} from '%s' CREDENTIALS 'aws_access_key_id=#{@aws_key_id};aws_secret_access_key=%s' delimiter '#{@delimiter}' GZIP ESCAPE #{@redshift_copy_base_options} #{@redshift_copy_options};"
|
63
64
|
end
|
64
65
|
|
65
66
|
def start
|
@@ -72,6 +73,7 @@ class RedshiftOutput < BufferedOutput
|
|
72
73
|
options[:s3_endpoint] = @s3_endpoint if @s3_endpoint
|
73
74
|
@s3 = AWS::S3.new(options)
|
74
75
|
@bucket = @s3.buckets[@s3_bucket]
|
76
|
+
@redshift_connection = RedshiftConnection.new(@db_conf)
|
75
77
|
end
|
76
78
|
|
77
79
|
def format(tag, time, record)
|
@@ -118,25 +120,26 @@ class RedshiftOutput < BufferedOutput
|
|
118
120
|
$log.debug format_log("start copying. s3_uri=#{s3_uri}")
|
119
121
|
|
120
122
|
begin
|
121
|
-
|
122
|
-
conn.exec(sql)
|
123
|
+
@redshift_connection.exec(sql)
|
123
124
|
$log.info format_log("completed copying to redshift. s3_uri=#{s3_uri}")
|
124
|
-
rescue
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
125
|
+
rescue RedshiftError => e
|
126
|
+
if e.to_s =~ IGNORE_REDSHIFT_ERROR_REGEXP
|
127
|
+
$log.error format_log("failed to copy data into redshift due to load error. s3_uri=#{s3_uri}"), :error=>e.to_s
|
128
|
+
return false # for debug
|
129
|
+
end
|
130
|
+
raise e
|
130
131
|
end
|
131
132
|
true # for debug
|
132
133
|
end
|
133
134
|
|
134
135
|
protected
|
136
|
+
|
135
137
|
def format_log(message)
|
136
138
|
(@log_suffix and not @log_suffix.empty?) ? "#{message} #{@log_suffix}" : message
|
137
139
|
end
|
138
140
|
|
139
141
|
private
|
142
|
+
|
140
143
|
def json?
|
141
144
|
@file_type == 'json'
|
142
145
|
end
|
@@ -158,11 +161,11 @@ class RedshiftOutput < BufferedOutput
|
|
158
161
|
|
159
162
|
def create_gz_file_from_structured_data(dst_file, chunk, delimiter)
|
160
163
|
# fetch the table definition from redshift
|
161
|
-
redshift_table_columns = fetch_table_columns
|
164
|
+
redshift_table_columns = @redshift_connection.fetch_table_columns(@redshift_tablename, @redshift_schemaname)
|
162
165
|
if redshift_table_columns == nil
|
163
166
|
raise "failed to fetch the redshift table definition."
|
164
167
|
elsif redshift_table_columns.empty?
|
165
|
-
$log.warn format_log("no table on redshift. table_name=#{table_name_with_schema}")
|
168
|
+
$log.warn format_log("no table on redshift. table_name=#{@table_name_with_schema}")
|
166
169
|
return nil
|
167
170
|
end
|
168
171
|
|
@@ -204,27 +207,6 @@ class RedshiftOutput < BufferedOutput
|
|
204
207
|
end
|
205
208
|
end
|
206
209
|
|
207
|
-
def fetch_table_columns
|
208
|
-
begin
|
209
|
-
columns = nil
|
210
|
-
conn = PG.connect(@db_conf)
|
211
|
-
conn.exec(fetch_columns_sql_with_schema) do |result|
|
212
|
-
columns = result.collect{|row| row['column_name']}
|
213
|
-
end
|
214
|
-
columns
|
215
|
-
ensure
|
216
|
-
conn.close rescue nil if conn
|
217
|
-
end
|
218
|
-
end
|
219
|
-
|
220
|
-
def fetch_columns_sql_with_schema
|
221
|
-
@fetch_columns_sql ||= if @redshift_schemaname
|
222
|
-
"select column_name from INFORMATION_SCHEMA.COLUMNS where table_schema = '#{@redshift_schemaname}' and table_name = '#{@redshift_tablename}' order by ordinal_position;"
|
223
|
-
else
|
224
|
-
"select column_name from INFORMATION_SCHEMA.COLUMNS where table_name = '#{@redshift_tablename}' order by ordinal_position;"
|
225
|
-
end
|
226
|
-
end
|
227
|
-
|
228
210
|
def json_to_hash(json_text)
|
229
211
|
return nil if json_text.to_s.empty?
|
230
212
|
|
@@ -279,14 +261,108 @@ class RedshiftOutput < BufferedOutput
|
|
279
261
|
s3path
|
280
262
|
end
|
281
263
|
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
264
|
+
class RedshiftError < StandardError
|
265
|
+
def initialize(msg)
|
266
|
+
case msg
|
267
|
+
when PG::Error
|
268
|
+
@pg_error = msg
|
269
|
+
super(msg.to_s)
|
270
|
+
set_backtrace(msg.backtrace)
|
271
|
+
else
|
272
|
+
super
|
273
|
+
end
|
274
|
+
end
|
275
|
+
|
276
|
+
attr_accessor :pg_error
|
277
|
+
end
|
278
|
+
|
279
|
+
class RedshiftConnection
|
280
|
+
REDSHIFT_CONNECT_TIMEOUT = 10.0 # 10sec
|
281
|
+
|
282
|
+
def initialize(db_conf)
|
283
|
+
@db_conf = db_conf
|
284
|
+
@connection = nil
|
285
|
+
end
|
286
|
+
|
287
|
+
attr_reader :db_conf
|
288
|
+
|
289
|
+
def fetch_table_columns(table_name, schema_name)
|
290
|
+
columns = nil
|
291
|
+
exec(fetch_columns_sql(table_name, schema_name)) do |result|
|
292
|
+
columns = result.collect{|row| row['column_name']}
|
293
|
+
end
|
294
|
+
columns
|
295
|
+
end
|
296
|
+
|
297
|
+
def exec(sql, &block)
|
298
|
+
conn = @connection
|
299
|
+
conn = create_redshift_connection if conn.nil?
|
300
|
+
if block
|
301
|
+
conn.exec(sql) {|result| block.call(result)}
|
302
|
+
else
|
303
|
+
conn.exec(sql)
|
304
|
+
end
|
305
|
+
rescue PG::Error => e
|
306
|
+
raise RedshiftError.new(e)
|
307
|
+
ensure
|
308
|
+
conn.close if conn && @connection.nil?
|
309
|
+
end
|
310
|
+
|
311
|
+
def connect_start
|
312
|
+
@connection = create_redshift_connection
|
313
|
+
end
|
314
|
+
|
315
|
+
def close
|
316
|
+
@connection.close rescue nil if @connection
|
317
|
+
@connection = nil
|
318
|
+
end
|
319
|
+
|
320
|
+
private
|
321
|
+
|
322
|
+
def create_redshift_connection
|
323
|
+
hostaddr = IPSocket.getaddress(db_conf[:host])
|
324
|
+
db_conf[:hostaddr] = hostaddr
|
325
|
+
|
326
|
+
conn = PG::Connection.connect_start(db_conf)
|
327
|
+
raise RedshiftError.new("Unable to create a new connection.") unless conn
|
328
|
+
if conn.status == PG::CONNECTION_BAD
|
329
|
+
raise RedshiftError.new("Connection failed: %s" % [ conn.error_message ])
|
330
|
+
end
|
331
|
+
|
332
|
+
socket = conn.socket_io
|
333
|
+
poll_status = PG::PGRES_POLLING_WRITING
|
334
|
+
until poll_status == PG::PGRES_POLLING_OK || poll_status == PG::PGRES_POLLING_FAILED
|
335
|
+
case poll_status
|
336
|
+
when PG::PGRES_POLLING_READING
|
337
|
+
IO.select([socket], nil, nil, REDSHIFT_CONNECT_TIMEOUT) or
|
338
|
+
raise RedshiftError.new("Asynchronous connection timed out!(READING)")
|
339
|
+
when PG::PGRES_POLLING_WRITING
|
340
|
+
IO.select(nil, [socket], nil, REDSHIFT_CONNECT_TIMEOUT) or
|
341
|
+
raise RedshiftError.new("Asynchronous connection timed out!(WRITING)")
|
342
|
+
end
|
343
|
+
poll_status = conn.connect_poll
|
344
|
+
end
|
345
|
+
|
346
|
+
unless conn.status == PG::CONNECTION_OK
|
347
|
+
raise RedshiftError, ("Connect failed: %s" % [conn.error_message.to_s.lines.uniq.join(" ")])
|
348
|
+
end
|
349
|
+
|
350
|
+
conn
|
351
|
+
rescue => e
|
352
|
+
conn.close rescue nil if conn
|
353
|
+
raise RedshiftError.new(e) if e.kind_of?(PG::Error)
|
354
|
+
raise e
|
355
|
+
end
|
356
|
+
|
357
|
+
def fetch_columns_sql(table_name, schema_name = nil)
|
358
|
+
sql = "select column_name from INFORMATION_SCHEMA.COLUMNS where table_name = '#{table_name}'"
|
359
|
+
sql << " and table_schema = '#{schema_name}'" if schema_name
|
360
|
+
sql << " order by ordinal_position;"
|
361
|
+
sql
|
362
|
+
end
|
288
363
|
end
|
289
364
|
end
|
290
365
|
|
291
366
|
|
367
|
+
|
292
368
|
end
|
@@ -12,6 +12,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
12
12
|
require 'pg'
|
13
13
|
require 'csv'
|
14
14
|
Fluent::Test.setup
|
15
|
+
PG::Error.module_eval { attr_accessor :result}
|
15
16
|
end
|
16
17
|
|
17
18
|
CONFIG_BASE= %[
|
@@ -211,54 +212,45 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
211
212
|
d_msgpack.run
|
212
213
|
end
|
213
214
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
215
|
+
def setup_redshift_connection_mock(options = {})
|
216
|
+
options ||= {}
|
217
|
+
column_names = options[:column_names] || ['key_a', 'key_b', 'key_c', 'key_d', 'key_e', 'key_f', 'key_g', 'key_h']
|
218
|
+
schema_name = options[:schema_name]
|
219
|
+
table_name = options[:table_name] || 'test_table'
|
220
|
+
exec_sql_proc = options[:exec_sql_proc]
|
220
221
|
|
221
|
-
|
222
|
-
if
|
223
|
-
/\Aselect column_name from INFORMATION_SCHEMA.COLUMNS where
|
222
|
+
column_list_query_regex =
|
223
|
+
if schema_name
|
224
|
+
/\Aselect column_name from INFORMATION_SCHEMA.COLUMNS where table_name = '#{table_name}' and table_schema = '#{schema_name}'/
|
224
225
|
else
|
225
|
-
/\Aselect column_name from INFORMATION_SCHEMA.COLUMNS where table_name = '#{
|
226
|
+
/\Aselect column_name from INFORMATION_SCHEMA.COLUMNS where table_name = '#{table_name}'/
|
226
227
|
end
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
if @target_schema
|
231
|
-
/\Acopy #{@target_schema}.#{@target_table} from/
|
228
|
+
copy_query_regex =
|
229
|
+
if schema_name
|
230
|
+
/\Acopy #{schema_name}.#{table_name} from/
|
232
231
|
else
|
233
|
-
/\Acopy #{
|
232
|
+
/\Acopy #{table_name} from/
|
234
233
|
end
|
235
|
-
end
|
236
234
|
|
237
|
-
|
238
|
-
|
239
|
-
if
|
240
|
-
|
235
|
+
flexmock(Fluent::RedshiftOutput::RedshiftConnection).new_instances do |conn|
|
236
|
+
conn.should_receive(:exec).and_return do |sql, block|
|
237
|
+
if exec_sql_proc
|
238
|
+
exec_sql_proc.call(sql, block)
|
239
|
+
elsif block
|
240
|
+
if sql =~ column_list_query_regex
|
241
|
+
block.call column_names.collect{|key| {'column_name' => key}}
|
242
|
+
else
|
243
|
+
block.call []
|
244
|
+
end
|
241
245
|
else
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
error.result = "ERROR: Load into table '#{@target_table}' failed. Check 'stl_load_errors' system table for details."
|
248
|
-
raise error
|
246
|
+
unless sql =~ copy_query_regex
|
247
|
+
error = PG::Error.new("ERROR: Load into table '#{@target_table}' failed. Check 'stl_load_errors' system table for details.")
|
248
|
+
error.result = "ERROR: Load into table '#{@target_table}' failed. Check 'stl_load_errors' system table for details."
|
249
|
+
raise Fluent::RedshiftOutput::RedshiftError.new(error)
|
250
|
+
end
|
249
251
|
end
|
250
252
|
end
|
251
253
|
end
|
252
|
-
|
253
|
-
def close
|
254
|
-
end
|
255
|
-
end
|
256
|
-
|
257
|
-
def setup_pg_mock
|
258
|
-
# create mock of PG
|
259
|
-
def PG.connect(dbinfo)
|
260
|
-
return PGConnectionMock.new
|
261
|
-
end
|
262
254
|
end
|
263
255
|
|
264
256
|
def setup_s3_mock(expected_data)
|
@@ -305,9 +297,10 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
305
297
|
flexmock(Tempfile).new_instances.should_receive(:close!).at_least.once
|
306
298
|
end
|
307
299
|
|
308
|
-
def setup_mocks(expected_data)
|
309
|
-
|
310
|
-
setup_s3_mock(expected_data)
|
300
|
+
def setup_mocks(expected_data, options = {})
|
301
|
+
setup_redshift_connection_mock(options)
|
302
|
+
setup_s3_mock(expected_data)
|
303
|
+
end
|
311
304
|
|
312
305
|
def test_write_with_csv
|
313
306
|
setup_mocks(%[val_a,val_b,val_c,val_d\nval_e,val_f,val_g,val_h\n])
|
@@ -421,38 +414,21 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
421
414
|
end
|
422
415
|
|
423
416
|
def test_write_redshift_connection_error
|
424
|
-
|
425
|
-
|
426
|
-
def initialize(return_keys=[]); end
|
427
|
-
def exec(sql)
|
428
|
-
raise PG::Error, "redshift connection error"
|
429
|
-
end
|
430
|
-
def close; end
|
431
|
-
end.new
|
432
|
-
end
|
433
|
-
setup_s3_mock(%[val_a,val_b,val_c,val_d\nval_e,val_f,val_g,val_h\n])
|
434
|
-
|
417
|
+
setup_mocks(%[val_a,val_b,val_c,val_d\nval_e,val_f,val_g,val_h\n],
|
418
|
+
exec_sql_proc: Proc.new {|sql, block| raise Fluent::RedshiftOutput::RedshiftError, "redshift connection error" })
|
435
419
|
d_csv = create_driver
|
436
420
|
emit_csv(d_csv)
|
437
|
-
assert_raise(
|
421
|
+
assert_raise(Fluent::RedshiftOutput::RedshiftError) {
|
438
422
|
d_csv.run
|
439
423
|
}
|
440
424
|
end
|
441
425
|
|
442
426
|
def test_write_redshift_load_error
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
error = PG::Error.new("ERROR: Load into table 'apache_log' failed. Check 'stl_load_errors' system table for details.")
|
449
|
-
error.result = "ERROR: Load into table 'apache_log' failed. Check 'stl_load_errors' system table for details."
|
450
|
-
raise error
|
451
|
-
end
|
452
|
-
def close; end
|
453
|
-
end.new
|
454
|
-
end
|
455
|
-
setup_s3_mock(%[val_a,val_b,val_c,val_d\nval_e,val_f,val_g,val_h\n])
|
427
|
+
setup_mocks(%[val_a,val_b,val_c,val_d\nval_e,val_f,val_g,val_h\n],
|
428
|
+
exec_sql_proc: Proc.new {|sql, block|
|
429
|
+
msg = "ERROR: Load into table 'apache_log' failed. Check 'stl_load_errors' system table for details."
|
430
|
+
raise Fluent::RedshiftOutput::RedshiftError.new(msg)
|
431
|
+
})
|
456
432
|
|
457
433
|
d_csv = create_driver
|
458
434
|
emit_csv(d_csv)
|
@@ -460,36 +436,19 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
460
436
|
end
|
461
437
|
|
462
438
|
def test_write_with_json_redshift_connection_error
|
463
|
-
|
464
|
-
|
465
|
-
def initialize(return_keys=[]); end
|
466
|
-
def exec(sql, &block)
|
467
|
-
error = PG::Error.new("redshift connection error")
|
468
|
-
raise error
|
469
|
-
end
|
470
|
-
def close; end
|
471
|
-
end.new
|
472
|
-
end
|
473
|
-
setup_s3_mock(%[val_a,val_b,val_c,val_d\nval_e,val_f,val_g,val_h\n])
|
439
|
+
setup_mocks(%[val_a,val_b,val_c,val_d\nval_e,val_f,val_g,val_h\n],
|
440
|
+
exec_sql_proc: Proc.new {|sql, block| raise Fluent::RedshiftOutput::RedshiftError.new("redshift connection error")})
|
474
441
|
|
475
442
|
d_json = create_driver(CONFIG_JSON)
|
476
443
|
emit_json(d_json)
|
477
|
-
assert_raise(
|
444
|
+
assert_raise(Fluent::RedshiftOutput::RedshiftError) {
|
478
445
|
d_json.run
|
479
446
|
}
|
480
447
|
end
|
481
448
|
|
482
449
|
def test_write_with_json_no_table_on_redshift
|
483
|
-
|
484
|
-
|
485
|
-
def initialize(return_keys=[]); end
|
486
|
-
def exec(sql, &block)
|
487
|
-
yield [] if block_given?
|
488
|
-
end
|
489
|
-
def close; end
|
490
|
-
end.new
|
491
|
-
end
|
492
|
-
setup_s3_mock(%[val_a,val_b,val_c,val_d\nval_e,val_f,val_g,val_h\n])
|
450
|
+
setup_mocks(%[val_a,val_b,val_c,val_d\nval_e,val_f,val_g,val_h\n],
|
451
|
+
exec_sql_proc: Proc.new {|sql, block| block.call [] if block })
|
493
452
|
|
494
453
|
d_json = create_driver(CONFIG_JSON)
|
495
454
|
emit_json(d_json)
|
@@ -497,15 +456,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
497
456
|
end
|
498
457
|
|
499
458
|
def test_write_with_json_failed_to_get_columns
|
500
|
-
|
501
|
-
return Class.new do
|
502
|
-
def initialize(return_keys=[]); end
|
503
|
-
def exec(sql, &block)
|
504
|
-
end
|
505
|
-
def close; end
|
506
|
-
end.new
|
507
|
-
end
|
508
|
-
setup_s3_mock("")
|
459
|
+
setup_mocks("", exec_sql_proc: Proc.new {|sql, block| nil})
|
509
460
|
|
510
461
|
d_json = create_driver(CONFIG_JSON)
|
511
462
|
emit_json(d_json)
|
@@ -515,10 +466,8 @@ class RedshiftOutputTest < Test::Unit::TestCase
|
|
515
466
|
end
|
516
467
|
|
517
468
|
def test_write_with_json_fetch_column_with_schema
|
518
|
-
|
519
|
-
|
520
|
-
end
|
521
|
-
setup_s3_mock(%[val_a\tval_b\t\t\t\t\t\t\n\t\tval_c\tval_d\t\t\t\t\n])
|
469
|
+
setup_mocks(%[val_a\tval_b\t\t\t\t\t\t\n\t\tval_c\tval_d\t\t\t\t\n],
|
470
|
+
schema_name: 'test_schema')
|
522
471
|
d_json = create_driver(CONFIG_JSON_WITH_SCHEMA)
|
523
472
|
emit_json(d_json)
|
524
473
|
assert_equal true, d_json.run
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-redshift
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Masashi Miyazaki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-05
|
11
|
+
date: 2015-08-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fluentd
|