fluent-plugin-redshift 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
@@ -4,6 +4,9 @@ module Fluent
4
4
  class RedshiftOutput < BufferedOutput
5
5
  Fluent::Plugin.register_output('redshift', self)
6
6
 
7
+ # ignore load table error. (invalid data format)
8
+ IGNORE_REDSHIFT_ERROR_REGEXP = /^ERROR: Load into table '[^']+' failed\./
9
+
7
10
  def initialize
8
11
  super
9
12
  require 'aws-sdk'
@@ -34,6 +37,8 @@ class RedshiftOutput < BufferedOutput
34
37
  # file format
35
38
  config_param :file_type, :string, :default => nil # json, tsv, csv
36
39
  config_param :delimiter, :string, :default => nil
40
+ # for debug
41
+ config_param :log_suffix, :string, :default => ''
37
42
 
38
43
  def configure(conf)
39
44
  super
@@ -48,8 +53,8 @@ class RedshiftOutput < BufferedOutput
48
53
  password:@redshift_password
49
54
  }
50
55
  @delimiter = determine_delimiter(@file_type) if @delimiter.nil? or @delimiter.empty?
51
- $log.debug "redshift file_type:#{@file_type} delimiter:'#{@delimiter}'"
52
- @copy_sql_template = "copy #{@redshift_tablename} from '%s' CREDENTIALS 'aws_access_key_id=#{@aws_key_id};aws_secret_access_key=%s' delimiter '#{@delimiter}' REMOVEQUOTES GZIP;"
56
+ $log.debug format_log("redshift file_type:#{@file_type} delimiter:'#{@delimiter}'")
57
+ @copy_sql_template = "copy #{@redshift_tablename} from '%s' CREDENTIALS 'aws_access_key_id=#{@aws_key_id};aws_secret_access_key=%s' delimiter '#{@delimiter}' GZIP TRUNCATECOLUMNS ESCAPE FILLRECORD ACCEPTANYDATE;"
53
58
  end
54
59
 
55
60
  def start
@@ -69,6 +74,8 @@ class RedshiftOutput < BufferedOutput
69
74
  end
70
75
 
71
76
  def write(chunk)
77
+ $log.debug format_log("start creating gz.")
78
+
72
79
  # create a gz file
73
80
  tmp = Tempfile.new("s3-")
74
81
  tmp = (json?) ? create_gz_file_from_json(tmp, chunk, @delimiter)
@@ -76,8 +83,8 @@ class RedshiftOutput < BufferedOutput
76
83
 
77
84
  # no data -> skip
78
85
  unless tmp
79
- $log.debug "received no valid data. "
80
- return
86
+ $log.debug format_log("received no valid data. ")
87
+ return false # for debug
81
88
  end
82
89
 
83
90
  # create a file path with time format
@@ -89,18 +96,25 @@ class RedshiftOutput < BufferedOutput
89
96
  # copy gz on s3 to redshift
90
97
  s3_uri = "s3://#{@s3_bucket}/#{s3path}"
91
98
  sql = @copy_sql_template % [s3_uri, @aws_sec_key]
92
- $log.debug "start copying. s3_uri=#{s3_uri}"
99
+ $log.debug format_log("start copying. s3_uri=#{s3_uri}")
93
100
  conn = nil
94
101
  begin
95
102
  conn = PG.connect(@db_conf)
96
103
  conn.exec(sql)
97
- $log.info "completed copying to redshift. s3_uri=#{s3_uri}"
104
+ $log.info format_log("completed copying to redshift. s3_uri=#{s3_uri}")
98
105
  rescue PG::Error => e
99
- $log.error "failed to copy data into redshift. sql=#{s3_uri}", :error=>e.to_s
100
- raise e if e.result.nil? # retry if connection errors
106
+ $log.error format_log("failed to copy data into redshift. s3_uri=#{s3_uri}"), :error=>e.to_s
107
+ raise e unless e.to_s =~ IGNORE_REDSHIFT_ERROR_REGEXP
108
+ return false # for debug
101
109
  ensure
102
110
  conn.close rescue nil if conn
103
111
  end
112
+ true # for debug
113
+ end
114
+
115
+ protected
116
+ def format_log(message)
117
+ (@log_suffix and not @log_suffix.empty?) ? "#{message} #{@log_suffix}" : message
104
118
  end
105
119
 
106
120
  private
@@ -125,27 +139,24 @@ class RedshiftOutput < BufferedOutput
125
139
  if redshift_table_columns == nil
126
140
  raise "failed to fetch the redshift table definition."
127
141
  elsif redshift_table_columns.empty?
128
- $log.warn "no table on redshift. table_name=#{@redshift_tablename}"
142
+ $log.warn format_log("no table on redshift. table_name=#{@redshift_tablename}")
129
143
  return nil
130
144
  end
131
145
 
132
146
  # convert json to tsv format text
133
- table_texts = ""
134
- chunk.msgpack_each do |record|
135
- begin
136
- table_texts << json_to_table_text(redshift_table_columns, record[@record_log_tag], delimiter)
137
- rescue => e
138
- $log.error "failed to create table text from json. text=(#{record[@record_log_tag]})", :error=>$!.to_s
139
- $log.error_backtrace
140
- end
141
- end
142
- return nil if table_texts.empty?
143
-
144
- # create gz
145
147
  gzw = nil
146
148
  begin
147
149
  gzw = Zlib::GzipWriter.new(dst_file)
148
- gzw.write(table_texts)
150
+ chunk.msgpack_each do |record|
151
+ begin
152
+ tsv_text = json_to_table_text(redshift_table_columns, record[@record_log_tag], delimiter)
153
+ gzw.write(tsv_text) if tsv_text and not tsv_text.empty?
154
+ rescue => e
155
+ $log.error format_log("failed to create table text from json. text=(#{record[@record_log_tag]})"), :error=>$!.to_s
156
+ $log.error_backtrace
157
+ end
158
+ end
159
+ return nil unless gzw.pos > 0
149
160
  ensure
150
161
  gzw.close rescue nil if gzw
151
162
  end
@@ -185,7 +196,7 @@ class RedshiftOutput < BufferedOutput
185
196
  begin
186
197
  json_obj = JSON.parse(json_text)
187
198
  rescue => e
188
- $log.warn "failed to parse json. ", :error=>e.to_s
199
+ $log.warn format_log("failed to parse json. "), :error=>e.to_s
189
200
  return ""
190
201
  end
191
202
  return "" unless json_obj
@@ -198,19 +209,22 @@ class RedshiftOutput < BufferedOutput
198
209
  val.to_s unless val.nil?
199
210
  end
200
211
  if val_list.all?{|v| v.nil? or v.empty?}
201
- $log.warn "no data match for table columns on redshift. json_text=#{json_text} table_columns=#{redshift_table_columns}"
212
+ $log.warn format_log("no data match for table columns on redshift. json_text=#{json_text} table_columns=#{redshift_table_columns}")
202
213
  return ""
203
214
  end
204
215
 
205
- # generate tsv text
206
- begin
207
- CSV.generate(:col_sep=>delimiter, :quote_char => '"') do |row|
208
- row << val_list # inlude new line
216
+ generate_line_with_delimiter(val_list, delimiter)
217
+ end
218
+
219
+ def generate_line_with_delimiter(val_list, delimiter)
220
+ val_list = val_list.collect do |val|
221
+ if val.nil? or val.empty?
222
+ ""
223
+ else
224
+ val.gsub(/\\/, "\\\\\\").gsub(/\t/, "\\\t").gsub(/\n/, "\\\n") # escape tab, newline and backslash
209
225
  end
210
- rescue => e
211
- $log.debug "failed to generate csv val_list:#{val_list} delimiter:(#{delimiter})"
212
- raise e
213
226
  end
227
+ val_list.join(delimiter) + "\n"
214
228
  end
215
229
 
216
230
  def create_s3path(bucket, path)
@@ -26,6 +26,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
26
26
  redshift_tablename test_table
27
27
  buffer_type memory
28
28
  utc
29
+ log_suffix id:5 host:localhost
29
30
  ]
30
31
  CONFIG_CSV= %[
31
32
  #{CONFIG_BASE}
@@ -127,6 +128,10 @@ class RedshiftOutputTest < Test::Unit::TestCase
127
128
  assert_equal "pipe", d4.instance.file_type
128
129
  assert_equal "|", d4.instance.delimiter
129
130
  end
131
+ def test_configure_no_log_suffix
132
+ d = create_driver(CONFIG_CSV.gsub(/ *log_suffix *.+$/, ''))
133
+ assert_equal "", d.instance.log_suffix
134
+ end
130
135
 
131
136
  def emit_csv(d)
132
137
  d.emit(RECORD_CSV_A, DEFAULT_TIME)
@@ -231,30 +236,38 @@ class RedshiftOutputTest < Test::Unit::TestCase
231
236
  setup_mocks(%[val_a,val_b,val_c,val_d\nval_e,val_f,val_g,val_h\n])
232
237
  d_csv = create_driver
233
238
  emit_csv(d_csv)
234
- d_csv.run
239
+ assert_equal true, d_csv.run
235
240
  end
236
241
 
237
242
  def test_write_with_json
238
243
  setup_mocks(%[val_a\tval_b\t\t\t\t\t\t\n\t\tval_c\tval_d\t\t\t\t\n])
239
244
  d_json = create_driver(CONFIG_JSON)
240
245
  emit_json(d_json)
241
- d_json.run
246
+ assert_equal true, d_json.run
242
247
  end
243
248
 
244
249
  def test_write_with_json_hash_value
245
- setup_mocks("val_a\t\"{\"\"foo\"\":\"\"var\"\"}\"\t\t\t\t\t\t\n\t\tval_c\tval_d\t\t\t\t\n")
250
+ setup_mocks("val_a\t{\"foo\":\"var\"}\t\t\t\t\t\t\n\t\tval_c\tval_d\t\t\t\t\n")
246
251
  d_json = create_driver(CONFIG_JSON)
247
252
  d_json.emit({"log" => %[{"key_a" : "val_a", "key_b" : {"foo" : "var"}}]} , DEFAULT_TIME)
248
253
  d_json.emit(RECORD_JSON_B, DEFAULT_TIME)
249
- d_json.run
254
+ assert_equal true, d_json.run
250
255
  end
251
256
 
252
257
  def test_write_with_json_array_value
253
- setup_mocks("val_a\t\"[\"\"foo\"\",\"\"var\"\"]\"\t\t\t\t\t\t\n\t\tval_c\tval_d\t\t\t\t\n")
258
+ setup_mocks("val_a\t[\"foo\",\"var\"]\t\t\t\t\t\t\n\t\tval_c\tval_d\t\t\t\t\n")
254
259
  d_json = create_driver(CONFIG_JSON)
255
260
  d_json.emit({"log" => %[{"key_a" : "val_a", "key_b" : ["foo", "var"]}]} , DEFAULT_TIME)
256
261
  d_json.emit(RECORD_JSON_B, DEFAULT_TIME)
257
- d_json.run
262
+ assert_equal true, d_json.run
263
+ end
264
+
265
+ def test_write_with_json_including_tab_newline_quote
266
+ setup_mocks("val_a_with_\\\t_tab_\\\n_newline\tval_b_with_\\\\_quote\t\t\t\t\t\t\n\t\tval_c\tval_d\t\t\t\t\n")
267
+ d_json = create_driver(CONFIG_JSON)
268
+ d_json.emit({"log" => %[{"key_a" : "val_a_with_\\t_tab_\\n_newline", "key_b" : "val_b_with_\\\\_quote"}]} , DEFAULT_TIME)
269
+ d_json.emit(RECORD_JSON_B, DEFAULT_TIME)
270
+ assert_equal true, d_json.run
258
271
  end
259
272
 
260
273
  def test_write_with_json_no_data
@@ -262,7 +275,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
262
275
  d_json = create_driver(CONFIG_JSON)
263
276
  d_json.emit("", DEFAULT_TIME)
264
277
  d_json.emit("", DEFAULT_TIME)
265
- d_json.run
278
+ assert_equal false, d_json.run
266
279
  end
267
280
 
268
281
  def test_write_with_json_invalid_one_line
@@ -270,7 +283,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
270
283
  d_json = create_driver(CONFIG_JSON)
271
284
  d_json.emit({"log" => %[}}]}, DEFAULT_TIME)
272
285
  d_json.emit(RECORD_JSON_B, DEFAULT_TIME)
273
- d_json.run
286
+ assert_equal true, d_json.run
274
287
  end
275
288
 
276
289
  def test_write_with_json_no_available_data
@@ -278,7 +291,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
278
291
  d_json = create_driver(CONFIG_JSON)
279
292
  d_json.emit(RECORD_JSON_A, DEFAULT_TIME)
280
293
  d_json.emit({"log" => %[{"key_o" : "val_o", "key_p" : "val_p"}]}, DEFAULT_TIME)
281
- d_json.run
294
+ assert_equal true, d_json.run
282
295
  end
283
296
 
284
297
  def test_write_redshift_connection_error
@@ -300,14 +313,14 @@ class RedshiftOutputTest < Test::Unit::TestCase
300
313
  }
301
314
  end
302
315
 
303
- def test_write_redshift_logic_error
316
+ def test_write_redshift_load_error
304
317
  PG::Error.module_eval { attr_accessor :result}
305
318
  def PG.connect(dbinfo)
306
319
  return Class.new do
307
320
  def initialize(return_keys=[]); end
308
321
  def exec(sql)
309
- error = PG::Error.new("redshift logic error")
310
- error.result = "logic error"
322
+ error = PG::Error.new("ERROR: Load into table 'apache_log' failed. Check 'stl_load_errors' system table for details.")
323
+ error.result = "ERROR: Load into table 'apache_log' failed. Check 'stl_load_errors' system table for details."
311
324
  raise error
312
325
  end
313
326
  def close; end
@@ -317,9 +330,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
317
330
 
318
331
  d_csv = create_driver
319
332
  emit_csv(d_csv)
320
- assert_nothing_raised {
321
- d_csv.run
322
- }
333
+ assert_equal false, d_csv.run
323
334
  end
324
335
 
325
336
  def test_write_with_json_redshift_connection_error
@@ -356,9 +367,7 @@ class RedshiftOutputTest < Test::Unit::TestCase
356
367
 
357
368
  d_json = create_driver(CONFIG_JSON)
358
369
  emit_json(d_json)
359
- assert_nothing_raised {
360
- d_json.run
361
- }
370
+ assert_equal false, d_json.run
362
371
  end
363
372
 
364
373
  def test_write_with_json_failed_to_get_columns
@@ -379,17 +388,4 @@ class RedshiftOutputTest < Test::Unit::TestCase
379
388
  }
380
389
  end
381
390
 
382
- def test_write_with_json_failed_to_generate_tsv
383
- flexmock(CSV).should_receive(:generate).with_any_args.
384
- and_return {
385
- raise "failed to generate tsv."
386
- }
387
- setup_s3_mock("")
388
-
389
- d_json = create_driver(CONFIG_JSON)
390
- emit_json(d_json)
391
- assert_nothing_raised {
392
- d_json.run
393
- }
394
- end
395
391
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-redshift
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-07 00:00:00.000000000 Z
12
+ date: 2013-06-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: fluentd