fluent-plugin-hekk_redshift 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,295 @@
1
+ module Fluent
2
+
3
+ class HekkRedshiftOutput < BufferedOutput
4
+ Fluent::Plugin.register_output('hekk_redshift', self)
5
+
6
+ # ignore load table error. (invalid data format)
7
+ IGNORE_REDSHIFT_ERROR_REGEXP = /^ERROR: Load into table '[^']+' failed\./
8
+
9
+ def initialize
10
+ super
11
+ require 'aws-sdk'
12
+ require 'zlib'
13
+ require 'time'
14
+ require 'tempfile'
15
+ require 'pg'
16
+ require 'json'
17
+ require 'csv'
18
+ end
19
+
20
+ config_param :record_log_tag, :string, :default => 'log'
21
+ # s3
22
+ config_param :aws_key_id, :string
23
+ config_param :aws_sec_key, :string
24
+ config_param :s3_bucket, :string
25
+ config_param :s3_endpoint, :string, :default => nil
26
+ config_param :path, :string, :default => ''
27
+ config_param :timestamp_key_format, :string, :default => 'year=%Y/month=%m/day=%d/hour=%H/%Y%m%d-%H%M'
28
+ config_param :utc, :bool, :default => false
29
+ # redshift
30
+ config_param :redshift_host, :string
31
+ config_param :redshift_port, :integer, :default => 5439
32
+ config_param :redshift_dbname, :string
33
+ config_param :redshift_user, :string
34
+ config_param :redshift_password, :string
35
+ config_param :redshift_tablename, :string
36
+ config_param :redshift_copy_command_tablename, :string
37
+ config_param :redshift_copy_command_columnname, :string
38
+ config_param :redshift_schemaname, :string, :default => nil
39
+ config_param :redshift_copy_base_options, :string , :default => 'FILLRECORD ACCEPTANYDATE TRUNCATECOLUMNS'
40
+ config_param :redshift_copy_options, :string , :default => nil
41
+ # file format
42
+ config_param :file_type, :string, :default => nil # json, tsv, csv, msgpack
43
+ config_param :delimiter, :string, :default => nil
44
+ # for debug
45
+ config_param :log_suffix, :string, :default => ''
46
+
47
+ def configure(conf)
48
+ super
49
+ @path = "#{@path}/" unless @path.end_with?('/') # append last slash
50
+ @path = @path[1..-1] if @path.start_with?('/') # remove head slash
51
+ @utc = true if conf['utc']
52
+ @db_conf = {
53
+ host:@redshift_host,
54
+ port:@redshift_port,
55
+ dbname:@redshift_dbname,
56
+ user:@redshift_user,
57
+ password:@redshift_password
58
+ }
59
+ @delimiter = determine_delimiter(@file_type) if @delimiter.nil? or @delimiter.empty?
60
+ $log.debug format_log("redshift file_type:#{@file_type} delimiter:'#{@delimiter}'")
61
+ @copy_sql_template = "copy #{table_name_with_schema} from '%s' CREDENTIALS 'aws_access_key_id=#{@aws_key_id};aws_secret_access_key=%s' delimiter '#{@delimiter}' GZIP ESCAPE #{@redshift_copy_base_options} #{@redshift_copy_options};"
62
+ @insert_sql_template = "insert into #{@redshift_copy_command_tablename}(#{@redshift_copy_command_columnname}) values('%s');"
63
+ end
64
+
65
+ def start
66
+ super
67
+ # init s3 conf
68
+ options = {
69
+ :access_key_id => @aws_key_id,
70
+ :secret_access_key => @aws_sec_key
71
+ }
72
+ options[:s3_endpoint] = @s3_endpoint if @s3_endpoint
73
+ @s3 = AWS::S3.new(options)
74
+ @bucket = @s3.buckets[@s3_bucket]
75
+ end
76
+
77
+ def format(tag, time, record)
78
+ if json?
79
+ record.to_msgpack
80
+ elsif msgpack?
81
+ { @record_log_tag => record }.to_msgpack
82
+ else
83
+ "#{record[@record_log_tag]}\n"
84
+ end
85
+ end
86
+
87
+ def write(chunk)
88
+ $log.debug format_log('start creating gz.')
89
+
90
+ # create a gz file
91
+ tmp = Tempfile.new('s3-')
92
+ tmp =
93
+ if json? || msgpack?
94
+ create_gz_file_from_structured_data(tmp, chunk, @delimiter)
95
+ else
96
+ create_gz_file_from_flat_data(tmp, chunk)
97
+ end
98
+
99
+ # no data -> skip
100
+ unless tmp
101
+ $log.debug format_log('received no valid data. ')
102
+ return false # for debug
103
+ end
104
+
105
+ # create a file path with time format
106
+ s3path = create_s3path(@bucket, @path)
107
+
108
+ # upload gz to s3
109
+ @bucket.objects[s3path].write(Pathname.new(tmp.path),
110
+ :acl => :bucket_owner_full_control)
111
+
112
+ # close temp file
113
+ tmp.close!
114
+
115
+ # copy gz on s3 to redshift
116
+ s3_uri = "s3://#{@s3_bucket}/#{s3path}"
117
+ copy_sql = @copy_sql_template % [s3_uri, @aws_sec_key]
118
+ sql = @insert_sql_template % [copy_sql.gsub(/'/, "\\\\'")]
119
+ insert_sql_to_redshift(sql)
120
+ true # for debug
121
+ end
122
+
123
+ protected
124
+ def format_log(message)
125
+ (@log_suffix and not @log_suffix.empty?) ? "#{message} #{@log_suffix}" : message
126
+ end
127
+
128
+ private
129
+ def json?
130
+ @file_type == 'json'
131
+ end
132
+
133
+ def msgpack?
134
+ @file_type == 'msgpack'
135
+ end
136
+
137
+ def create_gz_file_from_flat_data(dst_file, chunk)
138
+ gzw = nil
139
+ begin
140
+ gzw = Zlib::GzipWriter.new(dst_file)
141
+ chunk.write_to(gzw)
142
+ ensure
143
+ gzw.close rescue nil if gzw
144
+ end
145
+ dst_file
146
+ end
147
+
148
+ def create_gz_file_from_structured_data(dst_file, chunk, delimiter)
149
+ # fetch the table definition from redshift
150
+ redshift_table_columns = fetch_table_columns
151
+ if redshift_table_columns == nil
152
+ raise 'failed to fetch the redshift table definition.'
153
+ elsif redshift_table_columns.empty?
154
+ $log.warn format_log("no table on redshift. table_name=#{table_name_with_schema}")
155
+ return nil
156
+ end
157
+
158
+ # convert json to tsv format text
159
+ gzw = nil
160
+ begin
161
+ gzw = Zlib::GzipWriter.new(dst_file)
162
+ chunk.msgpack_each do |record|
163
+ begin
164
+ hash = json? ? json_to_hash(record[@record_log_tag]) : record[@record_log_tag]
165
+ tsv_text = hash_to_table_text(redshift_table_columns, hash, delimiter)
166
+ gzw.write(tsv_text) if tsv_text and not tsv_text.empty?
167
+ rescue
168
+ if json?
169
+ $log.error format_log("failed to create table text from json. text=(#{record[@record_log_tag]})"), :error=>$!.to_s
170
+ else
171
+ $log.error format_log("failed to create table text from msgpack. text=(#{record[@record_log_tag]})"), :error=>$!.to_s
172
+ end
173
+
174
+ $log.error_backtrace
175
+ end
176
+ end
177
+ return nil unless gzw.pos > 0
178
+ ensure
179
+ gzw.close rescue nil if gzw
180
+ end
181
+ dst_file
182
+ end
183
+
184
+ def determine_delimiter(file_type)
185
+ case file_type
186
+ when 'json', 'msgpack', 'tsv'
187
+ "\t"
188
+ when 'csv'
189
+ ','
190
+ else
191
+ raise Fluent::ConfigError, "Invalid file_type:#{file_type}."
192
+ end
193
+ end
194
+
195
+ def fetch_table_columns
196
+ conn = PG.connect(@db_conf)
197
+ begin
198
+ columns = nil
199
+ conn.exec(fetch_columns_sql_with_schema) do |result|
200
+ columns = result.collect{|row| row['column_name']}
201
+ end
202
+ columns
203
+ ensure
204
+ conn.close rescue nil
205
+ end
206
+ end
207
+
208
+ def fetch_columns_sql_with_schema
209
+ @fetch_columns_sql ||= if @redshift_schemaname
210
+ "select column_name from INFORMATION_SCHEMA.COLUMNS where table_schema = '#{@redshift_schemaname}' and table_name = '#{@redshift_tablename}' order by ordinal_position;"
211
+ else
212
+ "select column_name from INFORMATION_SCHEMA.COLUMNS where table_name = '#{@redshift_tablename}' order by ordinal_position;"
213
+ end
214
+ end
215
+
216
+ def json_to_hash(json_text)
217
+ return nil if json_text.to_s.empty?
218
+
219
+ JSON.parse(json_text)
220
+ rescue => e
221
+ $log.warn format_log('failed to parse json. '), :error => e.to_s
222
+ end
223
+
224
+ def hash_to_table_text(redshift_table_columns, hash, delimiter)
225
+ return '' unless hash
226
+
227
+ # extract values from hash
228
+ val_list = redshift_table_columns.collect do |cn|
229
+ val = hash[cn]
230
+ val = JSON.generate(val) if val.kind_of?(Hash) or val.kind_of?(Array)
231
+
232
+ if val.to_s.empty?
233
+ nil
234
+ else
235
+ val.to_s
236
+ end
237
+ end
238
+
239
+ if val_list.all?{|v| v.nil? or v.empty?}
240
+ $log.warn format_log("no data match for table columns on redshift. data=#{hash} table_columns=#{redshift_table_columns}")
241
+ return ''
242
+ end
243
+
244
+ generate_line_with_delimiter(val_list, delimiter)
245
+ end
246
+
247
+ def generate_line_with_delimiter(val_list, delimiter)
248
+ val_list = val_list.collect do |val|
249
+ if val.nil? or val.empty?
250
+ ''
251
+ else
252
+ val.gsub(/\\/, "\\\\\\").gsub(/\t/, "\\\t").gsub(/\n/, "\\\n") # escape tab, newline and backslash
253
+ end
254
+ end
255
+ val_list.join(delimiter) + "\n"
256
+ end
257
+
258
+ def create_s3path(bucket, path)
259
+ timestamp_key = (@utc) ? Time.now.utc.strftime(@timestamp_key_format) : Time.now.strftime(@timestamp_key_format)
260
+ i = 0
261
+ begin
262
+ suffix = "_#{'%02d' % i}"
263
+ s3path = "#{path}#{timestamp_key}#{suffix}.gz"
264
+ i += 1
265
+ end while bucket.objects[s3path].exists?
266
+ s3path
267
+ end
268
+
269
+ def table_name_with_schema
270
+ @table_name_with_schema ||= if @redshift_schemaname
271
+ "#{@redshift_schemaname}.#{@redshift_tablename}"
272
+ else
273
+ @redshift_tablename
274
+ end
275
+ end
276
+
277
+ def insert_sql_to_redshift(sql)
278
+ $log.debug format_log('start inserting copy command.')
279
+ conn = nil
280
+ begin
281
+ conn = PG.connect(@db_conf)
282
+ conn.exec(sql)
283
+ $log.info format_log('completed inserting to redshift.')
284
+ rescue PG::Error => e
285
+ $log.error format_log('failed to insert copy command into redshift.'), :error=>e.to_s
286
+ raise e unless e.to_s =~ IGNORE_REDSHIFT_ERROR_REGEXP
287
+ return false # for debug
288
+ ensure
289
+ conn.close rescue nil if conn
290
+ end
291
+ end
292
+
293
+ end
294
+
295
+ end