fluent-plugin-datahub 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,73 @@
1
+ require "json"
2
+
3
+ class RecordField
4
+
5
+ def initialize(name, type)
6
+ @name = name
7
+ @type = type
8
+ end
9
+
10
+ def get_name()
11
+ return @name
12
+ end
13
+
14
+ def get_type()
15
+ return @type
16
+ end
17
+
18
+ def to_json(*a)
19
+ field_map = {}
20
+ field_map["name"] = @name
21
+ field_map["type"] = @type
22
+ return field_map.to_json(*a)
23
+ end
24
+
25
+ end
26
+
27
+ class RecordSchema
28
+ def initialize()
29
+ @fields = []
30
+
31
+ @encoding = nil
32
+
33
+ @fields_map = {}
34
+ end
35
+
36
+ def setEncoding(encoding)
37
+ if ["US-ASCII", "ASCII-8BIT", "UTF-8", "ISO-8859-1", "Shift_JIS", "EUC-JP", "Windows-31J", "BINARY", "CP932", "eucJP"].include?(encoding)
38
+ @encoding = encoding
39
+ else
40
+ raise "Unsupported encoding type [" + encoding.to_s + "]."
41
+ end
42
+ end
43
+
44
+ def get_encoding
45
+ return @encoding
46
+ end
47
+
48
+ def add_field(field)
49
+ @fields.push(field)
50
+ @fields_map[field.get_name] = field
51
+ end
52
+
53
+ def get_field(name)
54
+ # @fields.each do |field|
55
+ # if field.get_name == name
56
+ # return field
57
+ # end
58
+ # end
59
+ # return nil
60
+ return @fields_map[name]
61
+ end
62
+
63
+ def get_fields()
64
+ return @fields
65
+ end
66
+
67
+ def to_json(*a)
68
+ tuple = {}
69
+ tuple["fields"] = @fields
70
+ tuple.to_json(*a)
71
+ end
72
+ end
73
+
@@ -0,0 +1,13 @@
1
+ require_relative "datahub-http-client"
2
+ require_relative "datahub-topic"
3
+
4
+ class DatahubShard
5
+ attr_accessor :shard_id
6
+ attr_accessor :state
7
+ attr_accessor :begin_key
8
+ attr_accessor :end_key
9
+ attr_accessor :right_shard_id
10
+ attr_accessor :left_shard_id
11
+ attr_accessor :parent_shard_ids
12
+ attr_accessor :closed_time
13
+ end
@@ -0,0 +1,73 @@
1
+ require_relative "datahub-http-client"
2
+ require_relative "datahub-project"
3
+ require_relative "datahub-shard"
4
+ require_relative "datahub-put-record-result"
5
+
6
+ class DatahubTopic
7
+ attr_accessor :shard_count
8
+ attr_accessor :lifecycle
9
+ attr_accessor :record_type
10
+ attr_accessor :record_schema
11
+ attr_accessor :comment
12
+ attr_accessor :create_time
13
+ attr_accessor :last_modify_time
14
+
15
+ def initialize(datahub_http_client, project_name, topic_name)
16
+ @client = datahub_http_client
17
+ @project_name = project_name
18
+ @topic_name = topic_name
19
+ end
20
+
21
+ def list_shards()
22
+ result_map = @client.list_shards(@project_name, @topic_name)
23
+ shard_array = result_map["Shards"]
24
+
25
+ shards = []
26
+
27
+ for i in 0...shard_array.size
28
+ shard = DatahubShard.new
29
+
30
+ shard_map = shard_array[i]
31
+ shard.begin_key = shard_map["BeginKey"]
32
+ shard.end_key = shard_map["EndKey"]
33
+ shard.left_shard_id = shard_map["LeftShardId"]
34
+ shard.parent_shard_ids = shard_map["ParentShardIds"]
35
+ shard.right_shard_id = shard_map["RightShardId"]
36
+ shard.shard_id = shard_map["ShardId"]
37
+ shard.state = shard_map["State"]
38
+
39
+ shards.push(shard)
40
+ end
41
+
42
+ return shards
43
+ end
44
+
45
+ def get_cursor(shard_id, offset=DateTime.now.strftime('%Q'), type="OLDEST")
46
+ result_map = @client.get_shard_cursor(@project_name, @topic_name, shard_id, offset, type)
47
+ return result_map["Cursor"]
48
+ end
49
+
50
+ def write_data(record_entities)
51
+ put_record_result = PutRecordResult.new
52
+ result_map = @client.write_data_to_topic(@project_name, @topic_name, record_entities)
53
+
54
+ if result_map["FailedRecordCount"] > 0
55
+ put_record_result.failed_record_count = result_map["FailedRecordCount"]
56
+ for i in 0...result_map["FailedRecords"].size
57
+ result_error = result_map["FailedRecords"][i]
58
+ put_record_result.failed_record_index.push(result_error["Index"])
59
+ error_entity = {}
60
+ error_entity["error_code"] = result_error["ErrorCode"]
61
+ error_entity["error_message"] = result_error["ErrorMessage"]
62
+ put_record_result.failed_record_error.push(error_entity)
63
+ put_record_result.failed_record_list.push(record_entities[result_error["Index"]])
64
+ end
65
+ end
66
+ return put_record_result
67
+ end
68
+
69
+ def read_data(shard_id, cursor, count)
70
+ @client.read_data_from_shard_with_cursor(@project_name, @topic_name, shard_id, cursor, count)
71
+ end
72
+
73
+ end
@@ -0,0 +1,402 @@
1
+ require_relative "datahub/datahub-client"
2
+
3
+ module Fluent
4
+ class DatahubOutput < BufferedOutput
5
+ Fluent::Plugin::register_output('datahub', self)
6
+
7
+ # datahub access id
8
+ config_param :access_id, :string
9
+
10
+ # datahub access key
11
+ config_param :access_key, :string
12
+
13
+ # datahub service endpoint
14
+ config_param :endpoint, :string
15
+
16
+ # datahub project name
17
+ config_param :project_name, :string
18
+
19
+ # datahub topic name
20
+ config_param :topic_name, :string
21
+
22
+ # 重试次数
23
+ config_param :retry_times, :integer, :default => -1
24
+
25
+ # 重试周期,下一次重试的间隔,单位为秒
26
+ config_param :retry_interval, :integer, :default => 3
27
+
28
+ # 提交的列名,用户可以配置topic的列,采集部分列或者全部列
29
+ # 默认为空数组,表示按照topic的顺序及全字段提交
30
+ # 另外:列的配置不用保序,但是要求该字段在topic的schema中存在
31
+ config_param :column_names, :array, :default => []
32
+
33
+ # 指定源头采集的keys, record 按照这些keys 获取数据, 写入datahub
34
+ # 默认空数组, 此时record使用column_names 获取数据, 写入datahub
35
+ config_param :source_keys, :array, :default => []
36
+
37
+ # 当出现脏数据时,是否继续写入
38
+ # 当开启该开关,必须指定@dirty_data_file文件
39
+ config_param :dirty_data_continue, :bool, :default => false
40
+
41
+ # 脏数据文件名称,当数据文件名称,在@dirty_data_continue开启的情况下,需要指定该值
42
+ # 特别注意:脏数据文件将被分割成两个部分.part1和.part2,part1作为更早的脏数据,part2作为更新的数据
43
+ config_param :dirty_data_file, :string, :default => ""
44
+
45
+ # 脏数据文件的最大大小,该值保证脏数据文件最大大小不超过这个值,目前该值仅是一个参考值
46
+ config_param :dirty_data_file_max_size, :integer, :required => false, :default => 50024000
47
+
48
+ # 写入指定的 shard_id
49
+ config_param :shard_id, :string, :required => false, :default => ""
50
+
51
+ # 按照指定字段的值计算hash,依据于该hash值落某个shard
52
+ config_param :shard_keys, :array, :required => false, :default => []
53
+
54
+ # fluentd自带的 retry次数, 由于可能导致数据重写,该参数默认设置为0
55
+ config_param :retry_limit, :integer, :default => 0
56
+
57
+ # 多少条数据 写一次datahub, 默认100条, 最大不能好过3m
58
+ config_param :put_data_batch_size, :integer, :default => 100
59
+
60
+ config_param :data_encoding, :string, :default => nil
61
+
62
+ # 该值内部使用,不提供配置
63
+ # 分发shard的游标
64
+ attr_accessor :shard_cursor
65
+
66
+ # 写文件锁
67
+ @@file_lock = Mutex.new
68
+
69
+ def configure(conf)
70
+ super
71
+ @client = DatahubClient.new(@endpoint, @access_id, @access_key)
72
+ @datahub_project = @client.get_project(@project_name)
73
+ @datahub_topic = @datahub_project.get_topic(@topic_name)
74
+
75
+ @shards = get_active_shard
76
+ @shard_count = @shards.size
77
+
78
+ @logger = log
79
+ @shard_cursor = 0
80
+
81
+ #限制一次向datahub put data不能超过3000
82
+ @put_data_max_size = 3000
83
+
84
+ @target_source_column_map = {}
85
+
86
+ # 前置校验参数
87
+ check_params
88
+ end
89
+
90
+ def check_params
91
+ schema = @datahub_topic.record_schema
92
+ if @data_encoding != nil
93
+ schema.setEncoding(@data_encoding)
94
+ end
95
+
96
+ fields = schema.get_fields
97
+
98
+ # 保证用户配置的字段在topic中存在
99
+ if @column_names.size > 0
100
+ for i in 0...@column_names.size do
101
+ column_name = @column_names[i]
102
+ column_index = find_column_index(fields, column_name)
103
+ if column_index == -1
104
+ @logger.error "Column: " + column_name + " not found, please check your config"
105
+ raise "Column: " + column_name + " not found, please check your config"
106
+ end
107
+ end
108
+ end
109
+
110
+ if @source_keys.size == 0
111
+ @source_keys = @column_names
112
+ end
113
+
114
+ #puts "source_key size: " + @source_keys.to_s
115
+ #puts "column_names: " + @column_names.to_s
116
+
117
+ if @source_keys.size > 0 and @column_names.size != @source_keys.size
118
+ @logger.error "source_keys's size must be equal to column_names's size, please check your config"
119
+ raise "source_keys's size must be equal to column_names's size, please check your config"
120
+ else
121
+ for i in 0...@column_names.size do
122
+ @target_source_column_map[@column_names[i]] = @source_keys[i]
123
+ end
124
+ end
125
+
126
+ #puts @target_source_column_map
127
+
128
+ if @shard_count < 1
129
+ raise "there must be at least 1 active shard!"
130
+ end
131
+
132
+ # 配置了脏数据继续,必须指定脏数据文件
133
+ if @dirty_data_continue
134
+ if @dirty_data_file.to_s.chomp.length == 0
135
+ raise "Dirty data file path can not be empty"
136
+ end
137
+ end
138
+
139
+ # 检查shard_keys字段是否合法
140
+ if @shard_keys.size > 0
141
+ for i in 0...@shard_keys.size
142
+ shard_key = @shard_keys[i]
143
+ shard_key_index = find_column_index(fields, shard_key)
144
+ if shard_key_index == -1
145
+ @logger.error "Shard key: " + shard_key + " not found in schema, please check your config"
146
+ raise "Shard key: " + shard_key + " not found in schema, please check your config"
147
+ end
148
+ end
149
+ end
150
+
151
+ end
152
+
153
+ # 在topic的schema中查找某列的真实下标
154
+ # 如果没找到返回-1
155
+ def find_column_index(fields, column_name)
156
+ for i in 0...fields.size do
157
+ name = fields[i].get_name
158
+ if name == column_name
159
+ return i
160
+ end
161
+ end
162
+ return -1
163
+ end
164
+
165
+ def start
166
+ super
167
+ end
168
+
169
+ def shutdown
170
+ super
171
+ end
172
+
173
+ def format(tag, time, record)
174
+ [tag, time, record].to_json + '\n'
175
+ end
176
+
177
+ def format(tag, time, record)
178
+ [tag, time, record].to_msgpack
179
+ end
180
+
181
+ def write(chunk)
182
+ record_entities = []
183
+ schema = @datahub_topic.record_schema
184
+
185
+ chunk.msgpack_each do |tag, time, record|
186
+ entity = RecordEntity.new(schema)
187
+ convert_success = record_to_entity(entity, record)
188
+ entity.set_shard_id(get_shard_id(record))
189
+ if convert_success
190
+ record_entities.push(entity)
191
+ end
192
+ if record_entities.size >= @put_data_max_size
193
+ write_data_with_retry(record_entities)
194
+ # puts record_entities.to_json
195
+ record_entities.clear
196
+ # puts "after clear ; " + record_entities.to_json
197
+ elsif record_entities.size >= @put_data_batch_size
198
+ write_data_with_retry(record_entities)
199
+ #puts record_entities.to_json
200
+ record_entities.clear
201
+ #puts "after clear ; " + record_entities.to_json
202
+ end
203
+ end
204
+
205
+ if record_entities.size > 0
206
+ write_data_with_retry(record_entities)
207
+ # record_entities.clear
208
+ end
209
+ end
210
+
211
+ # 根据@@retry_times 重试写入datahub数据
212
+ def write_data_with_retry(record_entities)
213
+ tmp_retry_times = @retry_times
214
+ put_result = nil
215
+ while true
216
+ begin
217
+ put_result = @datahub_topic.write_data(record_entities)
218
+ rescue => e
219
+ @logger.warn "Put " + record_entities.size.to_s + " records to datahub failed, total " + record_entities.size.to_s + ", message = " + e.message
220
+ if tmp_retry_times > 0
221
+ sleep @retry_interval
222
+ @logger.warn "Now retry(" + (@retry_times - tmp_retry_times + 1).to_s + ")..."
223
+ tmp_retry_times -= 1
224
+ next
225
+ else
226
+ if !@dirty_data_continue
227
+ @logger.error "Dirty data found, exit process now."
228
+ puts "Dirty data found, exit process now."
229
+ raise "try to exit!"
230
+ else
231
+ #不重试/重试次数用完,写入脏数据文件
232
+ for i in 0...record_entities.size
233
+ record_entity = record_entities[i]
234
+ @logger.error "Put record: " + record_entity.get_columns_map.to_s + " failed, " + put_result.failed_record_error[i].to_s
235
+ write_as_dirty_data(record_entity.get_columns_map)
236
+ end
237
+ break
238
+ end
239
+ end
240
+ end
241
+
242
+ #puts record_entities.to_json
243
+ if put_result != nil and put_result.failed_record_count > 0
244
+ if tmp_retry_times > 0
245
+ #按照retry_times重试
246
+ @logger.warn "Put " + put_result.failed_record_count.to_s + " records to datahub failed, total " + record_entities.size.to_s
247
+ sleep @retry_interval
248
+ @logger.warn "Now retry(" + (@retry_times - tmp_retry_times + 1).to_s + ")..."
249
+ tmp_retry_times -= 1
250
+ record_entities = put_result.failed_record_list
251
+
252
+ # 若是轮询写入方式,且shard处于非active状态(即error_code = "InvalidShardOperation"),则刷新shard列表
253
+ fresh_shard_flag = false
254
+ if @shard_id.empty? and @shard_keys.size == 0
255
+ for i in 0...put_result.failed_record_count
256
+ error_entity = put_result.failed_record_error[i]
257
+ if error_entity["error_code"] == "InvalidShardOperation"
258
+ unless fresh_shard_flag
259
+ @shards = get_active_shard
260
+ @shard_count = @shards.size
261
+ fresh_shard_flag = true
262
+ end
263
+ # puts "before: " + record_entities[i].to_json
264
+ record_entities[i].set_shard_id(get_shard_id(record_entities[i]))
265
+ # puts record_entities[i].to_json
266
+ end
267
+ end
268
+ end
269
+ else
270
+ if !@dirty_data_continue
271
+ @logger.error "Dirty data found, exit process now."
272
+ puts "Dirty data found, exit process now."
273
+ raise "try to exit!"
274
+ else
275
+ #不重试/重试次数用完,写入脏数据文件
276
+ for i in 0...put_result.failed_record_count
277
+ record_entity = put_result.failed_record_list[i]
278
+ @logger.error "Put record: " + record_entity.get_columns_map.to_s + " failed, " + put_result.failed_record_error[i].to_s
279
+ write_as_dirty_data(record_entity.get_columns_map)
280
+ end
281
+ break
282
+ end
283
+ end
284
+ else
285
+ @logger.info "Put data to datahub success, total " + record_entities.size.to_s
286
+ break
287
+ end
288
+ end
289
+ end
290
+
291
+ # 将record转化为entity
292
+ def record_to_entity(entity, record)
293
+ schema = entity.get_schema
294
+ @column_names.each do |column|
295
+ begin
296
+ source_key = @target_source_column_map[column]
297
+ if record.has_key?(source_key)
298
+ field = schema.get_field(column)
299
+ if field == nil
300
+ raise "Unknown column name of data"
301
+ else
302
+ field_type = field.get_type
303
+ if field_type == "BIGINT"
304
+ entity.setBigInt(column, record[source_key])
305
+ elsif field_type == "DOUBLE"
306
+ entity.setDouble(column, record[source_key])
307
+ elsif field_type == "BOOLEAN"
308
+ entity.setBoolean(column, record[source_key])
309
+ elsif field_type == "STRING"
310
+ entity.setString(column, record[source_key])
311
+ elsif field_type == "TIMESTAMP"
312
+ entity.setTimeStamp(column, record[source_key])
313
+ else
314
+ raise "Unknown schema type of data"
315
+ end
316
+ end
317
+ end
318
+ rescue => e
319
+ @logger.error "Parse data: " + column + "[" + record[source_key].to_s + "] failed, " + e.message
320
+ if !@dirty_data_continue
321
+ @logger.error "Dirty data found, exit process now."
322
+ puts "Dirty data found, exit process now."
323
+ raise "try to exit!"
324
+ else
325
+ # 忽略的异常数据直接落文件
326
+ write_as_dirty_data(record)
327
+ end
328
+ return false
329
+ end
330
+ end
331
+ return true
332
+ end
333
+
334
+
335
+ # 脏数据文件处理
336
+ def write_as_dirty_data(record)
337
+ dirty_file_part1_name = @dirty_data_file + ".part1"
338
+ dirty_file_part2_name = @dirty_data_file + ".part2"
339
+
340
+ # todo 加锁写入
341
+ @@file_lock.synchronize {
342
+ dirty_file_part2 = File.open(dirty_file_part2_name, "a+")
343
+ dirty_file_part2.puts(record.to_json)
344
+ dirty_file_part2.close
345
+ if File.size(dirty_file_part2_name) > @dirty_data_file_max_size / 2
346
+ # .part1, .part2分别存储数据
347
+ # 旧数据落part1,新的数据落part2
348
+ FileUtils.mv(dirty_file_part2_name, dirty_file_part1_name)
349
+ end
350
+ }
351
+ end
352
+
353
+ # 产生写入的shard_id
354
+ def get_shard_id(record)
355
+ if @shard_id != nil and !@shard_id.empty?
356
+ return @shard_id
357
+ elsif @shard_keys != nil and @shard_keys.size > 0
358
+ #hash 写入
359
+ hash_string = ""
360
+ for i in 0...@shard_keys.size
361
+ shard_key = @shard_keys[i]
362
+ source_key = @target_source_column_map[shard_key]
363
+ if record[source_key] != nil
364
+ hash_string += record[source_key].to_s + ","
365
+ end
366
+ end
367
+ hashed_value = hash_code(hash_string)
368
+ index = hashed_value % @shard_count
369
+ return @shards[index].shard_id
370
+ else
371
+ #轮询写入
372
+ idx = @shard_cursor % @shard_count
373
+ @shard_cursor = idx + 1
374
+ shard_id = @shards[idx].shard_id
375
+ # puts "idx: " + idx.to_s
376
+ # puts "shard_id: " + shard_id.to_s
377
+ return shard_id
378
+ end
379
+ end
380
+
381
+ # 产生和java 一样的hashcode
382
+ def hash_code(str)
383
+ str.each_char.reduce(0) do |result, char|
384
+ [((result << 5) - result) + char.ord].pack('L').unpack('l').first
385
+ end
386
+ end
387
+
388
+ # 获取active状态的shard
389
+ def get_active_shard
390
+ all_shards = @datahub_topic.list_shards
391
+ active_shards = []
392
+ all_shards.each do |shard|
393
+ if shard.state == "ACTIVE"
394
+ active_shards.push(shard)
395
+ end
396
+ end
397
+
398
+ return active_shards
399
+ end
400
+
401
+ end
402
+ end