fluent-plugin-datahub 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,73 @@
1
+ require "json"
2
+
3
+ class RecordField
4
+
5
+ def initialize(name, type)
6
+ @name = name
7
+ @type = type
8
+ end
9
+
10
+ def get_name()
11
+ return @name
12
+ end
13
+
14
+ def get_type()
15
+ return @type
16
+ end
17
+
18
+ def to_json(*a)
19
+ field_map = {}
20
+ field_map["name"] = @name
21
+ field_map["type"] = @type
22
+ return field_map.to_json(*a)
23
+ end
24
+
25
+ end
26
+
27
+ class RecordSchema
28
+ def initialize()
29
+ @fields = []
30
+
31
+ @encoding = nil
32
+
33
+ @fields_map = {}
34
+ end
35
+
36
+ def setEncoding(encoding)
37
+ if ["US-ASCII", "ASCII-8BIT", "UTF-8", "ISO-8859-1", "Shift_JIS", "EUC-JP", "Windows-31J", "BINARY", "CP932", "eucJP"].include?(encoding)
38
+ @encoding = encoding
39
+ else
40
+ raise "Unsupported encoding type [" + encoding.to_s + "]."
41
+ end
42
+ end
43
+
44
+ def get_encoding
45
+ return @encoding
46
+ end
47
+
48
+ def add_field(field)
49
+ @fields.push(field)
50
+ @fields_map[field.get_name] = field
51
+ end
52
+
53
+ def get_field(name)
54
+ # @fields.each do |field|
55
+ # if field.get_name == name
56
+ # return field
57
+ # end
58
+ # end
59
+ # return nil
60
+ return @fields_map[name]
61
+ end
62
+
63
+ def get_fields()
64
+ return @fields
65
+ end
66
+
67
+ def to_json(*a)
68
+ tuple = {}
69
+ tuple["fields"] = @fields
70
+ tuple.to_json(*a)
71
+ end
72
+ end
73
+
@@ -0,0 +1,13 @@
1
+ require_relative "datahub-http-client"
2
+ require_relative "datahub-topic"
3
+
4
+ class DatahubShard
5
+ attr_accessor :shard_id
6
+ attr_accessor :state
7
+ attr_accessor :begin_key
8
+ attr_accessor :end_key
9
+ attr_accessor :right_shard_id
10
+ attr_accessor :left_shard_id
11
+ attr_accessor :parent_shard_ids
12
+ attr_accessor :closed_time
13
+ end
@@ -0,0 +1,73 @@
1
+ require_relative "datahub-http-client"
2
+ require_relative "datahub-project"
3
+ require_relative "datahub-shard"
4
+ require_relative "datahub-put-record-result"
5
+
6
+ class DatahubTopic
7
+ attr_accessor :shard_count
8
+ attr_accessor :lifecycle
9
+ attr_accessor :record_type
10
+ attr_accessor :record_schema
11
+ attr_accessor :comment
12
+ attr_accessor :create_time
13
+ attr_accessor :last_modify_time
14
+
15
+ def initialize(datahub_http_client, project_name, topic_name)
16
+ @client = datahub_http_client
17
+ @project_name = project_name
18
+ @topic_name = topic_name
19
+ end
20
+
21
+ def list_shards()
22
+ result_map = @client.list_shards(@project_name, @topic_name)
23
+ shard_array = result_map["Shards"]
24
+
25
+ shards = []
26
+
27
+ for i in 0...shard_array.size
28
+ shard = DatahubShard.new
29
+
30
+ shard_map = shard_array[i]
31
+ shard.begin_key = shard_map["BeginKey"]
32
+ shard.end_key = shard_map["EndKey"]
33
+ shard.left_shard_id = shard_map["LeftShardId"]
34
+ shard.parent_shard_ids = shard_map["ParentShardIds"]
35
+ shard.right_shard_id = shard_map["RightShardId"]
36
+ shard.shard_id = shard_map["ShardId"]
37
+ shard.state = shard_map["State"]
38
+
39
+ shards.push(shard)
40
+ end
41
+
42
+ return shards
43
+ end
44
+
45
+ def get_cursor(shard_id, offset=DateTime.now.strftime('%Q'), type="OLDEST")
46
+ result_map = @client.get_shard_cursor(@project_name, @topic_name, shard_id, offset, type)
47
+ return result_map["Cursor"]
48
+ end
49
+
50
+ def write_data(record_entities)
51
+ put_record_result = PutRecordResult.new
52
+ result_map = @client.write_data_to_topic(@project_name, @topic_name, record_entities)
53
+
54
+ if result_map["FailedRecordCount"] > 0
55
+ put_record_result.failed_record_count = result_map["FailedRecordCount"]
56
+ for i in 0...result_map["FailedRecords"].size
57
+ result_error = result_map["FailedRecords"][i]
58
+ put_record_result.failed_record_index.push(result_error["Index"])
59
+ error_entity = {}
60
+ error_entity["error_code"] = result_error["ErrorCode"]
61
+ error_entity["error_message"] = result_error["ErrorMessage"]
62
+ put_record_result.failed_record_error.push(error_entity)
63
+ put_record_result.failed_record_list.push(record_entities[result_error["Index"]])
64
+ end
65
+ end
66
+ return put_record_result
67
+ end
68
+
69
+ def read_data(shard_id, cursor, count)
70
+ @client.read_data_from_shard_with_cursor(@project_name, @topic_name, shard_id, cursor, count)
71
+ end
72
+
73
+ end
@@ -0,0 +1,402 @@
1
+ require_relative "datahub/datahub-client"
2
+
3
+ module Fluent
4
+ class DatahubOutput < BufferedOutput
5
+ Fluent::Plugin::register_output('datahub', self)
6
+
7
+ # datahub access id
8
+ config_param :access_id, :string
9
+
10
+ # datahub access key
11
+ config_param :access_key, :string
12
+
13
+ # datahub service endpoint
14
+ config_param :endpoint, :string
15
+
16
+ # datahub project name
17
+ config_param :project_name, :string
18
+
19
+ # datahub topic name
20
+ config_param :topic_name, :string
21
+
22
+ # 重试次数
23
+ config_param :retry_times, :integer, :default => -1
24
+
25
+ # 重试周期,下一次重试的间隔,单位为秒
26
+ config_param :retry_interval, :integer, :default => 3
27
+
28
+ # 提交的列名,用户可以配置topic的列,采集部分列或者全部列
29
+ # 默认为空数组,表示按照topic的顺序及全字段提交
30
+ # 另外:列的配置不用保序,但是要求该字段在topic的schema中存在
31
+ config_param :column_names, :array, :default => []
32
+
33
+ # 指定源头采集的keys, record 按照这些keys 获取数据, 写入datahub
34
+ # 默认空数组, 此时record使用column_names 获取数据, 写入datahub
35
+ config_param :source_keys, :array, :default => []
36
+
37
+ # 当出现脏数据时,是否继续写入
38
+ # 当开启该开关,必须指定@dirty_data_file文件
39
+ config_param :dirty_data_continue, :bool, :default => false
40
+
41
+ # 脏数据文件名称,当数据文件名称,在@dirty_data_continue开启的情况下,需要指定该值
42
+ # 特别注意:脏数据文件将被分割成两个部分.part1和.part2,part1作为更早的脏数据,part2作为更新的数据
43
+ config_param :dirty_data_file, :string, :default => ""
44
+
45
+ # 脏数据文件的最大大小,该值保证脏数据文件最大大小不超过这个值,目前该值仅是一个参考值
46
+ config_param :dirty_data_file_max_size, :integer, :required => false, :default => 50024000
47
+
48
+ # 写入指定的 shard_id
49
+ config_param :shard_id, :string, :required => false, :default => ""
50
+
51
+ # 按照指定字段的值计算hash,依据于该hash值落某个shard
52
+ config_param :shard_keys, :array, :required => false, :default => []
53
+
54
+ # fluentd自带的 retry次数, 由于可能导致数据重写,该参数默认设置为0
55
+ config_param :retry_limit, :integer, :default => 0
56
+
57
+ # 多少条数据 写一次datahub, 默认100条, 最大不能好过3m
58
+ config_param :put_data_batch_size, :integer, :default => 100
59
+
60
+ config_param :data_encoding, :string, :default => nil
61
+
62
+ # 该值内部使用,不提供配置
63
+ # 分发shard的游标
64
+ attr_accessor :shard_cursor
65
+
66
+ # 写文件锁
67
+ @@file_lock = Mutex.new
68
+
69
+ def configure(conf)
70
+ super
71
+ @client = DatahubClient.new(@endpoint, @access_id, @access_key)
72
+ @datahub_project = @client.get_project(@project_name)
73
+ @datahub_topic = @datahub_project.get_topic(@topic_name)
74
+
75
+ @shards = get_active_shard
76
+ @shard_count = @shards.size
77
+
78
+ @logger = log
79
+ @shard_cursor = 0
80
+
81
+ #限制一次向datahub put data不能超过3000
82
+ @put_data_max_size = 3000
83
+
84
+ @target_source_column_map = {}
85
+
86
+ # 前置校验参数
87
+ check_params
88
+ end
89
+
90
+ def check_params
91
+ schema = @datahub_topic.record_schema
92
+ if @data_encoding != nil
93
+ schema.setEncoding(@data_encoding)
94
+ end
95
+
96
+ fields = schema.get_fields
97
+
98
+ # 保证用户配置的字段在topic中存在
99
+ if @column_names.size > 0
100
+ for i in 0...@column_names.size do
101
+ column_name = @column_names[i]
102
+ column_index = find_column_index(fields, column_name)
103
+ if column_index == -1
104
+ @logger.error "Column: " + column_name + " not found, please check your config"
105
+ raise "Column: " + column_name + " not found, please check your config"
106
+ end
107
+ end
108
+ end
109
+
110
+ if @source_keys.size == 0
111
+ @source_keys = @column_names
112
+ end
113
+
114
+ #puts "source_key size: " + @source_keys.to_s
115
+ #puts "column_names: " + @column_names.to_s
116
+
117
+ if @source_keys.size > 0 and @column_names.size != @source_keys.size
118
+ @logger.error "source_keys's size must be equal to column_names's size, please check your config"
119
+ raise "source_keys's size must be equal to column_names's size, please check your config"
120
+ else
121
+ for i in 0...@column_names.size do
122
+ @target_source_column_map[@column_names[i]] = @source_keys[i]
123
+ end
124
+ end
125
+
126
+ #puts @target_source_column_map
127
+
128
+ if @shard_count < 1
129
+ raise "there must be at least 1 active shard!"
130
+ end
131
+
132
+ # 配置了脏数据继续,必须指定脏数据文件
133
+ if @dirty_data_continue
134
+ if @dirty_data_file.to_s.chomp.length == 0
135
+ raise "Dirty data file path can not be empty"
136
+ end
137
+ end
138
+
139
+ # 检查shard_keys字段是否合法
140
+ if @shard_keys.size > 0
141
+ for i in 0...@shard_keys.size
142
+ shard_key = @shard_keys[i]
143
+ shard_key_index = find_column_index(fields, shard_key)
144
+ if shard_key_index == -1
145
+ @logger.error "Shard key: " + shard_key + " not found in schema, please check your config"
146
+ raise "Shard key: " + shard_key + " not found in schema, please check your config"
147
+ end
148
+ end
149
+ end
150
+
151
+ end
152
+
153
+ # 在topic的schema中查找某列的真实下标
154
+ # 如果没找到返回-1
155
+ def find_column_index(fields, column_name)
156
+ for i in 0...fields.size do
157
+ name = fields[i].get_name
158
+ if name == column_name
159
+ return i
160
+ end
161
+ end
162
+ return -1
163
+ end
164
+
165
+ def start
166
+ super
167
+ end
168
+
169
+ def shutdown
170
+ super
171
+ end
172
+
173
+ def format(tag, time, record)
174
+ [tag, time, record].to_json + '\n'
175
+ end
176
+
177
+ def format(tag, time, record)
178
+ [tag, time, record].to_msgpack
179
+ end
180
+
181
+ def write(chunk)
182
+ record_entities = []
183
+ schema = @datahub_topic.record_schema
184
+
185
+ chunk.msgpack_each do |tag, time, record|
186
+ entity = RecordEntity.new(schema)
187
+ convert_success = record_to_entity(entity, record)
188
+ entity.set_shard_id(get_shard_id(record))
189
+ if convert_success
190
+ record_entities.push(entity)
191
+ end
192
+ if record_entities.size >= @put_data_max_size
193
+ write_data_with_retry(record_entities)
194
+ # puts record_entities.to_json
195
+ record_entities.clear
196
+ # puts "after clear ; " + record_entities.to_json
197
+ elsif record_entities.size >= @put_data_batch_size
198
+ write_data_with_retry(record_entities)
199
+ #puts record_entities.to_json
200
+ record_entities.clear
201
+ #puts "after clear ; " + record_entities.to_json
202
+ end
203
+ end
204
+
205
+ if record_entities.size > 0
206
+ write_data_with_retry(record_entities)
207
+ # record_entities.clear
208
+ end
209
+ end
210
+
211
+ # 根据@@retry_times 重试写入datahub数据
212
+ def write_data_with_retry(record_entities)
213
+ tmp_retry_times = @retry_times
214
+ put_result = nil
215
+ while true
216
+ begin
217
+ put_result = @datahub_topic.write_data(record_entities)
218
+ rescue => e
219
+ @logger.warn "Put " + record_entities.size.to_s + " records to datahub failed, total " + record_entities.size.to_s + ", message = " + e.message
220
+ if tmp_retry_times > 0
221
+ sleep @retry_interval
222
+ @logger.warn "Now retry(" + (@retry_times - tmp_retry_times + 1).to_s + ")..."
223
+ tmp_retry_times -= 1
224
+ next
225
+ else
226
+ if !@dirty_data_continue
227
+ @logger.error "Dirty data found, exit process now."
228
+ puts "Dirty data found, exit process now."
229
+ raise "try to exit!"
230
+ else
231
+ #不重试/重试次数用完,写入脏数据文件
232
+ for i in 0...record_entities.size
233
+ record_entity = record_entities[i]
234
+ @logger.error "Put record: " + record_entity.get_columns_map.to_s + " failed, " + put_result.failed_record_error[i].to_s
235
+ write_as_dirty_data(record_entity.get_columns_map)
236
+ end
237
+ break
238
+ end
239
+ end
240
+ end
241
+
242
+ #puts record_entities.to_json
243
+ if put_result != nil and put_result.failed_record_count > 0
244
+ if tmp_retry_times > 0
245
+ #按照retry_times重试
246
+ @logger.warn "Put " + put_result.failed_record_count.to_s + " records to datahub failed, total " + record_entities.size.to_s
247
+ sleep @retry_interval
248
+ @logger.warn "Now retry(" + (@retry_times - tmp_retry_times + 1).to_s + ")..."
249
+ tmp_retry_times -= 1
250
+ record_entities = put_result.failed_record_list
251
+
252
+ # 若是轮询写入方式,且shard处于非active状态(即error_code = "InvalidShardOperation"),则刷新shard列表
253
+ fresh_shard_flag = false
254
+ if @shard_id.empty? and @shard_keys.size == 0
255
+ for i in 0...put_result.failed_record_count
256
+ error_entity = put_result.failed_record_error[i]
257
+ if error_entity["error_code"] == "InvalidShardOperation"
258
+ unless fresh_shard_flag
259
+ @shards = get_active_shard
260
+ @shard_count = @shards.size
261
+ fresh_shard_flag = true
262
+ end
263
+ # puts "before: " + record_entities[i].to_json
264
+ record_entities[i].set_shard_id(get_shard_id(record_entities[i]))
265
+ # puts record_entities[i].to_json
266
+ end
267
+ end
268
+ end
269
+ else
270
+ if !@dirty_data_continue
271
+ @logger.error "Dirty data found, exit process now."
272
+ puts "Dirty data found, exit process now."
273
+ raise "try to exit!"
274
+ else
275
+ #不重试/重试次数用完,写入脏数据文件
276
+ for i in 0...put_result.failed_record_count
277
+ record_entity = put_result.failed_record_list[i]
278
+ @logger.error "Put record: " + record_entity.get_columns_map.to_s + " failed, " + put_result.failed_record_error[i].to_s
279
+ write_as_dirty_data(record_entity.get_columns_map)
280
+ end
281
+ break
282
+ end
283
+ end
284
+ else
285
+ @logger.info "Put data to datahub success, total " + record_entities.size.to_s
286
+ break
287
+ end
288
+ end
289
+ end
290
+
291
+ # 将record转化为entity
292
+ def record_to_entity(entity, record)
293
+ schema = entity.get_schema
294
+ @column_names.each do |column|
295
+ begin
296
+ source_key = @target_source_column_map[column]
297
+ if record.has_key?(source_key)
298
+ field = schema.get_field(column)
299
+ if field == nil
300
+ raise "Unknown column name of data"
301
+ else
302
+ field_type = field.get_type
303
+ if field_type == "BIGINT"
304
+ entity.setBigInt(column, record[source_key])
305
+ elsif field_type == "DOUBLE"
306
+ entity.setDouble(column, record[source_key])
307
+ elsif field_type == "BOOLEAN"
308
+ entity.setBoolean(column, record[source_key])
309
+ elsif field_type == "STRING"
310
+ entity.setString(column, record[source_key])
311
+ elsif field_type == "TIMESTAMP"
312
+ entity.setTimeStamp(column, record[source_key])
313
+ else
314
+ raise "Unknown schema type of data"
315
+ end
316
+ end
317
+ end
318
+ rescue => e
319
+ @logger.error "Parse data: " + column + "[" + record[source_key].to_s + "] failed, " + e.message
320
+ if !@dirty_data_continue
321
+ @logger.error "Dirty data found, exit process now."
322
+ puts "Dirty data found, exit process now."
323
+ raise "try to exit!"
324
+ else
325
+ # 忽略的异常数据直接落文件
326
+ write_as_dirty_data(record)
327
+ end
328
+ return false
329
+ end
330
+ end
331
+ return true
332
+ end
333
+
334
+
335
+ # 脏数据文件处理
336
+ def write_as_dirty_data(record)
337
+ dirty_file_part1_name = @dirty_data_file + ".part1"
338
+ dirty_file_part2_name = @dirty_data_file + ".part2"
339
+
340
+ # todo 加锁写入
341
+ @@file_lock.synchronize {
342
+ dirty_file_part2 = File.open(dirty_file_part2_name, "a+")
343
+ dirty_file_part2.puts(record.to_json)
344
+ dirty_file_part2.close
345
+ if File.size(dirty_file_part2_name) > @dirty_data_file_max_size / 2
346
+ # .part1, .part2分别存储数据
347
+ # 旧数据落part1,新的数据落part2
348
+ FileUtils.mv(dirty_file_part2_name, dirty_file_part1_name)
349
+ end
350
+ }
351
+ end
352
+
353
+ # 产生写入的shard_id
354
+ def get_shard_id(record)
355
+ if @shard_id != nil and !@shard_id.empty?
356
+ return @shard_id
357
+ elsif @shard_keys != nil and @shard_keys.size > 0
358
+ #hash 写入
359
+ hash_string = ""
360
+ for i in 0...@shard_keys.size
361
+ shard_key = @shard_keys[i]
362
+ source_key = @target_source_column_map[shard_key]
363
+ if record[source_key] != nil
364
+ hash_string += record[source_key].to_s + ","
365
+ end
366
+ end
367
+ hashed_value = hash_code(hash_string)
368
+ index = hashed_value % @shard_count
369
+ return @shards[index].shard_id
370
+ else
371
+ #轮询写入
372
+ idx = @shard_cursor % @shard_count
373
+ @shard_cursor = idx + 1
374
+ shard_id = @shards[idx].shard_id
375
+ # puts "idx: " + idx.to_s
376
+ # puts "shard_id: " + shard_id.to_s
377
+ return shard_id
378
+ end
379
+ end
380
+
381
+ # 产生和java 一样的hashcode
382
+ def hash_code(str)
383
+ str.each_char.reduce(0) do |result, char|
384
+ [((result << 5) - result) + char.ord].pack('L').unpack('l').first
385
+ end
386
+ end
387
+
388
+ # 获取active状态的shard
389
+ def get_active_shard
390
+ all_shards = @datahub_topic.list_shards
391
+ active_shards = []
392
+ all_shards.each do |shard|
393
+ if shard.state == "ACTIVE"
394
+ active_shards.push(shard)
395
+ end
396
+ end
397
+
398
+ return active_shards
399
+ end
400
+
401
+ end
402
+ end