logstash-output-datahub 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 41b554cbbdc8d2ea64dd05f8284660a499ce1346
4
- data.tar.gz: df5877d7d4039a857c1fcaa2e5ba195ca7d0e7e7
3
+ metadata.gz: e846682c134462b56c1ee792a34143797b6b34ce
4
+ data.tar.gz: a36cd441f580ffd6763fc41cae69de4c7924f05b
5
5
  SHA512:
6
- metadata.gz: c7fa8a628c8953db4af20fc7938879884484d092248c3e6bbbf476f785bec4509b305c55428f4b25b2603e572a10013143a35feb215248b17f4d8232a285fdf6
7
- data.tar.gz: 59a9bb7aea19acea1f829b1891f9702141368764d2351cada3ecdb9e7471f1df82340ba5ba41f97b84ad31a36ff182479d753bff02bd41b4fcf1b6648824c99f
6
+ metadata.gz: eb309d8008b270ac7a8b485c57d48e2d85e3e548ec4d8d963fa4b318b7250b54f07a4cc26ce4ef67a18b80dda43ca4f589282765b2ab87de333093cc1189205c
7
+ data.tar.gz: b48d2a40c2677e0c262383765970fe63396bff623bd94449f95b1ebf04dbc6a0a5a902af4ccca2c01ebabafad1d5e9bd38efec8344ffdf0a9e73320b85f7f09b
data/README.md CHANGED
@@ -71,8 +71,6 @@ output {
71
71
  topic_name => ""
72
72
  #shard_id => "0"
73
73
  #shard_keys => ["thread_id"]
74
- batch_size => 10
75
- batch_timeout => 5
76
74
  dirty_data_continue => true
77
75
  dirty_data_file => "/Users/ph0ly/trash/dirty.data"
78
76
  dirty_data_file_max_size => 1000
@@ -89,8 +87,6 @@ project_name(Required): datahub项目名称
89
87
  topic_name(Required): datahub topic名称
90
88
  retry_times(Optional): 重试次数,-1为无限重试、0为不重试、>0表示需要有限次数
91
89
  retry_interval(Optional): 下一次重试的间隔,单位为秒
92
- batch_size(Optional): 批量提交大小,指定数据积攒到@batch_size大小时触发一次提交,默认100
93
- batch_timeout(Optional): 批量提交超时,在数据量较少的情况下,数据超时后的超时提交,默认5秒
94
90
  shard_keys(Optional):数组类型,数据落shard的字段名称,插件会根据这些字段的值计算hash将每条数据落某个shard, 注意shard_keys和shard_id都未指定,默认轮询落shard
95
91
  shard_id(Optional): 所有数据落指定的shard,注意shard_keys和shard_id都未指定,默认轮询落shard
96
92
  dirty_data_continue(Optional): 脏数据是否继续运行,默认为false,如果指定true,则遇到脏数据直接无视,继续处理数据。当开启该开关,必须指定@dirty_data_file文件
@@ -1,341 +1,357 @@
1
- #
2
- #Licensed to the Apache Software Foundation (ASF) under one
3
- #or more contributor license agreements. See the NOTICE file
4
- #distributed with this work for additional information
5
- #regarding copyright ownership. The ASF licenses this file
6
- #to you under the Apache License, Version 2.0 (the
7
- #"License"); you may not use this file except in compliance
8
- #with the License. You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- #Unless required by applicable law or agreed to in writing,
13
- #software distributed under the License is distributed on an
14
- #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- #KIND, either express or implied. See the License for the
16
- #specific language governing permissions and limitations
17
- #under the License.
18
- #
19
- require "logstash/outputs/base"
20
- require "logstash/namespace"
21
- require "logstash/environment"
22
- require "fileutils"
23
- require "thread"
24
-
25
- jar_path=File.expand_path(File.join(File.dirname(__FILE__), "../../.."))
26
- LogStash::Environment.load_runtime_jars! File.join(jar_path, "vendor")
27
-
28
- # Datahub output plugin
29
- class LogStash::Outputs::Datahub < LogStash::Outputs::Base
30
- declare_threadsafe!
31
-
32
- config_name "datahub"
33
-
34
- # datahub access id
35
- config :access_id, :validate => :string, :required => true
36
-
37
- # datahub access key
38
- config :access_key, :validate => :string, :required => true
39
-
40
- # datahub service endpoint
41
- config :endpoint, :validate => :string, :required => true
42
-
43
- # datahub project name
44
- config :project_name, :validate => :string, :required => true
45
-
46
- # datahub topic name
47
- config :topic_name, :validate => :string, :required => true
48
-
49
- # 重试次数,-1为无限重试、0为不重试、>0表示需要有限次数
50
- config :retry_times, :validate => :number, :required => false, :default => -1
51
-
52
- # 重试周期,下一次重试的间隔,单位为秒
53
- config :retry_interval, :validate => :number, :required => false, :default => 5
54
-
55
- # 按照指定字段的值计算hash,依据于该hash值落某个shard
56
- config :shard_keys, :validate => :array, :required => false, :default => []
57
-
58
- # 指定数据落指定的shard
59
- config :shard_id, :validate => :string, :required => false, :default => ""
60
-
61
- # # 提交的列名,用户可以配置topic的列,采集部分列或者全部列
62
- # # 默认为空数组,表示按照topic的顺序及全字段提交
63
- # # 另外:列的配置不用保序,但是要求该字段在topic的schema中存在
64
- # config :column_names, :validate => :array, :required => false, :default => []
65
-
66
- # 当出现脏数据时,是否继续写入
67
- # 当开启该开关,必须指定@dirty_data_file文件
68
- config :dirty_data_continue, :validate => :boolean, :required => false, :default => false
69
-
70
- # 脏数据文件名称,当数据文件名称,在@dirty_data_continue开启的情况下,需要指定该值
71
- # 特别注意:脏数据文件将被分割成两个部分.part1和.part2,part1作为更早的脏数据,part2作为更新的数据
72
- config :dirty_data_file, :validate => :string, :required => false
73
-
74
- # 脏数据文件的最大大小,该值保证脏数据文件最大大小不超过这个值,目前该值仅是一个参考值
75
- config :dirty_data_file_max_size, :validate => :number, :required => false, :default => 50024000
76
-
77
- # 数据传输压缩方式选择,目前支持deflate, lz4格式
78
- config :compress_method, :validate => :string, :required => false, :default => ""
79
-
80
- # 该值内部使用,不提供配置
81
- # 分发shard的游标
82
- attr_accessor :shard_cursor
83
-
84
- # Shard cursor lock
85
- @@shard_lock = Mutex.new
86
-
87
- # 写文件锁
88
- @@file_lock = Mutex.new
89
-
90
- DatahubPackage = com.aliyun.datahub
91
-
92
- public
93
- def register
94
- begin
95
- @account = DatahubPackage.auth.AliyunAccount::new(@access_id, @access_key)
96
- @conf = DatahubPackage.DatahubConfiguration::new(@account, @endpoint)
97
- if @compress_method == "deflate" || @compress_method == "lz4"
98
- @compression_format = DatahubPackage.model.compress.CompressionFormat.fromValue(@compress_method)
99
- @conf.setCompressionFormat(@compression_format)
100
- end
101
-
102
- @client = DatahubPackage.DatahubClient::new(@conf)
103
- @project = DatahubPackage.wrapper.Project::Builder.build(@project_name, @client)
104
- @topic = @project.getTopic(@topic_name)
105
- @shard_cursor = 0
106
-
107
- @shards = get_active_shards(@topic.listShard())
108
- @shard_count = @shards.size()
109
-
110
- result = @client.getTopic(@project_name, @topic_name)
111
- @schema = result.getRecordSchema()
112
- fields = @schema.getFields()
113
- @columns_size = fields.size
114
- @columns = []
115
- for i in 0...@columns_size
116
- @columns.push(fields[i].getName())
117
- end
118
-
119
- # 前置校验参数
120
- check_params()
121
-
122
- if @shard_count == 0
123
- @logger.error "No active shard available, please check"
124
- raise "No active shard available, please check"
125
- end
126
-
127
- @logger.info "Init datahub success!"
128
- rescue => e
129
- @logger.error "Init failed!" + e.message + " " + e.backtrace.inspect.to_s
130
- raise e
131
- end
132
- end # def register
133
-
134
- def check_params()
135
- # 如果shard_id配置了,则检查该shard是否ok
136
- if !@shard_id.empty?
137
- valid = false
138
- for i in 0...@shards.size
139
- shard_entry = @shards[i]
140
- if shard_entry.getShardId() == @shard_id && shard_entry.getState() == DatahubPackage.model.ShardState::ACTIVE
141
- valid = true
142
- end
143
- end
144
- if (!valid)
145
- @logger.error "Config shard_id not exists or state not active, check your config"
146
- raise "Config shard_id not exists or state not active, check your config"
147
- end
148
- end
149
-
150
- # 检查shard_keys字段是否合法
151
- if @shard_keys.size > 0
152
- for i in 0...@shard_keys.size
153
- shard_key = @shard_keys[i]
154
- if !@schema.containsField(shard_key)
155
- @logger.error "Config shard_keys contains one or one more unknown field, check your config"
156
- raise "Config shard_keys contains one or one more unknown field, check your config"
157
- end
158
- end
159
- end
160
-
161
- # 配置了脏数据继续,必须指定脏数据文件
162
- if @dirty_data_continue
163
- if @dirty_data_file.to_s.chomp.length == 0
164
- raise "Dirty data file path can not be empty"
165
- end
166
- end
167
-
168
- end
169
-
170
- # 检查并设置数据到entry中
171
- # 如果解析数据异常,则数据落脏数据文件
172
- def check_and_set_data(entry, field_type, index, event_map, column_name)
173
- data = event_map[column_name]
174
- begin
175
- if field_type == DatahubPackage.common.data.FieldType::STRING
176
- entry.setString(index, data.to_s)
177
- elsif field_type == DatahubPackage.common.data.FieldType::BIGINT
178
- entry.setBigint(index, java.lang.Long.parseLong(data.to_s))
179
- elsif field_type == DatahubPackage.common.data.FieldType::DOUBLE
180
- entry.setDouble(index, java.lang.Double.parseDouble(data.to_s))
181
- elsif field_type == DatahubPackage.common.data.FieldType::BOOLEAN
182
- entry.setBoolean(index, java.lang.Boolean.parseBoolean(data.to_s))
183
- elsif field_type == DatahubPackage.common.data.FieldType::TIMESTAMP
184
- entry.setTimeStamp(index, java.lang.Long.parseLong(data.to_s))
185
- else
186
- raise "Unknown schema type of data"
187
- end
188
- return true
189
- rescue => e
190
- @logger.error "Parse data: " + column_name + "[" + data + "] failed, " + e.message
191
- # 数据格式有异常,根据配置参数确定是否续跑
192
- if !@dirty_data_continue
193
- @logger.error "Dirty data found, exit process now."
194
- puts "Dirty data found, exit process now."
195
- Process.exit(1)
196
- # 忽略的异常数据直接落文件
197
- else
198
- write_as_dirty_data(event_map)
199
- end
200
- return false
201
- end
202
- end
203
-
204
- # 脏数据文件处理
205
- def write_as_dirty_data(event_amp)
206
- dirty_file_part1_name = @dirty_data_file + ".part1"
207
- dirty_file_part2_name = @dirty_data_file + ".part2"
208
-
209
- # 加锁写入
210
- @@file_lock.synchronize {
211
- dirty_file_part2 = File.open(dirty_file_part2_name, "a+")
212
- dirty_file_part2.puts(event_amp.to_s)
213
- dirty_file_part2.close
214
- if File.size(dirty_file_part2_name) > @dirty_data_file_max_size / 2
215
- # .part1, .part2分别存储数据
216
- # 旧数据落part1,新的数据落part2
217
- FileUtils.mv(dirty_file_part2_name, dirty_file_part1_name)
218
- end
219
- }
220
- end
221
-
222
- def get_active_shards(shards)
223
- active_shards = []
224
- for i in 0...shards.size
225
- entry = shards.get(i)
226
- if entry.getState() == DatahubPackage.model.ShardState::ACTIVE
227
- active_shards.push(entry)
228
- end
229
- end
230
- return active_shards
231
- end
232
-
233
- def get_next_shard_id()
234
- if !@shard_id.empty?
235
- return @shard_id
236
- # 否则轮询写入shard
237
- else
238
- idx = 0
239
- @@shard_lock.synchronize {
240
- idx = @shard_cursor % @shard_count
241
- @shard_cursor = idx + 1
242
- }
243
- shard_id = @shards[idx].getShardId()
244
- return shard_id
245
- end
246
- end
247
-
248
- def multi_receive(event_list)
249
- begin
250
- entries = []
251
- shard_id = get_next_shard_id()
252
-
253
- event_list.each do |event|
254
- if event == LogStash::SHUTDOWN
255
- return
256
- end
257
- event_map = event.to_hash
258
-
259
- entry = DatahubPackage.model.RecordEntry::new(@schema)
260
- #entry.putAttribute("srcId", event_map["host"].to_s)
261
- #entry.putAttribute("ts", event_map["@timestamp"].to_s)
262
- #entry.putAttribute("version", event_map["@version"].to_s)
263
- #entry.putAttribute("srcType", "log")
264
-
265
- for i in 0...@columns_size do
266
- value = event_map[@columns[i]]
267
- if value != nil
268
- entry.set(i, value)
269
- end
270
- end
271
-
272
- if @shard_keys.size > 0
273
- hash_string = ""
274
- for i in 0...@shard_keys.size
275
- shard_key = @shard_keys[i]
276
- if event_map[shard_key] != nil
277
- hash_string += event_map[shard_key].to_s + ","
278
- end
279
- end
280
- hashed_value = java.lang.String.new(hash_string).hashCode()
281
- entry.setPartitionKey(hashed_value)
282
- else
283
- entry.setShardId(shard_id)
284
- end
285
- entries.push(entry)
286
- end
287
-
288
- # puts "total: " + entries.size.to_s
289
-
290
- # 提交列表必须有数据
291
- if entries.size > 0
292
- put_result = @client.putRecords(@project_name, @topic_name, entries)
293
- if put_result.getFailedRecordCount() > 0
294
- @logger.info "Put " + put_result.getFailedRecordCount().to_s + " records to datahub failed, total " + entries.size().to_s
295
- sleep @retry_interval
296
- entries = put_result.getFailedRecords()
297
- @logger.info "write to datahub, failed: " + entries.size.to_s
298
- else
299
- @logger.info "Put data to datahub success, total " + entries.size().to_s
300
- end
301
- end
302
-
303
- rescue DatahubPackage.exception.DatahubServiceException => e
304
- @logger.error "Flush data exception: " + e.message #+ " " + e.backtrace.inspect.to_s
305
- # shard的状态改变,需要重新加载shard
306
- if e.getErrorCode() == "InvalidShardOperation"
307
- @shards = get_active_shards(@topic.listShard())
308
- @shard_count = @shards.size()
309
-
310
- if @shard_count == 0
311
- @logger.error "No active shard available, please check"
312
- end
313
- elsif e.getErrorCode() == nil
314
- sleep @retry_interval
315
- end
316
- retry
317
- rescue => e
318
- @logger.error "Flush data exception: " + e.message + " " + e.backtrace.inspect.to_s
319
-
320
- # 无限重试
321
- if @retry_times < 0
322
- @logger.warn "Now retry..."
323
- # puts "Now retry..."
324
- sleep @retry_interval
325
- retry
326
- # 重试次数用完
327
- elsif @retry_times == 0
328
- @logger.error "Retry not work, now exit"
329
- Process.exit(1)
330
- # 继续重试
331
- elsif @retry_times > 0
332
- @logger.warn "Now retry..."
333
- # puts "Now retry..."
334
- sleep @retry_interval
335
- @retry_times -= 1
336
- retry
337
- end
338
- end
339
- end # def multi_receive
340
-
341
- end # class LogStash::Outputs::Datahub
1
+ #
2
+ #Licensed to the Apache Software Foundation (ASF) under one
3
+ #or more contributor license agreements. See the NOTICE file
4
+ #distributed with this work for additional information
5
+ #regarding copyright ownership. The ASF licenses this file
6
+ #to you under the Apache License, Version 2.0 (the
7
+ #"License"); you may not use this file except in compliance
8
+ #with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ #Unless required by applicable law or agreed to in writing,
13
+ #software distributed under the License is distributed on an
14
+ #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ #KIND, either express or implied. See the License for the
16
+ #specific language governing permissions and limitations
17
+ #under the License.
18
+ #
19
+ require "logstash/outputs/base"
20
+ require "logstash/namespace"
21
+ require "logstash/environment"
22
+ require "fileutils"
23
+ require "thread"
24
+
25
+ jar_path=File.expand_path(File.join(File.dirname(__FILE__), "../../.."))
26
+ LogStash::Environment.load_runtime_jars! File.join(jar_path, "vendor")
27
+
28
+ # Datahub output plugin
29
+ class LogStash::Outputs::Datahub < LogStash::Outputs::Base
30
+ declare_threadsafe!
31
+
32
+ config_name "datahub"
33
+
34
+ # datahub access id
35
+ config :access_id, :validate => :string, :required => true
36
+
37
+ # datahub access key
38
+ config :access_key, :validate => :string, :required => true
39
+
40
+ # datahub service endpoint
41
+ config :endpoint, :validate => :string, :required => true
42
+
43
+ # datahub project name
44
+ config :project_name, :validate => :string, :required => true
45
+
46
+ # datahub topic name
47
+ config :topic_name, :validate => :string, :required => true
48
+
49
+ # 重试次数,-1为无限重试、0为不重试、>0表示需要有限次数
50
+ config :retry_times, :validate => :number, :required => false, :default => -1
51
+
52
+ # 重试周期,下一次重试的间隔,单位为秒
53
+ config :retry_interval, :validate => :number, :required => false, :default => 5
54
+
55
+ # 按照指定字段的值计算hash,依据于该hash值落某个shard
56
+ config :shard_keys, :validate => :array, :required => false, :default => []
57
+
58
+ # 指定数据落指定的shard
59
+ config :shard_id, :validate => :string, :required => false, :default => ""
60
+
61
+ # # 提交的列名,用户可以配置topic的列,采集部分列或者全部列
62
+ # # 默认为空数组,表示按照topic的顺序及全字段提交
63
+ # # 另外:列的配置不用保序,但是要求该字段在topic的schema中存在
64
+ # config :column_names, :validate => :array, :required => false, :default => []
65
+
66
+ # 当出现脏数据时,是否继续写入
67
+ # 当开启该开关,必须指定@dirty_data_file文件
68
+ config :dirty_data_continue, :validate => :boolean, :required => false, :default => false
69
+
70
+ # 脏数据文件名称,当数据文件名称,在@dirty_data_continue开启的情况下,需要指定该值
71
+ # 特别注意:脏数据文件将被分割成两个部分.part1和.part2,part1作为更早的脏数据,part2作为更新的数据
72
+ config :dirty_data_file, :validate => :string, :required => false
73
+
74
+ # 脏数据文件的最大大小,该值保证脏数据文件最大大小不超过这个值,目前该值仅是一个参考值
75
+ config :dirty_data_file_max_size, :validate => :number, :required => false, :default => 50024000
76
+
77
+ # 数据传输压缩方式选择,目前支持deflate, lz4格式
78
+ config :compress_method, :validate => :string, :required => false, :default => ""
79
+
80
+ # 该值内部使用,不提供配置
81
+ # 分发shard的游标
82
+ attr_accessor :shard_cursor
83
+
84
+ # Shard cursor lock
85
+ @@shard_lock = Mutex.new
86
+
87
+ # 写文件锁
88
+ @@file_lock = Mutex.new
89
+
90
+ DatahubPackage = com.aliyun.datahub
91
+
92
+ public
93
+ def register
94
+ begin
95
+ @account = DatahubPackage.auth.AliyunAccount::new(@access_id, @access_key)
96
+ @conf = DatahubPackage.DatahubConfiguration::new(@account, @endpoint)
97
+ if @compress_method == "deflate" || @compress_method == "lz4"
98
+ @compression_format = DatahubPackage.model.compress.CompressionFormat.fromValue(@compress_method)
99
+ @conf.setCompressionFormat(@compression_format)
100
+ end
101
+
102
+ @client = DatahubPackage.DatahubClient::new(@conf)
103
+ @project = DatahubPackage.wrapper.Project::Builder.build(@project_name, @client)
104
+ @topic = @project.getTopic(@topic_name)
105
+ @shard_cursor = 0
106
+
107
+ @shards = get_active_shards(@topic.listShard())
108
+ @shard_count = @shards.size()
109
+
110
+ result = @client.getTopic(@project_name, @topic_name)
111
+ @schema = result.getRecordSchema()
112
+ fields = @schema.getFields()
113
+ @columns_size = fields.size
114
+ @columnnames = []
115
+ for i in 0...@columns_size
116
+ @columnnames.push(fields[i].getName())
117
+ end
118
+ @columntypes = []
119
+ for i in 0...@columns_size
120
+ @columntypes.push(fields[i].getType())
121
+ end
122
+
123
+ # 前置校验参数
124
+ check_params()
125
+
126
+ if @shard_count == 0
127
+ @logger.error "No active shard available, please check"
128
+ raise "No active shard available, please check"
129
+ end
130
+
131
+ @logger.info "Init datahub success!"
132
+ rescue => e
133
+ @logger.error "Init failed!" + e.message + " " + e.backtrace.inspect.to_s
134
+ raise e
135
+ end
136
+ end # def register
137
+
138
+ def check_params()
139
+ # 如果shard_id配置了,则检查该shard是否ok
140
+ if !@shard_id.empty?
141
+ valid = false
142
+ for i in 0...@shards.size
143
+ shard_entry = @shards[i]
144
+ if shard_entry.getShardId() == @shard_id && shard_entry.getState() == DatahubPackage.model.ShardState::ACTIVE
145
+ valid = true
146
+ end
147
+ end
148
+ if (!valid)
149
+ @logger.error "Config shard_id not exists or state not active, check your config"
150
+ raise "Config shard_id not exists or state not active, check your config"
151
+ end
152
+ end
153
+
154
+ # 检查shard_keys字段是否合法
155
+ if @shard_keys.size > 0
156
+ for i in 0...@shard_keys.size
157
+ shard_key = @shard_keys[i]
158
+ if !@schema.containsField(shard_key)
159
+ @logger.error "Config shard_keys contains one or one more unknown field, check your config"
160
+ raise "Config shard_keys contains one or one more unknown field, check your config"
161
+ end
162
+ end
163
+ end
164
+
165
+ # 配置了脏数据继续,必须指定脏数据文件
166
+ if @dirty_data_continue
167
+ if @dirty_data_file.to_s.chomp.length == 0
168
+ raise "Dirty data file path can not be empty"
169
+ end
170
+ end
171
+
172
+ end
173
+
174
+ # 检查并设置数据到entry中
175
+ # 如果解析数据异常,则数据落脏数据文件
176
+ def check_and_set_data(entry, field_type, index, event_map, column_name)
177
+ data = event_map[column_name]
178
+ begin
179
+ if field_type == DatahubPackage.common.data.FieldType::STRING
180
+ entry.setString(index, data.to_s)
181
+ elsif field_type == DatahubPackage.common.data.FieldType::BIGINT
182
+ entry.setBigint(index, java.lang.Long.parseLong(data.to_s))
183
+ elsif field_type == DatahubPackage.common.data.FieldType::DOUBLE
184
+ entry.setDouble(index, java.lang.Double.parseDouble(data.to_s))
185
+ elsif field_type == DatahubPackage.common.data.FieldType::BOOLEAN
186
+ entry.setBoolean(index, java.lang.Boolean.parseBoolean(data.to_s))
187
+ elsif field_type == DatahubPackage.common.data.FieldType::TIMESTAMP
188
+ entry.setTimeStamp(index, java.lang.Long.parseLong(data.to_s))
189
+ else
190
+ raise "Unknown schema type of data"
191
+ end
192
+ return true
193
+ rescue => e
194
+ @logger.error "Parse data: " + column_name + "[" + data + "] failed, " + e.message
195
+ # 数据格式有异常,根据配置参数确定是否续跑
196
+ if !@dirty_data_continue
197
+ @logger.error "Dirty data found, exit process now."
198
+ puts "Dirty data found, exit process now."
199
+ Process.exit(1)
200
+ # 忽略的异常数据直接落文件
201
+ else
202
+ write_as_dirty_data(event_map)
203
+ end
204
+ return false
205
+ end
206
+ end
207
+
208
+ # 脏数据文件处理
209
+ def write_as_dirty_data(event_amp)
210
+ dirty_file_part1_name = @dirty_data_file + ".part1"
211
+ dirty_file_part2_name = @dirty_data_file + ".part2"
212
+
213
+ # 加锁写入
214
+ @@file_lock.synchronize {
215
+ dirty_file_part2 = File.open(dirty_file_part2_name, "a+")
216
+ dirty_file_part2.puts(event_amp.to_s)
217
+ dirty_file_part2.close
218
+ if File.size(dirty_file_part2_name) > @dirty_data_file_max_size / 2
219
+ # .part1, .part2分别存储数据
220
+ # 旧数据落part1,新的数据落part2
221
+ FileUtils.mv(dirty_file_part2_name, dirty_file_part1_name)
222
+ end
223
+ }
224
+ end
225
+
226
+ def get_active_shards(shards)
227
+ active_shards = []
228
+ for i in 0...shards.size
229
+ entry = shards.get(i)
230
+ if entry.getState() == DatahubPackage.model.ShardState::ACTIVE
231
+ active_shards.push(entry)
232
+ end
233
+ end
234
+ return active_shards
235
+ end
236
+
237
+ def get_next_shard_id()
238
+ if !@shard_id.empty?
239
+ return @shard_id
240
+ # 否则轮询写入shard
241
+ else
242
+ idx = 0
243
+ @@shard_lock.synchronize {
244
+ idx = @shard_cursor % @shard_count
245
+ @shard_cursor = idx + 1
246
+ }
247
+ shard_id = @shards[idx].getShardId()
248
+ return shard_id
249
+ end
250
+ end
251
+
252
+ def multi_receive(event_list)
253
+ retry_count = 0
254
+ begin
255
+ entries = []
256
+ shard_id = get_next_shard_id()
257
+
258
+ event_list.each do |event|
259
+ if event == LogStash::SHUTDOWN
260
+ return
261
+ end
262
+ event_map = event.to_hash
263
+
264
+ entry = DatahubPackage.model.RecordEntry::new(@schema)
265
+ entry.putAttribute("srcId", event_map["host"].to_s)
266
+ entry.putAttribute("ts", event_map["@timestamp"].to_s)
267
+ entry.putAttribute("version", event_map["@version"].to_s)
268
+ entry.putAttribute("srcType", "log")
269
+
270
+ is_data_valid = false
271
+ for i in 0...@columns_size do
272
+ column_name = @columnnames[i]
273
+ column_type = @columntypes[i]
274
+ value = event_map[column_name]
275
+ if value != nil
276
+ is_data_valid = check_and_set_data(entry, column_type, i, event_map, column_name)
277
+ break if !is_data_valid
278
+ end
279
+ end
280
+
281
+ if is_data_valid
282
+ if @shard_keys.size > 0
283
+ hash_string = ""
284
+ for i in 0...@shard_keys.size
285
+ shard_key = @shard_keys[i]
286
+ if event_map[shard_key] != nil
287
+ hash_string += event_map[shard_key].to_s + ","
288
+ end
289
+ end
290
+ hashed_value = java.lang.String.new(hash_string).hashCode()
291
+ entry.setPartitionKey(hashed_value)
292
+ else
293
+ entry.setShardId(shard_id)
294
+ end
295
+ entries.push(entry)
296
+ end
297
+ end
298
+
299
+ # puts "total: " + entries.size.to_s
300
+
301
+ # 提交列表必须有数据
302
+ if entries.size > 0
303
+ put_result = @client.putRecords(@project_name, @topic_name, entries)
304
+ if put_result.getFailedRecordCount() > 0
305
+ @logger.info "Put " + put_result.getFailedRecordCount().to_s + " records to datahub failed, total " + entries.size().to_s
306
+ sleep @retry_interval
307
+ entries = put_result.getFailedRecords()
308
+ raise "Write to datahub failed: " + entries.size.to_s
309
+ else
310
+ @logger.info "Put data to datahub success, total " + entries.size().to_s
311
+ end
312
+ end
313
+
314
+ rescue DatahubPackage.exception.DatahubServiceException => e
315
+ @logger.error "Flush data exception: " + e.message #+ " " + e.backtrace.inspect.to_s
316
+ # shard的状态改变,需要重新加载shard
317
+ if e.getErrorCode() == "InvalidShardOperation"
318
+ @shards = get_active_shards(@topic.listShard())
319
+ @shard_count = @shards.size()
320
+
321
+ if @shard_count == 0
322
+ @logger.error "No active shard available, please check"
323
+ end
324
+ elsif e.getErrorCode() == nil
325
+ sleep @retry_interval
326
+ end
327
+ retry_count += 1
328
+ @logger.warn "Now retry: " + retry_count.to_s
329
+ retry
330
+ rescue => e
331
+ @logger.error "Flush data exception: " + e.message + " " + e.backtrace.inspect.to_s
332
+
333
+ # 无限重试
334
+ if @retry_times < 0
335
+ retry_count += 1
336
+ @logger.warn "Now retry: " + retry_count.to_s
337
+ # puts "Now retry..."
338
+ sleep @retry_interval
339
+ retry
340
+ elsif @retry_times == 0
341
+ @logger.error "Retry not work, now exit"
342
+ Process.exit(1)
343
+ # 继续重试
344
+ elsif @retry_times > 0
345
+ retry_count += 1
346
+ if retry_count > @retry_times
347
+ @logger.warn "Retry over: " + @retry_times.to_s
348
+ Process.exit(1)
349
+ end
350
+ @logger.warn "Now retry..."
351
+ sleep @retry_interval
352
+ retry
353
+ end
354
+ end
355
+ end # def multi_receive
356
+
357
+ end # class LogStash::Outputs::Datahub
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'logstash-output-datahub'
3
- s.version = "1.0.0"
3
+ s.version = "1.0.1"
4
4
  s.licenses = ["Apache License (2.0)"]
5
5
  s.summary = "This aliyun-datahub output plugin."
6
6
  s.description = "This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program"
@@ -19,7 +19,7 @@ Gem::Specification.new do |s|
19
19
 
20
20
  # Gem dependencies
21
21
  s.add_runtime_dependency 'stud'
22
- s.add_runtime_dependency "logstash-core", ">= 2.0.0", "< 3.0.0"
22
+ s.add_runtime_dependency "logstash-core", ">= 2.0.0"
23
23
  s.add_runtime_dependency "logstash-codec-plain"
24
24
  s.add_development_dependency "logstash-devutils"
25
25
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: logstash-output-datahub
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aliyun
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-20 00:00:00.000000000 Z
11
+ date: 2017-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: stud
@@ -31,9 +31,6 @@ dependencies:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: 2.0.0
34
- - - "<"
35
- - !ruby/object:Gem::Version
36
- version: 3.0.0
37
34
  type: :runtime
38
35
  prerelease: false
39
36
  version_requirements: !ruby/object:Gem::Requirement
@@ -41,9 +38,6 @@ dependencies:
41
38
  - - ">="
42
39
  - !ruby/object:Gem::Version
43
40
  version: 2.0.0
44
- - - "<"
45
- - !ruby/object:Gem::Version
46
- version: 3.0.0
47
41
  - !ruby/object:Gem::Dependency
48
42
  name: logstash-codec-plain
49
43
  requirement: !ruby/object:Gem::Requirement
@@ -126,7 +120,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
126
120
  version: '0'
127
121
  requirements: []
128
122
  rubyforge_project:
129
- rubygems_version: 2.4.5.1
123
+ rubygems_version: 2.6.10
130
124
  signing_key:
131
125
  specification_version: 4
132
126
  summary: This aliyun-datahub output plugin.