logstash-output-datahub 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 41b554cbbdc8d2ea64dd05f8284660a499ce1346
4
- data.tar.gz: df5877d7d4039a857c1fcaa2e5ba195ca7d0e7e7
3
+ metadata.gz: e846682c134462b56c1ee792a34143797b6b34ce
4
+ data.tar.gz: a36cd441f580ffd6763fc41cae69de4c7924f05b
5
5
  SHA512:
6
- metadata.gz: c7fa8a628c8953db4af20fc7938879884484d092248c3e6bbbf476f785bec4509b305c55428f4b25b2603e572a10013143a35feb215248b17f4d8232a285fdf6
7
- data.tar.gz: 59a9bb7aea19acea1f829b1891f9702141368764d2351cada3ecdb9e7471f1df82340ba5ba41f97b84ad31a36ff182479d753bff02bd41b4fcf1b6648824c99f
6
+ metadata.gz: eb309d8008b270ac7a8b485c57d48e2d85e3e548ec4d8d963fa4b318b7250b54f07a4cc26ce4ef67a18b80dda43ca4f589282765b2ab87de333093cc1189205c
7
+ data.tar.gz: b48d2a40c2677e0c262383765970fe63396bff623bd94449f95b1ebf04dbc6a0a5a902af4ccca2c01ebabafad1d5e9bd38efec8344ffdf0a9e73320b85f7f09b
data/README.md CHANGED
@@ -71,8 +71,6 @@ output {
71
71
  topic_name => ""
72
72
  #shard_id => "0"
73
73
  #shard_keys => ["thread_id"]
74
- batch_size => 10
75
- batch_timeout => 5
76
74
  dirty_data_continue => true
77
75
  dirty_data_file => "/Users/ph0ly/trash/dirty.data"
78
76
  dirty_data_file_max_size => 1000
@@ -89,8 +87,6 @@ project_name(Required): datahub项目名称
89
87
  topic_name(Required): datahub topic名称
90
88
  retry_times(Optional): 重试次数,-1为无限重试、0为不重试、>0表示需要有限次数
91
89
  retry_interval(Optional): 下一次重试的间隔,单位为秒
92
- batch_size(Optional): 批量提交大小,指定数据积攒到@batch_size大小时触发一次提交,默认100
93
- batch_timeout(Optional): 批量提交超时,在数据量较少的情况下,数据超时后的超时提交,默认5秒
94
90
  shard_keys(Optional):数组类型,数据落shard的字段名称,插件会根据这些字段的值计算hash将每条数据落某个shard, 注意shard_keys和shard_id都未指定,默认轮询落shard
95
91
  shard_id(Optional): 所有数据落指定的shard,注意shard_keys和shard_id都未指定,默认轮询落shard
96
92
  dirty_data_continue(Optional): 脏数据是否继续运行,默认为false,如果指定true,则遇到脏数据直接无视,继续处理数据。当开启该开关,必须指定@dirty_data_file文件
@@ -1,341 +1,357 @@
1
- #
2
- #Licensed to the Apache Software Foundation (ASF) under one
3
- #or more contributor license agreements. See the NOTICE file
4
- #distributed with this work for additional information
5
- #regarding copyright ownership. The ASF licenses this file
6
- #to you under the Apache License, Version 2.0 (the
7
- #"License"); you may not use this file except in compliance
8
- #with the License. You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- #Unless required by applicable law or agreed to in writing,
13
- #software distributed under the License is distributed on an
14
- #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
- #KIND, either express or implied. See the License for the
16
- #specific language governing permissions and limitations
17
- #under the License.
18
- #
19
- require "logstash/outputs/base"
20
- require "logstash/namespace"
21
- require "logstash/environment"
22
- require "fileutils"
23
- require "thread"
24
-
25
- jar_path=File.expand_path(File.join(File.dirname(__FILE__), "../../.."))
26
- LogStash::Environment.load_runtime_jars! File.join(jar_path, "vendor")
27
-
28
- # Datahub output plugin
29
- class LogStash::Outputs::Datahub < LogStash::Outputs::Base
30
- declare_threadsafe!
31
-
32
- config_name "datahub"
33
-
34
- # datahub access id
35
- config :access_id, :validate => :string, :required => true
36
-
37
- # datahub access key
38
- config :access_key, :validate => :string, :required => true
39
-
40
- # datahub service endpoint
41
- config :endpoint, :validate => :string, :required => true
42
-
43
- # datahub project name
44
- config :project_name, :validate => :string, :required => true
45
-
46
- # datahub topic name
47
- config :topic_name, :validate => :string, :required => true
48
-
49
- # 重试次数,-1为无限重试、0为不重试、>0表示需要有限次数
50
- config :retry_times, :validate => :number, :required => false, :default => -1
51
-
52
- # 重试周期,下一次重试的间隔,单位为秒
53
- config :retry_interval, :validate => :number, :required => false, :default => 5
54
-
55
- # 按照指定字段的值计算hash,依据于该hash值落某个shard
56
- config :shard_keys, :validate => :array, :required => false, :default => []
57
-
58
- # 指定数据落指定的shard
59
- config :shard_id, :validate => :string, :required => false, :default => ""
60
-
61
- # # 提交的列名,用户可以配置topic的列,采集部分列或者全部列
62
- # # 默认为空数组,表示按照topic的顺序及全字段提交
63
- # # 另外:列的配置不用保序,但是要求该字段在topic的schema中存在
64
- # config :column_names, :validate => :array, :required => false, :default => []
65
-
66
- # 当出现脏数据时,是否继续写入
67
- # 当开启该开关,必须指定@dirty_data_file文件
68
- config :dirty_data_continue, :validate => :boolean, :required => false, :default => false
69
-
70
- # 脏数据文件名称,当数据文件名称,在@dirty_data_continue开启的情况下,需要指定该值
71
- # 特别注意:脏数据文件将被分割成两个部分.part1和.part2,part1作为更早的脏数据,part2作为更新的数据
72
- config :dirty_data_file, :validate => :string, :required => false
73
-
74
- # 脏数据文件的最大大小,该值保证脏数据文件最大大小不超过这个值,目前该值仅是一个参考值
75
- config :dirty_data_file_max_size, :validate => :number, :required => false, :default => 50024000
76
-
77
- # 数据传输压缩方式选择,目前支持deflate, lz4格式
78
- config :compress_method, :validate => :string, :required => false, :default => ""
79
-
80
- # 该值内部使用,不提供配置
81
- # 分发shard的游标
82
- attr_accessor :shard_cursor
83
-
84
- # Shard cursor lock
85
- @@shard_lock = Mutex.new
86
-
87
- # 写文件锁
88
- @@file_lock = Mutex.new
89
-
90
- DatahubPackage = com.aliyun.datahub
91
-
92
- public
93
- def register
94
- begin
95
- @account = DatahubPackage.auth.AliyunAccount::new(@access_id, @access_key)
96
- @conf = DatahubPackage.DatahubConfiguration::new(@account, @endpoint)
97
- if @compress_method == "deflate" || @compress_method == "lz4"
98
- @compression_format = DatahubPackage.model.compress.CompressionFormat.fromValue(@compress_method)
99
- @conf.setCompressionFormat(@compression_format)
100
- end
101
-
102
- @client = DatahubPackage.DatahubClient::new(@conf)
103
- @project = DatahubPackage.wrapper.Project::Builder.build(@project_name, @client)
104
- @topic = @project.getTopic(@topic_name)
105
- @shard_cursor = 0
106
-
107
- @shards = get_active_shards(@topic.listShard())
108
- @shard_count = @shards.size()
109
-
110
- result = @client.getTopic(@project_name, @topic_name)
111
- @schema = result.getRecordSchema()
112
- fields = @schema.getFields()
113
- @columns_size = fields.size
114
- @columns = []
115
- for i in 0...@columns_size
116
- @columns.push(fields[i].getName())
117
- end
118
-
119
- # 前置校验参数
120
- check_params()
121
-
122
- if @shard_count == 0
123
- @logger.error "No active shard available, please check"
124
- raise "No active shard available, please check"
125
- end
126
-
127
- @logger.info "Init datahub success!"
128
- rescue => e
129
- @logger.error "Init failed!" + e.message + " " + e.backtrace.inspect.to_s
130
- raise e
131
- end
132
- end # def register
133
-
134
- def check_params()
135
- # 如果shard_id配置了,则检查该shard是否ok
136
- if !@shard_id.empty?
137
- valid = false
138
- for i in 0...@shards.size
139
- shard_entry = @shards[i]
140
- if shard_entry.getShardId() == @shard_id && shard_entry.getState() == DatahubPackage.model.ShardState::ACTIVE
141
- valid = true
142
- end
143
- end
144
- if (!valid)
145
- @logger.error "Config shard_id not exists or state not active, check your config"
146
- raise "Config shard_id not exists or state not active, check your config"
147
- end
148
- end
149
-
150
- # 检查shard_keys字段是否合法
151
- if @shard_keys.size > 0
152
- for i in 0...@shard_keys.size
153
- shard_key = @shard_keys[i]
154
- if !@schema.containsField(shard_key)
155
- @logger.error "Config shard_keys contains one or one more unknown field, check your config"
156
- raise "Config shard_keys contains one or one more unknown field, check your config"
157
- end
158
- end
159
- end
160
-
161
- # 配置了脏数据继续,必须指定脏数据文件
162
- if @dirty_data_continue
163
- if @dirty_data_file.to_s.chomp.length == 0
164
- raise "Dirty data file path can not be empty"
165
- end
166
- end
167
-
168
- end
169
-
170
- # 检查并设置数据到entry中
171
- # 如果解析数据异常,则数据落脏数据文件
172
- def check_and_set_data(entry, field_type, index, event_map, column_name)
173
- data = event_map[column_name]
174
- begin
175
- if field_type == DatahubPackage.common.data.FieldType::STRING
176
- entry.setString(index, data.to_s)
177
- elsif field_type == DatahubPackage.common.data.FieldType::BIGINT
178
- entry.setBigint(index, java.lang.Long.parseLong(data.to_s))
179
- elsif field_type == DatahubPackage.common.data.FieldType::DOUBLE
180
- entry.setDouble(index, java.lang.Double.parseDouble(data.to_s))
181
- elsif field_type == DatahubPackage.common.data.FieldType::BOOLEAN
182
- entry.setBoolean(index, java.lang.Boolean.parseBoolean(data.to_s))
183
- elsif field_type == DatahubPackage.common.data.FieldType::TIMESTAMP
184
- entry.setTimeStamp(index, java.lang.Long.parseLong(data.to_s))
185
- else
186
- raise "Unknown schema type of data"
187
- end
188
- return true
189
- rescue => e
190
- @logger.error "Parse data: " + column_name + "[" + data + "] failed, " + e.message
191
- # 数据格式有异常,根据配置参数确定是否续跑
192
- if !@dirty_data_continue
193
- @logger.error "Dirty data found, exit process now."
194
- puts "Dirty data found, exit process now."
195
- Process.exit(1)
196
- # 忽略的异常数据直接落文件
197
- else
198
- write_as_dirty_data(event_map)
199
- end
200
- return false
201
- end
202
- end
203
-
204
- # 脏数据文件处理
205
- def write_as_dirty_data(event_amp)
206
- dirty_file_part1_name = @dirty_data_file + ".part1"
207
- dirty_file_part2_name = @dirty_data_file + ".part2"
208
-
209
- # 加锁写入
210
- @@file_lock.synchronize {
211
- dirty_file_part2 = File.open(dirty_file_part2_name, "a+")
212
- dirty_file_part2.puts(event_amp.to_s)
213
- dirty_file_part2.close
214
- if File.size(dirty_file_part2_name) > @dirty_data_file_max_size / 2
215
- # .part1, .part2分别存储数据
216
- # 旧数据落part1,新的数据落part2
217
- FileUtils.mv(dirty_file_part2_name, dirty_file_part1_name)
218
- end
219
- }
220
- end
221
-
222
- def get_active_shards(shards)
223
- active_shards = []
224
- for i in 0...shards.size
225
- entry = shards.get(i)
226
- if entry.getState() == DatahubPackage.model.ShardState::ACTIVE
227
- active_shards.push(entry)
228
- end
229
- end
230
- return active_shards
231
- end
232
-
233
- def get_next_shard_id()
234
- if !@shard_id.empty?
235
- return @shard_id
236
- # 否则轮询写入shard
237
- else
238
- idx = 0
239
- @@shard_lock.synchronize {
240
- idx = @shard_cursor % @shard_count
241
- @shard_cursor = idx + 1
242
- }
243
- shard_id = @shards[idx].getShardId()
244
- return shard_id
245
- end
246
- end
247
-
248
- def multi_receive(event_list)
249
- begin
250
- entries = []
251
- shard_id = get_next_shard_id()
252
-
253
- event_list.each do |event|
254
- if event == LogStash::SHUTDOWN
255
- return
256
- end
257
- event_map = event.to_hash
258
-
259
- entry = DatahubPackage.model.RecordEntry::new(@schema)
260
- #entry.putAttribute("srcId", event_map["host"].to_s)
261
- #entry.putAttribute("ts", event_map["@timestamp"].to_s)
262
- #entry.putAttribute("version", event_map["@version"].to_s)
263
- #entry.putAttribute("srcType", "log")
264
-
265
- for i in 0...@columns_size do
266
- value = event_map[@columns[i]]
267
- if value != nil
268
- entry.set(i, value)
269
- end
270
- end
271
-
272
- if @shard_keys.size > 0
273
- hash_string = ""
274
- for i in 0...@shard_keys.size
275
- shard_key = @shard_keys[i]
276
- if event_map[shard_key] != nil
277
- hash_string += event_map[shard_key].to_s + ","
278
- end
279
- end
280
- hashed_value = java.lang.String.new(hash_string).hashCode()
281
- entry.setPartitionKey(hashed_value)
282
- else
283
- entry.setShardId(shard_id)
284
- end
285
- entries.push(entry)
286
- end
287
-
288
- # puts "total: " + entries.size.to_s
289
-
290
- # 提交列表必须有数据
291
- if entries.size > 0
292
- put_result = @client.putRecords(@project_name, @topic_name, entries)
293
- if put_result.getFailedRecordCount() > 0
294
- @logger.info "Put " + put_result.getFailedRecordCount().to_s + " records to datahub failed, total " + entries.size().to_s
295
- sleep @retry_interval
296
- entries = put_result.getFailedRecords()
297
- @logger.info "write to datahub, failed: " + entries.size.to_s
298
- else
299
- @logger.info "Put data to datahub success, total " + entries.size().to_s
300
- end
301
- end
302
-
303
- rescue DatahubPackage.exception.DatahubServiceException => e
304
- @logger.error "Flush data exception: " + e.message #+ " " + e.backtrace.inspect.to_s
305
- # shard的状态改变,需要重新加载shard
306
- if e.getErrorCode() == "InvalidShardOperation"
307
- @shards = get_active_shards(@topic.listShard())
308
- @shard_count = @shards.size()
309
-
310
- if @shard_count == 0
311
- @logger.error "No active shard available, please check"
312
- end
313
- elsif e.getErrorCode() == nil
314
- sleep @retry_interval
315
- end
316
- retry
317
- rescue => e
318
- @logger.error "Flush data exception: " + e.message + " " + e.backtrace.inspect.to_s
319
-
320
- # 无限重试
321
- if @retry_times < 0
322
- @logger.warn "Now retry..."
323
- # puts "Now retry..."
324
- sleep @retry_interval
325
- retry
326
- # 重试次数用完
327
- elsif @retry_times == 0
328
- @logger.error "Retry not work, now exit"
329
- Process.exit(1)
330
- # 继续重试
331
- elsif @retry_times > 0
332
- @logger.warn "Now retry..."
333
- # puts "Now retry..."
334
- sleep @retry_interval
335
- @retry_times -= 1
336
- retry
337
- end
338
- end
339
- end # def multi_receive
340
-
341
- end # class LogStash::Outputs::Datahub
1
+ #
2
+ #Licensed to the Apache Software Foundation (ASF) under one
3
+ #or more contributor license agreements. See the NOTICE file
4
+ #distributed with this work for additional information
5
+ #regarding copyright ownership. The ASF licenses this file
6
+ #to you under the Apache License, Version 2.0 (the
7
+ #"License"); you may not use this file except in compliance
8
+ #with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ #Unless required by applicable law or agreed to in writing,
13
+ #software distributed under the License is distributed on an
14
+ #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ #KIND, either express or implied. See the License for the
16
+ #specific language governing permissions and limitations
17
+ #under the License.
18
+ #
19
+ require "logstash/outputs/base"
20
+ require "logstash/namespace"
21
+ require "logstash/environment"
22
+ require "fileutils"
23
+ require "thread"
24
+
25
+ jar_path=File.expand_path(File.join(File.dirname(__FILE__), "../../.."))
26
+ LogStash::Environment.load_runtime_jars! File.join(jar_path, "vendor")
27
+
28
+ # Datahub output plugin
29
+ class LogStash::Outputs::Datahub < LogStash::Outputs::Base
30
+ declare_threadsafe!
31
+
32
+ config_name "datahub"
33
+
34
+ # datahub access id
35
+ config :access_id, :validate => :string, :required => true
36
+
37
+ # datahub access key
38
+ config :access_key, :validate => :string, :required => true
39
+
40
+ # datahub service endpoint
41
+ config :endpoint, :validate => :string, :required => true
42
+
43
+ # datahub project name
44
+ config :project_name, :validate => :string, :required => true
45
+
46
+ # datahub topic name
47
+ config :topic_name, :validate => :string, :required => true
48
+
49
+ # 重试次数,-1为无限重试、0为不重试、>0表示需要有限次数
50
+ config :retry_times, :validate => :number, :required => false, :default => -1
51
+
52
+ # 重试周期,下一次重试的间隔,单位为秒
53
+ config :retry_interval, :validate => :number, :required => false, :default => 5
54
+
55
+ # 按照指定字段的值计算hash,依据于该hash值落某个shard
56
+ config :shard_keys, :validate => :array, :required => false, :default => []
57
+
58
+ # 指定数据落指定的shard
59
+ config :shard_id, :validate => :string, :required => false, :default => ""
60
+
61
+ # # 提交的列名,用户可以配置topic的列,采集部分列或者全部列
62
+ # # 默认为空数组,表示按照topic的顺序及全字段提交
63
+ # # 另外:列的配置不用保序,但是要求该字段在topic的schema中存在
64
+ # config :column_names, :validate => :array, :required => false, :default => []
65
+
66
+ # 当出现脏数据时,是否继续写入
67
+ # 当开启该开关,必须指定@dirty_data_file文件
68
+ config :dirty_data_continue, :validate => :boolean, :required => false, :default => false
69
+
70
+ # 脏数据文件名称,当数据文件名称,在@dirty_data_continue开启的情况下,需要指定该值
71
+ # 特别注意:脏数据文件将被分割成两个部分.part1和.part2,part1作为更早的脏数据,part2作为更新的数据
72
+ config :dirty_data_file, :validate => :string, :required => false
73
+
74
+ # 脏数据文件的最大大小,该值保证脏数据文件最大大小不超过这个值,目前该值仅是一个参考值
75
+ config :dirty_data_file_max_size, :validate => :number, :required => false, :default => 50024000
76
+
77
+ # 数据传输压缩方式选择,目前支持deflate, lz4格式
78
+ config :compress_method, :validate => :string, :required => false, :default => ""
79
+
80
+ # 该值内部使用,不提供配置
81
+ # 分发shard的游标
82
+ attr_accessor :shard_cursor
83
+
84
+ # Shard cursor lock
85
+ @@shard_lock = Mutex.new
86
+
87
+ # 写文件锁
88
+ @@file_lock = Mutex.new
89
+
90
+ DatahubPackage = com.aliyun.datahub
91
+
92
+ public
93
+ def register
94
+ begin
95
+ @account = DatahubPackage.auth.AliyunAccount::new(@access_id, @access_key)
96
+ @conf = DatahubPackage.DatahubConfiguration::new(@account, @endpoint)
97
+ if @compress_method == "deflate" || @compress_method == "lz4"
98
+ @compression_format = DatahubPackage.model.compress.CompressionFormat.fromValue(@compress_method)
99
+ @conf.setCompressionFormat(@compression_format)
100
+ end
101
+
102
+ @client = DatahubPackage.DatahubClient::new(@conf)
103
+ @project = DatahubPackage.wrapper.Project::Builder.build(@project_name, @client)
104
+ @topic = @project.getTopic(@topic_name)
105
+ @shard_cursor = 0
106
+
107
+ @shards = get_active_shards(@topic.listShard())
108
+ @shard_count = @shards.size()
109
+
110
+ result = @client.getTopic(@project_name, @topic_name)
111
+ @schema = result.getRecordSchema()
112
+ fields = @schema.getFields()
113
+ @columns_size = fields.size
114
+ @columnnames = []
115
+ for i in 0...@columns_size
116
+ @columnnames.push(fields[i].getName())
117
+ end
118
+ @columntypes = []
119
+ for i in 0...@columns_size
120
+ @columntypes.push(fields[i].getType())
121
+ end
122
+
123
+ # 前置校验参数
124
+ check_params()
125
+
126
+ if @shard_count == 0
127
+ @logger.error "No active shard available, please check"
128
+ raise "No active shard available, please check"
129
+ end
130
+
131
+ @logger.info "Init datahub success!"
132
+ rescue => e
133
+ @logger.error "Init failed!" + e.message + " " + e.backtrace.inspect.to_s
134
+ raise e
135
+ end
136
+ end # def register
137
+
138
+ def check_params()
139
+ # 如果shard_id配置了,则检查该shard是否ok
140
+ if !@shard_id.empty?
141
+ valid = false
142
+ for i in 0...@shards.size
143
+ shard_entry = @shards[i]
144
+ if shard_entry.getShardId() == @shard_id && shard_entry.getState() == DatahubPackage.model.ShardState::ACTIVE
145
+ valid = true
146
+ end
147
+ end
148
+ if (!valid)
149
+ @logger.error "Config shard_id not exists or state not active, check your config"
150
+ raise "Config shard_id not exists or state not active, check your config"
151
+ end
152
+ end
153
+
154
+ # 检查shard_keys字段是否合法
155
+ if @shard_keys.size > 0
156
+ for i in 0...@shard_keys.size
157
+ shard_key = @shard_keys[i]
158
+ if !@schema.containsField(shard_key)
159
+ @logger.error "Config shard_keys contains one or one more unknown field, check your config"
160
+ raise "Config shard_keys contains one or one more unknown field, check your config"
161
+ end
162
+ end
163
+ end
164
+
165
+ # 配置了脏数据继续,必须指定脏数据文件
166
+ if @dirty_data_continue
167
+ if @dirty_data_file.to_s.chomp.length == 0
168
+ raise "Dirty data file path can not be empty"
169
+ end
170
+ end
171
+
172
+ end
173
+
174
+ # 检查并设置数据到entry中
175
+ # 如果解析数据异常,则数据落脏数据文件
176
+ def check_and_set_data(entry, field_type, index, event_map, column_name)
177
+ data = event_map[column_name]
178
+ begin
179
+ if field_type == DatahubPackage.common.data.FieldType::STRING
180
+ entry.setString(index, data.to_s)
181
+ elsif field_type == DatahubPackage.common.data.FieldType::BIGINT
182
+ entry.setBigint(index, java.lang.Long.parseLong(data.to_s))
183
+ elsif field_type == DatahubPackage.common.data.FieldType::DOUBLE
184
+ entry.setDouble(index, java.lang.Double.parseDouble(data.to_s))
185
+ elsif field_type == DatahubPackage.common.data.FieldType::BOOLEAN
186
+ entry.setBoolean(index, java.lang.Boolean.parseBoolean(data.to_s))
187
+ elsif field_type == DatahubPackage.common.data.FieldType::TIMESTAMP
188
+ entry.setTimeStamp(index, java.lang.Long.parseLong(data.to_s))
189
+ else
190
+ raise "Unknown schema type of data"
191
+ end
192
+ return true
193
+ rescue => e
194
+ @logger.error "Parse data: " + column_name + "[" + data + "] failed, " + e.message
195
+ # 数据格式有异常,根据配置参数确定是否续跑
196
+ if !@dirty_data_continue
197
+ @logger.error "Dirty data found, exit process now."
198
+ puts "Dirty data found, exit process now."
199
+ Process.exit(1)
200
+ # 忽略的异常数据直接落文件
201
+ else
202
+ write_as_dirty_data(event_map)
203
+ end
204
+ return false
205
+ end
206
+ end
207
+
208
+ # 脏数据文件处理
209
+ def write_as_dirty_data(event_amp)
210
+ dirty_file_part1_name = @dirty_data_file + ".part1"
211
+ dirty_file_part2_name = @dirty_data_file + ".part2"
212
+
213
+ # 加锁写入
214
+ @@file_lock.synchronize {
215
+ dirty_file_part2 = File.open(dirty_file_part2_name, "a+")
216
+ dirty_file_part2.puts(event_amp.to_s)
217
+ dirty_file_part2.close
218
+ if File.size(dirty_file_part2_name) > @dirty_data_file_max_size / 2
219
+ # .part1, .part2分别存储数据
220
+ # 旧数据落part1,新的数据落part2
221
+ FileUtils.mv(dirty_file_part2_name, dirty_file_part1_name)
222
+ end
223
+ }
224
+ end
225
+
226
+ def get_active_shards(shards)
227
+ active_shards = []
228
+ for i in 0...shards.size
229
+ entry = shards.get(i)
230
+ if entry.getState() == DatahubPackage.model.ShardState::ACTIVE
231
+ active_shards.push(entry)
232
+ end
233
+ end
234
+ return active_shards
235
+ end
236
+
237
+ def get_next_shard_id()
238
+ if !@shard_id.empty?
239
+ return @shard_id
240
+ # 否则轮询写入shard
241
+ else
242
+ idx = 0
243
+ @@shard_lock.synchronize {
244
+ idx = @shard_cursor % @shard_count
245
+ @shard_cursor = idx + 1
246
+ }
247
+ shard_id = @shards[idx].getShardId()
248
+ return shard_id
249
+ end
250
+ end
251
+
252
+ def multi_receive(event_list)
253
+ retry_count = 0
254
+ begin
255
+ entries = []
256
+ shard_id = get_next_shard_id()
257
+
258
+ event_list.each do |event|
259
+ if event == LogStash::SHUTDOWN
260
+ return
261
+ end
262
+ event_map = event.to_hash
263
+
264
+ entry = DatahubPackage.model.RecordEntry::new(@schema)
265
+ entry.putAttribute("srcId", event_map["host"].to_s)
266
+ entry.putAttribute("ts", event_map["@timestamp"].to_s)
267
+ entry.putAttribute("version", event_map["@version"].to_s)
268
+ entry.putAttribute("srcType", "log")
269
+
270
+ is_data_valid = false
271
+ for i in 0...@columns_size do
272
+ column_name = @columnnames[i]
273
+ column_type = @columntypes[i]
274
+ value = event_map[column_name]
275
+ if value != nil
276
+ is_data_valid = check_and_set_data(entry, column_type, i, event_map, column_name)
277
+ break if !is_data_valid
278
+ end
279
+ end
280
+
281
+ if is_data_valid
282
+ if @shard_keys.size > 0
283
+ hash_string = ""
284
+ for i in 0...@shard_keys.size
285
+ shard_key = @shard_keys[i]
286
+ if event_map[shard_key] != nil
287
+ hash_string += event_map[shard_key].to_s + ","
288
+ end
289
+ end
290
+ hashed_value = java.lang.String.new(hash_string).hashCode()
291
+ entry.setPartitionKey(hashed_value)
292
+ else
293
+ entry.setShardId(shard_id)
294
+ end
295
+ entries.push(entry)
296
+ end
297
+ end
298
+
299
+ # puts "total: " + entries.size.to_s
300
+
301
+ # 提交列表必须有数据
302
+ if entries.size > 0
303
+ put_result = @client.putRecords(@project_name, @topic_name, entries)
304
+ if put_result.getFailedRecordCount() > 0
305
+ @logger.info "Put " + put_result.getFailedRecordCount().to_s + " records to datahub failed, total " + entries.size().to_s
306
+ sleep @retry_interval
307
+ entries = put_result.getFailedRecords()
308
+ raise "Write to datahub failed: " + entries.size.to_s
309
+ else
310
+ @logger.info "Put data to datahub success, total " + entries.size().to_s
311
+ end
312
+ end
313
+
314
+ rescue DatahubPackage.exception.DatahubServiceException => e
315
+ @logger.error "Flush data exception: " + e.message #+ " " + e.backtrace.inspect.to_s
316
+ # shard的状态改变,需要重新加载shard
317
+ if e.getErrorCode() == "InvalidShardOperation"
318
+ @shards = get_active_shards(@topic.listShard())
319
+ @shard_count = @shards.size()
320
+
321
+ if @shard_count == 0
322
+ @logger.error "No active shard available, please check"
323
+ end
324
+ elsif e.getErrorCode() == nil
325
+ sleep @retry_interval
326
+ end
327
+ retry_count += 1
328
+ @logger.warn "Now retry: " + retry_count.to_s
329
+ retry
330
+ rescue => e
331
+ @logger.error "Flush data exception: " + e.message + " " + e.backtrace.inspect.to_s
332
+
333
+ # 无限重试
334
+ if @retry_times < 0
335
+ retry_count += 1
336
+ @logger.warn "Now retry: " + retry_count.to_s
337
+ # puts "Now retry..."
338
+ sleep @retry_interval
339
+ retry
340
+ elsif @retry_times == 0
341
+ @logger.error "Retry not work, now exit"
342
+ Process.exit(1)
343
+ # 继续重试
344
+ elsif @retry_times > 0
345
+ retry_count += 1
346
+ if retry_count > @retry_times
347
+ @logger.warn "Retry over: " + @retry_times.to_s
348
+ Process.exit(1)
349
+ end
350
+ @logger.warn "Now retry..."
351
+ sleep @retry_interval
352
+ retry
353
+ end
354
+ end
355
+ end # def multi_receive
356
+
357
+ end # class LogStash::Outputs::Datahub
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'logstash-output-datahub'
3
- s.version = "1.0.0"
3
+ s.version = "1.0.1"
4
4
  s.licenses = ["Apache License (2.0)"]
5
5
  s.summary = "This aliyun-datahub output plugin."
6
6
  s.description = "This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program"
@@ -19,7 +19,7 @@ Gem::Specification.new do |s|
19
19
 
20
20
  # Gem dependencies
21
21
  s.add_runtime_dependency 'stud'
22
- s.add_runtime_dependency "logstash-core", ">= 2.0.0", "< 3.0.0"
22
+ s.add_runtime_dependency "logstash-core", ">= 2.0.0"
23
23
  s.add_runtime_dependency "logstash-codec-plain"
24
24
  s.add_development_dependency "logstash-devutils"
25
25
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: logstash-output-datahub
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aliyun
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-20 00:00:00.000000000 Z
11
+ date: 2017-06-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: stud
@@ -31,9 +31,6 @@ dependencies:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: 2.0.0
34
- - - "<"
35
- - !ruby/object:Gem::Version
36
- version: 3.0.0
37
34
  type: :runtime
38
35
  prerelease: false
39
36
  version_requirements: !ruby/object:Gem::Requirement
@@ -41,9 +38,6 @@ dependencies:
41
38
  - - ">="
42
39
  - !ruby/object:Gem::Version
43
40
  version: 2.0.0
44
- - - "<"
45
- - !ruby/object:Gem::Version
46
- version: 3.0.0
47
41
  - !ruby/object:Gem::Dependency
48
42
  name: logstash-codec-plain
49
43
  requirement: !ruby/object:Gem::Requirement
@@ -126,7 +120,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
126
120
  version: '0'
127
121
  requirements: []
128
122
  rubyforge_project:
129
- rubygems_version: 2.4.5.1
123
+ rubygems_version: 2.6.10
130
124
  signing_key:
131
125
  specification_version: 4
132
126
  summary: This aliyun-datahub output plugin.