fluentd-plugin-aliyun-odps 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,64 @@
1
+ #
2
+ #Licensed to the Apache Software Foundation (ASF) under one
3
+ #or more contributor license agreements. See the NOTICE file
4
+ #distributed with this work for additional information
5
+ #regarding copyright ownership. The ASF licenses this file
6
+ #to you under the Apache License, Version 2.0 (the
7
+ #"License"); you may not use this file except in compliance
8
+ #with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ #Unless required by applicable law or agreed to in writing,
13
+ #software distributed under the License is distributed on an
14
+ #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ #KIND, either express or implied. See the License for the
16
+ #specific language governing permissions and limitations
17
+ #under the License.
18
+ #
19
+ module OdpsDatahub
20
+ $ODPS_BIGINT = "bigint"
21
+ $ODPS_DOUBLE = "double"
22
+ $ODPS_BOOLEAN = "boolean"
23
+ $ODPS_DATETIME = "datetime"
24
+ $ODPS_STRING = "string"
25
+
26
+ class OdpsTableColumn
27
+ attr_reader :mName, :mType, :mIdx
28
+ def initialize(name, type, idx)
29
+ @mName = name
30
+ @mType = type
31
+ @mIdx = idx
32
+ end
33
+ end
34
+
35
+ class OdpsTableSchema
36
+ attr_accessor :mCols
37
+ def initialize(jsonobj = nil)
38
+ @mCols = Array.new
39
+ if jsonobj != nil
40
+ columns = jsonobj["columns"]
41
+ columns.each do |object|
42
+ appendColumn(object["name"], object["type"])
43
+ end
44
+ end
45
+ end
46
+
47
+ def getColumnCount
48
+ @mCols.size
49
+ end
50
+
51
+ def getColumnType(idx)
52
+ if idx < 0 or idx >= @mCols.size
53
+ raise "idx out of range"
54
+ end
55
+ @mCols.at(idx).mType
56
+ end
57
+
58
+ def appendColumn(name, type)
59
+ col = OdpsTableColumn.new(name, type, @mCols.size)
60
+ @mCols.push(col)
61
+ end
62
+ end
63
+ end
64
+
@@ -0,0 +1,57 @@
1
+ #
2
+ #Licensed to the Apache Software Foundation (ASF) under one
3
+ #or more contributor license agreements. See the NOTICE file
4
+ #distributed with this work for additional information
5
+ #regarding copyright ownership. The ASF licenses this file
6
+ #to you under the Apache License, Version 2.0 (the
7
+ #"License"); you may not use this file except in compliance
8
+ #with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ #Unless required by applicable law or agreed to in writing,
13
+ #software distributed under the License is distributed on an
14
+ #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ #KIND, either express or implied. See the License for the
16
+ #specific language governing permissions and limitations
17
+ #under the License.
18
+ #
19
+ require 'rexml/document'
20
+
21
+ module OdpsDatahub
22
+ class XmlTemplate
23
+ def self.getTaskXml(taskName, sqlString)
24
+ task_template=%{<SQL>
25
+ <Name>#{taskName}</Name>
26
+ <Comment/>
27
+ <Config>
28
+ <Property>
29
+ <Name>settings</Name>
30
+ <Value>{"odps.sql.udf.strict.mode": "true"}</Value>
31
+ </Property>
32
+ </Config>
33
+ <Query><![CDATA[#{sqlString}]]></Query>
34
+ </SQL>
35
+ }
36
+ return task_template
37
+ end
38
+
39
+ def self.getJobXml(name, comment, priority, taskStr, runMode)
40
+ job_template=%{<?xml version="1.0" encoding="utf-8"?>
41
+ <Instance>
42
+ <Job>
43
+ <Name>#{name}</Name>
44
+ <Comment>#{comment}</Comment>
45
+ <Priority>#{priority}</Priority>
46
+ <Tasks>
47
+ #{taskStr}
48
+ </Tasks>
49
+ <DAG>
50
+ <RunMode>#{runMode}</RunMode>
51
+ </DAG>
52
+ </Job>
53
+ </Instance>}
54
+ return job_template
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,21 @@
1
+ ### Generated by rprotoc. DO NOT EDIT!
2
+ ### <proto file: xstream_pack.proto>
3
+ #
4
+ # package apsara.odps.tunnel.proto;
5
+ #
6
+ # message XStreamPack
7
+ # {
8
+ # required bytes pack_data = 1;
9
+ # optional bytes pack_meta = 2;
10
+ # }
11
+ require 'protobuf'
12
+ require 'protobuf/message'
13
+ require 'protobuf/enum'
14
+
15
+
16
+ module OdpsDatahub
17
+ class XStreamPack < ::Protobuf::Message
18
+ required :bytes, :pack_data, 1
19
+ optional :bytes, :pack_meta, 2
20
+ end
21
+ end
@@ -0,0 +1,8 @@
1
+
2
+ package apsara.odps.tunnel.proto;
3
+
4
+ message XStreamPack
5
+ {
6
+ required bytes pack_data = 1;
7
+ optional bytes pack_meta = 2;
8
+ }
@@ -0,0 +1,373 @@
1
+ #
2
+ #Licensed to the Apache Software Foundation (ASF) under one
3
+ #or more contributor license agreements. See the NOTICE file
4
+ #distributed with this work for additional information
5
+ #regarding copyright ownership. The ASF licenses this file
6
+ #to you under the Apache License, Version 2.0 (the
7
+ #"License"); you may not use this file except in compliance
8
+ #with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ #Unless required by applicable law or agreed to in writing,
13
+ #software distributed under the License is distributed on an
14
+ #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ #KIND, either express or implied. See the License for the
16
+ #specific language governing permissions and limitations
17
+ #under the License.
18
+ #
19
+ module Fluent
20
+ class ODPSOutput < Fluent::BufferedOutput
21
+ Fluent::Plugin.register_output('aliyun_odps', self)
22
+ @@txt=nil
23
+
24
+ def initialize
25
+ super
26
+ require 'time'
27
+ require_relative 'stream_client'
28
+ @compressor = nil
29
+ end
30
+
31
+ config_param :path, :string, :default => ""
32
+ config_param :aliyun_access_id, :string, :default => nil
33
+ config_param :aliyun_access_key, :string, :default => nil, :secret => true
34
+ config_param :aliyun_odps_endpoint, :string, :default => nil
35
+ config_param :aliyun_odps_hub_endpoint, :string, :default => nil
36
+ config_param :project, :string, :default => nil
37
+ config_param :format, :string, :default => 'out_file'
38
+
39
+ attr_accessor :tables
40
+
41
+ unless method_defined?(:log)
42
+ define_method(:log) { $log }
43
+ end
44
+ # TODO: Merge SQLInput's TableElement
45
+ class TableElement
46
+ include Configurable
47
+
48
+ config_param :table, :string, :default => nil
49
+ config_param :fields, :string, :default => nil
50
+ config_param :partition, :string, :default => nil
51
+ config_param :num_retries, :integer, :default => 5
52
+ config_param :shard_number, :integer, :default => 1
53
+ config_param :thread_number, :integer, :default => 1
54
+ config_param :time_format, :string, :default => nil
55
+ config_param :record_batch_size, :integer, :default => 10
56
+ config_param :time_out, :integer, :default => 300
57
+ attr_accessor :partitionList
58
+ attr_reader :client
59
+ attr_reader :writer
60
+ attr_reader :pattern
61
+ attr_reader :log
62
+
63
+ def initialize(pattern, log)
64
+ super()
65
+ @pattern = MatchPattern.create(pattern)
66
+ @log = log
67
+ @writer = Array.new
68
+ end
69
+
70
+ #初始化数据
71
+ def configure(conf)
72
+ super
73
+ @format_proc = Proc.new { |record|
74
+ values = []
75
+ @fields.split(',').each { |key|
76
+ unless record.has_key?(key)
77
+ @log.warn "the table "+@table+"'s "+key+" field not has match key"
78
+ end
79
+ values << record[key]
80
+ }
81
+ values
82
+ }
83
+ end
84
+
85
+ def init(config)
86
+ odpsConfig = OdpsDatahub::OdpsConfig.new(config[:aliyun_access_id],
87
+ config[:aliyun_access_key],
88
+ config[:aliyun_odps_endpoint],
89
+ config[:aliyun_odps_hub_endpoint],
90
+ config[:project])
91
+ if @record_batch_size<=0
92
+ raise "the table "+ @table+"'s record_batch_size is must more than 0"
93
+ end
94
+ begin
95
+ @client = OdpsDatahub::StreamClient.new(odpsConfig, config[:project], @table)
96
+ @client.loadShard(@shard_number)
97
+ allLoaded = false
98
+ loadtime=0
99
+ while !allLoaded do
100
+ count = 0
101
+ #get json like [{"ShardId": "0","State": "loaded"},{"ShardId": "1","State": "loaded"}]
102
+ @client.getShardStatus.each { |shard|
103
+ if shard["State"] != "loaded"
104
+ sleep(5)
105
+ loadtime+=5
106
+ break
107
+ else
108
+ count += 1
109
+ end
110
+ if count == @shard_number
111
+ allLoaded = true
112
+ @log.info "All shareds are loaded successfully"
113
+ end
114
+ if loadtime>=300
115
+ raise "Load shared timeout"
116
+ end
117
+ }
118
+ end
119
+ for i in 0..@thread_number-1
120
+ @writer[i] = @client.createStreamArrayWriter()
121
+ end
122
+ partitionMaps=@client.getPartitionList
123
+ @partitionList=[]
124
+ for map in partitionMaps do
125
+ partitionName=''
126
+ map.each { |k, v|
127
+ partitionName+=k+"="+v+","
128
+ }
129
+ @partitionList<<partitionName.chomp(",")
130
+ end
131
+ rescue => e
132
+ raise "loadShard failed,"+e.message
133
+ end
134
+ end
135
+
136
+ #import data
137
+ def import(chunk)
138
+ records = []
139
+ partitions=Hash.new
140
+ chunk.msgpack_each { |tag, time, data|
141
+ begin
142
+ #if partition is not empty
143
+ unless @partition.blank? then
144
+ #if partition has params in it
145
+ if @partition.include? "=${"
146
+ #split partition
147
+ partition_arrays=@partition.split(',')
148
+ partition_name=''
149
+ i=1
150
+ for p in partition_arrays do
151
+ #if partition is time formated
152
+ if p.include? "strftime"
153
+ key=p[p.index("{")+1, p.index(".strftime")-1-p.index("{")]
154
+ partition_column=p[0, p.index("=")]
155
+ timeFormat=p[p.index("(")+2, p.index(")")-3-p.index("(")]
156
+ if data.has_key?(key)
157
+ if time_format == nil
158
+ partition_value=Time.parse(data[key]).strftime(timeFormat)
159
+ else
160
+ partition_value=Time.strptime(data[key], time_format).strftime(timeFormat)
161
+ end
162
+ if i==1
163
+ partition_name+=partition_column+"="+partition_value
164
+ else
165
+ partition_name+=","+partition_column+"="+partition_value
166
+ end
167
+ else
168
+ raise "partition has no corresponding source key or the partition expression is wrong,"+data
169
+ end
170
+ else
171
+ key=p[p.index("{")+1, p.index("}")-1-p.index("{")]
172
+ partition_column=p[0, p.index("=")]
173
+ if data.has_key?(key)
174
+ partition_value=data[key]
175
+ if i==1
176
+ partition_name+=partition_column+"="+partition_value
177
+ else
178
+ partition_name+=","+partition_column+"="+partition_value
179
+ end
180
+ else
181
+ raise "partition has no corresponding source key or the partition expression is wrong,"+data
182
+ end
183
+ end
184
+ i+=1
185
+ end
186
+ else
187
+ partition_name=@partition
188
+ end
189
+ if partitions[partition_name]==nil
190
+ partitions[partition_name]=[]
191
+ end
192
+ partitions[partition_name] << @format_proc.call(data)
193
+
194
+ else
195
+ records << @format_proc.call(data)
196
+ end
197
+
198
+ rescue => e
199
+ raise "Failed to format the data:"+e.message
200
+ end
201
+ }
202
+
203
+ begin
204
+ #multi thread
205
+ sendThread = Array.new
206
+ unless @partition.blank? then
207
+ partitions.each { |k, v|
208
+ @log.info k
209
+ #if the partition is not exist, create one
210
+ unless @partitionList.include?(k)
211
+ @client.addPartition(k)
212
+ @partitionList << k
213
+ @log.info "add partition "+k
214
+ end
215
+ }
216
+ for thread in 0..@thread_number-1
217
+ sendThread[thread] = Thread.start(thread) do |threadId|
218
+ retryTime = 0
219
+ begin
220
+ partitions.each { |k, v|
221
+ sendCount = v.size/@thread_number
222
+ restCount = 0
223
+ if threadId == @thread_number-1
224
+ restCount = v.size%@thread_number
225
+ end
226
+ @writer[threadId].write(v[sendCount*threadId..sendCount*(threadId+1)+restCount-1], k)
227
+ @log.info "Successfully import "+(sendCount+restCount).to_s+" data to partition:"+k+",table:"+@table+" at threadId:"+threadId.to_s
228
+ }
229
+ rescue => e
230
+ if retryTime > 0
231
+ @log.info "Fail to write, retry in 2sec. Error at threadId:"+threadId.to_s+" Msg:"+e.message
232
+ sleep(2)
233
+ retryTime -= 1
234
+ retry
235
+ else
236
+ raise e
237
+ end
238
+ end
239
+ end
240
+ end
241
+ else
242
+ @log.info records.size.to_s+" records to be sent"
243
+ for thread in 0..@thread_number-1
244
+ sendThread[thread] = Thread.start(thread) do |threadId|
245
+ retryTime = 0
246
+ #send data from sendCount*threadId to sendCount*(threadId+1)-1
247
+ sendCount = records.size/@thread_number
248
+ restCount = 0
249
+ if threadId == @thread_number-1
250
+ restCount = records.size%@thread_number
251
+ end
252
+ begin
253
+ @writer[threadId].write(records[sendCount*threadId..sendCount*(threadId+1)+restCount-1])
254
+ @log.info "Successfully import "+(sendCount+restCount).to_s+" data to table:"+@table+" at threadId:"+threadId.to_s
255
+ rescue => e
256
+ if retryTime > 0
257
+ @log.info "Fail to write, retry in 2sec. Error at threadId:"+threadId.to_s+" Msg:"+e.message
258
+ sleep(2)
259
+ retryTime -= 1
260
+ retry
261
+ else
262
+ raise e
263
+ end
264
+ end
265
+ end
266
+ end
267
+ end
268
+ for thread in 0..@thread_number-1
269
+ sendThread[thread].join
270
+ end
271
+ rescue => e
272
+ # ignore other exceptions to use Fluentd retry
273
+ raise "write records failed,"+e.message
274
+ end
275
+ end
276
+
277
+ def close()
278
+ @client.loadShard(0)
279
+ end
280
+
281
+ end
282
+
283
+ # This method is called before starting.
284
+ # 'conf' is a Hash that includes configuration parameters.
285
+ # If the configuration is invalid, raise Fluent::ConfigError.
286
+ def configure(conf)
287
+ super
288
+ print "configure"
289
+ # You can also refer raw parameter via conf[name].
290
+ @tables = []
291
+ conf.elements.select { |e|
292
+ e.name == 'table'
293
+ }.each { |e|
294
+ te = TableElement.new(e.arg, log)
295
+ te.configure(e)
296
+ if e.arg.empty?
297
+ log.warn "no table definition"
298
+ else
299
+ @tables << te
300
+ end
301
+ }
302
+ if @tables.empty?
303
+ raise ConfigError, "There is no <table>. <table> is required"
304
+ end
305
+ end
306
+
307
+ # This method is called when starting.
308
+ # Open sockets or files here.
309
+ def start
310
+ super
311
+ config = {
312
+ :aliyun_access_id => @aliyun_access_id,
313
+ :aliyun_access_key => @aliyun_access_key,
314
+ :project => @project,
315
+ :aliyun_odps_endpoint => @aliyun_odps_endpoint,
316
+ :aliyun_odps_hub_endpoint => @aliyun_odps_hub_endpoint,
317
+ }
318
+ #初始化各个table object
319
+ @tables.each { |te|
320
+ te.init(config)
321
+ }
322
+ log.info "the table object size is "+@tables.size.to_s
323
+ end
324
+
325
+ # This method is called when shutting down.
326
+ # Shutdown the thread and close sockets or files here.
327
+ def shutdown
328
+ super
329
+ @tables.reject! do |te|
330
+ te.close()
331
+ end
332
+ end
333
+
334
+ # This method is called when an event reaches to Fluentd.
335
+ # Convert the event to a raw string.
336
+ def format(tag, time, record)
337
+ [tag, time, record].to_json + "\n"
338
+ end
339
+
340
+ # This method is called every flush interval. Write the buffer chunk
341
+ # to files or databases here.
342
+ # 'chunk' is a buffer chunk that includes multiple formatted
343
+ # events. You can use 'data = chunk.read' to get all events and
344
+ # 'chunk.open {|io| ... }' to get IO objects.
345
+ #
346
+ # NOTE! This method is called by internal thread, not Fluentd's main thread. So IO wait doesn't affect other plugins.
347
+ def write(chunk)
348
+ #foreach tables,choose table oject ,data = chunk.read
349
+ @tables.each { |table|
350
+ if table.pattern.match(chunk.key)
351
+ log.info "Begin to import the data and the table_match is "+chunk.key
352
+ return table.import(chunk)
353
+ end
354
+ }
355
+ end
356
+
357
+ def emit(tag, es, chain)
358
+ super(tag, es, chain, format_tag(tag))
359
+ end
360
+
361
+ def format(tag, time, record)
362
+ [tag, time, record].to_msgpack
363
+ end
364
+
365
+ def format_tag(tag)
366
+ if @remove_tag_prefix
367
+ tag.gsub(@remove_tag_prefix, '')
368
+ else
369
+ tag
370
+ end
371
+ end
372
+ end
373
+ end