fluent-plugin-aliyun-odps 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ #
2
+ #Licensed to the Apache Software Foundation (ASF) under one
3
+ #or more contributor license agreements. See the NOTICE file
4
+ #distributed with this work for additional information
5
+ #regarding copyright ownership. The ASF licenses this file
6
+ #to you under the Apache License, Version 2.0 (the
7
+ #"License"); you may not use this file except in compliance
8
+ #with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ #Unless required by applicable law or agreed to in writing,
13
+ #software distributed under the License is distributed on an
14
+ #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ #KIND, either express or implied. See the License for the
16
+ #specific language governing permissions and limitations
17
+ #under the License.
18
+ #
19
+ module OdpsDatahub
20
+ $ODPS_BIGINT = "bigint"
21
+ $ODPS_DOUBLE = "double"
22
+ $ODPS_BOOLEAN = "boolean"
23
+ $ODPS_DATETIME = "datetime"
24
+ $ODPS_STRING = "string"
25
+
26
+ class OdpsTableColumn
27
+ attr_reader :mName, :mType, :mIdx
28
+ def initialize(name, type, idx)
29
+ @mName = name
30
+ @mType = type
31
+ @mIdx = idx
32
+ end
33
+ end
34
+
35
+ class OdpsTableSchema
36
+ attr_accessor :mCols
37
+ def initialize(jsonobj = nil)
38
+ @mCols = Array.new
39
+ if jsonobj != nil
40
+ columns = jsonobj["columns"]
41
+ columns.each do |object|
42
+ appendColumn(object["name"], object["type"])
43
+ end
44
+ end
45
+ end
46
+
47
+ def getColumnCount
48
+ @mCols.size
49
+ end
50
+
51
+ def getColumnType(idx)
52
+ if idx < 0 or idx >= @mCols.size
53
+ raise "idx out of range"
54
+ end
55
+ @mCols.at(idx).mType
56
+ end
57
+
58
+ def appendColumn(name, type)
59
+ col = OdpsTableColumn.new(name, type, @mCols.size)
60
+ @mCols.push(col)
61
+ end
62
+ end
63
+ end
64
+
@@ -0,0 +1,57 @@
1
+ #
2
+ #Licensed to the Apache Software Foundation (ASF) under one
3
+ #or more contributor license agreements. See the NOTICE file
4
+ #distributed with this work for additional information
5
+ #regarding copyright ownership. The ASF licenses this file
6
+ #to you under the Apache License, Version 2.0 (the
7
+ #"License"); you may not use this file except in compliance
8
+ #with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ #Unless required by applicable law or agreed to in writing,
13
+ #software distributed under the License is distributed on an
14
+ #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ #KIND, either express or implied. See the License for the
16
+ #specific language governing permissions and limitations
17
+ #under the License.
18
+ #
19
+ require 'rexml/document'
20
+
21
+ module OdpsDatahub
22
+ class XmlTemplate
23
+ def self.getTaskXml(taskName, sqlString)
24
+ task_template=%{<SQL>
25
+ <Name>#{taskName}</Name>
26
+ <Comment/>
27
+ <Config>
28
+ <Property>
29
+ <Name>settings</Name>
30
+ <Value>{"odps.sql.udf.strict.mode": "true"}</Value>
31
+ </Property>
32
+ </Config>
33
+ <Query><![CDATA[#{sqlString}]]></Query>
34
+ </SQL>
35
+ }
36
+ return task_template
37
+ end
38
+
39
+ def self.getJobXml(name, comment, priority, taskStr, runMode)
40
+ job_template=%{<?xml version="1.0" encoding="utf-8"?>
41
+ <Instance>
42
+ <Job>
43
+ <Name>#{name}</Name>
44
+ <Comment>#{comment}</Comment>
45
+ <Priority>#{priority}</Priority>
46
+ <Tasks>
47
+ #{taskStr}
48
+ </Tasks>
49
+ <DAG>
50
+ <RunMode>#{runMode}</RunMode>
51
+ </DAG>
52
+ </Job>
53
+ </Instance>}
54
+ return job_template
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,21 @@
1
+ ### Generated by rprotoc. DO NOT EDIT!
2
+ ### <proto file: xstream_pack.proto>
3
+ #
4
+ # package apsara.odps.tunnel.proto;
5
+ #
6
+ # message XStreamPack
7
+ # {
8
+ # required bytes pack_data = 1;
9
+ # optional bytes pack_meta = 2;
10
+ # }
11
+ require 'protobuf'
12
+ require 'protobuf/message'
13
+ require 'protobuf/enum'
14
+
15
+
16
+ module OdpsDatahub
17
+ class XStreamPack < ::Protobuf::Message
18
+ required :bytes, :pack_data, 1
19
+ optional :bytes, :pack_meta, 2
20
+ end
21
+ end
@@ -0,0 +1,8 @@
1
+
2
+ package apsara.odps.tunnel.proto;
3
+
4
+ message XStreamPack
5
+ {
6
+ required bytes pack_data = 1;
7
+ optional bytes pack_meta = 2;
8
+ }
@@ -0,0 +1,373 @@
1
+ #
2
+ #Licensed to the Apache Software Foundation (ASF) under one
3
+ #or more contributor license agreements. See the NOTICE file
4
+ #distributed with this work for additional information
5
+ #regarding copyright ownership. The ASF licenses this file
6
+ #to you under the Apache License, Version 2.0 (the
7
+ #"License"); you may not use this file except in compliance
8
+ #with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ #Unless required by applicable law or agreed to in writing,
13
+ #software distributed under the License is distributed on an
14
+ #"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ #KIND, either express or implied. See the License for the
16
+ #specific language governing permissions and limitations
17
+ #under the License.
18
+ #
19
+ module Fluent
20
+ class ODPSOutput < Fluent::BufferedOutput
21
+ Fluent::Plugin.register_output('aliyun_odps', self)
22
+ @@txt=nil
23
+
24
+ def initialize
25
+ super
26
+ require 'time'
27
+ require_relative 'stream_client'
28
+ @compressor = nil
29
+ end
30
+
31
+ config_param :path, :string, :default => ""
32
+ config_param :aliyun_access_id, :string, :default => nil
33
+ config_param :aliyun_access_key, :string, :default => nil, :secret => true
34
+ config_param :aliyun_odps_endpoint, :string, :default => nil
35
+ config_param :aliyun_odps_hub_endpoint, :string, :default => nil
36
+ config_param :project, :string, :default => nil
37
+ config_param :format, :string, :default => 'out_file'
38
+
39
+ attr_accessor :tables
40
+
41
+ unless method_defined?(:log)
42
+ define_method(:log) { $log }
43
+ end
44
+ # TODO: Merge SQLInput's TableElement
45
+ class TableElement
46
+ include Configurable
47
+
48
+ config_param :table, :string, :default => nil
49
+ config_param :fields, :string, :default => nil
50
+ config_param :partition, :string, :default => nil
51
+ config_param :num_retries, :integer, :default => 5
52
+ config_param :shard_number, :integer, :default => 1
53
+ config_param :thread_number, :integer, :default => 1
54
+ config_param :time_format, :string, :default => nil
55
+ config_param :record_batch_size, :integer, :default => 10
56
+ config_param :time_out, :integer, :default => 300
57
+ attr_accessor :partitionList
58
+ attr_reader :client
59
+ attr_reader :writer
60
+ attr_reader :pattern
61
+ attr_reader :log
62
+
63
+ def initialize(pattern, log)
64
+ super()
65
+ @pattern = MatchPattern.create(pattern)
66
+ @log = log
67
+ @writer = Array.new
68
+ end
69
+
70
+ #初始化数据
71
+ def configure(conf)
72
+ super
73
+ @format_proc = Proc.new { |record|
74
+ values = []
75
+ @fields.split(',').each { |key|
76
+ unless record.has_key?(key)
77
+ @log.warn "the table "+@table+"'s "+key+" field not has match key"
78
+ end
79
+ values << record[key]
80
+ }
81
+ values
82
+ }
83
+ end
84
+
85
+ def init(config)
86
+ odpsConfig = OdpsDatahub::OdpsConfig.new(config[:aliyun_access_id],
87
+ config[:aliyun_access_key],
88
+ config[:aliyun_odps_endpoint],
89
+ config[:aliyun_odps_hub_endpoint],
90
+ config[:project])
91
+ if @record_batch_size<=0
92
+ raise "the table "+ @table+"'s record_batch_size is must more than 0"
93
+ end
94
+ begin
95
+ @client = OdpsDatahub::StreamClient.new(odpsConfig, config[:project], @table)
96
+ @client.loadShard(@shard_number)
97
+ allLoaded = false
98
+ loadtime=0
99
+ while !allLoaded do
100
+ count = 0
101
+ #get json like [{"ShardId": "0","State": "loaded"},{"ShardId": "1","State": "loaded"}]
102
+ @client.getShardStatus.each { |shard|
103
+ if shard["State"] != "loaded"
104
+ sleep(5)
105
+ loadtime+=5
106
+ break
107
+ else
108
+ count += 1
109
+ end
110
+ if count == @shard_number
111
+ allLoaded = true
112
+ @log.info "All shareds are loaded successfully"
113
+ end
114
+ if loadtime>=300
115
+ raise "Load shared timeout"
116
+ end
117
+ }
118
+ end
119
+ for i in 0..@thread_number-1
120
+ @writer[i] = @client.createStreamArrayWriter()
121
+ end
122
+ partitionMaps=@client.getPartitionList
123
+ @partitionList=[]
124
+ for map in partitionMaps do
125
+ partitionName=''
126
+ map.each { |k, v|
127
+ partitionName+=k+"="+v+","
128
+ }
129
+ @partitionList<<partitionName.chomp(",")
130
+ end
131
+ rescue => e
132
+ raise "loadShard failed,"+e.message
133
+ end
134
+ end
135
+
136
+ #import data
137
+ def import(chunk)
138
+ records = []
139
+ partitions=Hash.new
140
+ chunk.msgpack_each { |tag, time, data|
141
+ begin
142
+ #if partition is not empty
143
+ unless @partition.blank? then
144
+ #if partition has params in it
145
+ if @partition.include? "=${"
146
+ #split partition
147
+ partition_arrays=@partition.split(',')
148
+ partition_name=''
149
+ i=1
150
+ for p in partition_arrays do
151
+ #if partition is time formated
152
+ if p.include? "strftime"
153
+ key=p[p.index("{")+1, p.index(".strftime")-1-p.index("{")]
154
+ partition_column=p[0, p.index("=")]
155
+ timeFormat=p[p.index("(")+2, p.index(")")-3-p.index("(")]
156
+ if data.has_key?(key)
157
+ if time_format == nil
158
+ partition_value=Time.parse(data[key]).strftime(timeFormat)
159
+ else
160
+ partition_value=Time.strptime(data[key], time_format).strftime(timeFormat)
161
+ end
162
+ if i==1
163
+ partition_name+=partition_column+"="+partition_value
164
+ else
165
+ partition_name+=","+partition_column+"="+partition_value
166
+ end
167
+ else
168
+ raise "partition has no corresponding source key or the partition expression is wrong,"+data
169
+ end
170
+ else
171
+ key=p[p.index("{")+1, p.index("}")-1-p.index("{")]
172
+ partition_column=p[0, p.index("=")]
173
+ if data.has_key?(key)
174
+ partition_value=data[key]
175
+ if i==1
176
+ partition_name+=partition_column+"="+partition_value
177
+ else
178
+ partition_name+=","+partition_column+"="+partition_value
179
+ end
180
+ else
181
+ raise "partition has no corresponding source key or the partition expression is wrong,"+data
182
+ end
183
+ end
184
+ i+=1
185
+ end
186
+ else
187
+ partition_name=@partition
188
+ end
189
+ if partitions[partition_name]==nil
190
+ partitions[partition_name]=[]
191
+ end
192
+ partitions[partition_name] << @format_proc.call(data)
193
+
194
+ else
195
+ records << @format_proc.call(data)
196
+ end
197
+
198
+ rescue => e
199
+ raise "Failed to format the data:"+e.message
200
+ end
201
+ }
202
+
203
+ begin
204
+ #multi thread
205
+ sendThread = Array.new
206
+ unless @partition.blank? then
207
+ partitions.each { |k, v|
208
+ @log.info k
209
+ #if the partition is not exist, create one
210
+ unless @partitionList.include?(k)
211
+ @client.addPartition(k)
212
+ @partitionList << k
213
+ @log.info "add partition "+k
214
+ end
215
+ }
216
+ for thread in 0..@thread_number-1
217
+ sendThread[thread] = Thread.start(thread) do |threadId|
218
+ retryTime = 0
219
+ begin
220
+ partitions.each { |k, v|
221
+ sendCount = v.size/@thread_number
222
+ restCount = 0
223
+ if threadId == @thread_number-1
224
+ restCount = v.size%@thread_number
225
+ end
226
+ @writer[threadId].write(v[sendCount*threadId..sendCount*(threadId+1)+restCount-1], k)
227
+ @log.info "Successfully import "+(sendCount+restCount).to_s+" data to partition:"+k+",table:"+@table+" at threadId:"+threadId.to_s
228
+ }
229
+ rescue => e
230
+ if retryTime > 0
231
+ @log.info "Fail to write, retry in 2sec. Error at threadId:"+threadId.to_s+" Msg:"+e.message
232
+ sleep(2)
233
+ retryTime -= 1
234
+ retry
235
+ else
236
+ raise e
237
+ end
238
+ end
239
+ end
240
+ end
241
+ else
242
+ @log.info records.size.to_s+" records to be sent"
243
+ for thread in 0..@thread_number-1
244
+ sendThread[thread] = Thread.start(thread) do |threadId|
245
+ retryTime = 0
246
+ #send data from sendCount*threadId to sendCount*(threadId+1)-1
247
+ sendCount = records.size/@thread_number
248
+ restCount = 0
249
+ if threadId == @thread_number-1
250
+ restCount = records.size%@thread_number
251
+ end
252
+ begin
253
+ @writer[threadId].write(records[sendCount*threadId..sendCount*(threadId+1)+restCount-1])
254
+ @log.info "Successfully import "+(sendCount+restCount).to_s+" data to table:"+@table+" at threadId:"+threadId.to_s
255
+ rescue => e
256
+ if retryTime > 0
257
+ @log.info "Fail to write, retry in 2sec. Error at threadId:"+threadId.to_s+" Msg:"+e.message
258
+ sleep(2)
259
+ retryTime -= 1
260
+ retry
261
+ else
262
+ raise e
263
+ end
264
+ end
265
+ end
266
+ end
267
+ end
268
+ for thread in 0..@thread_number-1
269
+ sendThread[thread].join
270
+ end
271
+ rescue => e
272
+ # ignore other exceptions to use Fluentd retry
273
+ raise "write records failed,"+e.message
274
+ end
275
+ end
276
+
277
+ def close()
278
+ @client.loadShard(0)
279
+ end
280
+
281
+ end
282
+
283
+ # This method is called before starting.
284
+ # 'conf' is a Hash that includes configuration parameters.
285
+ # If the configuration is invalid, raise Fluent::ConfigError.
286
+ def configure(conf)
287
+ super
288
+ print "configure"
289
+ # You can also refer raw parameter via conf[name].
290
+ @tables = []
291
+ conf.elements.select { |e|
292
+ e.name == 'table'
293
+ }.each { |e|
294
+ te = TableElement.new(e.arg, log)
295
+ te.configure(e)
296
+ if e.arg.empty?
297
+ log.warn "no table definition"
298
+ else
299
+ @tables << te
300
+ end
301
+ }
302
+ if @tables.empty?
303
+ raise ConfigError, "There is no <table>. <table> is required"
304
+ end
305
+ end
306
+
307
+ # This method is called when starting.
308
+ # Open sockets or files here.
309
+ def start
310
+ super
311
+ config = {
312
+ :aliyun_access_id => @aliyun_access_id,
313
+ :aliyun_access_key => @aliyun_access_key,
314
+ :project => @project,
315
+ :aliyun_odps_endpoint => @aliyun_odps_endpoint,
316
+ :aliyun_odps_hub_endpoint => @aliyun_odps_hub_endpoint,
317
+ }
318
+ #初始化各个table object
319
+ @tables.each { |te|
320
+ te.init(config)
321
+ }
322
+ log.info "the table object size is "+@tables.size.to_s
323
+ end
324
+
325
+ # This method is called when shutting down.
326
+ # Shutdown the thread and close sockets or files here.
327
+ def shutdown
328
+ super
329
+ @tables.reject! do |te|
330
+ te.close()
331
+ end
332
+ end
333
+
334
+ # This method is called when an event reaches to Fluentd.
335
+ # Convert the event to a raw string.
336
+ def format(tag, time, record)
337
+ [tag, time, record].to_json + "\n"
338
+ end
339
+
340
+ # This method is called every flush interval. Write the buffer chunk
341
+ # to files or databases here.
342
+ # 'chunk' is a buffer chunk that includes multiple formatted
343
+ # events. You can use 'data = chunk.read' to get all events and
344
+ # 'chunk.open {|io| ... }' to get IO objects.
345
+ #
346
+ # NOTE! This method is called by internal thread, not Fluentd's main thread. So IO wait doesn't affect other plugins.
347
+ def write(chunk)
348
+ #foreach tables,choose table oject ,data = chunk.read
349
+ @tables.each { |table|
350
+ if table.pattern.match(chunk.key)
351
+ log.info "Begin to import the data and the table_match is "+chunk.key
352
+ return table.import(chunk)
353
+ end
354
+ }
355
+ end
356
+
357
+ def emit(tag, es, chain)
358
+ super(tag, es, chain, format_tag(tag))
359
+ end
360
+
361
+ def format(tag, time, record)
362
+ [tag, time, record].to_msgpack
363
+ end
364
+
365
+ def format_tag(tag)
366
+ if @remove_tag_prefix
367
+ tag.gsub(@remove_tag_prefix, '')
368
+ else
369
+ tag
370
+ end
371
+ end
372
+ end
373
+ end