fluent-plugin-datahub 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +3 -0
- data/README.md +38 -0
- data/Rakefile +14 -0
- data/VERSION +1 -0
- data/build.sh +11 -0
- data/fluent-plugin-datahub.gemspec +22 -0
- data/lib/fluent/plugin/datahub/datahub-client.rb +27 -0
- data/lib/fluent/plugin/datahub/datahub-http-client-test.rb +343 -0
- data/lib/fluent/plugin/datahub/datahub-http-client.rb +229 -0
- data/lib/fluent/plugin/datahub/datahub-project.rb +59 -0
- data/lib/fluent/plugin/datahub/datahub-put-record-result.rb +23 -0
- data/lib/fluent/plugin/datahub/datahub-record-entity.rb +136 -0
- data/lib/fluent/plugin/datahub/datahub-record-schema.rb +73 -0
- data/lib/fluent/plugin/datahub/datahub-shard.rb +13 -0
- data/lib/fluent/plugin/datahub/datahub-topic.rb +73 -0
- data/lib/fluent/plugin/out_datahub.rb +402 -0
- data/sample/csv_sample.conf +22 -0
- data/sample/csv_sample.csv +14 -0
- data/sample/log_sample.conf +17 -0
- data/sample/log_sample.log +1 -0
- metadata +97 -0
@@ -0,0 +1,73 @@
|
|
1
|
+
require "json"
|
2
|
+
|
3
|
+
class RecordField
|
4
|
+
|
5
|
+
def initialize(name, type)
|
6
|
+
@name = name
|
7
|
+
@type = type
|
8
|
+
end
|
9
|
+
|
10
|
+
def get_name()
|
11
|
+
return @name
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_type()
|
15
|
+
return @type
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_json(*a)
|
19
|
+
field_map = {}
|
20
|
+
field_map["name"] = @name
|
21
|
+
field_map["type"] = @type
|
22
|
+
return field_map.to_json(*a)
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
class RecordSchema
|
28
|
+
def initialize()
|
29
|
+
@fields = []
|
30
|
+
|
31
|
+
@encoding = nil
|
32
|
+
|
33
|
+
@fields_map = {}
|
34
|
+
end
|
35
|
+
|
36
|
+
def setEncoding(encoding)
|
37
|
+
if ["US-ASCII", "ASCII-8BIT", "UTF-8", "ISO-8859-1", "Shift_JIS", "EUC-JP", "Windows-31J", "BINARY", "CP932", "eucJP"].include?(encoding)
|
38
|
+
@encoding = encoding
|
39
|
+
else
|
40
|
+
raise "Unsupported encoding type [" + encoding.to_s + "]."
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_encoding
|
45
|
+
return @encoding
|
46
|
+
end
|
47
|
+
|
48
|
+
def add_field(field)
|
49
|
+
@fields.push(field)
|
50
|
+
@fields_map[field.get_name] = field
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_field(name)
|
54
|
+
# @fields.each do |field|
|
55
|
+
# if field.get_name == name
|
56
|
+
# return field
|
57
|
+
# end
|
58
|
+
# end
|
59
|
+
# return nil
|
60
|
+
return @fields_map[name]
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_fields()
|
64
|
+
return @fields
|
65
|
+
end
|
66
|
+
|
67
|
+
def to_json(*a)
|
68
|
+
tuple = {}
|
69
|
+
tuple["fields"] = @fields
|
70
|
+
tuple.to_json(*a)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require_relative "datahub-http-client"
|
2
|
+
require_relative "datahub-topic"
|
3
|
+
|
4
|
+
class DatahubShard
|
5
|
+
attr_accessor :shard_id
|
6
|
+
attr_accessor :state
|
7
|
+
attr_accessor :begin_key
|
8
|
+
attr_accessor :end_key
|
9
|
+
attr_accessor :right_shard_id
|
10
|
+
attr_accessor :left_shard_id
|
11
|
+
attr_accessor :parent_shard_ids
|
12
|
+
attr_accessor :closed_time
|
13
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require_relative "datahub-http-client"
|
2
|
+
require_relative "datahub-project"
|
3
|
+
require_relative "datahub-shard"
|
4
|
+
require_relative "datahub-put-record-result"
|
5
|
+
|
6
|
+
class DatahubTopic
|
7
|
+
attr_accessor :shard_count
|
8
|
+
attr_accessor :lifecycle
|
9
|
+
attr_accessor :record_type
|
10
|
+
attr_accessor :record_schema
|
11
|
+
attr_accessor :comment
|
12
|
+
attr_accessor :create_time
|
13
|
+
attr_accessor :last_modify_time
|
14
|
+
|
15
|
+
def initialize(datahub_http_client, project_name, topic_name)
|
16
|
+
@client = datahub_http_client
|
17
|
+
@project_name = project_name
|
18
|
+
@topic_name = topic_name
|
19
|
+
end
|
20
|
+
|
21
|
+
def list_shards()
|
22
|
+
result_map = @client.list_shards(@project_name, @topic_name)
|
23
|
+
shard_array = result_map["Shards"]
|
24
|
+
|
25
|
+
shards = []
|
26
|
+
|
27
|
+
for i in 0...shard_array.size
|
28
|
+
shard = DatahubShard.new
|
29
|
+
|
30
|
+
shard_map = shard_array[i]
|
31
|
+
shard.begin_key = shard_map["BeginKey"]
|
32
|
+
shard.end_key = shard_map["EndKey"]
|
33
|
+
shard.left_shard_id = shard_map["LeftShardId"]
|
34
|
+
shard.parent_shard_ids = shard_map["ParentShardIds"]
|
35
|
+
shard.right_shard_id = shard_map["RightShardId"]
|
36
|
+
shard.shard_id = shard_map["ShardId"]
|
37
|
+
shard.state = shard_map["State"]
|
38
|
+
|
39
|
+
shards.push(shard)
|
40
|
+
end
|
41
|
+
|
42
|
+
return shards
|
43
|
+
end
|
44
|
+
|
45
|
+
def get_cursor(shard_id, offset=DateTime.now.strftime('%Q'), type="OLDEST")
|
46
|
+
result_map = @client.get_shard_cursor(@project_name, @topic_name, shard_id, offset, type)
|
47
|
+
return result_map["Cursor"]
|
48
|
+
end
|
49
|
+
|
50
|
+
def write_data(record_entities)
|
51
|
+
put_record_result = PutRecordResult.new
|
52
|
+
result_map = @client.write_data_to_topic(@project_name, @topic_name, record_entities)
|
53
|
+
|
54
|
+
if result_map["FailedRecordCount"] > 0
|
55
|
+
put_record_result.failed_record_count = result_map["FailedRecordCount"]
|
56
|
+
for i in 0...result_map["FailedRecords"].size
|
57
|
+
result_error = result_map["FailedRecords"][i]
|
58
|
+
put_record_result.failed_record_index.push(result_error["Index"])
|
59
|
+
error_entity = {}
|
60
|
+
error_entity["error_code"] = result_error["ErrorCode"]
|
61
|
+
error_entity["error_message"] = result_error["ErrorMessage"]
|
62
|
+
put_record_result.failed_record_error.push(error_entity)
|
63
|
+
put_record_result.failed_record_list.push(record_entities[result_error["Index"]])
|
64
|
+
end
|
65
|
+
end
|
66
|
+
return put_record_result
|
67
|
+
end
|
68
|
+
|
69
|
+
def read_data(shard_id, cursor, count)
|
70
|
+
@client.read_data_from_shard_with_cursor(@project_name, @topic_name, shard_id, cursor, count)
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,402 @@
|
|
1
|
+
require_relative "datahub/datahub-client"
|
2
|
+
|
3
|
+
module Fluent
|
4
|
+
class DatahubOutput < BufferedOutput
|
5
|
+
Fluent::Plugin::register_output('datahub', self)
|
6
|
+
|
7
|
+
# datahub access id
|
8
|
+
config_param :access_id, :string
|
9
|
+
|
10
|
+
# datahub access key
|
11
|
+
config_param :access_key, :string
|
12
|
+
|
13
|
+
# datahub service endpoint
|
14
|
+
config_param :endpoint, :string
|
15
|
+
|
16
|
+
# datahub project name
|
17
|
+
config_param :project_name, :string
|
18
|
+
|
19
|
+
# datahub topic name
|
20
|
+
config_param :topic_name, :string
|
21
|
+
|
22
|
+
# 重试次数
|
23
|
+
config_param :retry_times, :integer, :default => -1
|
24
|
+
|
25
|
+
# 重试周期,下一次重试的间隔,单位为秒
|
26
|
+
config_param :retry_interval, :integer, :default => 3
|
27
|
+
|
28
|
+
# 提交的列名,用户可以配置topic的列,采集部分列或者全部列
|
29
|
+
# 默认为空数组,表示按照topic的顺序及全字段提交
|
30
|
+
# 另外:列的配置不用保序,但是要求该字段在topic的schema中存在
|
31
|
+
config_param :column_names, :array, :default => []
|
32
|
+
|
33
|
+
# 指定源头采集的keys, record 按照这些keys 获取数据, 写入datahub
|
34
|
+
# 默认空数组, 此时record使用column_names 获取数据, 写入datahub
|
35
|
+
config_param :source_keys, :array, :default => []
|
36
|
+
|
37
|
+
# 当出现脏数据时,是否继续写入
|
38
|
+
# 当开启该开关,必须指定@dirty_data_file文件
|
39
|
+
config_param :dirty_data_continue, :bool, :default => false
|
40
|
+
|
41
|
+
# 脏数据文件名称,当数据文件名称,在@dirty_data_continue开启的情况下,需要指定该值
|
42
|
+
# 特别注意:脏数据文件将被分割成两个部分.part1和.part2,part1作为更早的脏数据,part2作为更新的数据
|
43
|
+
config_param :dirty_data_file, :string, :default => ""
|
44
|
+
|
45
|
+
# 脏数据文件的最大大小,该值保证脏数据文件最大大小不超过这个值,目前该值仅是一个参考值
|
46
|
+
config_param :dirty_data_file_max_size, :integer, :required => false, :default => 50024000
|
47
|
+
|
48
|
+
# 写入指定的 shard_id
|
49
|
+
config_param :shard_id, :string, :required => false, :default => ""
|
50
|
+
|
51
|
+
# 按照指定字段的值计算hash,依据于该hash值落某个shard
|
52
|
+
config_param :shard_keys, :array, :required => false, :default => []
|
53
|
+
|
54
|
+
# fluentd自带的 retry次数, 由于可能导致数据重写,该参数默认设置为0
|
55
|
+
config_param :retry_limit, :integer, :default => 0
|
56
|
+
|
57
|
+
# 多少条数据 写一次datahub, 默认100条, 最大不能好过3m
|
58
|
+
config_param :put_data_batch_size, :integer, :default => 100
|
59
|
+
|
60
|
+
config_param :data_encoding, :string, :default => nil
|
61
|
+
|
62
|
+
# 该值内部使用,不提供配置
|
63
|
+
# 分发shard的游标
|
64
|
+
attr_accessor :shard_cursor
|
65
|
+
|
66
|
+
# 写文件锁
|
67
|
+
@@file_lock = Mutex.new
|
68
|
+
|
69
|
+
def configure(conf)
|
70
|
+
super
|
71
|
+
@client = DatahubClient.new(@endpoint, @access_id, @access_key)
|
72
|
+
@datahub_project = @client.get_project(@project_name)
|
73
|
+
@datahub_topic = @datahub_project.get_topic(@topic_name)
|
74
|
+
|
75
|
+
@shards = get_active_shard
|
76
|
+
@shard_count = @shards.size
|
77
|
+
|
78
|
+
@logger = log
|
79
|
+
@shard_cursor = 0
|
80
|
+
|
81
|
+
#限制一次向datahub put data不能超过3000
|
82
|
+
@put_data_max_size = 3000
|
83
|
+
|
84
|
+
@target_source_column_map = {}
|
85
|
+
|
86
|
+
# 前置校验参数
|
87
|
+
check_params
|
88
|
+
end
|
89
|
+
|
90
|
+
def check_params
|
91
|
+
schema = @datahub_topic.record_schema
|
92
|
+
if @data_encoding != nil
|
93
|
+
schema.setEncoding(@data_encoding)
|
94
|
+
end
|
95
|
+
|
96
|
+
fields = schema.get_fields
|
97
|
+
|
98
|
+
# 保证用户配置的字段在topic中存在
|
99
|
+
if @column_names.size > 0
|
100
|
+
for i in 0...@column_names.size do
|
101
|
+
column_name = @column_names[i]
|
102
|
+
column_index = find_column_index(fields, column_name)
|
103
|
+
if column_index == -1
|
104
|
+
@logger.error "Column: " + column_name + " not found, please check your config"
|
105
|
+
raise "Column: " + column_name + " not found, please check your config"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
if @source_keys.size == 0
|
111
|
+
@source_keys = @column_names
|
112
|
+
end
|
113
|
+
|
114
|
+
#puts "source_key size: " + @source_keys.to_s
|
115
|
+
#puts "column_names: " + @column_names.to_s
|
116
|
+
|
117
|
+
if @source_keys.size > 0 and @column_names.size != @source_keys.size
|
118
|
+
@logger.error "source_keys's size must be equal to column_names's size, please check your config"
|
119
|
+
raise "source_keys's size must be equal to column_names's size, please check your config"
|
120
|
+
else
|
121
|
+
for i in 0...@column_names.size do
|
122
|
+
@target_source_column_map[@column_names[i]] = @source_keys[i]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
#puts @target_source_column_map
|
127
|
+
|
128
|
+
if @shard_count < 1
|
129
|
+
raise "there must be at least 1 active shard!"
|
130
|
+
end
|
131
|
+
|
132
|
+
# 配置了脏数据继续,必须指定脏数据文件
|
133
|
+
if @dirty_data_continue
|
134
|
+
if @dirty_data_file.to_s.chomp.length == 0
|
135
|
+
raise "Dirty data file path can not be empty"
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# 检查shard_keys字段是否合法
|
140
|
+
if @shard_keys.size > 0
|
141
|
+
for i in 0...@shard_keys.size
|
142
|
+
shard_key = @shard_keys[i]
|
143
|
+
shard_key_index = find_column_index(fields, shard_key)
|
144
|
+
if shard_key_index == -1
|
145
|
+
@logger.error "Shard key: " + shard_key + " not found in schema, please check your config"
|
146
|
+
raise "Shard key: " + shard_key + " not found in schema, please check your config"
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
|
153
|
+
# 在topic的schema中查找某列的真实下标
|
154
|
+
# 如果没找到返回-1
|
155
|
+
def find_column_index(fields, column_name)
|
156
|
+
for i in 0...fields.size do
|
157
|
+
name = fields[i].get_name
|
158
|
+
if name == column_name
|
159
|
+
return i
|
160
|
+
end
|
161
|
+
end
|
162
|
+
return -1
|
163
|
+
end
|
164
|
+
|
165
|
+
def start
|
166
|
+
super
|
167
|
+
end
|
168
|
+
|
169
|
+
def shutdown
|
170
|
+
super
|
171
|
+
end
|
172
|
+
|
173
|
+
def format(tag, time, record)
|
174
|
+
[tag, time, record].to_json + '\n'
|
175
|
+
end
|
176
|
+
|
177
|
+
def format(tag, time, record)
|
178
|
+
[tag, time, record].to_msgpack
|
179
|
+
end
|
180
|
+
|
181
|
+
def write(chunk)
|
182
|
+
record_entities = []
|
183
|
+
schema = @datahub_topic.record_schema
|
184
|
+
|
185
|
+
chunk.msgpack_each do |tag, time, record|
|
186
|
+
entity = RecordEntity.new(schema)
|
187
|
+
convert_success = record_to_entity(entity, record)
|
188
|
+
entity.set_shard_id(get_shard_id(record))
|
189
|
+
if convert_success
|
190
|
+
record_entities.push(entity)
|
191
|
+
end
|
192
|
+
if record_entities.size >= @put_data_max_size
|
193
|
+
write_data_with_retry(record_entities)
|
194
|
+
# puts record_entities.to_json
|
195
|
+
record_entities.clear
|
196
|
+
# puts "after clear ; " + record_entities.to_json
|
197
|
+
elsif record_entities.size >= @put_data_batch_size
|
198
|
+
write_data_with_retry(record_entities)
|
199
|
+
#puts record_entities.to_json
|
200
|
+
record_entities.clear
|
201
|
+
#puts "after clear ; " + record_entities.to_json
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
if record_entities.size > 0
|
206
|
+
write_data_with_retry(record_entities)
|
207
|
+
# record_entities.clear
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
# 根据@@retry_times 重试写入datahub数据
|
212
|
+
def write_data_with_retry(record_entities)
|
213
|
+
tmp_retry_times = @retry_times
|
214
|
+
put_result = nil
|
215
|
+
while true
|
216
|
+
begin
|
217
|
+
put_result = @datahub_topic.write_data(record_entities)
|
218
|
+
rescue => e
|
219
|
+
@logger.warn "Put " + record_entities.size.to_s + " records to datahub failed, total " + record_entities.size.to_s + ", message = " + e.message
|
220
|
+
if tmp_retry_times > 0
|
221
|
+
sleep @retry_interval
|
222
|
+
@logger.warn "Now retry(" + (@retry_times - tmp_retry_times + 1).to_s + ")..."
|
223
|
+
tmp_retry_times -= 1
|
224
|
+
next
|
225
|
+
else
|
226
|
+
if !@dirty_data_continue
|
227
|
+
@logger.error "Dirty data found, exit process now."
|
228
|
+
puts "Dirty data found, exit process now."
|
229
|
+
raise "try to exit!"
|
230
|
+
else
|
231
|
+
#不重试/重试次数用完,写入脏数据文件
|
232
|
+
for i in 0...record_entities.size
|
233
|
+
record_entity = record_entities[i]
|
234
|
+
@logger.error "Put record: " + record_entity.get_columns_map.to_s + " failed, " + put_result.failed_record_error[i].to_s
|
235
|
+
write_as_dirty_data(record_entity.get_columns_map)
|
236
|
+
end
|
237
|
+
break
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
#puts record_entities.to_json
|
243
|
+
if put_result != nil and put_result.failed_record_count > 0
|
244
|
+
if tmp_retry_times > 0
|
245
|
+
#按照retry_times重试
|
246
|
+
@logger.warn "Put " + put_result.failed_record_count.to_s + " records to datahub failed, total " + record_entities.size.to_s
|
247
|
+
sleep @retry_interval
|
248
|
+
@logger.warn "Now retry(" + (@retry_times - tmp_retry_times + 1).to_s + ")..."
|
249
|
+
tmp_retry_times -= 1
|
250
|
+
record_entities = put_result.failed_record_list
|
251
|
+
|
252
|
+
# 若是轮询写入方式,且shard处于非active状态(即error_code = "InvalidShardOperation"),则刷新shard列表
|
253
|
+
fresh_shard_flag = false
|
254
|
+
if @shard_id.empty? and @shard_keys.size == 0
|
255
|
+
for i in 0...put_result.failed_record_count
|
256
|
+
error_entity = put_result.failed_record_error[i]
|
257
|
+
if error_entity["error_code"] == "InvalidShardOperation"
|
258
|
+
unless fresh_shard_flag
|
259
|
+
@shards = get_active_shard
|
260
|
+
@shard_count = @shards.size
|
261
|
+
fresh_shard_flag = true
|
262
|
+
end
|
263
|
+
# puts "before: " + record_entities[i].to_json
|
264
|
+
record_entities[i].set_shard_id(get_shard_id(record_entities[i]))
|
265
|
+
# puts record_entities[i].to_json
|
266
|
+
end
|
267
|
+
end
|
268
|
+
end
|
269
|
+
else
|
270
|
+
if !@dirty_data_continue
|
271
|
+
@logger.error "Dirty data found, exit process now."
|
272
|
+
puts "Dirty data found, exit process now."
|
273
|
+
raise "try to exit!"
|
274
|
+
else
|
275
|
+
#不重试/重试次数用完,写入脏数据文件
|
276
|
+
for i in 0...put_result.failed_record_count
|
277
|
+
record_entity = put_result.failed_record_list[i]
|
278
|
+
@logger.error "Put record: " + record_entity.get_columns_map.to_s + " failed, " + put_result.failed_record_error[i].to_s
|
279
|
+
write_as_dirty_data(record_entity.get_columns_map)
|
280
|
+
end
|
281
|
+
break
|
282
|
+
end
|
283
|
+
end
|
284
|
+
else
|
285
|
+
@logger.info "Put data to datahub success, total " + record_entities.size.to_s
|
286
|
+
break
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
# 将record转化为entity
|
292
|
+
def record_to_entity(entity, record)
|
293
|
+
schema = entity.get_schema
|
294
|
+
@column_names.each do |column|
|
295
|
+
begin
|
296
|
+
source_key = @target_source_column_map[column]
|
297
|
+
if record.has_key?(source_key)
|
298
|
+
field = schema.get_field(column)
|
299
|
+
if field == nil
|
300
|
+
raise "Unknown column name of data"
|
301
|
+
else
|
302
|
+
field_type = field.get_type
|
303
|
+
if field_type == "BIGINT"
|
304
|
+
entity.setBigInt(column, record[source_key])
|
305
|
+
elsif field_type == "DOUBLE"
|
306
|
+
entity.setDouble(column, record[source_key])
|
307
|
+
elsif field_type == "BOOLEAN"
|
308
|
+
entity.setBoolean(column, record[source_key])
|
309
|
+
elsif field_type == "STRING"
|
310
|
+
entity.setString(column, record[source_key])
|
311
|
+
elsif field_type == "TIMESTAMP"
|
312
|
+
entity.setTimeStamp(column, record[source_key])
|
313
|
+
else
|
314
|
+
raise "Unknown schema type of data"
|
315
|
+
end
|
316
|
+
end
|
317
|
+
end
|
318
|
+
rescue => e
|
319
|
+
@logger.error "Parse data: " + column + "[" + record[source_key].to_s + "] failed, " + e.message
|
320
|
+
if !@dirty_data_continue
|
321
|
+
@logger.error "Dirty data found, exit process now."
|
322
|
+
puts "Dirty data found, exit process now."
|
323
|
+
raise "try to exit!"
|
324
|
+
else
|
325
|
+
# 忽略的异常数据直接落文件
|
326
|
+
write_as_dirty_data(record)
|
327
|
+
end
|
328
|
+
return false
|
329
|
+
end
|
330
|
+
end
|
331
|
+
return true
|
332
|
+
end
|
333
|
+
|
334
|
+
|
335
|
+
# 脏数据文件处理
|
336
|
+
def write_as_dirty_data(record)
|
337
|
+
dirty_file_part1_name = @dirty_data_file + ".part1"
|
338
|
+
dirty_file_part2_name = @dirty_data_file + ".part2"
|
339
|
+
|
340
|
+
# todo 加锁写入
|
341
|
+
@@file_lock.synchronize {
|
342
|
+
dirty_file_part2 = File.open(dirty_file_part2_name, "a+")
|
343
|
+
dirty_file_part2.puts(record.to_json)
|
344
|
+
dirty_file_part2.close
|
345
|
+
if File.size(dirty_file_part2_name) > @dirty_data_file_max_size / 2
|
346
|
+
# .part1, .part2分别存储数据
|
347
|
+
# 旧数据落part1,新的数据落part2
|
348
|
+
FileUtils.mv(dirty_file_part2_name, dirty_file_part1_name)
|
349
|
+
end
|
350
|
+
}
|
351
|
+
end
|
352
|
+
|
353
|
+
# 产生写入的shard_id
|
354
|
+
def get_shard_id(record)
|
355
|
+
if @shard_id != nil and !@shard_id.empty?
|
356
|
+
return @shard_id
|
357
|
+
elsif @shard_keys != nil and @shard_keys.size > 0
|
358
|
+
#hash 写入
|
359
|
+
hash_string = ""
|
360
|
+
for i in 0...@shard_keys.size
|
361
|
+
shard_key = @shard_keys[i]
|
362
|
+
source_key = @target_source_column_map[shard_key]
|
363
|
+
if record[source_key] != nil
|
364
|
+
hash_string += record[source_key].to_s + ","
|
365
|
+
end
|
366
|
+
end
|
367
|
+
hashed_value = hash_code(hash_string)
|
368
|
+
index = hashed_value % @shard_count
|
369
|
+
return @shards[index].shard_id
|
370
|
+
else
|
371
|
+
#轮询写入
|
372
|
+
idx = @shard_cursor % @shard_count
|
373
|
+
@shard_cursor = idx + 1
|
374
|
+
shard_id = @shards[idx].shard_id
|
375
|
+
# puts "idx: " + idx.to_s
|
376
|
+
# puts "shard_id: " + shard_id.to_s
|
377
|
+
return shard_id
|
378
|
+
end
|
379
|
+
end
|
380
|
+
|
381
|
+
# 产生和java 一样的hashcode
|
382
|
+
def hash_code(str)
|
383
|
+
str.each_char.reduce(0) do |result, char|
|
384
|
+
[((result << 5) - result) + char.ord].pack('L').unpack('l').first
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
# 获取active状态的shard
|
389
|
+
def get_active_shard
|
390
|
+
all_shards = @datahub_topic.list_shards
|
391
|
+
active_shards = []
|
392
|
+
all_shards.each do |shard|
|
393
|
+
if shard.state == "ACTIVE"
|
394
|
+
active_shards.push(shard)
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
return active_shards
|
399
|
+
end
|
400
|
+
|
401
|
+
end
|
402
|
+
end
|