RubyGems - fluent-plugin-datahub - Versions diffs - 0.0.1 - Mend

fluent-plugin-datahub 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +7 -0
data/Gemfile +3 -0
data/README.md +38 -0
data/Rakefile +14 -0
data/VERSION +1 -0
data/build.sh +11 -0
data/fluent-plugin-datahub.gemspec +22 -0
data/lib/fluent/plugin/datahub/datahub-client.rb +27 -0
data/lib/fluent/plugin/datahub/datahub-http-client-test.rb +343 -0
data/lib/fluent/plugin/datahub/datahub-http-client.rb +229 -0
data/lib/fluent/plugin/datahub/datahub-project.rb +59 -0
data/lib/fluent/plugin/datahub/datahub-put-record-result.rb +23 -0
data/lib/fluent/plugin/datahub/datahub-record-entity.rb +136 -0
data/lib/fluent/plugin/datahub/datahub-record-schema.rb +73 -0
data/lib/fluent/plugin/datahub/datahub-shard.rb +13 -0
data/lib/fluent/plugin/datahub/datahub-topic.rb +73 -0
data/lib/fluent/plugin/out_datahub.rb +402 -0
data/sample/csv_sample.conf +22 -0
data/sample/csv_sample.csv +14 -0
data/sample/log_sample.conf +17 -0
data/sample/log_sample.log +1 -0
metadata +97 -0

data/lib/fluent/plugin/datahub/datahub-record-schema.rb ADDED Viewed

@@ -0,0 +1,73 @@
+require "json"
+class RecordField
+    def initialize(name, type)
+        @name = name
+        @type = type
+    end
+    def get_name()
+        return @name
+    end
+    def get_type()
+        return @type
+    end
+    def to_json(*a)
+        field_map = {}
+        field_map["name"] = @name
+        field_map["type"] = @type
+        return field_map.to_json(*a)
+    end
+end
+class RecordSchema
+    def initialize()
+        @fields = []
+        @encoding = nil
+        @fields_map = {}
+    end
+    def setEncoding(encoding)
+        if ["US-ASCII", "ASCII-8BIT", "UTF-8", "ISO-8859-1", "Shift_JIS", "EUC-JP", "Windows-31J", "BINARY", "CP932", "eucJP"].include?(encoding)
+            @encoding = encoding
+        else
+            raise "Unsupported encoding type [" + encoding.to_s + "]."
+        end
+    end
+    def get_encoding
+        return @encoding
+    end
+    def add_field(field)
+        @fields.push(field)
+        @fields_map[field.get_name] = field
+    end
+    def get_field(name)
+        # @fields.each do |field|
+        #     if field.get_name == name
+        #         return field
+        #     end
+        # end
+        # return nil
+        return @fields_map[name]
+    end
+    def get_fields()
+        return @fields
+    end
+    def to_json(*a)
+        tuple = {}
+        tuple["fields"] = @fields
+        tuple.to_json(*a)
+    end
+end

data/lib/fluent/plugin/datahub/datahub-shard.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require_relative "datahub-http-client"
+require_relative "datahub-topic"
+class DatahubShard
+    attr_accessor :shard_id
+    attr_accessor :state
+    attr_accessor :begin_key
+    attr_accessor :end_key
+    attr_accessor :right_shard_id
+    attr_accessor :left_shard_id
+    attr_accessor :parent_shard_ids
+    attr_accessor :closed_time
+end

data/lib/fluent/plugin/datahub/datahub-topic.rb ADDED Viewed

@@ -0,0 +1,73 @@
+require_relative "datahub-http-client"
+require_relative "datahub-project"
+require_relative "datahub-shard"
+require_relative "datahub-put-record-result"
+class DatahubTopic
+    attr_accessor :shard_count
+    attr_accessor :lifecycle
+    attr_accessor :record_type
+    attr_accessor :record_schema
+    attr_accessor :comment
+    attr_accessor :create_time
+    attr_accessor :last_modify_time
+    def initialize(datahub_http_client, project_name, topic_name)
+        @client = datahub_http_client
+        @project_name = project_name
+        @topic_name = topic_name
+    end
+    def list_shards()
+        result_map = @client.list_shards(@project_name, @topic_name)
+        shard_array = result_map["Shards"]
+        shards = []
+        for i in 0...shard_array.size
+            shard = DatahubShard.new
+            shard_map = shard_array[i]
+            shard.begin_key = shard_map["BeginKey"]
+            shard.end_key = shard_map["EndKey"]
+            shard.left_shard_id = shard_map["LeftShardId"]
+            shard.parent_shard_ids = shard_map["ParentShardIds"]
+            shard.right_shard_id = shard_map["RightShardId"]
+            shard.shard_id = shard_map["ShardId"]
+            shard.state = shard_map["State"]
+            shards.push(shard)
+        end
+        return shards
+    end
+    def get_cursor(shard_id, offset=DateTime.now.strftime('%Q'), type="OLDEST")
+        result_map = @client.get_shard_cursor(@project_name, @topic_name, shard_id, offset, type)
+        return result_map["Cursor"]
+    end
+    def write_data(record_entities)
+        put_record_result = PutRecordResult.new
+        result_map = @client.write_data_to_topic(@project_name, @topic_name, record_entities)
+        if result_map["FailedRecordCount"] > 0
+            put_record_result.failed_record_count = result_map["FailedRecordCount"]
+            for i in 0...result_map["FailedRecords"].size
+                result_error = result_map["FailedRecords"][i]
+                put_record_result.failed_record_index.push(result_error["Index"])
+                error_entity = {}
+                error_entity["error_code"] = result_error["ErrorCode"]
+                error_entity["error_message"] = result_error["ErrorMessage"]
+                put_record_result.failed_record_error.push(error_entity)
+                put_record_result.failed_record_list.push(record_entities[result_error["Index"]])
+            end
+        end
+        return put_record_result
+    end
+    def read_data(shard_id, cursor, count)
+        @client.read_data_from_shard_with_cursor(@project_name, @topic_name, shard_id, cursor, count)
+    end
+end

data/lib/fluent/plugin/out_datahub.rb ADDED Viewed

@@ -0,0 +1,402 @@
+require_relative "datahub/datahub-client"
+module Fluent
+    class DatahubOutput < BufferedOutput
+        Fluent::Plugin::register_output('datahub', self)
+        # datahub access id
+        config_param :access_id, :string
+        # datahub access key
+        config_param :access_key, :string
+        # datahub service endpoint
+        config_param :endpoint, :string
+        # datahub project name
+        config_param :project_name, :string
+        # datahub topic name
+        config_param :topic_name, :string
+        # 重试次数
+        config_param :retry_times, :integer, :default => -1
+        # 重试周期，下一次重试的间隔，单位为秒
+        config_param :retry_interval, :integer, :default => 3
+        # 提交的列名，用户可以配置topic的列，采集部分列或者全部列
+        # 默认为空数组，表示按照topic的顺序及全字段提交
+        # 另外：列的配置不用保序，但是要求该字段在topic的schema中存在
+        config_param :column_names, :array, :default => []
+        # 指定源头采集的keys, record 按照这些keys 获取数据, 写入datahub
+        # 默认空数组, 此时record使用column_names 获取数据, 写入datahub
+        config_param :source_keys, :array, :default => []
+        # 当出现脏数据时，是否继续写入
+        # 当开启该开关，必须指定@dirty_data_file文件
+        config_param :dirty_data_continue, :bool, :default => false
+        # 脏数据文件名称，当数据文件名称，在@dirty_data_continue开启的情况下，需要指定该值
+        # 特别注意：脏数据文件将被分割成两个部分.part1和.part2，part1作为更早的脏数据，part2作为更新的数据
+        config_param :dirty_data_file, :string, :default => ""
+        # 脏数据文件的最大大小，该值保证脏数据文件最大大小不超过这个值，目前该值仅是一个参考值
+        config_param :dirty_data_file_max_size, :integer, :required => false, :default => 50024000
+        # 写入指定的 shard_id
+        config_param :shard_id, :string, :required => false, :default => ""
+        # 按照指定字段的值计算hash，依据于该hash值落某个shard
+        config_param :shard_keys, :array, :required => false, :default => []
+        # fluentd自带的 retry次数, 由于可能导致数据重写，该参数默认设置为0
+        config_param :retry_limit, :integer, :default => 0
+        # 多少条数据 写一次datahub, 默认100条, 最大不能好过3m
+        config_param :put_data_batch_size, :integer, :default => 100
+        config_param :data_encoding, :string, :default => nil
+        # 该值内部使用，不提供配置
+        # 分发shard的游标
+        attr_accessor :shard_cursor
+        # 写文件锁
+        @@file_lock = Mutex.new
+        def configure(conf)
+            super
+            @client = DatahubClient.new(@endpoint, @access_id, @access_key)
+            @datahub_project = @client.get_project(@project_name)
+            @datahub_topic = @datahub_project.get_topic(@topic_name)
+            @shards = get_active_shard
+            @shard_count = @shards.size
+            @logger = log
+            @shard_cursor = 0
+            #限制一次向datahub put data不能超过3000
+            @put_data_max_size = 3000
+            @target_source_column_map = {}
+            # 前置校验参数
+            check_params
+        end
+        def check_params
+          schema = @datahub_topic.record_schema
+          if @data_encoding != nil
+              schema.setEncoding(@data_encoding)
+          end
+          fields = schema.get_fields
+          # 保证用户配置的字段在topic中存在
+          if @column_names.size > 0
+              for i in 0...@column_names.size do
+                  column_name = @column_names[i]
+                  column_index = find_column_index(fields, column_name)
+                  if column_index == -1
+                      @logger.error "Column: " + column_name + " not found, please check your config"
+                      raise "Column: " + column_name + " not found, please check your config"
+                  end
+              end
+          end
+          if @source_keys.size == 0
+              @source_keys = @column_names
+          end
+          #puts "source_key size: " + @source_keys.to_s
+          #puts "column_names: " + @column_names.to_s
+          if @source_keys.size > 0 and @column_names.size != @source_keys.size
+              @logger.error "source_keys's size must be equal to column_names's size, please check your config"
+              raise "source_keys's size must be equal to column_names's size, please check your config"
+          else
+              for i in 0...@column_names.size do
+                  @target_source_column_map[@column_names[i]] = @source_keys[i]
+              end
+          end
+          #puts @target_source_column_map
+          if @shard_count < 1
+              raise "there must be at least 1 active shard!"
+          end
+          # 配置了脏数据继续，必须指定脏数据文件
+          if @dirty_data_continue
+              if @dirty_data_file.to_s.chomp.length == 0
+                  raise "Dirty data file path can not be empty"
+              end
+          end
+          # 检查shard_keys字段是否合法
+          if @shard_keys.size > 0
+              for i in 0...@shard_keys.size
+                  shard_key = @shard_keys[i]
+                  shard_key_index = find_column_index(fields, shard_key)
+                  if shard_key_index == -1
+                      @logger.error "Shard key: " + shard_key + " not found in schema, please check your config"
+                      raise "Shard key: " + shard_key + " not found in schema, please check your config"
+                  end
+              end
+          end
+        end
+        # 在topic的schema中查找某列的真实下标
+        # 如果没找到返回-1
+        def find_column_index(fields, column_name)
+            for i in 0...fields.size do
+                name = fields[i].get_name
+                if name == column_name
+                    return i
+                end
+            end
+            return -1
+        end
+        def start
+            super
+        end
+        def shutdown
+            super
+        end
+        def format(tag, time, record)
+            [tag, time, record].to_json + '\n'
+        end
+        def format(tag, time, record)
+            [tag, time, record].to_msgpack
+        end
+        def write(chunk)
+            record_entities = []
+            schema = @datahub_topic.record_schema
+            chunk.msgpack_each do |tag, time, record|
+                entity = RecordEntity.new(schema)
+                convert_success = record_to_entity(entity, record)
+                entity.set_shard_id(get_shard_id(record))
+                if convert_success
+                    record_entities.push(entity)
+                end
+                if record_entities.size >= @put_data_max_size
+                    write_data_with_retry(record_entities)
+                    # puts record_entities.to_json
+                    record_entities.clear
+                    # puts "after clear ; " + record_entities.to_json
+                elsif record_entities.size >= @put_data_batch_size
+                    write_data_with_retry(record_entities)
+                    #puts record_entities.to_json
+                    record_entities.clear
+                    #puts "after clear ; " + record_entities.to_json
+                end
+            end
+            if record_entities.size > 0
+                write_data_with_retry(record_entities)
+                # record_entities.clear
+            end
+        end
+        # 根据@@retry_times 重试写入datahub数据
+        def write_data_with_retry(record_entities)
+            tmp_retry_times = @retry_times
+            put_result = nil
+            while true
+                begin
+                    put_result = @datahub_topic.write_data(record_entities)
+                rescue => e
+                    @logger.warn "Put " + record_entities.size.to_s + " records to datahub failed, total " + record_entities.size.to_s + ", message = " + e.message
+                    if tmp_retry_times > 0
+                        sleep @retry_interval
+                        @logger.warn "Now retry(" + (@retry_times - tmp_retry_times + 1).to_s + ")..."
+                        tmp_retry_times -= 1
+                        next
+                    else
+                        if !@dirty_data_continue
+                            @logger.error "Dirty data found, exit process now."
+                            puts "Dirty data found, exit process now."
+                            raise "try to exit!"
+                        else
+                            #不重试/重试次数用完，写入脏数据文件
+                            for i in 0...record_entities.size
+                                record_entity = record_entities[i]
+                                @logger.error "Put record: " + record_entity.get_columns_map.to_s + " failed, " + put_result.failed_record_error[i].to_s
+                                write_as_dirty_data(record_entity.get_columns_map)
+                            end
+                            break
+                        end
+                    end
+                end
+                #puts record_entities.to_json
+                if put_result != nil and put_result.failed_record_count > 0
+                    if tmp_retry_times > 0
+                        #按照retry_times重试
+                        @logger.warn "Put " + put_result.failed_record_count.to_s + " records to datahub failed, total " + record_entities.size.to_s
+                        sleep @retry_interval
+                        @logger.warn "Now retry(" + (@retry_times - tmp_retry_times + 1).to_s + ")..."
+                        tmp_retry_times -= 1
+                        record_entities = put_result.failed_record_list
+                        # 若是轮询写入方式，且shard处于非active状态(即error_code = "InvalidShardOperation")，则刷新shard列表
+                        fresh_shard_flag = false
+                        if @shard_id.empty? and @shard_keys.size == 0
+                            for i in 0...put_result.failed_record_count
+                                error_entity = put_result.failed_record_error[i]
+                                if error_entity["error_code"] == "InvalidShardOperation"
+                                    unless fresh_shard_flag
+                                        @shards = get_active_shard
+                                        @shard_count = @shards.size
+                                        fresh_shard_flag = true
+                                    end
+                                    # puts "before: " + record_entities[i].to_json
+                                    record_entities[i].set_shard_id(get_shard_id(record_entities[i]))
+                                    # puts record_entities[i].to_json
+                                end
+                            end
+                        end
+                    else
+                        if !@dirty_data_continue
+                            @logger.error "Dirty data found, exit process now."
+                            puts "Dirty data found, exit process now."
+                            raise "try to exit!"
+                        else
+                            #不重试/重试次数用完，写入脏数据文件
+                            for i in 0...put_result.failed_record_count
+                                record_entity = put_result.failed_record_list[i]
+                                @logger.error "Put record: " + record_entity.get_columns_map.to_s + " failed, " + put_result.failed_record_error[i].to_s
+                                write_as_dirty_data(record_entity.get_columns_map)
+                            end
+                            break
+                        end
+                    end
+                else
+                    @logger.info "Put data to datahub success, total " + record_entities.size.to_s
+                    break
+                end
+            end
+        end
+        # 将record转化为entity
+        def record_to_entity(entity, record)
+            schema = entity.get_schema
+            @column_names.each do |column|
+                begin
+                    source_key = @target_source_column_map[column]
+                    if record.has_key?(source_key)
+                        field = schema.get_field(column)
+                        if field == nil
+                            raise "Unknown column name of data"
+                        else
+                            field_type = field.get_type
+                            if field_type == "BIGINT"
+                                entity.setBigInt(column, record[source_key])
+                            elsif field_type == "DOUBLE"
+                                entity.setDouble(column, record[source_key])
+                            elsif field_type == "BOOLEAN"
+                                entity.setBoolean(column, record[source_key])
+                            elsif field_type == "STRING"
+                                entity.setString(column, record[source_key])
+                            elsif field_type == "TIMESTAMP"
+                                entity.setTimeStamp(column, record[source_key])
+                            else
+                                raise "Unknown schema type of data"
+                            end
+                        end
+                    end
+                rescue => e
+                    @logger.error "Parse data: " + column + "[" + record[source_key].to_s + "] failed, " + e.message
+                    if !@dirty_data_continue
+                        @logger.error "Dirty data found, exit process now."
+                        puts "Dirty data found, exit process now."
+                        raise "try to exit!"
+                    else
+                        # 忽略的异常数据直接落文件
+                        write_as_dirty_data(record)
+                    end
+                    return false
+                end
+            end
+            return true
+        end
+        # 脏数据文件处理
+        def write_as_dirty_data(record)
+            dirty_file_part1_name = @dirty_data_file + ".part1"
+            dirty_file_part2_name = @dirty_data_file + ".part2"
+            # todo 加锁写入
+            @@file_lock.synchronize {
+                dirty_file_part2 = File.open(dirty_file_part2_name, "a+")
+                dirty_file_part2.puts(record.to_json)
+                dirty_file_part2.close
+                if File.size(dirty_file_part2_name) > @dirty_data_file_max_size / 2
+                    # .part1, .part2分别存储数据
+                    # 旧数据落part1，新的数据落part2
+                    FileUtils.mv(dirty_file_part2_name, dirty_file_part1_name)
+                end
+            }
+        end
+        # 产生写入的shard_id
+        def get_shard_id(record)
+            if @shard_id != nil and !@shard_id.empty?
+                return @shard_id
+            elsif @shard_keys != nil and @shard_keys.size > 0
+                #hash 写入
+                hash_string = ""
+                for i in 0...@shard_keys.size
+                    shard_key = @shard_keys[i]
+                    source_key = @target_source_column_map[shard_key]
+                    if record[source_key] != nil
+                        hash_string += record[source_key].to_s + ","
+                    end
+                end
+                hashed_value = hash_code(hash_string)
+                index = hashed_value % @shard_count
+                return @shards[index].shard_id
+            else
+                #轮询写入
+                idx = @shard_cursor % @shard_count
+                @shard_cursor = idx + 1
+                shard_id = @shards[idx].shard_id
+                # puts "idx: " + idx.to_s
+                # puts "shard_id: " + shard_id.to_s
+                return shard_id
+            end
+        end
+        # 产生和java 一样的hashcode
+        def hash_code(str)
+            str.each_char.reduce(0) do |result, char|
+                [((result << 5) - result) + char.ord].pack('L').unpack('l').first
+            end
+        end
+        # 获取active状态的shard
+        def get_active_shard
+            all_shards = @datahub_topic.list_shards
+            active_shards = []
+            all_shards.each do |shard|
+                if shard.state == "ACTIVE"
+                    active_shards.push(shard)
+                end
+            end
+            return active_shards
+        end
+    end
+end