RubyGems - logstash-input-kafka - Versions diffs - 0.1.0 - Mend

logstash-input-kafka 0.1.0

Files changed (12) hide show

checksums.yaml +15 -0
data/.gitignore +3 -0
data/Gemfile +3 -0
data/LICENSE +13 -0
data/README.md +40 -0
data/Rakefile +6 -0
data/lib/logstash/inputs/kafka.rb +153 -0
data/logstash-input-kafka.gemspec +32 -0
data/rakelib/publish.rake +9 -0
data/rakelib/vendor.rake +169 -0
data/spec/inputs/kafka.rb +57 -0
metadata +107 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    N2M1OTFjOGRmNTgyNWEyNjg3NTM0NDhkM2Y5NzNhNDUxYmQ5NjY1OQ==
+  data.tar.gz: !binary |-
+    NGYzOTRhZmYyMDExYzc1MTAxMGI1ODM3ODc4OWYwZGUyODI5NGY0Yw==
+SHA512:
+  metadata.gz: !binary |-
+    OGJhZTZlYWZmOTc1OWYxYzg1YWVjNzYzMzJjMzQ0MTgyODczYjlhODUyZDhl
+    Zjc2YWY5NmY4Y2NmNmM5MDJjNTI2ZmU0ZGE2MTIwMGNhZTk1MmM4NGMwZTY4
+    MzQ4MjE2N2Q4NWExYmIxODY3MzE4ZDk4YjdkNDU5ZGY0MTU2NWI=
+  data.tar.gz: !binary |-
+    YzY1Mjg1NjM2MmUwMTc5MzNlNWQyODRiNmQzMTEyYTQ2MDU2N2ZlZmZmZTMx
+    ZGRiNjRlNmRjZWU3MjcxZmE3NDFiMzAzOTUxOTNkNTQyYTljNjNhZTFhNjY4
+    ZjhiNzVlMmIzZmNmMzZmNzQxZTY5ZTY3MjY3OTFjZmY3NWU0ODM=

data/.gitignore ADDED Viewed

@@ -0,0 +1,3 @@
+*.gem
+Gemfile.lock
+.bundle

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source 'http://rubygems.org'
+gem 'rake'
+gem 'gem_publisher'

data/LICENSE ADDED Viewed

@@ -0,0 +1,13 @@
+Copyright (c) 2012-2014 Elasticsearch <http://www.elasticsearch.org>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

data/README.md ADDED Viewed

@@ -0,0 +1,40 @@
+logstash-input-kafka
+====================
+Apache Kafka input for Logstash. This input will consume messages from a Kafka topic using the high level consumer API exposed by Kafka.
+For more information about Kafka, refer to this [documentation](http://kafka.apache.org/documentation.html)
+Information about high level consumer API can be found [here](http://kafka.apache.org/documentation.html#highlevelconsumerapi)
+Logstash Configuration
+====================
+See http://kafka.apache.org/documentation.html#consumerconfigs for details about the Kafka consumer options.
+    input {
+        kafka {
+            topic_id => ... # string (required), The topic to consume messages from
+            zk_connect => ... # string (optional), default: "localhost:2181", Specifies the ZooKeeper connection string in the form hostname:port
+            group_id => ... # string (optional), default: "logstash", A string that uniquely identifies the group of consumer processes
+            reset_beginning => ... # boolean (optional), default: false, Specify whether to jump to beginning of the queue when there is no initial offset in ZK
+            consumer_threads => ... # number (optional), default: 1, Number of threads to read from the partitions
+            queue_size => ... # number (optional), default: 20, Internal Logstash queue size used to hold events in memory
+            rebalance_max_retries => ... # number (optional), default: 4
+            rebalance_backoff_ms => ... # number (optional), default:  2000
+            consumer_timeout_ms => ... # number (optional), default: -1
+            consumer_restart_on_error => ... # boolean (optional), default: true
+            consumer_restart_sleep_ms => ... # number (optional), default: 0
+            decorate_events => ... # boolean (optional), default: false, Option to add Kafka metadata like topic, message size to the event
+            consumer_id => ... # string (optional) default: nil
+            fetch_message_max_bytes => ... # number (optional) default: 1048576
+        }
+    }
+The default codec is json
+Dependencies
+====================
+* Apache Kafka version 0.8.1.1
+* jruby-kafka library

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+@files=[]
+task :default do
+  system('rake -T')
+end

data/lib/logstash/inputs/kafka.rb ADDED Viewed

@@ -0,0 +1,153 @@
+require 'logstash/namespace'
+require 'logstash/inputs/base'
+require 'logstash-input-kafka_jars'
+# This input will read events from a Kafka topic. It uses the high level consumer API provided
+# by Kafka to read messages from the broker. It also maintains the state of what has been
+# consumed using Zookeeper. The default input codec is json
+#
+# The only required configuration is the topic name. By default it will connect to a Zookeeper
+# running on localhost. All the broker information is read from Zookeeper state
+#
+# Ideally you should have as many threads as the number of partitions for a perfect balance --
+# more threads than partitions means that some threads will be idle
+#
+# For more information see http://kafka.apache.org/documentation.html#theconsumer
+#
+# Kafka consumer configuration: http://kafka.apache.org/documentation.html#consumerconfigs
+#
+class LogStash::Inputs::Kafka < LogStash::Inputs::Base
+  config_name 'kafka'
+  milestone 1
+  default :codec, 'json'
+  # Specifies the ZooKeeper connection string in the form hostname:port where host and port are
+  # the host and port of a ZooKeeper server. You can also specify multiple hosts in the form
+  # hostname1:port1,hostname2:port2,hostname3:port3.
+  #
+  # The server may also have a ZooKeeper chroot path as part of it's ZooKeeper connection string
+  # which puts its data under some path in the global ZooKeeper namespace. If so the consumer
+  # should use the same chroot path in its connection string. For example to give a chroot path of
+  # /chroot/path you would give the connection string as
+  # hostname1:port1,hostname2:port2,hostname3:port3/chroot/path.
+  config :zk_connect, :validate => :string, :default => 'localhost:2181'
+  # A string that uniquely identifies the group of consumer processes to which this consumer
+  # belongs. By setting the same group id multiple processes indicate that they are all part of
+  # the same consumer group.
+  config :group_id, :validate => :string, :default => 'logstash'
+  # The topic to consume messages from
+  config :topic_id, :validate => :string, :required => true
+  # Specify whether to jump to beginning of the queue when there is no initial offset in
+  # ZooKeeper, or if an offset is out of range. If this is false, messages are consumed
+  # from the latest offset
+  #
+  # If reset_beginning is true, the consumer will check ZooKeeper to see if any other group members
+  # are present and active. If not, the consumer deletes any offset information in the ZooKeeper
+  # and starts at the smallest offset. If other group members are present reset_beginning will not
+  # work and the consumer threads will rejoin the consumer group.
+  config :reset_beginning, :validate => :boolean, :default => false
+  # Number of threads to read from the partitions. Ideally you should have as many threads as the
+  # number of partitions for a perfect balance. More threads than partitions means that some
+  # threads will be idle. Less threads means a single thread could be consuming from more than
+  # one partition
+  config :consumer_threads, :validate => :number, :default => 1
+  # Internal Logstash queue size used to hold events in memory after it has been read from Kafka
+  config :queue_size, :validate => :number, :default => 20
+  # When a new consumer joins a consumer group the set of consumers attempt to "rebalance" the
+  # load to assign partitions to each consumer. If the set of consumers changes while this
+  # assignment is taking place the rebalance will fail and retry. This setting controls the
+  # maximum number of attempts before giving up.
+  config :rebalance_max_retries, :validate => :number, :default => 4
+  # Backoff time between retries during rebalance.
+  config :rebalance_backoff_ms, :validate => :number, :default => 2000
+  # Throw a timeout exception to the consumer if no message is available for consumption after
+  # the specified interval
+  config :consumer_timeout_ms, :validate => :number, :default => -1
+  # Option to restart the consumer loop on error
+  config :consumer_restart_on_error, :validate => :boolean, :default => true
+  # Time in millis to wait for consumer to restart after an error
+  config :consumer_restart_sleep_ms, :validate => :number, :default => 0
+  # Option to add Kafka metadata like topic, message size to the event
+  config :decorate_events, :validate => :boolean, :default => false
+  # A unique id for the consumer; generated automatically if not set.
+  config :consumer_id, :validate => :string, :default => nil
+  # The number of byes of messages to attempt to fetch for each topic-partition in each fetch
+  # request. These bytes will be read into memory for each partition, so this helps control
+  # the memory used by the consumer. The fetch request size must be at least as large as the
+  # maximum message size the server allows or else it is possible for the producer to send
+  # messages larger than the consumer can fetch.
+  config :fetch_message_max_bytes, :validate => :number, :default => 1048576
+  public
+  def register
+    require 'jruby-kafka'
+    options = {
+        :zk_connect => @zk_connect,
+        :group_id => @group_id,
+        :topic_id => @topic_id,
+        :rebalance_max_retries => @rebalance_max_retries,
+        :rebalance_backoff_ms => @rebalance_backoff_ms,
+        :consumer_timeout_ms => @consumer_timeout_ms,
+        :consumer_restart_on_error => @consumer_restart_on_error,
+        :consumer_restart_sleep_ms => @consumer_restart_sleep_ms,
+        :consumer_id => @consumer_id,
+        :fetch_message_max_bytes => @fetch_message_max_bytes
+    }
+    if @reset_beginning
+      options[:reset_beginning] = 'from-beginning'
+    end # if :reset_beginning
+    @kafka_client_queue = SizedQueue.new(@queue_size)
+    @consumer_group = Kafka::Group.new(options)
+    @logger.info('Registering kafka', :group_id => @group_id, :topic_id => @topic_id, :zk_connect => @zk_connect)
+  end # def register
+  public
+  def run(logstash_queue)
+    # noinspection JRubyStringImportInspection
+    java_import 'kafka.common.ConsumerRebalanceFailedException'
+    @logger.info('Running kafka', :group_id => @group_id, :topic_id => @topic_id, :zk_connect => @zk_connect)
+    begin
+      @consumer_group.run(@consumer_threads,@kafka_client_queue)
+      begin
+        while true
+          event = @kafka_client_queue.pop
+          queue_event("#{event}",logstash_queue)
+        end
+      rescue LogStash::ShutdownSignal
+        @logger.info('Kafka got shutdown signal')
+        @consumer_group.shutdown
+      end
+      until @kafka_client_queue.empty?
+        queue_event("#{@kafka_client_queue.pop}",logstash_queue)
+      end
+      @logger.info('Done running kafka input')
+    rescue => e
+      @logger.warn('kafka client threw exception, restarting',
+                   :exception => e)
+      if @consumer_group.running?
+        @consumer_group.shutdown
+      end
+      sleep(Float(@consumer_restart_sleep_ms) * 1 / 1000)
+      retry
+    end
+    finished
+  end # def run
+  private
+  def queue_event(msg, output_queue)
+    begin
+      @codec.decode(msg) do |event|
+        decorate(event)
+        if @decorate_events
+          event['kafka'] = {:msg_size => msg.bytesize, :topic => @topic_id, :consumer_group => @group_id}
+        end
+        output_queue << event
+      end # @codec.decode
+    rescue => e # parse or event creation error
+      @logger.error('Failed to create event', :message => msg, :exception => e,
+                    :backtrace => e.backtrace)
+    end # begin
+  end # def queue_event
+end #class LogStash::Inputs::Kafka

data/logstash-input-kafka.gemspec ADDED Viewed

@@ -0,0 +1,32 @@
+Gem::Specification.new do |s|
+  s.name            = 'logstash-input-kafka'
+  s.version         = '0.1.0'
+  s.licenses        = ['Apache License (2.0)']
+  s.summary         = 'This input will read events from a Kafka topic. It uses the high level consumer API provided by Kafka to read messages from the broker'
+  s.description     = 'This input will read events from a Kafka topic. It uses the high level consumer API provided by Kafka to read messages from the broker'
+  s.authors         = ['Elasticsearch']
+  s.email           = 'richard.pijnenburg@elasticsearch.com'
+  s.homepage        = 'http://logstash.net/'
+  s.require_paths = ['lib']
+  # Files
+  s.files = `git ls-files`.split($\)
+  # Tests
+  s.test_files = s.files.grep(%r{^(test|spec|features)/})
+  # Special flag to let us know this is actually a logstash plugin
+  s.metadata = { 'logstash_plugin' => 'true', 'group' => 'input'}
+  # Jar dependencies
+  s.requirements << "jar 'org.apache.kafka:kafka_2.10', '0.8.1.1'"
+  # Gem dependencies
+  s.add_runtime_dependency 'logstash', '>= 1.4.0', '< 2.0.0'
+  s.add_runtime_dependency 'jar-dependencies', ['~> 0.1.0']
+  s.add_runtime_dependency 'jruby-kafka', ['>=0.2.1']
+end

data/rakelib/publish.rake ADDED Viewed

@@ -0,0 +1,9 @@
+require "gem_publisher"
+desc "Publish gem to RubyGems.org"
+task :publish_gem do |t|
+  gem_file = Dir.glob(File.expand_path('../*.gemspec',File.dirname(__FILE__))).first
+  gem = GemPublisher.publish_if_updated(gem_file, :rubygems)
+  puts "Published #{gem}" if gem
+end

data/rakelib/vendor.rake ADDED Viewed

@@ -0,0 +1,169 @@
+require "net/http"
+require "uri"
+require "digest/sha1"
+def vendor(*args)
+  return File.join("vendor", *args)
+end
+directory "vendor/" => ["vendor"] do |task, args|
+  mkdir task.name
+end
+def fetch(url, sha1, output)
+  puts "Downloading #{url}"
+  actual_sha1 = download(url, output)
+  if actual_sha1 != sha1
+    fail "SHA1 does not match (expected '#{sha1}' but got '#{actual_sha1}')"
+  end
+end # def fetch
+def file_fetch(url, sha1)
+  filename = File.basename( URI(url).path )
+  output = "vendor/#{filename}"
+  task output => [ "vendor/" ] do
+    begin
+      actual_sha1 = file_sha1(output)
+      if actual_sha1 != sha1
+        fetch(url, sha1, output)
+      end
+    rescue Errno::ENOENT
+      fetch(url, sha1, output)
+    end
+  end.invoke
+  return output
+end
+def file_sha1(path)
+  digest = Digest::SHA1.new
+  fd = File.new(path, "r")
+  while true
+    begin
+      digest << fd.sysread(16384)
+    rescue EOFError
+      break
+    end
+  end
+  return digest.hexdigest
+ensure
+  fd.close if fd
+end
+def download(url, output)
+  uri = URI(url)
+  digest = Digest::SHA1.new
+  tmp = "#{output}.tmp"
+  Net::HTTP.start(uri.host, uri.port, :use_ssl => (uri.scheme == "https")) do |http|
+    request = Net::HTTP::Get.new(uri.path)
+    http.request(request) do |response|
+      fail "HTTP fetch failed for #{url}. #{response}" if [200, 301].include?(response.code)
+      size = (response["content-length"].to_i || -1).to_f
+      count = 0
+      File.open(tmp, "w") do |fd|
+        response.read_body do |chunk|
+          fd.write(chunk)
+          digest << chunk
+          if size > 0 && $stdout.tty?
+            count += chunk.bytesize
+            $stdout.write(sprintf("\r%0.2f%%", count/size * 100))
+          end
+        end
+      end
+      $stdout.write("\r      \r") if $stdout.tty?
+    end
+  end
+  File.rename(tmp, output)
+  return digest.hexdigest
+rescue SocketError => e
+  puts "Failure while downloading #{url}: #{e}"
+  raise
+ensure
+  File.unlink(tmp) if File.exist?(tmp)
+end # def download
+def untar(tarball, &block)
+  require "archive/tar/minitar"
+  tgz = Zlib::GzipReader.new(File.open(tarball))
+  # Pull out typesdb
+  tar = Archive::Tar::Minitar::Input.open(tgz)
+  tar.each do |entry|
+    path = block.call(entry)
+    next if path.nil?
+    parent = File.dirname(path)
+    mkdir_p parent unless File.directory?(parent)
+    # Skip this file if the output file is the same size
+    if entry.directory?
+      mkdir path unless File.directory?(path)
+    else
+      entry_mode = entry.instance_eval { @mode } & 0777
+      if File.exists?(path)
+        stat = File.stat(path)
+        # TODO(sissel): Submit a patch to archive-tar-minitar upstream to
+        # expose headers in the entry.
+        entry_size = entry.instance_eval { @size }
+        # If file sizes are same, skip writing.
+        next if stat.size == entry_size && (stat.mode & 0777) == entry_mode
+      end
+      puts "Extracting #{entry.full_name} from #{tarball} #{entry_mode.to_s(8)}"
+      File.open(path, "w") do |fd|
+        # eof? check lets us skip empty files. Necessary because the API provided by
+        # Archive::Tar::Minitar::Reader::EntryStream only mostly acts like an
+        # IO object. Something about empty files in this EntryStream causes
+        # IO.copy_stream to throw "can't convert nil into String" on JRuby
+        # TODO(sissel): File a bug about this.
+        while !entry.eof?
+          chunk = entry.read(16384)
+          fd.write(chunk)
+        end
+          #IO.copy_stream(entry, fd)
+      end
+      File.chmod(entry_mode, path)
+    end
+  end
+  tar.close
+  File.unlink(tarball) if File.file?(tarball)
+end # def untar
+def ungz(file)
+  outpath = file.gsub('.gz', '')
+  tgz = Zlib::GzipReader.new(File.open(file))
+  begin
+    File.open(outpath, "w") do |out|
+      IO::copy_stream(tgz, out)
+    end
+    File.unlink(file)
+  rescue
+    File.unlink(outpath) if File.file?(outpath)
+   raise
+  end
+  tgz.close
+end
+desc "Process any vendor files required for this plugin"
+task "vendor" do |task, args|
+  @files.each do |file|
+    download = file_fetch(file['url'], file['sha1'])
+    if download =~ /.tar.gz/
+      prefix = download.gsub('.tar.gz', '').gsub('vendor/', '')
+      untar(download) do |entry|
+        if !file['files'].nil?
+          next unless file['files'].include?(entry.full_name.gsub(prefix, ''))
+          out = entry.full_name.split("/").last
+        end
+        File.join('vendor', out)
+      end
+    elsif download =~ /.gz/
+      ungz(download)
+    end
+  end
+end

data/spec/inputs/kafka.rb ADDED Viewed

@@ -0,0 +1,57 @@
+# encoding: utf-8
+require 'rspec'
+require 'insist'
+require 'logstash/namespace'
+require 'logstash/inputs/kafka'
+require 'logstash/errors'
+describe LogStash::Inputs::Kafka do
+  extend LogStash::RSpec
+  let (:kafka_config) {{:topic_id => 'test'}}
+  it 'should populate kafka config with default values' do
+    kafka = LogStash::Inputs::Kafka.new(kafka_config)
+    insist {kafka.zk_connect} == 'localhost:2181'
+    insist {kafka.topic_id} == 'test'
+    insist {kafka.group_id} == 'logstash'
+    !insist { kafka.reset_beginning }
+  end
+  it 'should register and load kafka jars without errors' do
+    kafka = LogStash::Inputs::Kafka.new(kafka_config)
+    kafka.register
+  end
+  it 'should retrieve event from kafka' do
+    # Extend class to control behavior
+    class LogStash::Inputs::TestKafka < LogStash::Inputs::Kafka
+      milestone 1
+      private
+      def queue_event(msg, output_queue)
+        super(msg, output_queue)
+        # need to raise exception here to stop the infinite loop
+        raise LogStash::ShutdownSignal
+      end
+    end
+    kafka = LogStash::Inputs::TestKafka.new(kafka_config)
+    kafka.register
+    class Kafka::Group
+      public
+      def run(a_num_threads, a_queue)
+        a_queue << 'Kafka message'
+      end
+    end
+    logstash_queue = Queue.new
+    kafka.run logstash_queue
+    e = logstash_queue.pop
+    insist { e['message'] } == 'Kafka message'
+    # no metadata by default
+    insist { e['kafka'] } == nil
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,107 @@
+--- !ruby/object:Gem::Specification
+name: logstash-input-kafka
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Elasticsearch
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-11-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: logstash
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.4.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 2.0.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.4.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 2.0.0
+- !ruby/object:Gem::Dependency
+  name: jar-dependencies
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.1.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.1.0
+- !ruby/object:Gem::Dependency
+  name: jruby-kafka
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 0.2.1
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 0.2.1
+description: This input will read events from a Kafka topic. It uses the high level
+  consumer API provided by Kafka to read messages from the broker
+email: richard.pijnenburg@elasticsearch.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE
+- README.md
+- Rakefile
+- lib/logstash/inputs/kafka.rb
+- logstash-input-kafka.gemspec
+- rakelib/publish.rake
+- rakelib/vendor.rake
+- spec/inputs/kafka.rb
+homepage: http://logstash.net/
+licenses:
+- Apache License (2.0)
+metadata:
+  logstash_plugin: 'true'
+  group: input
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements:
+- jar 'org.apache.kafka:kafka_2.10', '0.8.1.1'
+rubyforge_project:
+rubygems_version: 2.4.1
+signing_key:
+specification_version: 4
+summary: This input will read events from a Kafka topic. It uses the high level consumer
+  API provided by Kafka to read messages from the broker
+test_files:
+- spec/inputs/kafka.rb