logstash-kafka 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 00adb861cf25fabd675f94d437b0e4dc36a7fbad
4
+ data.tar.gz: ce5d97a0e3b9081bb60f595364da29125e2b6a5d
5
+ SHA512:
6
+ metadata.gz: 1b0675791f6af79c8cc64371787cb322ba108bdc2d431cf55bf49a543b92a8d4cab09efeb593803b50b05d7dd88f6f3404d76f3207de665c816d5b061183abd2
7
+ data.tar.gz: 919df956e078237ee9f01de276227b5f61ab454faf830e794962c76fc2132ce0cd928137aeaa648ef36e4fc8e2e9b66a3e893101cde6f150595ac017d401edec
@@ -0,0 +1,152 @@
1
+ require 'logstash/namespace'
2
+ require 'logstash/inputs/base'
3
+ require 'jruby-kafka'
4
+
5
+ # This input will read events from a Kafka topic. It uses the high level consumer API provided
6
+ # by Kafka to read messages from the broker. It also maintains the state of what has been
7
+ # consumed using Zookeeper. The default input codec is json
8
+ #
9
+ # The only required configuration is the topic name. By default it will connect to a Zookeeper
10
+ # running on localhost. All the broker information is read from Zookeeper state
11
+ #
12
+ # Ideally you should have as many threads as the number of partitions for a perfect balance --
13
+ # more threads than partitions means that some threads will be idle
14
+ #
15
+ # For more information see http://kafka.apache.org/documentation.html#theconsumer
16
+ #
17
+ # Kafka consumer configuration: http://kafka.apache.org/documentation.html#consumerconfigs
18
+ #
19
+ class LogStash::Inputs::Kafka < LogStash::Inputs::Base
20
+ config_name 'kafka'
21
+ milestone 1
22
+
23
+ default :codec, 'json'
24
+
25
+ # Specifies the ZooKeeper connection string in the form hostname:port where host and port are
26
+ # the host and port of a ZooKeeper server. You can also specify multiple hosts in the form
27
+ # hostname1:port1,hostname2:port2,hostname3:port3.
28
+ #
29
+ # The server may also have a ZooKeeper chroot path as part of it's ZooKeeper connection string
30
+ # which puts its data under some path in the global ZooKeeper namespace. If so the consumer
31
+ # should use the same chroot path in its connection string. For example to give a chroot path of
32
+ # /chroot/path you would give the connection string as
33
+ # hostname1:port1,hostname2:port2,hostname3:port3/chroot/path.
34
+ config :zk_connect, :validate => :string, :default => 'localhost:2181'
35
+ # A string that uniquely identifies the group of consumer processes to which this consumer
36
+ # belongs. By setting the same group id multiple processes indicate that they are all part of
37
+ # the same consumer group.
38
+ config :group_id, :validate => :string, :default => 'logstash'
39
+ # The topic to consume messages from
40
+ config :topic_id, :validate => :string, :required => true
41
+ # Specify whether to jump to beginning of the queue when there is no initial offset in
42
+ # ZooKeeper, or if an offset is out of range. If this is false, messages are consumed
43
+ # from the latest offset
44
+ #
45
+ # If reset_beginning is true, the consumer will check ZooKeeper to see if any other group members
46
+ # are present and active. If not, the consumer deletes any offset information in the ZooKeeper
47
+ # and starts at the smallest offset. If other group members are present reset_beginning will not
48
+ # work and the consumer threads will rejoin the consumer group.
49
+ config :reset_beginning, :validate => :boolean, :default => false
50
+ # Number of threads to read from the partitions. Ideally you should have as many threads as the
51
+ # number of partitions for a perfect balance. More threads than partitions means that some
52
+ # threads will be idle. Less threads means a single thread could be consuming from more than
53
+ # one partition
54
+ config :consumer_threads, :validate => :number, :default => 1
55
+ # Internal Logstash queue size used to hold events in memory after it has been read from Kafka
56
+ config :queue_size, :validate => :number, :default => 20
57
+ # When a new consumer joins a consumer group the set of consumers attempt to "rebalance" the
58
+ # load to assign partitions to each consumer. If the set of consumers changes while this
59
+ # assignment is taking place the rebalance will fail and retry. This setting controls the
60
+ # maximum number of attempts before giving up.
61
+ config :rebalance_max_retries, :validate => :number, :default => 4
62
+ # Backoff time between retries during rebalance.
63
+ config :rebalance_backoff_ms, :validate => :number, :default => 2000
64
+ # Throw a timeout exception to the consumer if no message is available for consumption after
65
+ # the specified interval
66
+ config :consumer_timeout_ms, :validate => :number, :default => -1
67
+ # Option to restart the consumer loop on error
68
+ config :consumer_restart_on_error, :validate => :boolean, :default => true
69
+ # Time in millis to wait for consumer to restart after an error
70
+ config :consumer_restart_sleep_ms, :validate => :number, :default => 0
71
+ # Option to add Kafka metadata like topic, message size to the event
72
+ config :decorate_events, :validate => :boolean, :default => false
73
+ # A unique id for the consumer; generated automatically if not set.
74
+ config :consumer_id, :validate => :string, :default => nil
75
+ # The number of byes of messages to attempt to fetch for each topic-partition in each fetch
76
+ # request. These bytes will be read into memory for each partition, so this helps control
77
+ # the memory used by the consumer. The fetch request size must be at least as large as the
78
+ # maximum message size the server allows or else it is possible for the producer to send
79
+ # messages larger than the consumer can fetch.
80
+ config :fetch_message_max_bytes, :validate => :number, :default => 1048576
81
+
82
+ public
83
+ def register
84
+ LogStash::Logger.setup_log4j(@logger)
85
+ options = {
86
+ :zk_connect => @zk_connect,
87
+ :group_id => @group_id,
88
+ :topic_id => @topic_id,
89
+ :rebalance_max_retries => @rebalance_max_retries,
90
+ :rebalance_backoff_ms => @rebalance_backoff_ms,
91
+ :consumer_timeout_ms => @consumer_timeout_ms,
92
+ :consumer_restart_on_error => @consumer_restart_on_error,
93
+ :consumer_restart_sleep_ms => @consumer_restart_sleep_ms,
94
+ :consumer_id => @consumer_id,
95
+ :fetch_message_max_bytes => @fetch_message_max_bytes
96
+ }
97
+ if @reset_beginning
98
+ options[:reset_beginning] = 'from-beginning'
99
+ end # if :reset_beginning
100
+ @kafka_client_queue = SizedQueue.new(@queue_size)
101
+ @consumer_group = Kafka::Group.new(options)
102
+ @logger.info('Registering kafka', :group_id => @group_id, :topic_id => @topic_id, :zk_connect => @zk_connect)
103
+ end # def register
104
+
105
+ public
106
+ def run(logstash_queue)
107
+ java_import 'kafka.common.ConsumerRebalanceFailedException'
108
+ @logger.info('Running kafka', :group_id => @group_id, :topic_id => @topic_id, :zk_connect => @zk_connect)
109
+ begin
110
+ @consumer_group.run(@consumer_threads,@kafka_client_queue)
111
+ begin
112
+ while true
113
+ event = @kafka_client_queue.pop
114
+ queue_event("#{event}",logstash_queue)
115
+ end
116
+ rescue LogStash::ShutdownSignal
117
+ @logger.info('Kafka got shutdown signal')
118
+ @consumer_group.shutdown
119
+ end
120
+ until @kafka_client_queue.empty?
121
+ queue_event("#{@kafka_client_queue.pop}",logstash_queue)
122
+ end
123
+ @logger.info('Done running kafka input')
124
+ rescue => e
125
+ @logger.warn('kafka client threw exception, restarting',
126
+ :exception => e)
127
+ if @consumer_group.running?
128
+ @consumer_group.shutdown
129
+ end
130
+ sleep(Float(@consumer_restart_sleep_ms) * 1 / 1000)
131
+ retry
132
+ end
133
+ finished
134
+ end # def run
135
+
136
+ private
137
+ def queue_event(msg, output_queue)
138
+ begin
139
+ @codec.decode(msg) do |event|
140
+ decorate(event)
141
+ if @decorate_events
142
+ event['kafka'] = {'msg_size' => msg.bytesize, 'topic' => @topic_id, 'consumer_group' => @group_id}
143
+ end
144
+ output_queue << event
145
+ end # @codec.decode
146
+ rescue => e # parse or event creation error
147
+ @logger.error('Failed to create event', :message => msg, :exception => e,
148
+ :backtrace => e.backtrace)
149
+ end # begin
150
+ end # def queue_event
151
+
152
+ end #class LogStash::Inputs::Kafka
@@ -0,0 +1,159 @@
1
+ require 'logstash/namespace'
2
+ require 'logstash/outputs/base'
3
+ require 'jruby-kafka'
4
+
5
+ # Write events to a Kafka topic. This uses the Kafka Producer API to write messages to a topic on
6
+ # the broker.
7
+ #
8
+ # The only required configuration is the topic name. The default codec is json,
9
+ # so events will be persisted on the broker in json format. If you select a codec of plain,
10
+ # Logstash will encode your messages with not only the message but also with a timestamp and
11
+ # hostname. If you do not want anything but your message passing through, you should make the output
12
+ # configuration something like:
13
+ # output {
14
+ # kafka {
15
+ # codec => plain {
16
+ # format => "%{message}"
17
+ # }
18
+ # }
19
+ # }
20
+ # For more information see http://kafka.apache.org/documentation.html#theproducer
21
+ #
22
+ # Kafka producer configuration: http://kafka.apache.org/documentation.html#producerconfigs
23
+ class LogStash::Outputs::Kafka < LogStash::Outputs::Base
24
+ config_name 'kafka'
25
+ milestone 1
26
+
27
+ default :codec, 'json'
28
+ # This is for bootstrapping and the producer will only use it for getting metadata (topics,
29
+ # partitions and replicas). The socket connections for sending the actual data will be
30
+ # established based on the broker information returned in the metadata. The format is
31
+ # host1:port1,host2:port2, and the list can be a subset of brokers or a VIP pointing to a
32
+ # subset of brokers.
33
+ config :broker_list, :validate => :string, :default => 'localhost:9092'
34
+ # The topic to produce the messages to
35
+ config :topic_id, :validate => :string, :required => true
36
+ # This parameter allows you to specify the compression codec for all data generated by this
37
+ # producer. Valid values are "none", "gzip" and "snappy".
38
+ config :compression_codec, :validate => %w( none gzip snappy ), :default => 'none'
39
+ # This parameter allows you to set whether compression should be turned on for particular
40
+ # topics. If the compression codec is anything other than NoCompressionCodec,
41
+ # enable compression only for specified topics if any. If the list of compressed topics is
42
+ # empty, then enable the specified compression codec for all topics. If the compression codec
43
+ # is NoCompressionCodec, compression is disabled for all topics
44
+ config :compressed_topics, :validate => :string, :default => ''
45
+ # This value controls when a produce request is considered completed. Specifically,
46
+ # how many other brokers must have committed the data to their log and acknowledged this to the
47
+ # leader. For more info, see -- http://kafka.apache.org/documentation.html#producerconfigs
48
+ config :request_required_acks, :validate => [-1,0,1], :default => 0
49
+ # The serializer class for messages. The default encoder takes a byte[] and returns the same byte[]
50
+ config :serializer_class, :validate => :string, :default => 'kafka.serializer.StringEncoder'
51
+ # The partitioner class for partitioning messages amongst partitions in the topic. The default
52
+ # partitioner is based on the hash of the key. If the key is null,
53
+ # the message is sent to a random partition in the broker.
54
+ # NOTE: topic_metadata_refresh_interval_ms controls how long the producer will distribute to a
55
+ # partition in the topic. This defaults to 10 mins, so the producer will continue to write to a
56
+ # single partition for 10 mins before it switches
57
+ config :partitioner_class, :validate => :string, :default => 'kafka.producer.DefaultPartitioner'
58
+ # The amount of time the broker will wait trying to meet the request.required.acks requirement
59
+ # before sending back an error to the client.
60
+ config :request_timeout_ms, :validate => :number, :default => 10000
61
+ # This parameter specifies whether the messages are sent asynchronously in a background thread.
62
+ # Valid values are (1) async for asynchronous send and (2) sync for synchronous send. By
63
+ # setting the producer to async we allow batching together of requests (which is great for
64
+ # throughput) but open the possibility of a failure of the client machine dropping unsent data.
65
+ config :producer_type, :validate => %w( sync async ), :default => 'sync'
66
+ # The serializer class for keys (defaults to the same as for messages if nothing is given)
67
+ config :key_serializer_class, :validate => :string, :default => nil
68
+ # This property will cause the producer to automatically retry a failed send request. This
69
+ # property specifies the number of retries when such failures occur. Note that setting a
70
+ # non-zero value here can lead to duplicates in the case of network errors that cause a message
71
+ # to be sent but the acknowledgement to be lost.
72
+ config :message_send_max_retries, :validate => :number, :default => 3
73
+ # Before each retry, the producer refreshes the metadata of relevant topics to see if a new
74
+ # leader has been elected. Since leader election takes a bit of time,
75
+ # this property specifies the amount of time that the producer waits before refreshing the
76
+ # metadata.
77
+ config :retry_backoff_ms, :validate => :number, :default => 100
78
+ # The producer generally refreshes the topic metadata from brokers when there is a failure
79
+ # (partition missing, leader not available...). It will also poll regularly (default: every
80
+ # 10min so 600000ms). If you set this to a negative value, metadata will only get refreshed on
81
+ # failure. If you set this to zero, the metadata will get refreshed after each message sent
82
+ # (not recommended). Important note: the refresh happen only AFTER the message is sent,
83
+ # so if the producer never sends a message the metadata is never refreshed
84
+ config :topic_metadata_refresh_interval_ms, :validate => :number, :default => 600 * 1000
85
+ # Maximum time to buffer data when using async mode. For example a setting of 100 will try to
86
+ # batch together 100ms of messages to send at once. This will improve throughput but adds
87
+ # message delivery latency due to the buffering.
88
+ config :queue_buffering_max_ms, :validate => :number, :default => 5000
89
+ # The maximum number of unsent messages that can be queued up the producer when using async
90
+ # mode before either the producer must be blocked or data must be dropped.
91
+ config :queue_buffering_max_messages, :validate => :number, :default => 10000
92
+ # The amount of time to block before dropping messages when running in async mode and the
93
+ # buffer has reached queue.buffering.max.messages. If set to 0 events will be enqueued
94
+ # immediately or dropped if the queue is full (the producer send call will never block). If set
95
+ # to -1 the producer will block indefinitely and never willingly drop a send.
96
+ config :queue_enqueue_timeout_ms, :validate => :number, :default => -1
97
+ # The number of messages to send in one batch when using async mode. The producer will wait
98
+ # until either this number of messages are ready to send or queue.buffer.max.ms is reached.
99
+ config :batch_num_messages, :validate => :number, :default => 200
100
+ # Socket write buffer size
101
+ config :send_buffer_bytes, :validate => :number, :default => 100 * 1024
102
+ # The client id is a user-specified string sent in each request to help trace calls. It should
103
+ # logically identify the application making the request.
104
+ config :client_id, :validate => :string, :default => ''
105
+
106
+ public
107
+ def register
108
+ LogStash::Logger.setup_log4j(@logger)
109
+ options = {
110
+ :broker_list => @broker_list,
111
+ :compression_codec => @compression_codec,
112
+ :compressed_topics => @compressed_topics,
113
+ :request_required_acks => @request_required_acks,
114
+ :serializer_class => @serializer_class,
115
+ :partitioner_class => @partitioner_class,
116
+ :request_timeout_ms => @request_timeout_ms,
117
+ :producer_type => @producer_type,
118
+ :key_serializer_class => @key_serializer_class,
119
+ :message_send_max_retries => @message_send_max_retries,
120
+ :retry_backoff_ms => @retry_backoff_ms,
121
+ :topic_metadata_refresh_interval_ms => @topic_metadata_refresh_interval_ms,
122
+ :queue_buffering_max_ms => @queue_buffering_max_ms,
123
+ :queue_buffering_max_messages => @queue_buffering_max_messages,
124
+ :queue_enqueue_timeout_ms => @queue_enqueue_timeout_ms,
125
+ :batch_num_messages => @batch_num_messages,
126
+ :send_buffer_bytes => @send_buffer_bytes,
127
+ :client_id => @client_id
128
+ }
129
+ @producer = Kafka::Producer.new(options)
130
+ @producer.connect
131
+
132
+ @logger.info('Registering kafka producer', :topic_id => @topic_id, :broker_list => @broker_list)
133
+
134
+ @codec.on_event do |event|
135
+ begin
136
+ @producer.send_msg(@topic_id,nil,event)
137
+ rescue LogStash::ShutdownSignal
138
+ @logger.info('Kafka producer got shutdown signal')
139
+ rescue => e
140
+ @logger.warn('kafka producer threw exception, restarting',
141
+ :exception => e)
142
+ end
143
+ end
144
+ end # def register
145
+
146
+ def receive(event)
147
+ return unless output?(event)
148
+ if event == LogStash::SHUTDOWN
149
+ finished
150
+ return
151
+ end
152
+ @codec.encode(event)
153
+ end
154
+
155
+ def teardown
156
+ @producer.close
157
+ end
158
+
159
+ end #class LogStash::Outputs::Kafka
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: logstash-kafka
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.0
5
+ platform: java
6
+ authors:
7
+ - Joseph Lawson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: jruby-kafka
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.0.0.beta
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 1.0.0.beta
27
+ description: this is primarily to be used as an interface for logstash
28
+ email:
29
+ - joe@joekiller.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - lib/logstash/inputs/kafka.rb
35
+ - lib/logstash/outputs/kafka.rb
36
+ homepage: https://github.com/joekiller/jruby-kafka
37
+ licenses:
38
+ - Apache 2.0
39
+ metadata: {}
40
+ post_install_message:
41
+ rdoc_options: []
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '0'
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ requirements: []
55
+ rubyforge_project:
56
+ rubygems_version: 2.2.2
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: jruby Kafka wrapper
60
+ test_files: []