RubyGems - logstash-output-webhdfs - Versions diffs - 3.1.0-java - Mend

logstash-output-webhdfs 3.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +31 -0
data/CONTRIBUTORS +18 -0
data/Gemfile +11 -0
data/LICENSE +202 -0
data/NOTICE.TXT +5 -0
data/README.md +98 -0
data/docs/index.asciidoc +296 -0
data/lib/logstash/outputs/webhdfs.rb +261 -0
data/lib/logstash/outputs/webhdfs_helper.rb +119 -0
data/lib/logstash-output-webhdfs_jars.rb +4 -0
data/logstash-output-webhdfs.gemspec +32 -0
data/spec/integration/webhdfs_spec.rb +130 -0
data/spec/outputs/webhdfs_helper_spec.rb +37 -0
data/spec/outputs/webhdfs_spec.rb +71 -0
data/vendor/jar-dependencies/org/xerial/snappy/snappy-java/1.1.10.5/snappy-java-1.1.10.5.jar +0 -0
metadata +141 -0

data/lib/logstash/outputs/webhdfs.rb ADDED Viewed

@@ -0,0 +1,261 @@
+# encoding: utf-8
+require "logstash/namespace"
+require "logstash/outputs/base"
+require "stud/buffer"
+require "logstash/outputs/webhdfs_helper"
+# This plugin sends Logstash events into files in HDFS via
+# the https://hadoop.apache.org/docs/r1.0.4/webhdfs.html[webhdfs] REST API.
+#
+# ==== Dependencies
+# This plugin has no dependency on jars from hadoop, thus reducing configuration and compatibility
+# problems. It uses the webhdfs gem from Kazuki Ohta and TAGOMORI Satoshi (@see: https://github.com/kzk/webhdfs).
+# Optional dependencies are zlib if you use the compression functionality.
+#
+# ==== Operational Notes
+# If you get an error like:
+#
+#     Max write retries reached. Exception: initialize: name or service not known {:level=>:error}
+#
+# make sure that the hostname of your namenode is resolvable on the host running Logstash. When creating/appending
+# to a file, webhdfs somtime sends a `307 TEMPORARY_REDIRECT` with the `HOSTNAME` of the machine its running on.
+#
+# ==== Usage
+# This is an example of Logstash config:
+#
+# [source,ruby]
+# ----------------------------------
+# input {
+#   ...
+# }
+# filter {
+#   ...
+# }
+# output {
+#   webhdfs {
+#     host => "127.0.0.1"                 # (required)
+#     port => 50070                       # (optional, default: 50070)
+#     path => "/user/logstash/dt=%{+YYYY-MM-dd}/logstash-%{+HH}.log"  # (required)
+#     user => "hue"                       # (required)
+#   }
+# }
+# ----------------------------------
+class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
+  include Stud::Buffer
+  include LogStash::Outputs::WebHdfsHelper
+  config_name "webhdfs"
+  MAGIC = "\x82SNAPPY\x0".force_encoding Encoding::ASCII_8BIT
+  DEFAULT_VERSION = 1
+  MINIMUM_COMPATIBLE_VERSION = 1
+  # The server name for webhdfs/httpfs connections.
+  config :host, :validate => :string, :required => true
+  # The server port for webhdfs/httpfs connections.
+  config :port, :validate => :number, :default => 50070
+  # Standby namenode for ha hdfs.
+  config :standby_host, :validate => :string, :default => false
+  # Standby namenode port for ha hdfs.
+  config :standby_port, :validate => :number, :default => 50070
+  # The Username for webhdfs.
+  config :user, :validate => :string, :required => true
+  # The path to the file to write to. Event fields can be used here,
+  # as well as date fields in the joda time format, e.g.:
+  # `/user/logstash/dt=%{+YYYY-MM-dd}/%{@source_host}-%{+HH}.log`
+  config :path, :validate => :string, :required => true
+  # Sending data to webhdfs in x seconds intervals.
+  config :idle_flush_time, :validate => :number, :default => 1
+  # Sending data to webhdfs if event count is above, even if `store_interval_in_secs` is not reached.
+  config :flush_size, :validate => :number, :default => 500
+  # WebHdfs open timeout, default 30s.
+  config :open_timeout, :validate => :number, :default => 30
+  # The WebHdfs read timeout, default 30s.
+  config :read_timeout, :validate => :number, :default => 30
+  # Use httpfs mode if set to true, else webhdfs.
+  config :use_httpfs, :validate => :boolean, :default => false
+  # Avoid appending to same file in multiple threads.
+  # This solves some problems with multiple logstash output threads and locked file leases in webhdfs.
+  # If this option is set to true, %{[@metadata][thread_id]} needs to be used in path config settting.
+  config :single_file_per_thread, :validate => :boolean, :default => false
+  # Retry some known webhdfs errors. These may be caused by race conditions when appending to same file, etc.
+  config :retry_known_errors, :validate => :boolean, :default => true
+  # How long should we wait between retries.
+  config :retry_interval, :validate => :number, :default => 0.5
+  # How many times should we retry. If retry_times is exceeded, an error will be logged and the event will be discarded.
+  config :retry_times, :validate => :number, :default => 5
+  # Compress output. One of ['none', 'snappy', 'gzip']
+  config :compression, :validate => ["none", "snappy", "gzip"], :default => "none"
+  # Set snappy chunksize. Only neccessary for stream format. Defaults to 32k. Max is 65536
+  # @see http://code.google.com/p/snappy/source/browse/trunk/framing_format.txt
+  config :snappy_bufsize, :validate => :number, :default => 32768
+  # Set snappy format. One of "stream", "file". Set to stream to be hive compatible.
+  config :snappy_format, :validate => ["stream", "file"], :default => "stream"
+  # Set kerberos authentication.
+  config :use_kerberos_auth, :validate => :boolean, :default => false
+  # Set kerberos keytab file. Note that the gssapi library needs to be available to use this.
+  config :kerberos_keytab, :validate => :string
+  # Set ssl authentication. Note that the openssl library needs to be available to use this.
+  config :use_ssl_auth, :validate => :boolean, :default => false
+  # Set ssl key file.
+  config :ssl_key, :validate => :string
+  # Set ssl cert file.
+  config :ssl_cert, :validate => :string
+  ## Set codec.
+  default :codec, 'line'
+  public
+  def register
+    load_module('webhdfs')
+    # in case of snappy the jars are already included and no wrapper module has to be loaded.
+    if @compression == "gzip"
+      load_module('zlib')
+    end
+    @main_namenode_failed = false
+    @standby_client = false
+    @files = {}
+    # Create and test standby client if configured.
+    if @standby_host
+      @standby_client = prepare_client(@standby_host, @standby_port, @user)
+      begin
+        test_client(@standby_client)
+      rescue => e
+        logger.warn("Could not connect to standby namenode #{@standby_client.host}. Error: #{e.message}. Trying main webhdfs namenode.")
+      end
+    end
+    @client = prepare_client(@host, @port, @user)
+    begin
+      test_client(@client)
+    rescue => e
+      # If no standy host is configured, we need to exit here.
+      if not @standby_host
+        raise
+      else
+        # If a standby host is configured, try this before giving up.
+        logger.error("Could not connect to #{@client.host}:#{@client.port}. Error: #{e.message}")
+        do_failover
+      end
+    end
+    # Make sure @path contains %{[@metadata][thread_id]} format value if @single_file_per_thread is set to true.
+    if @single_file_per_thread and !@path.include? "%{[@metadata][thread_id]}"
+      @logger.error("Please set %{[@metadata][thread_id]} format value in @path if @single_file_per_thread is active.")
+      raise LogStash::ConfigurationError
+    end
+    buffer_initialize(
+      :max_items => @flush_size,
+      :max_interval => @idle_flush_time,
+      :logger => @logger
+    )
+    @codec.on_event do |event, encoded_event|
+      encoded_event
+    end
+  end # def register
+  def receive(event)
+    buffer_receive(event)
+  end # def receive
+  def flush(events=nil, close=false)
+    return if not events
+    newline = "\n"
+    output_files = Hash.new { |hash, key| hash[key] = "" }
+    events.collect do |event|
+      # Add thread_id to event metadata to be used as format value in path configuration.
+      if @single_file_per_thread
+        event.set("[@metadata][thread_id]", Thread.current.object_id.to_s)
+      end
+      path = event.sprintf(@path)
+      event_as_string = @codec.encode(event)
+      event_as_string += newline unless event_as_string.end_with? newline
+      output_files[path] << event_as_string
+    end
+    output_files.each do |path, output|
+      if @compression == "gzip"
+        path += ".gz"
+        output = compress_gzip(output)
+      elsif @compression == "snappy"
+        path += ".snappy"
+        if @snappy_format == "file"
+          output = compress_snappy_file(output)
+        elsif
+          output = compress_snappy_stream(output)
+        end
+      end
+      write_data(path, output)
+    end
+  end
+  def write_data(path, data)
+    # Retry max_retry times. This can solve problems like leases being hold by another process. Sadly this is no
+    # KNOWN_ERROR in rubys webhdfs client.
+    write_tries = 0
+    begin
+      # Try to append to already existing file, which will work most of the times.
+      @client.append(path, data)
+      # File does not exist, so create it.
+    rescue WebHDFS::FileNotFoundError
+      # Add snappy header if format is "file".
+      if @compression == "snappy" and @snappy_format == "file"
+        @client.create(path, get_snappy_header! + data)
+      elsif
+        @client.create(path, data)
+      end
+    # Handle other write errors and retry to write max. @retry_times.
+    rescue => e
+      # Handle StandbyException and do failover. Still we want to exit if write_tries >= @retry_times.
+      if @standby_client && (e.message.match(/Failed to connect to host/) || e.message.match(/StandbyException/))
+        do_failover
+        write_tries += 1
+        retry
+      end
+      if write_tries < @retry_times
+        @logger.warn("webhdfs write caused an exception: #{e.message}. Maybe you should increase retry_interval or reduce number of workers. Retrying...")
+        sleep(@retry_interval * write_tries)
+        write_tries += 1
+        retry
+      else
+        # Issue error after max retries.
+        @logger.error("Max write retries reached. Events will be discarded. Exception: #{e.message}")
+      end
+    end
+  end
+  def do_failover
+    if not @standby_client
+      return
+    end
+    @logger.warn("Failing over from #{@client.host}:#{@client.port} to #{@standby_client.host}:#{@standby_client.port}.")
+    @client, @standby_client = @standby_client, @client
+  end
+  def close
+    buffer_flush(:final => true)
+  end # def close
+end # class LogStash::Outputs::WebHdfs

data/lib/logstash/outputs/webhdfs_helper.rb ADDED Viewed

@@ -0,0 +1,119 @@
+require "logstash/namespace"
+module LogStash
+  module Outputs
+    module WebHdfsHelper
+      # Load a module
+      # @param module_name [String] A module name
+      # @raise [LoadError] If the module count not be loaded
+      def load_module(module_name)
+        begin
+          require module_name
+        rescue LoadError
+          @logger.error("Module #{module_name} could not be loaded.")
+          raise
+        end
+      end
+      # Setup a WebHDFS client
+      # @param host [String] The WebHDFS location
+      # @param port [Number] The port used to do the communication
+      # @param username [String] A valid HDFS user
+      # @return [WebHDFS] An setup client instance
+      def prepare_client(host, port, username)
+        client = WebHDFS::Client.new(host, port, username)
+        if @use_kerberos_auth
+          require 'gssapi'
+          client.kerberos = true
+          client.kerberos_keytab = @kerberos_keytab
+        end
+        if @use_ssl_auth
+          require 'openssl'
+          client.ssl = true
+          client.ssl_key = OpenSSL::PKey::RSA.new(open(@ssl_key))
+          client.ssl_cert = OpenSSL::X509::Certificate.new(open(@ssl_cert))
+        end
+        client.httpfs_mode = @use_httpfs
+        client.open_timeout = @open_timeout
+        client.read_timeout = @read_timeout
+        client.retry_known_errors = @retry_known_errors
+        client.retry_interval = @retry_interval if @retry_interval
+        client.retry_times = @retry_times if @retry_times
+        client
+      end
+      # Test client connection.
+      #@param client [WebHDFS] webhdfs client object.
+      def test_client(client)
+        begin
+          client.list('/')
+        rescue => e
+          @logger.error("Webhdfs check request failed. (namenode: #{client.host}:#{client.port}, Exception: #{e.message})")
+          raise
+        end
+      end
+      # Compress data using the gzip methods.
+      # @param data [String] stream of data to be compressed
+      # @return [String] the compressed stream of data
+      def compress_gzip(data)
+        buffer = StringIO.new('','w')
+        compressor = Zlib::GzipWriter.new(buffer)
+        begin
+          compressor.write(data)
+        ensure
+          compressor.close()
+        end
+        buffer.string
+      end
+      # Compress snappy file.
+      # @param data [binary] stream of data to be compressed
+      # @return [String] the compressed stream of data
+      def compress_snappy_file(data)
+        # Encode data to ASCII_8BIT (binary)
+        data = data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
+        buffer = StringIO.new('', 'w')
+        buffer.set_encoding(Encoding::ASCII_8BIT)
+        compressed = snappy_deflate(data)
+        buffer << [compressed.size, compressed].pack("Na*")
+        buffer.string
+      end
+      def snappy_deflate(input)
+        raw_bytes = input.bytes.to_java :byte # to needed to force the instance to be a byte[] and match arguments type in subsequent Snappy call
+        compressed = Java::org.xerial.snappy.Snappy.compress(raw_bytes)
+        String.from_java_bytes(compressed)
+      end
+      def snappy_inflate(input)
+        raw_bytes = input.bytes.to_java :byte # to needed to force the instance to be a byte[] and match arguments type in subsequent Snappy call
+        uncompressed_length = Java::org.xerial.snappy.Snappy.uncompressedLength(raw_bytes, 0, raw_bytes.length)
+        uncompressed = Java::byte[uncompressed_length].new
+        Java::org.xerial.snappy.Snappy.uncompress(raw_bytes, 0, raw_bytes.length, uncompressed, 0)
+        String.from_java_bytes(uncompressed)
+      end
+      def compress_snappy_stream(data)
+        # Encode data to ASCII_8BIT (binary)
+        data = data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
+        buffer = StringIO.new
+        buffer.set_encoding(Encoding::ASCII_8BIT)
+        chunks = data.scan(/.{1,#{@snappy_bufsize}}/m)
+        chunks.each do |chunk|
+          compressed = snappy_deflate(chunk)
+          buffer << [chunk.size, compressed.size, compressed].pack("NNa*")
+        end
+        return buffer.string
+      end
+      def get_snappy_header!
+        [MAGIC, DEFAULT_VERSION, MINIMUM_COMPATIBLE_VERSION].pack("a8NN")
+      end
+    end
+  end
+end

data/lib/logstash-output-webhdfs_jars.rb ADDED Viewed

@@ -0,0 +1,4 @@
+# AUTOGENERATED BY THE GRADLE SCRIPT. DO NOT EDIT.
+require 'jar_dependencies'
+require_jar('org.xerial.snappy', 'snappy-java', '1.1.10.5')

data/logstash-output-webhdfs.gemspec ADDED Viewed

@@ -0,0 +1,32 @@
+# encoding: utf-8
+Gem::Specification.new do |s|
+  s.name            = 'logstash-output-webhdfs'
+  s.version         = '3.1.0'
+  s.licenses        = ['Apache License (2.0)']
+  s.summary         = "Sends Logstash events to HDFS using the `webhdfs` REST API"
+  s.description     = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
+  s.authors         = ["Björn Puttmann, loshkovskyi, Elastic"]
+  s.email           = 'b.puttmann@dbap.de'
+  s.homepage        = "http://www.dbap.de"
+  s.require_paths = ['lib', 'vendor/jar-dependencies']
+  # Files
+  s.files = Dir["lib/**/*","spec/**/*","*.gemspec","*.md","CONTRIBUTORS","Gemfile","LICENSE","NOTICE.TXT", "vendor/jar-dependencies/**/*.jar", "vendor/jar-dependencies/**/*.rb", "VERSION", "docs/**/*"]
+  # Tests
+  s.test_files = s.files.grep(%r{^(test|spec|features)/})
+  # Special flag to let us know this is actually a logstash plugin
+  s.metadata = { "logstash_plugin" => "true", "logstash_group" => "output" }
+  # Gem dependencies
+  s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99"
+  s.add_runtime_dependency 'webhdfs'
+  s.add_development_dependency 'logstash-devutils'
+  s.add_development_dependency 'logstash-codec-line'
+  s.add_development_dependency 'logstash-codec-json'
+  s.platform = 'java'
+end

data/spec/integration/webhdfs_spec.rb ADDED Viewed

@@ -0,0 +1,130 @@
+# encoding: utf-8
+require 'logstash/devutils/rspec/spec_helper'
+require 'logstash/outputs/webhdfs'
+require 'webhdfs'
+require 'json'
+describe LogStash::Outputs::WebHdfs, :integration => true do
+  let(:host) { 'localhost' }
+  let(:port) { 50070 }
+  let(:user) { 'test' }
+  let(:test_file) { '/user/' + user + '/%{host}.test' }
+  let(:hdfs_file_name) { 'user/' + user + '/localhost.test' }
+  let(:config) { { 'host' => host, 'user' => user, 'path' => test_file, 'compression' => 'none' } }
+  subject(:plugin) { LogStash::Plugin.lookup("output", "webhdfs").new(config) }
+  let(:webhdfs_client) { WebHDFS::Client.new(host, port, user) }
+  let(:event) { LogStash::Event.new('message' => 'Hello world!', 'source' => 'out of the blue',
+                                    'type' => 'generator', 'host' => 'localhost' ) }
+  describe "register and close" do
+    it 'should register with default values' do
+      expect { subject.register }.to_not raise_error
+    end
+  end
+  describe '#write' do
+    let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
+                     'path' => test_file, 'compression' => 'none' } }
+    after(:each) do
+      webhdfs_client.delete(hdfs_file_name)
+    end
+    describe "writing plain files" do
+      before(:each) do
+        subject.register
+        subject.receive(event)
+        subject.close
+      end
+      it 'should use the correct filename pattern' do
+        expect { webhdfs_client.read(hdfs_file_name) }.to_not raise_error
+      end
+      context "using the line codec without format" do
+        let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
+                         'path' => test_file, 'compression' => 'none', 'codec' => 'line' } }
+        it 'should match the event data' do
+          expect(webhdfs_client.read(hdfs_file_name).strip()).to eq(event.to_s)
+        end
+      end
+      context "using the json codec" do
+        let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
+                         'path' => test_file, 'compression' => 'none', 'codec' => 'json' } }
+        it 'should match the event data' do
+          expect(webhdfs_client.read(hdfs_file_name).strip()).to eq(event.to_json)
+        end
+      end
+      context "when flushing events" do
+        let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10, 'idle_flush_time' => 2,
+                         'path' => test_file, 'compression' => 'none', 'codec' => 'json' } }
+        before(:each) do
+          webhdfs_client.delete(hdfs_file_name)
+        end
+        it 'should flush after configured idle time' do
+          subject.register
+          subject.receive(event)
+          expect { webhdfs_client.read(hdfs_file_name) }.to raise_error(error=WebHDFS::FileNotFoundError)
+          sleep 3
+          expect { webhdfs_client.read(hdfs_file_name) }.to_not raise_error
+          expect(webhdfs_client.read(hdfs_file_name).strip()).to eq(event.to_json)
+        end
+      end
+    end
+    describe "#compression" do
+      before(:each) do
+        subject.register
+        for _ in 0...500
+          subject.receive(event)
+        end
+        subject.close
+      end
+      context "when using no compression" do
+        let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
+                         'path' => test_file, 'compression' => 'none', 'codec' => 'line' } }
+        it 'should write some messages uncompressed' do
+          expect(webhdfs_client.read(hdfs_file_name).lines.count).to eq(500)
+        end
+      end
+      context "when using gzip compression" do
+        let(:config) { { 'host' => host, 'user' => user,
+                         'path' => test_file, 'compression' => 'gzip', 'codec' => 'line' } }
+        it 'should write some messages gzip compressed' do
+          expect(Zlib::Inflate.new(window_bits=47).inflate(webhdfs_client.read("#{hdfs_file_name}.gz")).lines.count ).to eq(500)
+          webhdfs_client.delete("#{hdfs_file_name}.gz")
+        end
+      end
+    end
+  end
+end

data/spec/outputs/webhdfs_helper_spec.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# encoding: utf-8
+require 'logstash/devutils/rspec/spec_helper'
+require 'logstash/outputs/webhdfs'
+require 'webhdfs'
+require 'logstash-output-webhdfs_jars'
+describe "webhdfs helpers" do
+  let(:host) { 'localhost' }
+  let(:user) { 'hadoop' }
+  let(:path) { '/test.log' }
+  let(:config) { { 'host' =>host, 'user' => user, 'path' => path, 'compression' => 'none' } }
+  let(:sample_data) { "Something very very very long to compress" }
+  subject(:plugin) { LogStash::Plugin.lookup("output", "webhdfs").new(config) }
+  context "when compressing using vendor snappy" do
+    it "should return a valid byte array" do
+      compressed = subject.compress_snappy_file(sample_data)
+      expect(compressed).not_to be(:nil)
+    end
+    it "should contains all the data" do
+      compressed = subject.compress_snappy_file(sample_data)
+      #remove the length integer (32 bit) added by compress_snappy_file, 4 bytes, from compressed
+      uncompressed = subject.snappy_inflate(compressed[4..-1])
+      expect(uncompressed).to eq(sample_data)
+    end
+  end
+end

data/spec/outputs/webhdfs_spec.rb ADDED Viewed

@@ -0,0 +1,71 @@
+# encoding: utf-8
+require 'logstash/devutils/rspec/spec_helper'
+require 'logstash/outputs/webhdfs'
+require 'webhdfs'
+require 'json'
+describe 'outputs/webhdfs' do
+  let(:host) { 'localhost' }
+  let(:user) { 'hadoop' }
+  let(:path) { '/test.log' }
+  let(:config) { { 'host' =>host, 'user' => user, 'path' => path, 'compression' => 'none' } }
+  subject(:plugin) { LogStash::Plugin.lookup("output", "webhdfs").new(config) }
+  describe '#initializing' do
+    it 'should fail to register without required values' do
+      plugin = LogStash::Plugin.lookup("output", "webhdfs")
+      expect { plugin.new }.to raise_error(error=LogStash::ConfigurationError)
+    end
+    context "default values" do
+      it 'should have default port' do
+        expect(subject.port).to eq(50070)
+      end
+      it 'should have default idle_flush_time' do
+        expect(subject.idle_flush_time).to eq(1)
+      end
+      it 'should have default flush_size' do
+        expect(subject.flush_size).to eq(500)
+      end
+      it 'should have default open_timeout' do
+        expect(subject.open_timeout).to eq(30)
+      end
+      it 'should have default read_timeout' do
+        expect(subject.read_timeout).to eq(30)
+      end
+      it 'should have default use_httpfs' do
+        expect(subject.use_httpfs).to eq(false)
+      end
+      it 'should have default retry_known_errors' do
+        expect(subject.retry_known_errors).to eq(true)
+      end
+      it 'should have default retry_interval' do
+        expect(subject.retry_interval).to eq(0.5)
+      end
+      it 'should have default retry_times' do
+        expect(subject.retry_times).to eq(5)
+      end
+      it 'should have default snappy_bufsize' do
+        expect(subject.snappy_bufsize).to eq(32768)
+      end
+      it 'should have default snappy_format' do
+        expect(subject.snappy_format).to eq('stream')
+      end
+    end
+  end
+end

data/vendor/jar-dependencies/org/xerial/snappy/snappy-java/1.1.10.5/snappy-java-1.1.10.5.jar ADDED Viewed

Binary file