RubyGems - logstash-output-webhdfs - Versions diffs - 0.0.2 → 0.1.0 - Mend

logstash-output-webhdfs 0.0.2 → 0.1.0

Files changed (13) hide show

checksums.yaml +4 -4
data/.gitignore +1 -19
data/CHANGELOG.md +2 -0
data/CONTRIBUTORS +9 -0
data/LICENSE +13 -13
data/NOTICE.TXT +5 -0
data/README.md +76 -31
data/lib/logstash/outputs/webhdfs.rb +71 -149
data/lib/logstash/outputs/webhdfs_helper.rb +82 -0
data/logstash-output-webhdfs.gemspec +7 -2
data/spec/integration/webhdfs_spec.rb +131 -0
data/spec/outputs/webhdfs_spec.rb +41 -100
metadata +38 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: b62674a3d35ba63e84ddb865b846c9b89c1bd08a
-  data.tar.gz: 3e35d2d01038c0c8e57f798b13aa31f9e46614d7
+  metadata.gz: 4d8c5f3670364dd6a301dfdf281b8265a7020a2d
+  data.tar.gz: e6abb77dd5507ae0d1388ab28989baaf4d517d8d
 SHA512:
-  metadata.gz: 4e1b78ca8e557ac1b76a4d4aaa79b137092f440ee5e183e2347be1a13b9c36dbb852349aa4e3be59641f89e4cc2167918704e75a383fc7de294fbe0ccf874412
-  data.tar.gz: 2db171e7d65de252e3eedf7b301a20e234a88b53b0c62c7b5bf80d25fbc44d951ea123274256dd312fd9710cd80c686d1ea4cf87206a184aa042905ea264581b
+  metadata.gz: 1f27095ed386f4984557345179bae49757312fbbd8e45fd1070ca1c24f0b3aac4deaaa8303aa279248a406f895260ba39067979548ff0e889b74df1f282a11bc
+  data.tar.gz: 89dd5efbb67f0ec3a2ce61446fe9962661bfb8933ceb94e792d6669b14e01b518658e266eb9f2acf6755761efe360c6c15bc8cabf7d08caca2128f31ea5f1c4f

data/.gitignore CHANGED

@@ -1,19 +1 @@
-*.gem
-*.rbc
-.bundle
-.config
-coverage
-InstalledFiles
-lib/bundler/man
-pkg
-rdoc
-spec/reports
-test/tmp
-test/version_tmp
-tmp
-.idea/
-# YARD artifacts
-.yardoc
-_yardoc
-doc/
-*.conf
+*.lock

data/CHANGELOG.md ADDED

	@@ -0,0 +1,2 @@
1	+ ## 0.1.0
2	+ * First version of the webhdfs plugin output

data/CONTRIBUTORS ADDED

@@ -0,0 +1,9 @@
+The following is a list of people who have contributed ideas, code, bug
+reports, or in general have helped logstash along its way.
+Contributors:
+Note: If you've sent us patches, bug reports, or otherwise contributed to
+Logstash, and you aren't on the list above and want to be, please let us know
+and we'll make sure you're here. Contributions from folks like you are what make
+open source awesome.

data/LICENSE CHANGED

@@ -1,13 +1,13 @@
-# Copyright 2014-2015 dbap GmbH. <http://www.dbap.de>
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+Copyright (c) 2012-2015 Elasticsearch <http://www.elasticsearch.org>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

data/NOTICE.TXT ADDED

@@ -0,0 +1,5 @@
+Elasticsearch
+Copyright 2012-2015 Elasticsearch
+This product includes software developed by The Apache Software
+Foundation (http://www.apache.org/).

data/README.md CHANGED

@@ -1,41 +1,86 @@
-logstash-webhdfs
-================
+# Logstash Plugin
-A logstash plugin to store events via webhdfs.
-Tested with v1.3.3, v1.4.0 and 1.5.0.
+This is a plugin for [Logstash](https://github.com/elasticsearch/logstash).
 It is fully free and fully open source. The license is Apache 2.0, meaning you are pretty much free to use it however you want in whatever way.
-This plugin only has a mandatory dependency on the webhdfs gem from Kazuki Ohta and TAGOMORI Satoshi (@see: https://github.com/kzk/webhdfs). Optional dependencies are zlib and snappy gem.
+## Documentation
+Logstash provides infrastructure to automatically generate documentation for this plugin. We use the asciidoc format to write documentation so any comments in the source code will be first converted into asciidoc and then into html. All plugin documentation are placed under one [central location](http://www.elasticsearch.org/guide/en/logstash/current/).
+- For formatting code or config example, you can use the asciidoc `[source,ruby]` directive
+- For more asciidoc formatting tips, see the excellent reference here https://github.com/elasticsearch/docs#asciidoc-guide
+## Need Help?
+Need help? Try #logstash on freenode IRC or the https://discuss.elastic.co/c/logstash discussion forum.
+## Developing
+### 1. Plugin Developement and Testing
-No jars from hadoop are needed, thus reducing configuration and compatibility problems.
+#### Code
+- To get started, you'll need JRuby with the Bundler gem installed.
-## Installation
-Change into your logstash install directory and execute:
+- Create a new plugin or clone and existing from the GitHub [logstash-plugins](https://github.com/logstash-plugins) organization. We also provide [example plugins](https://github.com/logstash-plugins?query=example).
+- Install dependencies
+```sh
+bundle install
 ```
-bin/plugin install logstash-output-webhdfs
+#### Test
+- Update your dependencies
+```sh
+bundle install
 ```
-## Documentation
+- Run tests
+```sh
+bundle exec rspec
+```
+### 2. Running your unpublished Plugin in Logstash
+#### 2.1 Run in a local Logstash clone
+- Edit Logstash `Gemfile` and add the local plugin path, for example:
+```ruby
+gem "logstash-filter-awesome", :path => "/your/local/logstash-filter-awesome"
+```
+- Install plugin
+```sh
+bin/plugin install --no-verify
+```
+- Run Logstash with your plugin
+```sh
+bin/logstash -e 'filter {awesome {}}'
+```
+At this point any modifications to the plugin code will be applied to this local Logstash setup. After modifying the plugin, simply rerun Logstash.
+#### 2.2 Run in an installed Logstash
+You can use the same **2.1** method to run your plugin in an installed Logstash by editing its `Gemfile` and pointing the `:path` to your local plugin development directory or you can build the gem and install it using:
+- Build your plugin gem
+```sh
+gem build logstash-filter-awesome.gemspec
+```
+- Install the plugin from the Logstash home
+```sh
+bin/plugin install /your/local/plugin/logstash-filter-awesome.gem
+```
+- Start Logstash and proceed to test the plugin
+## Contributing
+All contributions are welcome: ideas, patches, documentation, bug reports, complaints, and even something you drew up on a napkin.
+Programming is not a required skill. Whatever you've seen about open source and maintainers or community members  saying "send patches or die" - you will not see that here.
+It is more important to the community that you are able to contribute.
-Example configuration:
-    output {
-        webhdfs {
-            workers => 2
-            server => "your.nameno.de:14000"
-            user => "flume"
-            path => "/user/flume/logstash/dt=%{+Y}-%{+M}-%{+d}/logstash-%{+H}.log"
-            flush_size => 500
-            compression => "snappy"
-            idle_flush_time => 10
-            retry_interval => 0.5
-        }
-    }
-For a complete list of options, see config section in source code.
-This plugin has dependencies on:
- * webhdfs module @<https://github.com/kzk/webhdfs>
- * snappy module @<https://github.com/miyucy/snappy>
+For more information about contributing, see the [CONTRIBUTING](https://github.com/elasticsearch/logstash/blob/master/CONTRIBUTING.md) file.

data/lib/logstash/outputs/webhdfs.rb CHANGED

@@ -2,76 +2,71 @@
 require "logstash/namespace"
 require "logstash/outputs/base"
 require "stud/buffer"
+require "logstash/outputs/webhdfs_helper"
-# Summary: Plugin to send logstash events to to files in HDFS via webhdfs
-# restapi.
+# This plugin sends Logstash events into files in HDFS via
+# the https://hadoop.apache.org/docs/r1.0.4/webhdfs.html[webhdfs] REST API.
 #
-# This plugin only has a mandatory dependency on the webhdfs gem from
-# Kazuki Ohta and TAGOMORI Satoshi (@see: https://github.com/kzk/webhdfs).
-# Optional dependencies are zlib and snappy gem.
-# No jars from hadoop are needed, thus reducing configuration and compatibility
-# problems.
+# ==== Dependencies
+# This plugin has no dependency on jars from hadoop, thus reducing configuration and compatibility
+# problems. It uses the webhdfs gem from Kazuki Ohta and TAGOMORI Satoshi (@see: https://github.com/kzk/webhdfs).
+# Optional dependencies are zlib and snappy gem if you use the compression functionality.
 #
+# ==== Operational Notes
 # If you get an error like:
 #
 #     Max write retries reached. Exception: initialize: name or service not known {:level=>:error}
 #
-# make sure, that the hostname of your namenode is resolvable on the host running logstash. When creating/appending
-# to a file, webhdfs somtime sends a 307 TEMPORARY_REDIRECT with the HOSTNAME of the machine its running on.
+# make sure that the hostname of your namenode is resolvable on the host running Logstash. When creating/appending
+# to a file, webhdfs somtime sends a `307 TEMPORARY_REDIRECT` with the `HOSTNAME` of the machine its running on.
 #
-# USAGE:
-# This is an example of logstash config:
+# ==== Usage
+# This is an example of Logstash config:
 #
 # [source,ruby]
 # ----------------------------------
-# webhdfs {
-#   server => "127.0.0.1:50070"         # (required)
-#   path => "/user/logstash/dt=%{+YYYY-MM-dd}/logstash-%{+HH}.log"  # (required)
-#   user => "hue"                       # (optional)
-#   message_format => "%{@source_host}" # (optional)
-#   idle_flush_time => 10               # (optional)
-#   flush_size => 50                    # (optional)
-#   open_timeout => 15                  # (optional)
-#   read_timeout => 15                  # (optional)
-#   use_httpfs => true                  # (optional)
-#   retry_interval => 1                 # (optional)
-#   retry_times => 3                    # (optional)
-#   compress => "snappy"                # (optional)
-#   remove_at_timestamp => false        # (optional)
+# input {
+#   ...
+# }
+# filter {
+#   ...
+# }
+# output {
+#   webhdfs {
+#     server => "127.0.0.1:50070"         # (required)
+#     path => "/user/logstash/dt=%{+YYYY-MM-dd}/logstash-%{+HH}.log"  # (required)
+#     user => "hue"                       # (required)
+#   }
 # }
 # ----------------------------------
-#
-# Author: Bjoern Puttmann <b.puttmann@dbap.de> - dbap GmbH, Münster, Germany.
 class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
   include Stud::Buffer
+  include LogStash::Outputs::WebHdfsHelper
   config_name "webhdfs"
-  milestone 1
-  if RUBY_VERSION[0..2] == '1.8'
-    MAGIC = "\x82SNAPPY\x0"
-  else
-    MAGIC = "\x82SNAPPY\x0".force_encoding Encoding::ASCII_8BIT
-  end
+  MAGIC = "\x82SNAPPY\x0".force_encoding Encoding::ASCII_8BIT
   DEFAULT_VERSION = 1
   MINIMUM_COMPATIBLE_VERSION = 1
-  # The server name and port for webhdfs/httpfs connections.
-  config :server, :validate => :string, :required => true
+  # The server name for webhdfs/httpfs connections.
+  config :host, :validate => :string, :required => true
+  # The server port for webhdfs/httpfs connections.
+  config :port, :validate => :number, :default => 50070
   # The Username for webhdfs.
-  config :user, :validate => :string, :required => false
+  config :user, :validate => :string, :required => true
   # The path to the file to write to. Event fields can be used here,
   # as well as date fields in the joda time format, e.g.:
-  # ....
-  #     "/user/logstash/dt=%{+YYYY-MM-dd}/%{@source_host}-%{+HH}.log"
-  # ....
+  # `/user/logstash/dt=%{+YYYY-MM-dd}/%{@source_host}-%{+HH}.log`
   config :path, :validate => :string, :required => true
   # The format to use when writing events to the file. This value
-  # supports any string and can include %{name} and other dynamic
+  # supports any string and can include `%{name}` and other dynamic
   # strings.
   #
   # If this setting is omitted, the full json representation of the
@@ -81,13 +76,13 @@ class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
   # Sending data to webhdfs in x seconds intervals.
   config :idle_flush_time, :validate => :number, :default => 1
-  # Sending data to webhdfs if event count is above, even if store_interval_in_secs is not reached.
+  # Sending data to webhdfs if event count is above, even if `store_interval_in_secs` is not reached.
   config :flush_size, :validate => :number, :default => 500
-  # WebHdfs open timeout, default 30s (in ruby net/http).
+  # WebHdfs open timeout, default 30s.
   config :open_timeout, :validate => :number, :default => 30
-  # The WebHdfs read timeout, default 30s (in ruby net/http).
+  # The WebHdfs read timeout, default 30s.
   config :read_timeout, :validate => :number, :default => 30
   # Use httpfs mode if set to true, else webhdfs.
@@ -99,7 +94,7 @@ class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
   # How long should we wait between retries.
   config :retry_interval, :validate => :number, :default => 0.5
-  # How many times should we retry.
+  # How many times should we retry. If retry_times is exceeded, an error will be logged and the event will be discarded.
   config :retry_times, :validate => :number, :default => 5
   # Compress output. One of ['none', 'snappy', 'gzip']
@@ -112,82 +107,49 @@ class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
   # Set snappy format. One of "stream", "file". Set to stream to be hive compatible.
   config :snappy_format, :validate => ["stream", "file"], :default => "stream"
-  # Remove @timestamp field. Hive does not like a leading "@", but we need @timestamp for path calculation.
-  config :remove_at_timestamp, :validate => :boolean, :default => true
+  ## Set codec.
+  default :codec, 'line'
   public
   def register
-    begin
-      require 'webhdfs'
-    rescue LoadError
-      @logger.error("Module webhdfs could not be loaded.")
-    end
+    load_module('webhdfs')
     if @compression == "gzip"
-      begin
-        require "zlib"
-      rescue LoadError
-        @logger.error("Gzip compression selected but zlib module could not be loaded.")
-      end
+      load_module('zlib')
     elsif @compression == "snappy"
-      begin
-        require "snappy"
-      rescue LoadError
-        @logger.error("Snappy compression selected but snappy module could not be loaded.")
-      end
+      load_module('snappy')
     end
     @files = {}
-    @host, @port = @server.split(':')
     @client = prepare_client(@host, @port, @user)
     # Test client connection.
     begin
       @client.list('/')
     rescue => e
       @logger.error("Webhdfs check request failed. (namenode: #{@client.host}:#{@client.port}, Exception: #{e.message})")
+      raise
     end
     buffer_initialize(
-    :max_items => @flush_size,
-    :max_interval => @idle_flush_time,
-    :logger => @logger
+      :max_items => @flush_size,
+      :max_interval => @idle_flush_time,
+      :logger => @logger
     )
+    @codec.on_event do |event, encoded_event|
+      encoded_event
+    end
   end # def register
-  public
   def receive(event)
     return unless output?(event)
     buffer_receive(event)
   end # def receive
-  def prepare_client(host, port, username)
-    client = WebHDFS::Client.new(host, port, username)
-    if @use_httpfs
-      client.httpfs_mode = true
-    end
-    client.open_timeout = @open_timeout
-    client.read_timeout = @read_timeout
-    if @retry_known_errors
-      client.retry_known_errors = true
-      client.retry_interval = @retry_interval if @retry_interval
-      client.retry_times = @retry_times if @retry_times
-    end
-    client
-  end
   def flush(events=nil, teardown=false)
     return if not events
-    # Avoid creating a new string for newline every time
-    newline = "\n".freeze
+    newline = "\n"
     output_files = Hash.new { |hash, key| hash[key] = "" }
     events.collect do |event|
       path = event.sprintf(@path)
-      if @remove_at_timestamp
-        event.remove("@timestamp")
-      end
-      if @message_format
-        event_as_string = event.sprintf(@message_format)
-      else
-        event_as_string = event.to_json
-      end
+      event_as_string = @codec.encode(event)
       event_as_string += newline unless event_as_string.end_with? newline
       output_files[path] << event_as_string
     end
@@ -203,69 +165,18 @@ class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
           output = compress_snappy_stream(output)
         end
       end
-      write_tries = 0
-      while write_tries < @retry_times do
-        begin
-          write_data(path, output)
-          break
-        rescue => e
-          write_tries += 1
-          # Retry max_retry times. This can solve problems like leases being hold by another process. Sadly this is no
-          # KNOWN_ERROR in rubys webhdfs client.
-          if write_tries < @retry_times
-            @logger.warn("Retrying webhdfs write for multiple times. Maybe you should increase retry_interval or reduce number of workers.")
-            sleep(@retry_interval * write_tries)
-            next
-          else
-            # Issue error after max retries.
-            @logger.error("Max write retries reached. Exception: #{e.message}")
-          end
-        end
-      end
+      write_data(path, output)
     end
   end
-  def compress_gzip(data)
-    buffer = StringIO.new('','w')
-    compressor = Zlib::GzipWriter.new(buffer)
-    begin
-      compressor.write data
-    ensure
-      compressor.close()
-    end
-    buffer.string
-  end
-  def compress_snappy_file(data)
-    # Encode data to ASCII_8BIT (binary)
-    data= data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
-    buffer = StringIO.new('', 'w')
-    buffer.set_encoding Encoding::ASCII_8BIT unless RUBY_VERSION =~ /^1\.8/
-    compressed = Snappy.deflate(data)
-    buffer << [compressed.size, compressed].pack("Na*")
-    buffer.string
-  end
-  def compress_snappy_stream(data)
-    # Encode data to ASCII_8BIT (binary)
-    data= data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
-    buffer = StringIO.new
-    buffer.set_encoding Encoding::ASCII_8BIT unless RUBY_VERSION =~ /^1\.8/
-    chunks = data.scan(/.{1,#{@snappy_bufsize}}/m)
-    chunks.each do |chunk|
-      compressed = Snappy.deflate(chunk)
-      buffer << [chunk.size, compressed.size, compressed].pack("NNa*")
-    end
-    return buffer.string
-  end
-  def get_snappy_header!
-    [MAGIC, DEFAULT_VERSION, MINIMUM_COMPATIBLE_VERSION].pack("a8NN")
-  end
   def write_data(path, data)
+    # Retry max_retry times. This can solve problems like leases being hold by another process. Sadly this is no
+    # KNOWN_ERROR in rubys webhdfs client.
+    write_tries = 0
     begin
+      # Try to append to already existing file, which will work most of the times.
       @client.append(path, data)
+      # File does not exist, so create it.
     rescue WebHDFS::FileNotFoundError
       # Add snappy header if format is "file".
       if @compression == "snappy" and @snappy_format == "file"
@@ -273,10 +184,21 @@ class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
       elsif
         @client.create(path, data)
       end
+      # Handle other write errors and retry to write max. @retry_times.
+    rescue => e
+      if write_tries < @retry_times
+        @logger.warn("webhdfs write caused an exception: #{e.message}. Maybe you should increase retry_interval or reduce number of workers. Retrying...")
+        sleep(@retry_interval * write_tries)
+        write_tries += 1
+        retry
+      else
+        # Issue error after max retries.
+        @logger.error("Max write retries reached. Events will be discarded. Exception: #{e.message}")
+      end
     end
   end
   def teardown
     buffer_flush(:final => true)
   end # def teardown
-end
+end # class LogStash::Outputs::WebHdfs

data/lib/logstash/outputs/webhdfs_helper.rb ADDED

@@ -0,0 +1,82 @@
+require "logstash/namespace"
+module LogStash
+  module Outputs
+    module WebHdfsHelper
+      # Load a module
+      # @param module_name [String] A module name
+      # @raise [LoadError] If the module count not be loaded
+      def load_module(module_name)
+        begin
+          require module_name
+        rescue LoadError
+          @logger.error("Module #{module_name} could not be loaded.")
+          raise
+        end
+      end
+      # Setup a WebHDFS client
+      # @param host [String] The WebHDFS location
+      # @param port [Number] The port used to do the communication
+      # @param username [String] A valid HDFS user
+      # @return [WebHDFS] An setup client instance
+      def prepare_client(host, port, username)
+        client = WebHDFS::Client.new(host, port, username)
+        client.httpfs_mode = @use_httpfs
+        client.open_timeout = @open_timeout
+        client.read_timeout = @read_timeout
+        client.retry_known_errors = @retry_known_errors
+        client.retry_interval = @retry_interval if @retry_interval
+        client.retry_times = @retry_times if @retry_times
+        client
+      end
+      # Compress data using the gzip methods.
+      # @param data [String] stream of data to be compressed
+      # @return [String] the compressed stream of data
+      def compress_gzip(data)
+        buffer = StringIO.new('','w')
+        compressor = Zlib::GzipWriter.new(buffer)
+        begin
+          compressor.write(data)
+        ensure
+          compressor.close()
+        end
+        buffer.string
+      end
+      # Compress snappy file.
+      # @param data [binary] stream of data to be compressed
+      # @return [String] the compressed stream of data
+      def compress_snappy_file(data)
+        # Encode data to ASCII_8BIT (binary)
+        data= data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
+        buffer = StringIO.new('', 'w')
+        buffer.set_encoding(Encoding::ASCII_8BIT)
+        compressed = Snappy.deflate(data)
+        buffer << [compressed.size, compressed].pack("Na*")
+        buffer.string
+      end
+      def compress_snappy_stream(data)
+        # Encode data to ASCII_8BIT (binary)
+        data= data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
+        buffer = StringIO.new
+        buffer.set_encoding(Encoding::ASCII_8BIT)
+        chunks = data.scan(/.{1,#{@snappy_bufsize}}/m)
+        chunks.each do |chunk|
+          compressed = Snappy.deflate(chunk)
+          buffer << [chunk.size, compressed.size, compressed].pack("NNa*")
+        end
+        return buffer.string
+      end
+      def get_snappy_header!
+        [MAGIC, DEFAULT_VERSION, MINIMUM_COMPATIBLE_VERSION].pack("a8NN")
+      end
+    end
+  end
+end

data/logstash-output-webhdfs.gemspec CHANGED

@@ -1,11 +1,11 @@
 Gem::Specification.new do |s|
   s.name            = 'logstash-output-webhdfs'
-  s.version         = '0.0.2'
+  s.version         = '0.1.0'
   s.licenses        = ['Apache License (2.0)']
   s.summary         = "Plugin to write events to hdfs via webhdfs."
   s.description     = "This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program"
-  s.authors         = ["Björn Puttmann, loshkovskyi"]
+  s.authors         = ["Björn Puttmann, loshkovskyi, Elastic"]
   s.email           = 'b.puttmann@dbap.de'
   s.homepage        = "http://www.dbap.de"
   s.require_paths = ["lib"]
@@ -24,4 +24,9 @@ Gem::Specification.new do |s|
   s.add_runtime_dependency 'webhdfs'
   s.add_runtime_dependency 'snappy'
   s.add_development_dependency 'logstash-devutils'
+  s.add_development_dependency 'logstash-codec-line'
+  s.add_development_dependency 'logstash-codec-json'
 end

data/spec/integration/webhdfs_spec.rb ADDED

@@ -0,0 +1,131 @@
+# encoding: utf-8
+require 'logstash/devutils/rspec/spec_helper'
+require 'logstash/outputs/webhdfs'
+require 'webhdfs'
+require 'json'
+describe LogStash::Outputs::WebHdfs, :integration => true do
+  let(:host) { 'localhost' }
+  let(:port) { 50070 }
+  let(:user) { 'vagrant' }
+  let(:test_file) { "/test.file" }
+  let(:event) { LogStash::Event.new('message' => 'Hello world!', 'source' => 'out of the blue',
+                                    'type' => 'generator', 'host' => 'localhost' ) }
+  let(:config) { { 'host' => host, 'user' => user,
+                   'path' => test_file, 'compression' => 'none' } }
+  subject { LogStash::Plugin.lookup("output", "webhdfs").new(config) }
+  let(:client) { WebHDFS::Client.new(host, port, user) }
+  describe "register and teardown" do
+    it 'should register with default values' do
+      expect { subject.register }.to_not raise_error
+    end
+  end
+  describe '#write' do
+    let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
+                     'path' => "/%{host}_test.log", 'compression' => 'none' } }
+    after(:each) do
+      client.delete(test_file)
+    end
+    describe "writing plain files" do
+      before(:each) do
+        subject.register
+        subject.receive(event)
+        subject.teardown
+      end
+      it 'should use the correct filename pattern' do
+        expect { client.read('localhost_test.log') }.to_not raise_error
+      end
+      context "using the line codec" do
+        let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
+                         'path' => test_file, 'compression' => 'none', 'codec' => 'line' } }
+        it 'should match the event data' do
+          expect(client.read(test_file).strip()).to eq(event.to_s)
+        end
+      end
+      context "using the json codec" do
+        let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
+                         'path' => test_file, 'compression' => 'none', 'codec' => 'json' } }
+        it 'should match the event data' do
+          expect(client.read(test_file).strip()).to eq(event.to_json)
+        end
+      end
+      context "when flushing events" do
+        let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10, 'idle_flush_time' => 2,
+                         'path' => test_file, 'compression' => 'none', 'codec' => 'json' } }
+        before(:each) do
+          client.delete(test_file)
+        end
+        it 'should flush after configured idle time' do
+          subject.register
+          subject.receive(event)
+          expect { client.read(test_file) }.to raise_error(error=WebHDFS::FileNotFoundError)
+          sleep 3
+          expect { client.read(test_file) }.to_not raise_error
+          expect(client.read(test_file).strip()).to eq(event.to_json)
+        end
+      end
+    end
+    describe "#compression" do
+      before(:each) do
+        subject.register
+        for _ in 0...500
+          subject.receive(event)
+        end
+        subject.teardown
+      end
+      context "when using no compression" do
+        let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
+                         'path' => test_file, 'compression' => 'none', 'codec' => 'line' } }
+        it 'should write some messages uncompressed' do
+          expect(client.read(test_file).lines.count).to eq(500)
+        end
+      end
+      context "when using gzip compression" do
+        let(:config) { { 'host' => host, 'user' => user,
+                         'path' => test_file, 'compression' => 'gzip', 'codec' => 'line' } }
+        it 'should write some messages gzip compressed' do
+          expect(Zlib::Inflate.new(window_bits=47).inflate(client.read("#{test_file}.gz")).lines.count ).to eq(500)
+        end
+      end
+    end
+  end
+end

data/spec/outputs/webhdfs_spec.rb CHANGED

@@ -6,125 +6,66 @@ require 'json'
 describe 'outputs/webhdfs' do
-  webhdfs_server = 'localhost'
-  webhdfs_port = 50070
-  webhdfs_user = 'hadoop'
-  path_to_testlog = '/user/hadoop/test.log'
-  current_logfile_name = '/user/hadoop/test.log'
-  current_config = ""
+  let(:host) { 'localhost' }
+  let(:user) { 'hadoop' }
+  let(:path) { '/test.log' }
-      event = LogStash::Event.new(
-        'message' => 'Hello world!',
-        'source' => 'out of the blue',
-        'type' => 'generator',
-        'host' => 'localhost',
-        '@timestamp' => LogStash::Timestamp.now)
+  let(:config) { { 'host' =>host, 'user' => user, 'path' => path, 'compression' => 'none' } }
-  default_config =  { 'server' => webhdfs_server + ':' + webhdfs_port.to_s,
-                      'user' => webhdfs_user,
-                      'path' => path_to_testlog,
-                      'compression' => 'none' }
+  subject(:plugin) { LogStash::Plugin.lookup("output", "webhdfs").new(config) }
-  client = WebHDFS::Client.new(webhdfs_server, webhdfs_port, webhdfs_user)
-  context 'when initializing' do
+  describe '#initializing' do
     it 'should fail to register without required values' do
-      expect { LogStash::Plugin.lookup("output", "webhdfs").new() }.to raise_error(error=LogStash::ConfigurationError)
+      plugin = LogStash::Plugin.lookup("output", "webhdfs")
+      expect { plugin.new }.to raise_error(error=LogStash::ConfigurationError)
     end
-    it 'should register with default values' do
-      subject = LogStash::Plugin.lookup("output", "webhdfs").new(default_config)
-      expect { subject.register }.to_not raise_error
-    end
+    context "default values" do
-    it 'should have default config values' do
-      subject = LogStash::Plugin.lookup("output", "webhdfs").new(default_config)
-      insist { subject.idle_flush_time } == 1
-      insist { subject.flush_size } == 500
-      insist { subject.open_timeout } == 30
-      insist { subject.read_timeout } == 30
-      insist { subject.use_httpfs } == false
-      insist { subject.retry_known_errors } == true
-      insist { subject.retry_interval } == 0.5
-      insist { subject.retry_times } == 5
-      insist { subject.snappy_bufsize } == 32768
-      insist { subject.snappy_format } == 'stream'
-      insist { subject.remove_at_timestamp } == true
-    end
-  end
+      it 'should have default port' do
+        expect(subject.port).to eq(50070)
+      end
-  context 'when writing messages' do
+      it 'should have default idle_flush_time' do
+        expect(subject.idle_flush_time).to eq(1)
+      end
+      it 'should have default flush_size' do
+        expect(subject.flush_size).to eq(500)
+      end
-    before :each do
-      current_logfile_name = path_to_testlog
-      current_config = default_config.clone
-    end
+      it 'should have default open_timeout' do
+        expect(subject.open_timeout).to eq(30)
+      end
-    it 'should match the event data' do
-      subject = LogStash::Plugin.lookup("output", "webhdfs").new(current_config)
-      subject.register
-      subject.receive(event)
-      subject.teardown
-      insist { client.read(current_logfile_name).strip } == event.to_json
-    end
+      it 'should have default read_timeout' do
+        expect(subject.read_timeout).to eq(30)
+      end
-    it 'should match the configured pattern' do
-      current_config['message_format'] = '%{message} came %{source}.'
-      subject = LogStash::Plugin.lookup("output", "webhdfs").new(current_config)
-      subject.register
-      subject.receive(event)
-      subject.teardown
-      insist { client.read(current_logfile_name).strip } == 'Hello world! came out of the blue.'
-    end
+      it 'should have default use_httpfs' do
+        expect(subject.use_httpfs).to eq(false)
+      end
-    # Hive does not like a leading "@", but we need @timestamp for path calculation.
-    it 'should remove the @timestamp field if configured' do
-      current_config['remove_at_timestamp'] = true
-      current_config['message_format'] = '%{@timestamp} should be missing.'
-      subject = LogStash::Plugin.lookup("output", "webhdfs").new(current_config)
-      subject.register
-      subject.receive(event)
-      subject.teardown
-      insist { client.read(current_logfile_name).strip } == '%{@timestamp} should be missing.'
-    end
+      it 'should have default retry_known_errors' do
+        expect(subject.retry_known_errors).to eq(true)
+      end
-    it 'should flush after configured idle time' do
-      current_config['idle_flush_time'] = 2
-      subject = LogStash::Plugin.lookup("output", "webhdfs").new(current_config)
-      subject.register
-      subject.receive(event)
-      expect { client.read(current_logfile_name) }.to raise_error(error=WebHDFS::FileNotFoundError)
-      sleep 3
-      insist { client.read(current_logfile_name).strip } == event.to_json
-    end
+      it 'should have default retry_interval' do
+        expect(subject.retry_interval).to eq(0.5)
+      end
-    it 'should write some messages uncompressed' do
-      subject = LogStash::Plugin.lookup("output", "webhdfs").new(current_config)
-      subject.register
-      for _ in 0..499
-        subject.receive(event)
+      it 'should have default retry_times' do
+        expect(subject.retry_times).to eq(5)
       end
-      subject.teardown
-      insist { client.read(current_logfile_name).lines.count } == 500
-    end
-    it 'should write some messages gzip compressed' do
-      current_logfile_name = current_logfile_name + ".gz"
-      current_config['compression'] = 'gzip'
-      subject = LogStash::Plugin.lookup("output", "webhdfs").new(current_config)
-      subject.register
-      for _ in 0..499
-        subject.receive(event)
+      it 'should have default snappy_bufsize' do
+        expect(subject.snappy_bufsize).to eq(32768)
       end
-      subject.teardown
-      insist { Zlib::Inflate.new(window_bits=47).inflate(client.read(current_logfile_name)).lines.count } == 500
-    end
-    after :each do
-      client.delete(current_logfile_name)
-    end
+      it 'should have default snappy_format' do
+        expect(subject.snappy_format).to eq('stream')
+      end
+    end
   end
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: logstash-output-webhdfs
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.1.0
 platform: ruby
 authors:
-- Björn Puttmann, loshkovskyi
+- Björn Puttmann, loshkovskyi, Elastic
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-05-22 00:00:00.000000000 Z
+date: 2015-08-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: logstash-core
@@ -72,6 +72,34 @@ dependencies:
         version: '0'
   prerelease: false
   type: :development
+- !ruby/object:Gem::Dependency
+  name: logstash-codec-line
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  prerelease: false
+  type: :development
+- !ruby/object:Gem::Dependency
+  name: logstash-codec-json
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  prerelease: false
+  type: :development
 description: This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program
 email: b.puttmann@dbap.de
 executables: []
@@ -79,12 +107,17 @@ extensions: []
 extra_rdoc_files: []
 files:
 - .gitignore
+- CHANGELOG.md
+- CONTRIBUTORS
 - Gemfile
 - LICENSE
+- NOTICE.TXT
 - README.md
 - Rakefile
 - lib/logstash/outputs/webhdfs.rb
+- lib/logstash/outputs/webhdfs_helper.rb
 - logstash-output-webhdfs.gemspec
+- spec/integration/webhdfs_spec.rb
 - spec/outputs/webhdfs_spec.rb
 homepage: http://www.dbap.de
 licenses:
@@ -108,9 +141,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.4
+rubygems_version: 2.1.9
 signing_key:
 specification_version: 4
 summary: Plugin to write events to hdfs via webhdfs.
 test_files:
+- spec/integration/webhdfs_spec.rb
 - spec/outputs/webhdfs_spec.rb