logstash-output-webhdfs 3.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +31 -0
- data/CONTRIBUTORS +18 -0
- data/Gemfile +11 -0
- data/LICENSE +202 -0
- data/NOTICE.TXT +5 -0
- data/README.md +98 -0
- data/docs/index.asciidoc +296 -0
- data/lib/logstash/outputs/webhdfs.rb +261 -0
- data/lib/logstash/outputs/webhdfs_helper.rb +119 -0
- data/lib/logstash-output-webhdfs_jars.rb +4 -0
- data/logstash-output-webhdfs.gemspec +32 -0
- data/spec/integration/webhdfs_spec.rb +130 -0
- data/spec/outputs/webhdfs_helper_spec.rb +37 -0
- data/spec/outputs/webhdfs_spec.rb +71 -0
- data/vendor/jar-dependencies/org/xerial/snappy/snappy-java/1.1.10.5/snappy-java-1.1.10.5.jar +0 -0
- metadata +141 -0
@@ -0,0 +1,261 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "logstash/namespace"
|
3
|
+
require "logstash/outputs/base"
|
4
|
+
require "stud/buffer"
|
5
|
+
require "logstash/outputs/webhdfs_helper"
|
6
|
+
|
7
|
+
# This plugin sends Logstash events into files in HDFS via
|
8
|
+
# the https://hadoop.apache.org/docs/r1.0.4/webhdfs.html[webhdfs] REST API.
|
9
|
+
#
|
10
|
+
# ==== Dependencies
|
11
|
+
# This plugin has no dependency on jars from hadoop, thus reducing configuration and compatibility
|
12
|
+
# problems. It uses the webhdfs gem from Kazuki Ohta and TAGOMORI Satoshi (@see: https://github.com/kzk/webhdfs).
|
13
|
+
# Optional dependencies are zlib if you use the compression functionality.
|
14
|
+
#
|
15
|
+
# ==== Operational Notes
|
16
|
+
# If you get an error like:
|
17
|
+
#
|
18
|
+
# Max write retries reached. Exception: initialize: name or service not known {:level=>:error}
|
19
|
+
#
|
20
|
+
# make sure that the hostname of your namenode is resolvable on the host running Logstash. When creating/appending
|
21
|
+
# to a file, webhdfs somtime sends a `307 TEMPORARY_REDIRECT` with the `HOSTNAME` of the machine its running on.
|
22
|
+
#
|
23
|
+
# ==== Usage
|
24
|
+
# This is an example of Logstash config:
|
25
|
+
#
|
26
|
+
# [source,ruby]
|
27
|
+
# ----------------------------------
|
28
|
+
# input {
|
29
|
+
# ...
|
30
|
+
# }
|
31
|
+
# filter {
|
32
|
+
# ...
|
33
|
+
# }
|
34
|
+
# output {
|
35
|
+
# webhdfs {
|
36
|
+
# host => "127.0.0.1" # (required)
|
37
|
+
# port => 50070 # (optional, default: 50070)
|
38
|
+
# path => "/user/logstash/dt=%{+YYYY-MM-dd}/logstash-%{+HH}.log" # (required)
|
39
|
+
# user => "hue" # (required)
|
40
|
+
# }
|
41
|
+
# }
|
42
|
+
# ----------------------------------
|
43
|
+
|
44
|
+
class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
|
45
|
+
|
46
|
+
include Stud::Buffer
|
47
|
+
include LogStash::Outputs::WebHdfsHelper
|
48
|
+
|
49
|
+
config_name "webhdfs"
|
50
|
+
|
51
|
+
MAGIC = "\x82SNAPPY\x0".force_encoding Encoding::ASCII_8BIT
|
52
|
+
DEFAULT_VERSION = 1
|
53
|
+
MINIMUM_COMPATIBLE_VERSION = 1
|
54
|
+
|
55
|
+
# The server name for webhdfs/httpfs connections.
|
56
|
+
config :host, :validate => :string, :required => true
|
57
|
+
|
58
|
+
# The server port for webhdfs/httpfs connections.
|
59
|
+
config :port, :validate => :number, :default => 50070
|
60
|
+
|
61
|
+
# Standby namenode for ha hdfs.
|
62
|
+
config :standby_host, :validate => :string, :default => false
|
63
|
+
|
64
|
+
# Standby namenode port for ha hdfs.
|
65
|
+
config :standby_port, :validate => :number, :default => 50070
|
66
|
+
|
67
|
+
# The Username for webhdfs.
|
68
|
+
config :user, :validate => :string, :required => true
|
69
|
+
|
70
|
+
# The path to the file to write to. Event fields can be used here,
|
71
|
+
# as well as date fields in the joda time format, e.g.:
|
72
|
+
# `/user/logstash/dt=%{+YYYY-MM-dd}/%{@source_host}-%{+HH}.log`
|
73
|
+
config :path, :validate => :string, :required => true
|
74
|
+
|
75
|
+
# Sending data to webhdfs in x seconds intervals.
|
76
|
+
config :idle_flush_time, :validate => :number, :default => 1
|
77
|
+
|
78
|
+
# Sending data to webhdfs if event count is above, even if `store_interval_in_secs` is not reached.
|
79
|
+
config :flush_size, :validate => :number, :default => 500
|
80
|
+
|
81
|
+
# WebHdfs open timeout, default 30s.
|
82
|
+
config :open_timeout, :validate => :number, :default => 30
|
83
|
+
|
84
|
+
# The WebHdfs read timeout, default 30s.
|
85
|
+
config :read_timeout, :validate => :number, :default => 30
|
86
|
+
|
87
|
+
# Use httpfs mode if set to true, else webhdfs.
|
88
|
+
config :use_httpfs, :validate => :boolean, :default => false
|
89
|
+
|
90
|
+
# Avoid appending to same file in multiple threads.
|
91
|
+
# This solves some problems with multiple logstash output threads and locked file leases in webhdfs.
|
92
|
+
# If this option is set to true, %{[@metadata][thread_id]} needs to be used in path config settting.
|
93
|
+
config :single_file_per_thread, :validate => :boolean, :default => false
|
94
|
+
|
95
|
+
# Retry some known webhdfs errors. These may be caused by race conditions when appending to same file, etc.
|
96
|
+
config :retry_known_errors, :validate => :boolean, :default => true
|
97
|
+
|
98
|
+
# How long should we wait between retries.
|
99
|
+
config :retry_interval, :validate => :number, :default => 0.5
|
100
|
+
|
101
|
+
# How many times should we retry. If retry_times is exceeded, an error will be logged and the event will be discarded.
|
102
|
+
config :retry_times, :validate => :number, :default => 5
|
103
|
+
|
104
|
+
# Compress output. One of ['none', 'snappy', 'gzip']
|
105
|
+
config :compression, :validate => ["none", "snappy", "gzip"], :default => "none"
|
106
|
+
|
107
|
+
# Set snappy chunksize. Only neccessary for stream format. Defaults to 32k. Max is 65536
|
108
|
+
# @see http://code.google.com/p/snappy/source/browse/trunk/framing_format.txt
|
109
|
+
config :snappy_bufsize, :validate => :number, :default => 32768
|
110
|
+
|
111
|
+
# Set snappy format. One of "stream", "file". Set to stream to be hive compatible.
|
112
|
+
config :snappy_format, :validate => ["stream", "file"], :default => "stream"
|
113
|
+
|
114
|
+
# Set kerberos authentication.
|
115
|
+
config :use_kerberos_auth, :validate => :boolean, :default => false
|
116
|
+
|
117
|
+
# Set kerberos keytab file. Note that the gssapi library needs to be available to use this.
|
118
|
+
config :kerberos_keytab, :validate => :string
|
119
|
+
|
120
|
+
# Set ssl authentication. Note that the openssl library needs to be available to use this.
|
121
|
+
config :use_ssl_auth, :validate => :boolean, :default => false
|
122
|
+
|
123
|
+
# Set ssl key file.
|
124
|
+
config :ssl_key, :validate => :string
|
125
|
+
|
126
|
+
# Set ssl cert file.
|
127
|
+
config :ssl_cert, :validate => :string
|
128
|
+
|
129
|
+
## Set codec.
|
130
|
+
default :codec, 'line'
|
131
|
+
|
132
|
+
public
|
133
|
+
|
134
|
+
def register
|
135
|
+
load_module('webhdfs')
|
136
|
+
|
137
|
+
# in case of snappy the jars are already included and no wrapper module has to be loaded.
|
138
|
+
if @compression == "gzip"
|
139
|
+
load_module('zlib')
|
140
|
+
end
|
141
|
+
@main_namenode_failed = false
|
142
|
+
@standby_client = false
|
143
|
+
@files = {}
|
144
|
+
# Create and test standby client if configured.
|
145
|
+
if @standby_host
|
146
|
+
@standby_client = prepare_client(@standby_host, @standby_port, @user)
|
147
|
+
begin
|
148
|
+
test_client(@standby_client)
|
149
|
+
rescue => e
|
150
|
+
logger.warn("Could not connect to standby namenode #{@standby_client.host}. Error: #{e.message}. Trying main webhdfs namenode.")
|
151
|
+
end
|
152
|
+
end
|
153
|
+
@client = prepare_client(@host, @port, @user)
|
154
|
+
begin
|
155
|
+
test_client(@client)
|
156
|
+
rescue => e
|
157
|
+
# If no standy host is configured, we need to exit here.
|
158
|
+
if not @standby_host
|
159
|
+
raise
|
160
|
+
else
|
161
|
+
# If a standby host is configured, try this before giving up.
|
162
|
+
logger.error("Could not connect to #{@client.host}:#{@client.port}. Error: #{e.message}")
|
163
|
+
do_failover
|
164
|
+
end
|
165
|
+
end
|
166
|
+
# Make sure @path contains %{[@metadata][thread_id]} format value if @single_file_per_thread is set to true.
|
167
|
+
if @single_file_per_thread and !@path.include? "%{[@metadata][thread_id]}"
|
168
|
+
@logger.error("Please set %{[@metadata][thread_id]} format value in @path if @single_file_per_thread is active.")
|
169
|
+
raise LogStash::ConfigurationError
|
170
|
+
end
|
171
|
+
buffer_initialize(
|
172
|
+
:max_items => @flush_size,
|
173
|
+
:max_interval => @idle_flush_time,
|
174
|
+
:logger => @logger
|
175
|
+
)
|
176
|
+
@codec.on_event do |event, encoded_event|
|
177
|
+
encoded_event
|
178
|
+
end
|
179
|
+
end # def register
|
180
|
+
|
181
|
+
def receive(event)
|
182
|
+
buffer_receive(event)
|
183
|
+
end # def receive
|
184
|
+
|
185
|
+
def flush(events=nil, close=false)
|
186
|
+
return if not events
|
187
|
+
newline = "\n"
|
188
|
+
output_files = Hash.new { |hash, key| hash[key] = "" }
|
189
|
+
events.collect do |event|
|
190
|
+
# Add thread_id to event metadata to be used as format value in path configuration.
|
191
|
+
if @single_file_per_thread
|
192
|
+
event.set("[@metadata][thread_id]", Thread.current.object_id.to_s)
|
193
|
+
end
|
194
|
+
path = event.sprintf(@path)
|
195
|
+
event_as_string = @codec.encode(event)
|
196
|
+
event_as_string += newline unless event_as_string.end_with? newline
|
197
|
+
output_files[path] << event_as_string
|
198
|
+
end
|
199
|
+
output_files.each do |path, output|
|
200
|
+
if @compression == "gzip"
|
201
|
+
path += ".gz"
|
202
|
+
output = compress_gzip(output)
|
203
|
+
elsif @compression == "snappy"
|
204
|
+
path += ".snappy"
|
205
|
+
if @snappy_format == "file"
|
206
|
+
output = compress_snappy_file(output)
|
207
|
+
elsif
|
208
|
+
output = compress_snappy_stream(output)
|
209
|
+
end
|
210
|
+
end
|
211
|
+
write_data(path, output)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def write_data(path, data)
|
216
|
+
# Retry max_retry times. This can solve problems like leases being hold by another process. Sadly this is no
|
217
|
+
# KNOWN_ERROR in rubys webhdfs client.
|
218
|
+
write_tries = 0
|
219
|
+
begin
|
220
|
+
# Try to append to already existing file, which will work most of the times.
|
221
|
+
@client.append(path, data)
|
222
|
+
# File does not exist, so create it.
|
223
|
+
rescue WebHDFS::FileNotFoundError
|
224
|
+
# Add snappy header if format is "file".
|
225
|
+
if @compression == "snappy" and @snappy_format == "file"
|
226
|
+
@client.create(path, get_snappy_header! + data)
|
227
|
+
elsif
|
228
|
+
@client.create(path, data)
|
229
|
+
end
|
230
|
+
# Handle other write errors and retry to write max. @retry_times.
|
231
|
+
rescue => e
|
232
|
+
# Handle StandbyException and do failover. Still we want to exit if write_tries >= @retry_times.
|
233
|
+
if @standby_client && (e.message.match(/Failed to connect to host/) || e.message.match(/StandbyException/))
|
234
|
+
do_failover
|
235
|
+
write_tries += 1
|
236
|
+
retry
|
237
|
+
end
|
238
|
+
if write_tries < @retry_times
|
239
|
+
@logger.warn("webhdfs write caused an exception: #{e.message}. Maybe you should increase retry_interval or reduce number of workers. Retrying...")
|
240
|
+
sleep(@retry_interval * write_tries)
|
241
|
+
write_tries += 1
|
242
|
+
retry
|
243
|
+
else
|
244
|
+
# Issue error after max retries.
|
245
|
+
@logger.error("Max write retries reached. Events will be discarded. Exception: #{e.message}")
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
def do_failover
|
251
|
+
if not @standby_client
|
252
|
+
return
|
253
|
+
end
|
254
|
+
@logger.warn("Failing over from #{@client.host}:#{@client.port} to #{@standby_client.host}:#{@standby_client.port}.")
|
255
|
+
@client, @standby_client = @standby_client, @client
|
256
|
+
end
|
257
|
+
|
258
|
+
def close
|
259
|
+
buffer_flush(:final => true)
|
260
|
+
end # def close
|
261
|
+
end # class LogStash::Outputs::WebHdfs
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require "logstash/namespace"
|
2
|
+
|
3
|
+
module LogStash
|
4
|
+
module Outputs
|
5
|
+
module WebHdfsHelper
|
6
|
+
|
7
|
+
# Load a module
|
8
|
+
# @param module_name [String] A module name
|
9
|
+
# @raise [LoadError] If the module count not be loaded
|
10
|
+
def load_module(module_name)
|
11
|
+
begin
|
12
|
+
require module_name
|
13
|
+
rescue LoadError
|
14
|
+
@logger.error("Module #{module_name} could not be loaded.")
|
15
|
+
raise
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Setup a WebHDFS client
|
20
|
+
# @param host [String] The WebHDFS location
|
21
|
+
# @param port [Number] The port used to do the communication
|
22
|
+
# @param username [String] A valid HDFS user
|
23
|
+
# @return [WebHDFS] An setup client instance
|
24
|
+
def prepare_client(host, port, username)
|
25
|
+
client = WebHDFS::Client.new(host, port, username)
|
26
|
+
if @use_kerberos_auth
|
27
|
+
require 'gssapi'
|
28
|
+
client.kerberos = true
|
29
|
+
client.kerberos_keytab = @kerberos_keytab
|
30
|
+
end
|
31
|
+
if @use_ssl_auth
|
32
|
+
require 'openssl'
|
33
|
+
client.ssl = true
|
34
|
+
client.ssl_key = OpenSSL::PKey::RSA.new(open(@ssl_key))
|
35
|
+
client.ssl_cert = OpenSSL::X509::Certificate.new(open(@ssl_cert))
|
36
|
+
end
|
37
|
+
client.httpfs_mode = @use_httpfs
|
38
|
+
client.open_timeout = @open_timeout
|
39
|
+
client.read_timeout = @read_timeout
|
40
|
+
client.retry_known_errors = @retry_known_errors
|
41
|
+
client.retry_interval = @retry_interval if @retry_interval
|
42
|
+
client.retry_times = @retry_times if @retry_times
|
43
|
+
client
|
44
|
+
end
|
45
|
+
# Test client connection.
|
46
|
+
#@param client [WebHDFS] webhdfs client object.
|
47
|
+
def test_client(client)
|
48
|
+
begin
|
49
|
+
client.list('/')
|
50
|
+
rescue => e
|
51
|
+
@logger.error("Webhdfs check request failed. (namenode: #{client.host}:#{client.port}, Exception: #{e.message})")
|
52
|
+
raise
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Compress data using the gzip methods.
|
57
|
+
# @param data [String] stream of data to be compressed
|
58
|
+
# @return [String] the compressed stream of data
|
59
|
+
def compress_gzip(data)
|
60
|
+
buffer = StringIO.new('','w')
|
61
|
+
compressor = Zlib::GzipWriter.new(buffer)
|
62
|
+
begin
|
63
|
+
compressor.write(data)
|
64
|
+
ensure
|
65
|
+
compressor.close()
|
66
|
+
end
|
67
|
+
buffer.string
|
68
|
+
end
|
69
|
+
|
70
|
+
# Compress snappy file.
|
71
|
+
# @param data [binary] stream of data to be compressed
|
72
|
+
# @return [String] the compressed stream of data
|
73
|
+
def compress_snappy_file(data)
|
74
|
+
# Encode data to ASCII_8BIT (binary)
|
75
|
+
data = data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
|
76
|
+
buffer = StringIO.new('', 'w')
|
77
|
+
buffer.set_encoding(Encoding::ASCII_8BIT)
|
78
|
+
compressed = snappy_deflate(data)
|
79
|
+
buffer << [compressed.size, compressed].pack("Na*")
|
80
|
+
buffer.string
|
81
|
+
end
|
82
|
+
|
83
|
+
def snappy_deflate(input)
|
84
|
+
raw_bytes = input.bytes.to_java :byte # to needed to force the instance to be a byte[] and match arguments type in subsequent Snappy call
|
85
|
+
|
86
|
+
compressed = Java::org.xerial.snappy.Snappy.compress(raw_bytes)
|
87
|
+
|
88
|
+
String.from_java_bytes(compressed)
|
89
|
+
end
|
90
|
+
|
91
|
+
def snappy_inflate(input)
|
92
|
+
raw_bytes = input.bytes.to_java :byte # to needed to force the instance to be a byte[] and match arguments type in subsequent Snappy call
|
93
|
+
uncompressed_length = Java::org.xerial.snappy.Snappy.uncompressedLength(raw_bytes, 0, raw_bytes.length)
|
94
|
+
uncompressed = Java::byte[uncompressed_length].new
|
95
|
+
Java::org.xerial.snappy.Snappy.uncompress(raw_bytes, 0, raw_bytes.length, uncompressed, 0)
|
96
|
+
|
97
|
+
String.from_java_bytes(uncompressed)
|
98
|
+
end
|
99
|
+
|
100
|
+
def compress_snappy_stream(data)
|
101
|
+
# Encode data to ASCII_8BIT (binary)
|
102
|
+
data = data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
|
103
|
+
buffer = StringIO.new
|
104
|
+
buffer.set_encoding(Encoding::ASCII_8BIT)
|
105
|
+
chunks = data.scan(/.{1,#{@snappy_bufsize}}/m)
|
106
|
+
chunks.each do |chunk|
|
107
|
+
compressed = snappy_deflate(chunk)
|
108
|
+
buffer << [chunk.size, compressed.size, compressed].pack("NNa*")
|
109
|
+
end
|
110
|
+
return buffer.string
|
111
|
+
end
|
112
|
+
|
113
|
+
def get_snappy_header!
|
114
|
+
[MAGIC, DEFAULT_VERSION, MINIMUM_COMPATIBLE_VERSION].pack("a8NN")
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
Gem::Specification.new do |s|
|
3
|
+
|
4
|
+
s.name = 'logstash-output-webhdfs'
|
5
|
+
s.version = '3.1.0'
|
6
|
+
s.licenses = ['Apache License (2.0)']
|
7
|
+
s.summary = "Sends Logstash events to HDFS using the `webhdfs` REST API"
|
8
|
+
s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
|
9
|
+
s.authors = ["Björn Puttmann, loshkovskyi, Elastic"]
|
10
|
+
s.email = 'b.puttmann@dbap.de'
|
11
|
+
s.homepage = "http://www.dbap.de"
|
12
|
+
s.require_paths = ['lib', 'vendor/jar-dependencies']
|
13
|
+
|
14
|
+
# Files
|
15
|
+
s.files = Dir["lib/**/*","spec/**/*","*.gemspec","*.md","CONTRIBUTORS","Gemfile","LICENSE","NOTICE.TXT", "vendor/jar-dependencies/**/*.jar", "vendor/jar-dependencies/**/*.rb", "VERSION", "docs/**/*"]
|
16
|
+
|
17
|
+
# Tests
|
18
|
+
s.test_files = s.files.grep(%r{^(test|spec|features)/})
|
19
|
+
|
20
|
+
# Special flag to let us know this is actually a logstash plugin
|
21
|
+
s.metadata = { "logstash_plugin" => "true", "logstash_group" => "output" }
|
22
|
+
|
23
|
+
# Gem dependencies
|
24
|
+
s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99"
|
25
|
+
s.add_runtime_dependency 'webhdfs'
|
26
|
+
s.add_development_dependency 'logstash-devutils'
|
27
|
+
|
28
|
+
s.add_development_dependency 'logstash-codec-line'
|
29
|
+
s.add_development_dependency 'logstash-codec-json'
|
30
|
+
|
31
|
+
s.platform = 'java'
|
32
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'logstash/devutils/rspec/spec_helper'
|
3
|
+
require 'logstash/outputs/webhdfs'
|
4
|
+
require 'webhdfs'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
describe LogStash::Outputs::WebHdfs, :integration => true do
|
8
|
+
let(:host) { 'localhost' }
|
9
|
+
let(:port) { 50070 }
|
10
|
+
let(:user) { 'test' }
|
11
|
+
let(:test_file) { '/user/' + user + '/%{host}.test' }
|
12
|
+
let(:hdfs_file_name) { 'user/' + user + '/localhost.test' }
|
13
|
+
|
14
|
+
let(:config) { { 'host' => host, 'user' => user, 'path' => test_file, 'compression' => 'none' } }
|
15
|
+
|
16
|
+
subject(:plugin) { LogStash::Plugin.lookup("output", "webhdfs").new(config) }
|
17
|
+
|
18
|
+
let(:webhdfs_client) { WebHDFS::Client.new(host, port, user) }
|
19
|
+
|
20
|
+
let(:event) { LogStash::Event.new('message' => 'Hello world!', 'source' => 'out of the blue',
|
21
|
+
'type' => 'generator', 'host' => 'localhost' ) }
|
22
|
+
|
23
|
+
describe "register and close" do
|
24
|
+
|
25
|
+
it 'should register with default values' do
|
26
|
+
expect { subject.register }.to_not raise_error
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
describe '#write' do
|
32
|
+
|
33
|
+
let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
|
34
|
+
'path' => test_file, 'compression' => 'none' } }
|
35
|
+
|
36
|
+
after(:each) do
|
37
|
+
webhdfs_client.delete(hdfs_file_name)
|
38
|
+
end
|
39
|
+
|
40
|
+
describe "writing plain files" do
|
41
|
+
|
42
|
+
before(:each) do
|
43
|
+
subject.register
|
44
|
+
subject.receive(event)
|
45
|
+
subject.close
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should use the correct filename pattern' do
|
49
|
+
expect { webhdfs_client.read(hdfs_file_name) }.to_not raise_error
|
50
|
+
end
|
51
|
+
|
52
|
+
context "using the line codec without format" do
|
53
|
+
|
54
|
+
let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
|
55
|
+
'path' => test_file, 'compression' => 'none', 'codec' => 'line' } }
|
56
|
+
|
57
|
+
it 'should match the event data' do
|
58
|
+
expect(webhdfs_client.read(hdfs_file_name).strip()).to eq(event.to_s)
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
context "using the json codec" do
|
64
|
+
|
65
|
+
let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
|
66
|
+
'path' => test_file, 'compression' => 'none', 'codec' => 'json' } }
|
67
|
+
|
68
|
+
|
69
|
+
it 'should match the event data' do
|
70
|
+
expect(webhdfs_client.read(hdfs_file_name).strip()).to eq(event.to_json)
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
context "when flushing events" do
|
76
|
+
|
77
|
+
let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10, 'idle_flush_time' => 2,
|
78
|
+
'path' => test_file, 'compression' => 'none', 'codec' => 'json' } }
|
79
|
+
|
80
|
+
before(:each) do
|
81
|
+
webhdfs_client.delete(hdfs_file_name)
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'should flush after configured idle time' do
|
85
|
+
subject.register
|
86
|
+
subject.receive(event)
|
87
|
+
expect { webhdfs_client.read(hdfs_file_name) }.to raise_error(error=WebHDFS::FileNotFoundError)
|
88
|
+
sleep 3
|
89
|
+
expect { webhdfs_client.read(hdfs_file_name) }.to_not raise_error
|
90
|
+
expect(webhdfs_client.read(hdfs_file_name).strip()).to eq(event.to_json)
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
describe "#compression" do
|
98
|
+
|
99
|
+
before(:each) do
|
100
|
+
subject.register
|
101
|
+
for _ in 0...500
|
102
|
+
subject.receive(event)
|
103
|
+
end
|
104
|
+
subject.close
|
105
|
+
end
|
106
|
+
|
107
|
+
context "when using no compression" do
|
108
|
+
|
109
|
+
let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
|
110
|
+
'path' => test_file, 'compression' => 'none', 'codec' => 'line' } }
|
111
|
+
|
112
|
+
it 'should write some messages uncompressed' do
|
113
|
+
expect(webhdfs_client.read(hdfs_file_name).lines.count).to eq(500)
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
context "when using gzip compression" do
|
119
|
+
|
120
|
+
let(:config) { { 'host' => host, 'user' => user,
|
121
|
+
'path' => test_file, 'compression' => 'gzip', 'codec' => 'line' } }
|
122
|
+
|
123
|
+
it 'should write some messages gzip compressed' do
|
124
|
+
expect(Zlib::Inflate.new(window_bits=47).inflate(webhdfs_client.read("#{hdfs_file_name}.gz")).lines.count ).to eq(500)
|
125
|
+
webhdfs_client.delete("#{hdfs_file_name}.gz")
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'logstash/devutils/rspec/spec_helper'
|
3
|
+
require 'logstash/outputs/webhdfs'
|
4
|
+
require 'webhdfs'
|
5
|
+
require 'logstash-output-webhdfs_jars'
|
6
|
+
|
7
|
+
|
8
|
+
describe "webhdfs helpers" do
|
9
|
+
|
10
|
+
let(:host) { 'localhost' }
|
11
|
+
let(:user) { 'hadoop' }
|
12
|
+
let(:path) { '/test.log' }
|
13
|
+
|
14
|
+
let(:config) { { 'host' =>host, 'user' => user, 'path' => path, 'compression' => 'none' } }
|
15
|
+
|
16
|
+
let(:sample_data) { "Something very very very long to compress" }
|
17
|
+
|
18
|
+
subject(:plugin) { LogStash::Plugin.lookup("output", "webhdfs").new(config) }
|
19
|
+
|
20
|
+
context "when compressing using vendor snappy" do
|
21
|
+
it "should return a valid byte array" do
|
22
|
+
compressed = subject.compress_snappy_file(sample_data)
|
23
|
+
|
24
|
+
expect(compressed).not_to be(:nil)
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should contains all the data" do
|
28
|
+
compressed = subject.compress_snappy_file(sample_data)
|
29
|
+
|
30
|
+
#remove the length integer (32 bit) added by compress_snappy_file, 4 bytes, from compressed
|
31
|
+
uncompressed = subject.snappy_inflate(compressed[4..-1])
|
32
|
+
|
33
|
+
expect(uncompressed).to eq(sample_data)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'logstash/devutils/rspec/spec_helper'
|
3
|
+
require 'logstash/outputs/webhdfs'
|
4
|
+
require 'webhdfs'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
describe 'outputs/webhdfs' do
|
8
|
+
|
9
|
+
let(:host) { 'localhost' }
|
10
|
+
let(:user) { 'hadoop' }
|
11
|
+
let(:path) { '/test.log' }
|
12
|
+
|
13
|
+
let(:config) { { 'host' =>host, 'user' => user, 'path' => path, 'compression' => 'none' } }
|
14
|
+
|
15
|
+
subject(:plugin) { LogStash::Plugin.lookup("output", "webhdfs").new(config) }
|
16
|
+
|
17
|
+
describe '#initializing' do
|
18
|
+
|
19
|
+
it 'should fail to register without required values' do
|
20
|
+
plugin = LogStash::Plugin.lookup("output", "webhdfs")
|
21
|
+
expect { plugin.new }.to raise_error(error=LogStash::ConfigurationError)
|
22
|
+
end
|
23
|
+
|
24
|
+
context "default values" do
|
25
|
+
|
26
|
+
it 'should have default port' do
|
27
|
+
expect(subject.port).to eq(50070)
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should have default idle_flush_time' do
|
31
|
+
expect(subject.idle_flush_time).to eq(1)
|
32
|
+
end
|
33
|
+
it 'should have default flush_size' do
|
34
|
+
expect(subject.flush_size).to eq(500)
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should have default open_timeout' do
|
38
|
+
expect(subject.open_timeout).to eq(30)
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'should have default read_timeout' do
|
42
|
+
expect(subject.read_timeout).to eq(30)
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should have default use_httpfs' do
|
46
|
+
expect(subject.use_httpfs).to eq(false)
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'should have default retry_known_errors' do
|
50
|
+
expect(subject.retry_known_errors).to eq(true)
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'should have default retry_interval' do
|
54
|
+
expect(subject.retry_interval).to eq(0.5)
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'should have default retry_times' do
|
58
|
+
expect(subject.retry_times).to eq(5)
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'should have default snappy_bufsize' do
|
62
|
+
expect(subject.snappy_bufsize).to eq(32768)
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'should have default snappy_format' do
|
66
|
+
expect(subject.snappy_format).to eq('stream')
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|