logstash-output-webhdfs 0.0.2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -19
- data/CHANGELOG.md +2 -0
- data/CONTRIBUTORS +9 -0
- data/LICENSE +13 -13
- data/NOTICE.TXT +5 -0
- data/README.md +76 -31
- data/lib/logstash/outputs/webhdfs.rb +71 -149
- data/lib/logstash/outputs/webhdfs_helper.rb +82 -0
- data/logstash-output-webhdfs.gemspec +7 -2
- data/spec/integration/webhdfs_spec.rb +131 -0
- data/spec/outputs/webhdfs_spec.rb +41 -100
- metadata +38 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4d8c5f3670364dd6a301dfdf281b8265a7020a2d
|
4
|
+
data.tar.gz: e6abb77dd5507ae0d1388ab28989baaf4d517d8d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1f27095ed386f4984557345179bae49757312fbbd8e45fd1070ca1c24f0b3aac4deaaa8303aa279248a406f895260ba39067979548ff0e889b74df1f282a11bc
|
7
|
+
data.tar.gz: 89dd5efbb67f0ec3a2ce61446fe9962661bfb8933ceb94e792d6669b14e01b518658e266eb9f2acf6755761efe360c6c15bc8cabf7d08caca2128f31ea5f1c4f
|
data/.gitignore
CHANGED
data/CHANGELOG.md
ADDED
data/CONTRIBUTORS
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
The following is a list of people who have contributed ideas, code, bug
|
2
|
+
reports, or in general have helped logstash along its way.
|
3
|
+
|
4
|
+
Contributors:
|
5
|
+
|
6
|
+
Note: If you've sent us patches, bug reports, or otherwise contributed to
|
7
|
+
Logstash, and you aren't on the list above and want to be, please let us know
|
8
|
+
and we'll make sure you're here. Contributions from folks like you are what make
|
9
|
+
open source awesome.
|
data/LICENSE
CHANGED
@@ -1,13 +1,13 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
1
|
+
Copyright (c) 2012-2015 Elasticsearch <http://www.elasticsearch.org>
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
data/NOTICE.TXT
ADDED
data/README.md
CHANGED
@@ -1,41 +1,86 @@
|
|
1
|
-
|
2
|
-
================
|
1
|
+
# Logstash Plugin
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
Tested with v1.3.3, v1.4.0 and 1.5.0.
|
3
|
+
This is a plugin for [Logstash](https://github.com/elasticsearch/logstash).
|
7
4
|
|
8
5
|
It is fully free and fully open source. The license is Apache 2.0, meaning you are pretty much free to use it however you want in whatever way.
|
9
6
|
|
10
|
-
|
7
|
+
## Documentation
|
8
|
+
|
9
|
+
Logstash provides infrastructure to automatically generate documentation for this plugin. We use the asciidoc format to write documentation so any comments in the source code will be first converted into asciidoc and then into html. All plugin documentation are placed under one [central location](http://www.elasticsearch.org/guide/en/logstash/current/).
|
10
|
+
|
11
|
+
- For formatting code or config example, you can use the asciidoc `[source,ruby]` directive
|
12
|
+
- For more asciidoc formatting tips, see the excellent reference here https://github.com/elasticsearch/docs#asciidoc-guide
|
13
|
+
|
14
|
+
## Need Help?
|
15
|
+
|
16
|
+
Need help? Try #logstash on freenode IRC or the https://discuss.elastic.co/c/logstash discussion forum.
|
17
|
+
|
18
|
+
## Developing
|
19
|
+
|
20
|
+
### 1. Plugin Developement and Testing
|
11
21
|
|
12
|
-
|
22
|
+
#### Code
|
23
|
+
- To get started, you'll need JRuby with the Bundler gem installed.
|
13
24
|
|
14
|
-
|
15
|
-
|
25
|
+
- Create a new plugin or clone and existing from the GitHub [logstash-plugins](https://github.com/logstash-plugins) organization. We also provide [example plugins](https://github.com/logstash-plugins?query=example).
|
26
|
+
|
27
|
+
- Install dependencies
|
28
|
+
```sh
|
29
|
+
bundle install
|
16
30
|
```
|
17
|
-
|
31
|
+
|
32
|
+
#### Test
|
33
|
+
|
34
|
+
- Update your dependencies
|
35
|
+
|
36
|
+
```sh
|
37
|
+
bundle install
|
18
38
|
```
|
19
39
|
|
20
|
-
|
40
|
+
- Run tests
|
41
|
+
|
42
|
+
```sh
|
43
|
+
bundle exec rspec
|
44
|
+
```
|
45
|
+
|
46
|
+
### 2. Running your unpublished Plugin in Logstash
|
47
|
+
|
48
|
+
#### 2.1 Run in a local Logstash clone
|
49
|
+
|
50
|
+
- Edit Logstash `Gemfile` and add the local plugin path, for example:
|
51
|
+
```ruby
|
52
|
+
gem "logstash-filter-awesome", :path => "/your/local/logstash-filter-awesome"
|
53
|
+
```
|
54
|
+
- Install plugin
|
55
|
+
```sh
|
56
|
+
bin/plugin install --no-verify
|
57
|
+
```
|
58
|
+
- Run Logstash with your plugin
|
59
|
+
```sh
|
60
|
+
bin/logstash -e 'filter {awesome {}}'
|
61
|
+
```
|
62
|
+
At this point any modifications to the plugin code will be applied to this local Logstash setup. After modifying the plugin, simply rerun Logstash.
|
63
|
+
|
64
|
+
#### 2.2 Run in an installed Logstash
|
65
|
+
|
66
|
+
You can use the same **2.1** method to run your plugin in an installed Logstash by editing its `Gemfile` and pointing the `:path` to your local plugin development directory or you can build the gem and install it using:
|
67
|
+
|
68
|
+
- Build your plugin gem
|
69
|
+
```sh
|
70
|
+
gem build logstash-filter-awesome.gemspec
|
71
|
+
```
|
72
|
+
- Install the plugin from the Logstash home
|
73
|
+
```sh
|
74
|
+
bin/plugin install /your/local/plugin/logstash-filter-awesome.gem
|
75
|
+
```
|
76
|
+
- Start Logstash and proceed to test the plugin
|
77
|
+
|
78
|
+
## Contributing
|
79
|
+
|
80
|
+
All contributions are welcome: ideas, patches, documentation, bug reports, complaints, and even something you drew up on a napkin.
|
81
|
+
|
82
|
+
Programming is not a required skill. Whatever you've seen about open source and maintainers or community members saying "send patches or die" - you will not see that here.
|
83
|
+
|
84
|
+
It is more important to the community that you are able to contribute.
|
21
85
|
|
22
|
-
|
23
|
-
|
24
|
-
output {
|
25
|
-
webhdfs {
|
26
|
-
workers => 2
|
27
|
-
server => "your.nameno.de:14000"
|
28
|
-
user => "flume"
|
29
|
-
path => "/user/flume/logstash/dt=%{+Y}-%{+M}-%{+d}/logstash-%{+H}.log"
|
30
|
-
flush_size => 500
|
31
|
-
compression => "snappy"
|
32
|
-
idle_flush_time => 10
|
33
|
-
retry_interval => 0.5
|
34
|
-
}
|
35
|
-
}
|
36
|
-
|
37
|
-
For a complete list of options, see config section in source code.
|
38
|
-
|
39
|
-
This plugin has dependencies on:
|
40
|
-
* webhdfs module @<https://github.com/kzk/webhdfs>
|
41
|
-
* snappy module @<https://github.com/miyucy/snappy>
|
86
|
+
For more information about contributing, see the [CONTRIBUTING](https://github.com/elasticsearch/logstash/blob/master/CONTRIBUTING.md) file.
|
@@ -2,76 +2,71 @@
|
|
2
2
|
require "logstash/namespace"
|
3
3
|
require "logstash/outputs/base"
|
4
4
|
require "stud/buffer"
|
5
|
+
require "logstash/outputs/webhdfs_helper"
|
5
6
|
|
6
|
-
#
|
7
|
-
#
|
7
|
+
# This plugin sends Logstash events into files in HDFS via
|
8
|
+
# the https://hadoop.apache.org/docs/r1.0.4/webhdfs.html[webhdfs] REST API.
|
8
9
|
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
# problems.
|
10
|
+
# ==== Dependencies
|
11
|
+
# This plugin has no dependency on jars from hadoop, thus reducing configuration and compatibility
|
12
|
+
# problems. It uses the webhdfs gem from Kazuki Ohta and TAGOMORI Satoshi (@see: https://github.com/kzk/webhdfs).
|
13
|
+
# Optional dependencies are zlib and snappy gem if you use the compression functionality.
|
14
14
|
#
|
15
|
+
# ==== Operational Notes
|
15
16
|
# If you get an error like:
|
16
17
|
#
|
17
18
|
# Max write retries reached. Exception: initialize: name or service not known {:level=>:error}
|
18
19
|
#
|
19
|
-
# make sure
|
20
|
-
# to a file, webhdfs somtime sends a 307 TEMPORARY_REDIRECT with the HOSTNAME of the machine its running on.
|
20
|
+
# make sure that the hostname of your namenode is resolvable on the host running Logstash. When creating/appending
|
21
|
+
# to a file, webhdfs somtime sends a `307 TEMPORARY_REDIRECT` with the `HOSTNAME` of the machine its running on.
|
21
22
|
#
|
22
|
-
#
|
23
|
-
# This is an example of
|
23
|
+
# ==== Usage
|
24
|
+
# This is an example of Logstash config:
|
24
25
|
#
|
25
26
|
# [source,ruby]
|
26
27
|
# ----------------------------------
|
27
|
-
#
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
31
|
-
#
|
32
|
-
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
# compress => "snappy" # (optional)
|
40
|
-
# remove_at_timestamp => false # (optional)
|
28
|
+
# input {
|
29
|
+
# ...
|
30
|
+
# }
|
31
|
+
# filter {
|
32
|
+
# ...
|
33
|
+
# }
|
34
|
+
# output {
|
35
|
+
# webhdfs {
|
36
|
+
# server => "127.0.0.1:50070" # (required)
|
37
|
+
# path => "/user/logstash/dt=%{+YYYY-MM-dd}/logstash-%{+HH}.log" # (required)
|
38
|
+
# user => "hue" # (required)
|
39
|
+
# }
|
41
40
|
# }
|
42
41
|
# ----------------------------------
|
43
|
-
#
|
44
|
-
# Author: Bjoern Puttmann <b.puttmann@dbap.de> - dbap GmbH, Münster, Germany.
|
45
42
|
|
46
43
|
class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
|
44
|
+
|
47
45
|
include Stud::Buffer
|
46
|
+
include LogStash::Outputs::WebHdfsHelper
|
48
47
|
|
49
48
|
config_name "webhdfs"
|
50
|
-
milestone 1
|
51
49
|
|
52
|
-
|
53
|
-
MAGIC = "\x82SNAPPY\x0"
|
54
|
-
else
|
55
|
-
MAGIC = "\x82SNAPPY\x0".force_encoding Encoding::ASCII_8BIT
|
56
|
-
end
|
50
|
+
MAGIC = "\x82SNAPPY\x0".force_encoding Encoding::ASCII_8BIT
|
57
51
|
DEFAULT_VERSION = 1
|
58
52
|
MINIMUM_COMPATIBLE_VERSION = 1
|
59
53
|
|
60
|
-
# The server name
|
61
|
-
config :
|
54
|
+
# The server name for webhdfs/httpfs connections.
|
55
|
+
config :host, :validate => :string, :required => true
|
56
|
+
|
57
|
+
# The server port for webhdfs/httpfs connections.
|
58
|
+
config :port, :validate => :number, :default => 50070
|
62
59
|
|
63
60
|
# The Username for webhdfs.
|
64
|
-
config :user, :validate => :string, :required =>
|
61
|
+
config :user, :validate => :string, :required => true
|
65
62
|
|
66
63
|
# The path to the file to write to. Event fields can be used here,
|
67
64
|
# as well as date fields in the joda time format, e.g.:
|
68
|
-
#
|
69
|
-
# "/user/logstash/dt=%{+YYYY-MM-dd}/%{@source_host}-%{+HH}.log"
|
70
|
-
# ....
|
65
|
+
# `/user/logstash/dt=%{+YYYY-MM-dd}/%{@source_host}-%{+HH}.log`
|
71
66
|
config :path, :validate => :string, :required => true
|
72
67
|
|
73
68
|
# The format to use when writing events to the file. This value
|
74
|
-
# supports any string and can include
|
69
|
+
# supports any string and can include `%{name}` and other dynamic
|
75
70
|
# strings.
|
76
71
|
#
|
77
72
|
# If this setting is omitted, the full json representation of the
|
@@ -81,13 +76,13 @@ class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
|
|
81
76
|
# Sending data to webhdfs in x seconds intervals.
|
82
77
|
config :idle_flush_time, :validate => :number, :default => 1
|
83
78
|
|
84
|
-
# Sending data to webhdfs if event count is above, even if store_interval_in_secs is not reached.
|
79
|
+
# Sending data to webhdfs if event count is above, even if `store_interval_in_secs` is not reached.
|
85
80
|
config :flush_size, :validate => :number, :default => 500
|
86
81
|
|
87
|
-
# WebHdfs open timeout, default 30s
|
82
|
+
# WebHdfs open timeout, default 30s.
|
88
83
|
config :open_timeout, :validate => :number, :default => 30
|
89
84
|
|
90
|
-
# The WebHdfs read timeout, default 30s
|
85
|
+
# The WebHdfs read timeout, default 30s.
|
91
86
|
config :read_timeout, :validate => :number, :default => 30
|
92
87
|
|
93
88
|
# Use httpfs mode if set to true, else webhdfs.
|
@@ -99,7 +94,7 @@ class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
|
|
99
94
|
# How long should we wait between retries.
|
100
95
|
config :retry_interval, :validate => :number, :default => 0.5
|
101
96
|
|
102
|
-
# How many times should we retry.
|
97
|
+
# How many times should we retry. If retry_times is exceeded, an error will be logged and the event will be discarded.
|
103
98
|
config :retry_times, :validate => :number, :default => 5
|
104
99
|
|
105
100
|
# Compress output. One of ['none', 'snappy', 'gzip']
|
@@ -112,82 +107,49 @@ class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
|
|
112
107
|
# Set snappy format. One of "stream", "file". Set to stream to be hive compatible.
|
113
108
|
config :snappy_format, :validate => ["stream", "file"], :default => "stream"
|
114
109
|
|
115
|
-
|
116
|
-
|
110
|
+
## Set codec.
|
111
|
+
default :codec, 'line'
|
117
112
|
|
118
113
|
public
|
119
114
|
|
120
115
|
def register
|
121
|
-
|
122
|
-
require 'webhdfs'
|
123
|
-
rescue LoadError
|
124
|
-
@logger.error("Module webhdfs could not be loaded.")
|
125
|
-
end
|
116
|
+
load_module('webhdfs')
|
126
117
|
if @compression == "gzip"
|
127
|
-
|
128
|
-
require "zlib"
|
129
|
-
rescue LoadError
|
130
|
-
@logger.error("Gzip compression selected but zlib module could not be loaded.")
|
131
|
-
end
|
118
|
+
load_module('zlib')
|
132
119
|
elsif @compression == "snappy"
|
133
|
-
|
134
|
-
require "snappy"
|
135
|
-
rescue LoadError
|
136
|
-
@logger.error("Snappy compression selected but snappy module could not be loaded.")
|
137
|
-
end
|
120
|
+
load_module('snappy')
|
138
121
|
end
|
139
122
|
@files = {}
|
140
|
-
@host, @port = @server.split(':')
|
141
123
|
@client = prepare_client(@host, @port, @user)
|
142
124
|
# Test client connection.
|
143
125
|
begin
|
144
126
|
@client.list('/')
|
145
127
|
rescue => e
|
146
128
|
@logger.error("Webhdfs check request failed. (namenode: #{@client.host}:#{@client.port}, Exception: #{e.message})")
|
129
|
+
raise
|
147
130
|
end
|
148
131
|
buffer_initialize(
|
149
|
-
|
150
|
-
|
151
|
-
|
132
|
+
:max_items => @flush_size,
|
133
|
+
:max_interval => @idle_flush_time,
|
134
|
+
:logger => @logger
|
152
135
|
)
|
136
|
+
@codec.on_event do |event, encoded_event|
|
137
|
+
encoded_event
|
138
|
+
end
|
153
139
|
end # def register
|
154
140
|
|
155
|
-
public
|
156
141
|
def receive(event)
|
157
142
|
return unless output?(event)
|
158
143
|
buffer_receive(event)
|
159
144
|
end # def receive
|
160
145
|
|
161
|
-
def prepare_client(host, port, username)
|
162
|
-
client = WebHDFS::Client.new(host, port, username)
|
163
|
-
if @use_httpfs
|
164
|
-
client.httpfs_mode = true
|
165
|
-
end
|
166
|
-
client.open_timeout = @open_timeout
|
167
|
-
client.read_timeout = @read_timeout
|
168
|
-
if @retry_known_errors
|
169
|
-
client.retry_known_errors = true
|
170
|
-
client.retry_interval = @retry_interval if @retry_interval
|
171
|
-
client.retry_times = @retry_times if @retry_times
|
172
|
-
end
|
173
|
-
client
|
174
|
-
end
|
175
|
-
|
176
146
|
def flush(events=nil, teardown=false)
|
177
147
|
return if not events
|
178
|
-
|
179
|
-
newline = "\n".freeze
|
148
|
+
newline = "\n"
|
180
149
|
output_files = Hash.new { |hash, key| hash[key] = "" }
|
181
150
|
events.collect do |event|
|
182
151
|
path = event.sprintf(@path)
|
183
|
-
|
184
|
-
event.remove("@timestamp")
|
185
|
-
end
|
186
|
-
if @message_format
|
187
|
-
event_as_string = event.sprintf(@message_format)
|
188
|
-
else
|
189
|
-
event_as_string = event.to_json
|
190
|
-
end
|
152
|
+
event_as_string = @codec.encode(event)
|
191
153
|
event_as_string += newline unless event_as_string.end_with? newline
|
192
154
|
output_files[path] << event_as_string
|
193
155
|
end
|
@@ -203,69 +165,18 @@ class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
|
|
203
165
|
output = compress_snappy_stream(output)
|
204
166
|
end
|
205
167
|
end
|
206
|
-
|
207
|
-
while write_tries < @retry_times do
|
208
|
-
begin
|
209
|
-
write_data(path, output)
|
210
|
-
break
|
211
|
-
rescue => e
|
212
|
-
write_tries += 1
|
213
|
-
# Retry max_retry times. This can solve problems like leases being hold by another process. Sadly this is no
|
214
|
-
# KNOWN_ERROR in rubys webhdfs client.
|
215
|
-
if write_tries < @retry_times
|
216
|
-
@logger.warn("Retrying webhdfs write for multiple times. Maybe you should increase retry_interval or reduce number of workers.")
|
217
|
-
sleep(@retry_interval * write_tries)
|
218
|
-
next
|
219
|
-
else
|
220
|
-
# Issue error after max retries.
|
221
|
-
@logger.error("Max write retries reached. Exception: #{e.message}")
|
222
|
-
end
|
223
|
-
end
|
224
|
-
end
|
168
|
+
write_data(path, output)
|
225
169
|
end
|
226
170
|
end
|
227
171
|
|
228
|
-
def compress_gzip(data)
|
229
|
-
buffer = StringIO.new('','w')
|
230
|
-
compressor = Zlib::GzipWriter.new(buffer)
|
231
|
-
begin
|
232
|
-
compressor.write data
|
233
|
-
ensure
|
234
|
-
compressor.close()
|
235
|
-
end
|
236
|
-
buffer.string
|
237
|
-
end
|
238
|
-
|
239
|
-
def compress_snappy_file(data)
|
240
|
-
# Encode data to ASCII_8BIT (binary)
|
241
|
-
data= data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
|
242
|
-
buffer = StringIO.new('', 'w')
|
243
|
-
buffer.set_encoding Encoding::ASCII_8BIT unless RUBY_VERSION =~ /^1\.8/
|
244
|
-
compressed = Snappy.deflate(data)
|
245
|
-
buffer << [compressed.size, compressed].pack("Na*")
|
246
|
-
buffer.string
|
247
|
-
end
|
248
|
-
|
249
|
-
def compress_snappy_stream(data)
|
250
|
-
# Encode data to ASCII_8BIT (binary)
|
251
|
-
data= data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
|
252
|
-
buffer = StringIO.new
|
253
|
-
buffer.set_encoding Encoding::ASCII_8BIT unless RUBY_VERSION =~ /^1\.8/
|
254
|
-
chunks = data.scan(/.{1,#{@snappy_bufsize}}/m)
|
255
|
-
chunks.each do |chunk|
|
256
|
-
compressed = Snappy.deflate(chunk)
|
257
|
-
buffer << [chunk.size, compressed.size, compressed].pack("NNa*")
|
258
|
-
end
|
259
|
-
return buffer.string
|
260
|
-
end
|
261
|
-
|
262
|
-
def get_snappy_header!
|
263
|
-
[MAGIC, DEFAULT_VERSION, MINIMUM_COMPATIBLE_VERSION].pack("a8NN")
|
264
|
-
end
|
265
|
-
|
266
172
|
def write_data(path, data)
|
173
|
+
# Retry max_retry times. This can solve problems like leases being hold by another process. Sadly this is no
|
174
|
+
# KNOWN_ERROR in rubys webhdfs client.
|
175
|
+
write_tries = 0
|
267
176
|
begin
|
177
|
+
# Try to append to already existing file, which will work most of the times.
|
268
178
|
@client.append(path, data)
|
179
|
+
# File does not exist, so create it.
|
269
180
|
rescue WebHDFS::FileNotFoundError
|
270
181
|
# Add snappy header if format is "file".
|
271
182
|
if @compression == "snappy" and @snappy_format == "file"
|
@@ -273,10 +184,21 @@ class LogStash::Outputs::WebHdfs < LogStash::Outputs::Base
|
|
273
184
|
elsif
|
274
185
|
@client.create(path, data)
|
275
186
|
end
|
187
|
+
# Handle other write errors and retry to write max. @retry_times.
|
188
|
+
rescue => e
|
189
|
+
if write_tries < @retry_times
|
190
|
+
@logger.warn("webhdfs write caused an exception: #{e.message}. Maybe you should increase retry_interval or reduce number of workers. Retrying...")
|
191
|
+
sleep(@retry_interval * write_tries)
|
192
|
+
write_tries += 1
|
193
|
+
retry
|
194
|
+
else
|
195
|
+
# Issue error after max retries.
|
196
|
+
@logger.error("Max write retries reached. Events will be discarded. Exception: #{e.message}")
|
197
|
+
end
|
276
198
|
end
|
277
199
|
end
|
278
200
|
|
279
201
|
def teardown
|
280
202
|
buffer_flush(:final => true)
|
281
203
|
end # def teardown
|
282
|
-
end
|
204
|
+
end # class LogStash::Outputs::WebHdfs
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require "logstash/namespace"
|
2
|
+
|
3
|
+
module LogStash
|
4
|
+
module Outputs
|
5
|
+
module WebHdfsHelper
|
6
|
+
|
7
|
+
# Load a module
|
8
|
+
# @param module_name [String] A module name
|
9
|
+
# @raise [LoadError] If the module count not be loaded
|
10
|
+
def load_module(module_name)
|
11
|
+
begin
|
12
|
+
require module_name
|
13
|
+
rescue LoadError
|
14
|
+
@logger.error("Module #{module_name} could not be loaded.")
|
15
|
+
raise
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
# Setup a WebHDFS client
|
20
|
+
# @param host [String] The WebHDFS location
|
21
|
+
# @param port [Number] The port used to do the communication
|
22
|
+
# @param username [String] A valid HDFS user
|
23
|
+
# @return [WebHDFS] An setup client instance
|
24
|
+
def prepare_client(host, port, username)
|
25
|
+
client = WebHDFS::Client.new(host, port, username)
|
26
|
+
client.httpfs_mode = @use_httpfs
|
27
|
+
client.open_timeout = @open_timeout
|
28
|
+
client.read_timeout = @read_timeout
|
29
|
+
client.retry_known_errors = @retry_known_errors
|
30
|
+
client.retry_interval = @retry_interval if @retry_interval
|
31
|
+
client.retry_times = @retry_times if @retry_times
|
32
|
+
client
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
# Compress data using the gzip methods.
|
37
|
+
# @param data [String] stream of data to be compressed
|
38
|
+
# @return [String] the compressed stream of data
|
39
|
+
def compress_gzip(data)
|
40
|
+
buffer = StringIO.new('','w')
|
41
|
+
compressor = Zlib::GzipWriter.new(buffer)
|
42
|
+
begin
|
43
|
+
compressor.write(data)
|
44
|
+
ensure
|
45
|
+
compressor.close()
|
46
|
+
end
|
47
|
+
buffer.string
|
48
|
+
end
|
49
|
+
|
50
|
+
# Compress snappy file.
|
51
|
+
# @param data [binary] stream of data to be compressed
|
52
|
+
# @return [String] the compressed stream of data
|
53
|
+
def compress_snappy_file(data)
|
54
|
+
# Encode data to ASCII_8BIT (binary)
|
55
|
+
data= data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
|
56
|
+
buffer = StringIO.new('', 'w')
|
57
|
+
buffer.set_encoding(Encoding::ASCII_8BIT)
|
58
|
+
compressed = Snappy.deflate(data)
|
59
|
+
buffer << [compressed.size, compressed].pack("Na*")
|
60
|
+
buffer.string
|
61
|
+
end
|
62
|
+
|
63
|
+
def compress_snappy_stream(data)
|
64
|
+
# Encode data to ASCII_8BIT (binary)
|
65
|
+
data= data.encode(Encoding::ASCII_8BIT, "binary", :undef => :replace)
|
66
|
+
buffer = StringIO.new
|
67
|
+
buffer.set_encoding(Encoding::ASCII_8BIT)
|
68
|
+
chunks = data.scan(/.{1,#{@snappy_bufsize}}/m)
|
69
|
+
chunks.each do |chunk|
|
70
|
+
compressed = Snappy.deflate(chunk)
|
71
|
+
buffer << [chunk.size, compressed.size, compressed].pack("NNa*")
|
72
|
+
end
|
73
|
+
return buffer.string
|
74
|
+
end
|
75
|
+
|
76
|
+
def get_snappy_header!
|
77
|
+
[MAGIC, DEFAULT_VERSION, MINIMUM_COMPATIBLE_VERSION].pack("a8NN")
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -1,11 +1,11 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
|
3
3
|
s.name = 'logstash-output-webhdfs'
|
4
|
-
s.version = '0.0
|
4
|
+
s.version = '0.1.0'
|
5
5
|
s.licenses = ['Apache License (2.0)']
|
6
6
|
s.summary = "Plugin to write events to hdfs via webhdfs."
|
7
7
|
s.description = "This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program"
|
8
|
-
s.authors = ["Björn Puttmann, loshkovskyi"]
|
8
|
+
s.authors = ["Björn Puttmann, loshkovskyi, Elastic"]
|
9
9
|
s.email = 'b.puttmann@dbap.de'
|
10
10
|
s.homepage = "http://www.dbap.de"
|
11
11
|
s.require_paths = ["lib"]
|
@@ -24,4 +24,9 @@ Gem::Specification.new do |s|
|
|
24
24
|
s.add_runtime_dependency 'webhdfs'
|
25
25
|
s.add_runtime_dependency 'snappy'
|
26
26
|
s.add_development_dependency 'logstash-devutils'
|
27
|
+
|
28
|
+
s.add_development_dependency 'logstash-codec-line'
|
29
|
+
s.add_development_dependency 'logstash-codec-json'
|
30
|
+
|
31
|
+
|
27
32
|
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'logstash/devutils/rspec/spec_helper'
|
3
|
+
require 'logstash/outputs/webhdfs'
|
4
|
+
require 'webhdfs'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
describe LogStash::Outputs::WebHdfs, :integration => true do
|
8
|
+
|
9
|
+
let(:host) { 'localhost' }
|
10
|
+
let(:port) { 50070 }
|
11
|
+
let(:user) { 'vagrant' }
|
12
|
+
|
13
|
+
let(:test_file) { "/test.file" }
|
14
|
+
|
15
|
+
let(:event) { LogStash::Event.new('message' => 'Hello world!', 'source' => 'out of the blue',
|
16
|
+
'type' => 'generator', 'host' => 'localhost' ) }
|
17
|
+
|
18
|
+
let(:config) { { 'host' => host, 'user' => user,
|
19
|
+
'path' => test_file, 'compression' => 'none' } }
|
20
|
+
|
21
|
+
subject { LogStash::Plugin.lookup("output", "webhdfs").new(config) }
|
22
|
+
|
23
|
+
let(:client) { WebHDFS::Client.new(host, port, user) }
|
24
|
+
|
25
|
+
describe "register and teardown" do
|
26
|
+
|
27
|
+
it 'should register with default values' do
|
28
|
+
expect { subject.register }.to_not raise_error
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
describe '#write' do
|
34
|
+
|
35
|
+
let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
|
36
|
+
'path' => "/%{host}_test.log", 'compression' => 'none' } }
|
37
|
+
|
38
|
+
after(:each) do
|
39
|
+
client.delete(test_file)
|
40
|
+
end
|
41
|
+
|
42
|
+
describe "writing plain files" do
|
43
|
+
|
44
|
+
before(:each) do
|
45
|
+
subject.register
|
46
|
+
subject.receive(event)
|
47
|
+
subject.teardown
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'should use the correct filename pattern' do
|
51
|
+
expect { client.read('localhost_test.log') }.to_not raise_error
|
52
|
+
end
|
53
|
+
|
54
|
+
context "using the line codec" do
|
55
|
+
|
56
|
+
let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
|
57
|
+
'path' => test_file, 'compression' => 'none', 'codec' => 'line' } }
|
58
|
+
|
59
|
+
it 'should match the event data' do
|
60
|
+
expect(client.read(test_file).strip()).to eq(event.to_s)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
context "using the json codec" do
|
65
|
+
|
66
|
+
let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
|
67
|
+
'path' => test_file, 'compression' => 'none', 'codec' => 'json' } }
|
68
|
+
|
69
|
+
|
70
|
+
it 'should match the event data' do
|
71
|
+
expect(client.read(test_file).strip()).to eq(event.to_json)
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
context "when flushing events" do
|
77
|
+
|
78
|
+
let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10, 'idle_flush_time' => 2,
|
79
|
+
'path' => test_file, 'compression' => 'none', 'codec' => 'json' } }
|
80
|
+
|
81
|
+
before(:each) do
|
82
|
+
client.delete(test_file)
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'should flush after configured idle time' do
|
86
|
+
subject.register
|
87
|
+
subject.receive(event)
|
88
|
+
expect { client.read(test_file) }.to raise_error(error=WebHDFS::FileNotFoundError)
|
89
|
+
sleep 3
|
90
|
+
expect { client.read(test_file) }.to_not raise_error
|
91
|
+
expect(client.read(test_file).strip()).to eq(event.to_json)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
describe "#compression" do
|
98
|
+
|
99
|
+
before(:each) do
|
100
|
+
subject.register
|
101
|
+
for _ in 0...500
|
102
|
+
subject.receive(event)
|
103
|
+
end
|
104
|
+
subject.teardown
|
105
|
+
end
|
106
|
+
|
107
|
+
context "when using no compression" do
|
108
|
+
|
109
|
+
let(:config) { { 'host' => host, 'user' => user, 'flush_size' => 10,
|
110
|
+
'path' => test_file, 'compression' => 'none', 'codec' => 'line' } }
|
111
|
+
|
112
|
+
it 'should write some messages uncompressed' do
|
113
|
+
expect(client.read(test_file).lines.count).to eq(500)
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
context "when using gzip compression" do
|
119
|
+
|
120
|
+
let(:config) { { 'host' => host, 'user' => user,
|
121
|
+
'path' => test_file, 'compression' => 'gzip', 'codec' => 'line' } }
|
122
|
+
|
123
|
+
it 'should write some messages gzip compressed' do
|
124
|
+
expect(Zlib::Inflate.new(window_bits=47).inflate(client.read("#{test_file}.gz")).lines.count ).to eq(500)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
end
|
@@ -6,125 +6,66 @@ require 'json'
|
|
6
6
|
|
7
7
|
describe 'outputs/webhdfs' do
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
path_to_testlog = '/user/hadoop/test.log'
|
13
|
-
current_logfile_name = '/user/hadoop/test.log'
|
14
|
-
current_config = ""
|
9
|
+
let(:host) { 'localhost' }
|
10
|
+
let(:user) { 'hadoop' }
|
11
|
+
let(:path) { '/test.log' }
|
15
12
|
|
16
|
-
|
17
|
-
'message' => 'Hello world!',
|
18
|
-
'source' => 'out of the blue',
|
19
|
-
'type' => 'generator',
|
20
|
-
'host' => 'localhost',
|
21
|
-
'@timestamp' => LogStash::Timestamp.now)
|
13
|
+
let(:config) { { 'host' =>host, 'user' => user, 'path' => path, 'compression' => 'none' } }
|
22
14
|
|
23
|
-
|
24
|
-
'user' => webhdfs_user,
|
25
|
-
'path' => path_to_testlog,
|
26
|
-
'compression' => 'none' }
|
15
|
+
subject(:plugin) { LogStash::Plugin.lookup("output", "webhdfs").new(config) }
|
27
16
|
|
28
|
-
|
29
|
-
|
30
|
-
context 'when initializing' do
|
17
|
+
describe '#initializing' do
|
31
18
|
|
32
19
|
it 'should fail to register without required values' do
|
33
|
-
|
20
|
+
plugin = LogStash::Plugin.lookup("output", "webhdfs")
|
21
|
+
expect { plugin.new }.to raise_error(error=LogStash::ConfigurationError)
|
34
22
|
end
|
35
23
|
|
36
|
-
|
37
|
-
subject = LogStash::Plugin.lookup("output", "webhdfs").new(default_config)
|
38
|
-
expect { subject.register }.to_not raise_error
|
39
|
-
end
|
24
|
+
context "default values" do
|
40
25
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
insist { subject.flush_size } == 500
|
45
|
-
insist { subject.open_timeout } == 30
|
46
|
-
insist { subject.read_timeout } == 30
|
47
|
-
insist { subject.use_httpfs } == false
|
48
|
-
insist { subject.retry_known_errors } == true
|
49
|
-
insist { subject.retry_interval } == 0.5
|
50
|
-
insist { subject.retry_times } == 5
|
51
|
-
insist { subject.snappy_bufsize } == 32768
|
52
|
-
insist { subject.snappy_format } == 'stream'
|
53
|
-
insist { subject.remove_at_timestamp } == true
|
54
|
-
end
|
55
|
-
end
|
26
|
+
it 'should have default port' do
|
27
|
+
expect(subject.port).to eq(50070)
|
28
|
+
end
|
56
29
|
|
57
|
-
|
30
|
+
it 'should have default idle_flush_time' do
|
31
|
+
expect(subject.idle_flush_time).to eq(1)
|
32
|
+
end
|
33
|
+
it 'should have default flush_size' do
|
34
|
+
expect(subject.flush_size).to eq(500)
|
35
|
+
end
|
58
36
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
end
|
37
|
+
it 'should have default open_timeout' do
|
38
|
+
expect(subject.open_timeout).to eq(30)
|
39
|
+
end
|
63
40
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
subject.receive(event)
|
68
|
-
subject.teardown
|
69
|
-
insist { client.read(current_logfile_name).strip } == event.to_json
|
70
|
-
end
|
41
|
+
it 'should have default read_timeout' do
|
42
|
+
expect(subject.read_timeout).to eq(30)
|
43
|
+
end
|
71
44
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
subject.register
|
76
|
-
subject.receive(event)
|
77
|
-
subject.teardown
|
78
|
-
insist { client.read(current_logfile_name).strip } == 'Hello world! came out of the blue.'
|
79
|
-
end
|
45
|
+
it 'should have default use_httpfs' do
|
46
|
+
expect(subject.use_httpfs).to eq(false)
|
47
|
+
end
|
80
48
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
current_config['message_format'] = '%{@timestamp} should be missing.'
|
85
|
-
subject = LogStash::Plugin.lookup("output", "webhdfs").new(current_config)
|
86
|
-
subject.register
|
87
|
-
subject.receive(event)
|
88
|
-
subject.teardown
|
89
|
-
insist { client.read(current_logfile_name).strip } == '%{@timestamp} should be missing.'
|
90
|
-
end
|
49
|
+
it 'should have default retry_known_errors' do
|
50
|
+
expect(subject.retry_known_errors).to eq(true)
|
51
|
+
end
|
91
52
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
subject.register
|
96
|
-
subject.receive(event)
|
97
|
-
expect { client.read(current_logfile_name) }.to raise_error(error=WebHDFS::FileNotFoundError)
|
98
|
-
sleep 3
|
99
|
-
insist { client.read(current_logfile_name).strip } == event.to_json
|
100
|
-
end
|
53
|
+
it 'should have default retry_interval' do
|
54
|
+
expect(subject.retry_interval).to eq(0.5)
|
55
|
+
end
|
101
56
|
|
102
|
-
|
103
|
-
|
104
|
-
subject.register
|
105
|
-
for _ in 0..499
|
106
|
-
subject.receive(event)
|
57
|
+
it 'should have default retry_times' do
|
58
|
+
expect(subject.retry_times).to eq(5)
|
107
59
|
end
|
108
|
-
subject.teardown
|
109
|
-
insist { client.read(current_logfile_name).lines.count } == 500
|
110
|
-
end
|
111
60
|
|
112
|
-
|
113
|
-
|
114
|
-
current_config['compression'] = 'gzip'
|
115
|
-
subject = LogStash::Plugin.lookup("output", "webhdfs").new(current_config)
|
116
|
-
subject.register
|
117
|
-
for _ in 0..499
|
118
|
-
subject.receive(event)
|
61
|
+
it 'should have default snappy_bufsize' do
|
62
|
+
expect(subject.snappy_bufsize).to eq(32768)
|
119
63
|
end
|
120
|
-
subject.teardown
|
121
|
-
insist { Zlib::Inflate.new(window_bits=47).inflate(client.read(current_logfile_name)).lines.count } == 500
|
122
|
-
end
|
123
64
|
|
124
|
-
|
125
|
-
|
126
|
-
|
65
|
+
it 'should have default snappy_format' do
|
66
|
+
expect(subject.snappy_format).to eq('stream')
|
67
|
+
end
|
127
68
|
|
69
|
+
end
|
128
70
|
end
|
129
|
-
|
130
71
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: logstash-output-webhdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
- Björn Puttmann, loshkovskyi
|
7
|
+
- Björn Puttmann, loshkovskyi, Elastic
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: logstash-core
|
@@ -72,6 +72,34 @@ dependencies:
|
|
72
72
|
version: '0'
|
73
73
|
prerelease: false
|
74
74
|
type: :development
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: logstash-codec-line
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - '>='
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0'
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - '>='
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0'
|
87
|
+
prerelease: false
|
88
|
+
type: :development
|
89
|
+
- !ruby/object:Gem::Dependency
|
90
|
+
name: logstash-codec-json
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - '>='
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '0'
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - '>='
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
prerelease: false
|
102
|
+
type: :development
|
75
103
|
description: This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program
|
76
104
|
email: b.puttmann@dbap.de
|
77
105
|
executables: []
|
@@ -79,12 +107,17 @@ extensions: []
|
|
79
107
|
extra_rdoc_files: []
|
80
108
|
files:
|
81
109
|
- .gitignore
|
110
|
+
- CHANGELOG.md
|
111
|
+
- CONTRIBUTORS
|
82
112
|
- Gemfile
|
83
113
|
- LICENSE
|
114
|
+
- NOTICE.TXT
|
84
115
|
- README.md
|
85
116
|
- Rakefile
|
86
117
|
- lib/logstash/outputs/webhdfs.rb
|
118
|
+
- lib/logstash/outputs/webhdfs_helper.rb
|
87
119
|
- logstash-output-webhdfs.gemspec
|
120
|
+
- spec/integration/webhdfs_spec.rb
|
88
121
|
- spec/outputs/webhdfs_spec.rb
|
89
122
|
homepage: http://www.dbap.de
|
90
123
|
licenses:
|
@@ -108,9 +141,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
141
|
version: '0'
|
109
142
|
requirements: []
|
110
143
|
rubyforge_project:
|
111
|
-
rubygems_version: 2.
|
144
|
+
rubygems_version: 2.1.9
|
112
145
|
signing_key:
|
113
146
|
specification_version: 4
|
114
147
|
summary: Plugin to write events to hdfs via webhdfs.
|
115
148
|
test_files:
|
149
|
+
- spec/integration/webhdfs_spec.rb
|
116
150
|
- spec/outputs/webhdfs_spec.rb
|