fluent-plugin-webhdfs 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +64 -17
- data/Rakefile +9 -0
- data/fluent-plugin-webhdfs.gemspec +3 -3
- data/lib/fluent/plugin/out_webhdfs.rb +17 -13
- data/test/helper.rb +28 -0
- data/test/plugin/test_out_webhdfs.rb +58 -0
- metadata +9 -5
data/README.md
CHANGED
@@ -1,29 +1,76 @@
|
|
1
|
-
#
|
1
|
+
# fluent-plugin-webhdfs
|
2
2
|
|
3
|
-
|
3
|
+
Fluentd output plugin to write data into Hadoop HDFS over WebHDFS/HttpFs.
|
4
4
|
|
5
|
-
|
5
|
+
WebHDFSOutput slices data by time (specified unit), and store these data as hdfs file of plain text. You can specify to:
|
6
6
|
|
7
|
-
|
7
|
+
* format whole data as serialized JSON, single attribute or separated multi attributes
|
8
|
+
* include time as line header, or not
|
9
|
+
* include tag as line header, or not
|
10
|
+
* change field separator (default: TAB)
|
11
|
+
* add new line as termination, or not
|
8
12
|
|
9
|
-
|
13
|
+
And you can specify output file path as 'path /path/to/dir/access.%Y%m%d.log', then got '/path/to/dir/access.20120316.log' on HDFS.
|
10
14
|
|
11
|
-
|
15
|
+
## Configuration
|
12
16
|
|
13
|
-
|
17
|
+
### WebHDFSOutput
|
14
18
|
|
15
|
-
|
19
|
+
To store data by time,tag,json (same with 'type file') over WebHDFS:
|
16
20
|
|
17
|
-
|
21
|
+
<match access.**>
|
22
|
+
type webhdfs
|
23
|
+
host namenode.your.cluster.local
|
24
|
+
port 50070
|
25
|
+
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
26
|
+
</match>
|
18
27
|
|
19
|
-
|
28
|
+
With username of pseudo authentication:
|
20
29
|
|
21
|
-
|
30
|
+
<match access.**>
|
31
|
+
type webhdfs
|
32
|
+
host namenode.your.cluster.local
|
33
|
+
port 50070
|
34
|
+
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
35
|
+
username hdfsuser
|
36
|
+
</match>
|
37
|
+
|
38
|
+
Store data over HttpFs (instead of WebHDFS):
|
22
39
|
|
23
|
-
|
40
|
+
<match access.**>
|
41
|
+
type webhdfs
|
42
|
+
host httpfs.node.your.cluster.local
|
43
|
+
port 14000
|
44
|
+
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
45
|
+
httpfs true
|
46
|
+
</match>
|
24
47
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
48
|
+
Store data as TSV (TAB separated values) of specified keys, without time, with tag (removed prefix 'access'):
|
49
|
+
|
50
|
+
<match access.**>
|
51
|
+
type webhdfs
|
52
|
+
host namenode.your.cluster.local
|
53
|
+
port 50070
|
54
|
+
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
55
|
+
|
56
|
+
field_separator TAB # or 'SPACE', 'COMMA'
|
57
|
+
output_include_time false
|
58
|
+
output_include_tag true
|
59
|
+
remove_prefix access
|
60
|
+
|
61
|
+
output_data_type attr:path,status,referer,agent,bytes
|
62
|
+
</match>
|
63
|
+
|
64
|
+
If message doesn't have specified attribute, fluent-plugin-webhdfs outputs 'NULL' instead of values.
|
65
|
+
|
66
|
+
## TODO
|
67
|
+
|
68
|
+
* long run test
|
69
|
+
* over webhdfs and httpfs
|
70
|
+
* patches welcome!
|
71
|
+
|
72
|
+
## Copyright
|
73
|
+
|
74
|
+
* Copyright (c) 2012- TAGOMORI Satoshi (tagomoris)
|
75
|
+
* License
|
76
|
+
* Apache License, Version 2.0
|
data/Rakefile
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
Gem::Specification.new do |gem|
|
3
3
|
gem.name = "fluent-plugin-webhdfs"
|
4
|
-
gem.version = "0.0.
|
4
|
+
gem.version = "0.0.4"
|
5
5
|
gem.authors = ["TAGOMORI Satoshi"]
|
6
6
|
gem.email = ["tagomoris@gmail.com"]
|
7
7
|
gem.summary = %q{Fluentd plugin to write data on HDFS over WebHDFS, with flexible formatting}
|
8
|
-
gem.description = %q{For WebHDFS
|
9
|
-
gem.homepage = "https://github.com/
|
8
|
+
gem.description = %q{For WebHDFS and HttpFs of Hadoop HDFS}
|
9
|
+
gem.homepage = "https://github.com/fluent/fluent-plugin-webhdfs"
|
10
10
|
|
11
11
|
gem.files = `git ls-files`.split($\)
|
12
12
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
@@ -8,7 +8,10 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
8
8
|
config_set_default :buffer_type, 'memory'
|
9
9
|
config_set_default :time_slice_format, '%Y%m%d'
|
10
10
|
|
11
|
-
config_param :
|
11
|
+
config_param :host, :string, :default => nil
|
12
|
+
config_param :port, :integer, :default => 50070
|
13
|
+
config_param :namenode, :string, :default => nil # host:port
|
14
|
+
|
12
15
|
config_param :path, :string
|
13
16
|
config_param :username, :string, :default => nil
|
14
17
|
|
@@ -16,6 +19,8 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
16
19
|
|
17
20
|
include Fluent::Mixin::PlainTextFormatter
|
18
21
|
|
22
|
+
config_param :default_tag, :string, :default => 'tag_missing'
|
23
|
+
|
19
24
|
def initialize
|
20
25
|
super
|
21
26
|
require 'net/http'
|
@@ -36,11 +41,18 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
36
41
|
|
37
42
|
super
|
38
43
|
|
39
|
-
|
40
|
-
|
44
|
+
if @host
|
45
|
+
@namenode_host = @host
|
46
|
+
@namenode_port = @port
|
47
|
+
elsif @namenode
|
48
|
+
unless /\A([a-zA-Z0-9][-a-zA-Z0-9.]*):(\d+)\Z/ =~ @namenode
|
49
|
+
raise Fluent::ConfigError, "Invalid config value about namenode: '#{@namenode}', needs NAMENODE_NAME:PORT"
|
50
|
+
end
|
51
|
+
@namenode_host = $1
|
52
|
+
@namenode_port = $2.to_i
|
53
|
+
else
|
54
|
+
raise Fluent::ConfigError, "WebHDFS host or namenode missing"
|
41
55
|
end
|
42
|
-
@namenode_host = $1
|
43
|
-
@namenode_port = $2.to_i
|
44
56
|
unless @path.index('/') == 0
|
45
57
|
raise Fluent::ConfigError, "Path on hdfs MUST starts with '/', but '#{@path}'"
|
46
58
|
end
|
@@ -52,7 +64,6 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
52
64
|
if @httpfs
|
53
65
|
@client.httpfs_mode = true
|
54
66
|
end
|
55
|
-
@mutex = Mutex.new
|
56
67
|
end
|
57
68
|
|
58
69
|
def start
|
@@ -73,13 +84,6 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
73
84
|
super
|
74
85
|
end
|
75
86
|
|
76
|
-
def record_to_string(record)
|
77
|
-
record.to_json
|
78
|
-
end
|
79
|
-
|
80
|
-
# def format(tag, time, record)
|
81
|
-
# end
|
82
|
-
|
83
87
|
def path_format(chunk_key)
|
84
88
|
Time.strptime(chunk_key, @time_slice_format).strftime(@path)
|
85
89
|
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
|
12
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
13
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
14
|
+
require 'fluent/test'
|
15
|
+
unless ENV.has_key?('VERBOSE')
|
16
|
+
nulllogger = Object.new
|
17
|
+
nulllogger.instance_eval {|obj|
|
18
|
+
def method_missing(method, *args)
|
19
|
+
# pass
|
20
|
+
end
|
21
|
+
}
|
22
|
+
$log = nulllogger
|
23
|
+
end
|
24
|
+
|
25
|
+
require 'fluent/plugin/out_webhdfs'
|
26
|
+
|
27
|
+
class Test::Unit::TestCase
|
28
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class WebHDFSOutputTest < Test::Unit::TestCase
|
4
|
+
CONFIG = %[
|
5
|
+
host namenode.local
|
6
|
+
path /hdfs/path/file.%Y%m%d.log
|
7
|
+
]
|
8
|
+
|
9
|
+
def create_driver(conf=CONFIG,tag='test')
|
10
|
+
Fluent::Test::OutputTestDriver.new(Fluent::WebHDFSOutput, tag).configure(conf)
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_configure
|
14
|
+
d = create_driver
|
15
|
+
assert_equal 'namenode.local', d.instance.instance_eval{ @namenode_host }
|
16
|
+
assert_equal 50070, d.instance.instance_eval{ @namenode_port }
|
17
|
+
assert_equal '/hdfs/path/file.%Y%m%d.log', d.instance.path
|
18
|
+
assert_equal '%Y%m%d', d.instance.time_slice_format
|
19
|
+
assert_equal false, d.instance.httpfs
|
20
|
+
assert_nil d.instance.username
|
21
|
+
|
22
|
+
assert_equal true, d.instance.output_include_time
|
23
|
+
assert_equal true, d.instance.output_include_tag
|
24
|
+
assert_equal 'json', d.instance.output_data_type
|
25
|
+
assert_nil d.instance.remove_prefix
|
26
|
+
assert_equal 'TAB', d.instance.field_separator
|
27
|
+
assert_equal true, d.instance.add_newline
|
28
|
+
assert_equal 'tag_missing', d.instance.default_tag
|
29
|
+
|
30
|
+
d = create_driver %[
|
31
|
+
namenode server.local:14000
|
32
|
+
path /hdfs/path/file.%Y%m%d.%H%M.log
|
33
|
+
httpfs yes
|
34
|
+
username hdfs_user
|
35
|
+
]
|
36
|
+
assert_equal 'server.local', d.instance.instance_eval{ @namenode_host }
|
37
|
+
assert_equal 14000, d.instance.instance_eval{ @namenode_port }
|
38
|
+
assert_equal '/hdfs/path/file.%Y%m%d.%H%M.log', d.instance.path
|
39
|
+
assert_equal '%Y%m%d%H%M', d.instance.time_slice_format
|
40
|
+
assert_equal true, d.instance.httpfs
|
41
|
+
assert_equal 'hdfs_user', d.instance.username
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_path_format
|
45
|
+
d = create_driver
|
46
|
+
assert_equal '/hdfs/path/file.%Y%m%d.log', d.instance.path
|
47
|
+
assert_equal '%Y%m%d', d.instance.time_slice_format
|
48
|
+
assert_equal '/hdfs/path/file.20120718.log', d.instance.path_format('20120718')
|
49
|
+
|
50
|
+
d = create_driver %[
|
51
|
+
namenode server.local:14000
|
52
|
+
path /hdfs/path/file.%Y%m%d.%H%M.log
|
53
|
+
]
|
54
|
+
assert_equal '/hdfs/path/file.%Y%m%d.%H%M.log', d.instance.path
|
55
|
+
assert_equal '%Y%m%d%H%M', d.instance.time_slice_format
|
56
|
+
assert_equal '/hdfs/path/file.20120718.1503.log', d.instance.path_format('201207181503')
|
57
|
+
end
|
58
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-webhdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -123,7 +123,7 @@ dependencies:
|
|
123
123
|
- - ! '>='
|
124
124
|
- !ruby/object:Gem::Version
|
125
125
|
version: 0.5.0
|
126
|
-
description: For WebHDFS
|
126
|
+
description: For WebHDFS and HttpFs of Hadoop HDFS
|
127
127
|
email:
|
128
128
|
- tagomoris@gmail.com
|
129
129
|
executables: []
|
@@ -137,7 +137,9 @@ files:
|
|
137
137
|
- Rakefile
|
138
138
|
- fluent-plugin-webhdfs.gemspec
|
139
139
|
- lib/fluent/plugin/out_webhdfs.rb
|
140
|
-
|
140
|
+
- test/helper.rb
|
141
|
+
- test/plugin/test_out_webhdfs.rb
|
142
|
+
homepage: https://github.com/fluent/fluent-plugin-webhdfs
|
141
143
|
licenses: []
|
142
144
|
post_install_message:
|
143
145
|
rdoc_options: []
|
@@ -161,4 +163,6 @@ rubygems_version: 1.8.21
|
|
161
163
|
signing_key:
|
162
164
|
specification_version: 3
|
163
165
|
summary: Fluentd plugin to write data on HDFS over WebHDFS, with flexible formatting
|
164
|
-
test_files:
|
166
|
+
test_files:
|
167
|
+
- test/helper.rb
|
168
|
+
- test/plugin/test_out_webhdfs.rb
|