fluent-plugin-webhdfs 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +64 -17
- data/Rakefile +9 -0
- data/fluent-plugin-webhdfs.gemspec +3 -3
- data/lib/fluent/plugin/out_webhdfs.rb +17 -13
- data/test/helper.rb +28 -0
- data/test/plugin/test_out_webhdfs.rb +58 -0
- metadata +9 -5
data/README.md
CHANGED
@@ -1,29 +1,76 @@
|
|
1
|
-
#
|
1
|
+
# fluent-plugin-webhdfs
|
2
2
|
|
3
|
-
|
3
|
+
Fluentd output plugin to write data into Hadoop HDFS over WebHDFS/HttpFs.
|
4
4
|
|
5
|
-
|
5
|
+
WebHDFSOutput slices data by time (specified unit), and store these data as hdfs file of plain text. You can specify to:
|
6
6
|
|
7
|
-
|
7
|
+
* format whole data as serialized JSON, single attribute or separated multi attributes
|
8
|
+
* include time as line header, or not
|
9
|
+
* include tag as line header, or not
|
10
|
+
* change field separator (default: TAB)
|
11
|
+
* add new line as termination, or not
|
8
12
|
|
9
|
-
|
13
|
+
And you can specify output file path as 'path /path/to/dir/access.%Y%m%d.log', then got '/path/to/dir/access.20120316.log' on HDFS.
|
10
14
|
|
11
|
-
|
15
|
+
## Configuration
|
12
16
|
|
13
|
-
|
17
|
+
### WebHDFSOutput
|
14
18
|
|
15
|
-
|
19
|
+
To store data by time,tag,json (same with 'type file') over WebHDFS:
|
16
20
|
|
17
|
-
|
21
|
+
<match access.**>
|
22
|
+
type webhdfs
|
23
|
+
host namenode.your.cluster.local
|
24
|
+
port 50070
|
25
|
+
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
26
|
+
</match>
|
18
27
|
|
19
|
-
|
28
|
+
With username of pseudo authentication:
|
20
29
|
|
21
|
-
|
30
|
+
<match access.**>
|
31
|
+
type webhdfs
|
32
|
+
host namenode.your.cluster.local
|
33
|
+
port 50070
|
34
|
+
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
35
|
+
username hdfsuser
|
36
|
+
</match>
|
37
|
+
|
38
|
+
Store data over HttpFs (instead of WebHDFS):
|
22
39
|
|
23
|
-
|
40
|
+
<match access.**>
|
41
|
+
type webhdfs
|
42
|
+
host httpfs.node.your.cluster.local
|
43
|
+
port 14000
|
44
|
+
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
45
|
+
httpfs true
|
46
|
+
</match>
|
24
47
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
48
|
+
Store data as TSV (TAB separated values) of specified keys, without time, with tag (removed prefix 'access'):
|
49
|
+
|
50
|
+
<match access.**>
|
51
|
+
type webhdfs
|
52
|
+
host namenode.your.cluster.local
|
53
|
+
port 50070
|
54
|
+
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
55
|
+
|
56
|
+
field_separator TAB # or 'SPACE', 'COMMA'
|
57
|
+
output_include_time false
|
58
|
+
output_include_tag true
|
59
|
+
remove_prefix access
|
60
|
+
|
61
|
+
output_data_type attr:path,status,referer,agent,bytes
|
62
|
+
</match>
|
63
|
+
|
64
|
+
If message doesn't have specified attribute, fluent-plugin-webhdfs outputs 'NULL' instead of values.
|
65
|
+
|
66
|
+
## TODO
|
67
|
+
|
68
|
+
* long run test
|
69
|
+
* over webhdfs and httpfs
|
70
|
+
* patches welcome!
|
71
|
+
|
72
|
+
## Copyright
|
73
|
+
|
74
|
+
* Copyright (c) 2012- TAGOMORI Satoshi (tagomoris)
|
75
|
+
* License
|
76
|
+
* Apache License, Version 2.0
|
data/Rakefile
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
2
|
Gem::Specification.new do |gem|
|
3
3
|
gem.name = "fluent-plugin-webhdfs"
|
4
|
-
gem.version = "0.0.
|
4
|
+
gem.version = "0.0.4"
|
5
5
|
gem.authors = ["TAGOMORI Satoshi"]
|
6
6
|
gem.email = ["tagomoris@gmail.com"]
|
7
7
|
gem.summary = %q{Fluentd plugin to write data on HDFS over WebHDFS, with flexible formatting}
|
8
|
-
gem.description = %q{For WebHDFS
|
9
|
-
gem.homepage = "https://github.com/
|
8
|
+
gem.description = %q{For WebHDFS and HttpFs of Hadoop HDFS}
|
9
|
+
gem.homepage = "https://github.com/fluent/fluent-plugin-webhdfs"
|
10
10
|
|
11
11
|
gem.files = `git ls-files`.split($\)
|
12
12
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
@@ -8,7 +8,10 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
8
8
|
config_set_default :buffer_type, 'memory'
|
9
9
|
config_set_default :time_slice_format, '%Y%m%d'
|
10
10
|
|
11
|
-
config_param :
|
11
|
+
config_param :host, :string, :default => nil
|
12
|
+
config_param :port, :integer, :default => 50070
|
13
|
+
config_param :namenode, :string, :default => nil # host:port
|
14
|
+
|
12
15
|
config_param :path, :string
|
13
16
|
config_param :username, :string, :default => nil
|
14
17
|
|
@@ -16,6 +19,8 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
16
19
|
|
17
20
|
include Fluent::Mixin::PlainTextFormatter
|
18
21
|
|
22
|
+
config_param :default_tag, :string, :default => 'tag_missing'
|
23
|
+
|
19
24
|
def initialize
|
20
25
|
super
|
21
26
|
require 'net/http'
|
@@ -36,11 +41,18 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
36
41
|
|
37
42
|
super
|
38
43
|
|
39
|
-
|
40
|
-
|
44
|
+
if @host
|
45
|
+
@namenode_host = @host
|
46
|
+
@namenode_port = @port
|
47
|
+
elsif @namenode
|
48
|
+
unless /\A([a-zA-Z0-9][-a-zA-Z0-9.]*):(\d+)\Z/ =~ @namenode
|
49
|
+
raise Fluent::ConfigError, "Invalid config value about namenode: '#{@namenode}', needs NAMENODE_NAME:PORT"
|
50
|
+
end
|
51
|
+
@namenode_host = $1
|
52
|
+
@namenode_port = $2.to_i
|
53
|
+
else
|
54
|
+
raise Fluent::ConfigError, "WebHDFS host or namenode missing"
|
41
55
|
end
|
42
|
-
@namenode_host = $1
|
43
|
-
@namenode_port = $2.to_i
|
44
56
|
unless @path.index('/') == 0
|
45
57
|
raise Fluent::ConfigError, "Path on hdfs MUST starts with '/', but '#{@path}'"
|
46
58
|
end
|
@@ -52,7 +64,6 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
52
64
|
if @httpfs
|
53
65
|
@client.httpfs_mode = true
|
54
66
|
end
|
55
|
-
@mutex = Mutex.new
|
56
67
|
end
|
57
68
|
|
58
69
|
def start
|
@@ -73,13 +84,6 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
73
84
|
super
|
74
85
|
end
|
75
86
|
|
76
|
-
def record_to_string(record)
|
77
|
-
record.to_json
|
78
|
-
end
|
79
|
-
|
80
|
-
# def format(tag, time, record)
|
81
|
-
# end
|
82
|
-
|
83
87
|
def path_format(chunk_key)
|
84
88
|
Time.strptime(chunk_key, @time_slice_format).strftime(@path)
|
85
89
|
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
|
12
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
13
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
14
|
+
require 'fluent/test'
|
15
|
+
unless ENV.has_key?('VERBOSE')
|
16
|
+
nulllogger = Object.new
|
17
|
+
nulllogger.instance_eval {|obj|
|
18
|
+
def method_missing(method, *args)
|
19
|
+
# pass
|
20
|
+
end
|
21
|
+
}
|
22
|
+
$log = nulllogger
|
23
|
+
end
|
24
|
+
|
25
|
+
require 'fluent/plugin/out_webhdfs'
|
26
|
+
|
27
|
+
class Test::Unit::TestCase
|
28
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class WebHDFSOutputTest < Test::Unit::TestCase
|
4
|
+
CONFIG = %[
|
5
|
+
host namenode.local
|
6
|
+
path /hdfs/path/file.%Y%m%d.log
|
7
|
+
]
|
8
|
+
|
9
|
+
def create_driver(conf=CONFIG,tag='test')
|
10
|
+
Fluent::Test::OutputTestDriver.new(Fluent::WebHDFSOutput, tag).configure(conf)
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_configure
|
14
|
+
d = create_driver
|
15
|
+
assert_equal 'namenode.local', d.instance.instance_eval{ @namenode_host }
|
16
|
+
assert_equal 50070, d.instance.instance_eval{ @namenode_port }
|
17
|
+
assert_equal '/hdfs/path/file.%Y%m%d.log', d.instance.path
|
18
|
+
assert_equal '%Y%m%d', d.instance.time_slice_format
|
19
|
+
assert_equal false, d.instance.httpfs
|
20
|
+
assert_nil d.instance.username
|
21
|
+
|
22
|
+
assert_equal true, d.instance.output_include_time
|
23
|
+
assert_equal true, d.instance.output_include_tag
|
24
|
+
assert_equal 'json', d.instance.output_data_type
|
25
|
+
assert_nil d.instance.remove_prefix
|
26
|
+
assert_equal 'TAB', d.instance.field_separator
|
27
|
+
assert_equal true, d.instance.add_newline
|
28
|
+
assert_equal 'tag_missing', d.instance.default_tag
|
29
|
+
|
30
|
+
d = create_driver %[
|
31
|
+
namenode server.local:14000
|
32
|
+
path /hdfs/path/file.%Y%m%d.%H%M.log
|
33
|
+
httpfs yes
|
34
|
+
username hdfs_user
|
35
|
+
]
|
36
|
+
assert_equal 'server.local', d.instance.instance_eval{ @namenode_host }
|
37
|
+
assert_equal 14000, d.instance.instance_eval{ @namenode_port }
|
38
|
+
assert_equal '/hdfs/path/file.%Y%m%d.%H%M.log', d.instance.path
|
39
|
+
assert_equal '%Y%m%d%H%M', d.instance.time_slice_format
|
40
|
+
assert_equal true, d.instance.httpfs
|
41
|
+
assert_equal 'hdfs_user', d.instance.username
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_path_format
|
45
|
+
d = create_driver
|
46
|
+
assert_equal '/hdfs/path/file.%Y%m%d.log', d.instance.path
|
47
|
+
assert_equal '%Y%m%d', d.instance.time_slice_format
|
48
|
+
assert_equal '/hdfs/path/file.20120718.log', d.instance.path_format('20120718')
|
49
|
+
|
50
|
+
d = create_driver %[
|
51
|
+
namenode server.local:14000
|
52
|
+
path /hdfs/path/file.%Y%m%d.%H%M.log
|
53
|
+
]
|
54
|
+
assert_equal '/hdfs/path/file.%Y%m%d.%H%M.log', d.instance.path
|
55
|
+
assert_equal '%Y%m%d%H%M', d.instance.time_slice_format
|
56
|
+
assert_equal '/hdfs/path/file.20120718.1503.log', d.instance.path_format('201207181503')
|
57
|
+
end
|
58
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-webhdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -123,7 +123,7 @@ dependencies:
|
|
123
123
|
- - ! '>='
|
124
124
|
- !ruby/object:Gem::Version
|
125
125
|
version: 0.5.0
|
126
|
-
description: For WebHDFS
|
126
|
+
description: For WebHDFS and HttpFs of Hadoop HDFS
|
127
127
|
email:
|
128
128
|
- tagomoris@gmail.com
|
129
129
|
executables: []
|
@@ -137,7 +137,9 @@ files:
|
|
137
137
|
- Rakefile
|
138
138
|
- fluent-plugin-webhdfs.gemspec
|
139
139
|
- lib/fluent/plugin/out_webhdfs.rb
|
140
|
-
|
140
|
+
- test/helper.rb
|
141
|
+
- test/plugin/test_out_webhdfs.rb
|
142
|
+
homepage: https://github.com/fluent/fluent-plugin-webhdfs
|
141
143
|
licenses: []
|
142
144
|
post_install_message:
|
143
145
|
rdoc_options: []
|
@@ -161,4 +163,6 @@ rubygems_version: 1.8.21
|
|
161
163
|
signing_key:
|
162
164
|
specification_version: 3
|
163
165
|
summary: Fluentd plugin to write data on HDFS over WebHDFS, with flexible formatting
|
164
|
-
test_files:
|
166
|
+
test_files:
|
167
|
+
- test/helper.rb
|
168
|
+
- test/plugin/test_out_webhdfs.rb
|