fluent-plugin-webhdfs 0.7.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -10
- data/Appraisals +0 -2
- data/README.md +55 -45
- data/fluent-plugin-webhdfs.gemspec +4 -4
- data/lib/fluent/plugin/out_webhdfs.rb +193 -100
- data/lib/fluent/plugin/webhdfs_compressor_bzip2.rb +3 -7
- data/lib/fluent/plugin/webhdfs_compressor_gzip.rb +3 -3
- data/lib/fluent/plugin/webhdfs_compressor_lzo_command.rb +3 -3
- data/lib/fluent/plugin/webhdfs_compressor_snappy.rb +2 -2
- data/lib/fluent/plugin/webhdfs_compressor_text.rb +2 -2
- data/test/helper.rb +5 -0
- data/test/plugin/test_compressor.rb +3 -3
- data/test/plugin/test_out_webhdfs.rb +179 -105
- metadata +18 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8865ee69f790536d7ce119ead16abc7862efdf59
|
4
|
+
data.tar.gz: df3f7cc64b42d465733ca51815b9e066b9d4310c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 384e20026a6b64a91c3ca59058e6e4e33480dda26057165cf9464006bc7e0b6d61170d97878abfe5c8c29a23b263e7a6db770f4cfcf4170b5de57ee3f2337da6
|
7
|
+
data.tar.gz: 464ec8333c3c97002150cabbeed4f13077abfe473944f4a56fb764414f25e6e03e793942f11ca270695c22cc813866e63c802debfa6f967ac08f98e4637b9e5d
|
data/.travis.yml
CHANGED
@@ -2,10 +2,9 @@ sudo: false
|
|
2
2
|
language: ruby
|
3
3
|
|
4
4
|
rvm:
|
5
|
-
- 2.0.0
|
6
5
|
- 2.1
|
7
6
|
- 2.2
|
8
|
-
- 2.3.
|
7
|
+
- 2.3.1
|
9
8
|
|
10
9
|
branches:
|
11
10
|
only:
|
@@ -23,12 +22,4 @@ script: bundle exec rake test
|
|
23
22
|
|
24
23
|
gemfile:
|
25
24
|
- Gemfile
|
26
|
-
- gemfiles/fluentd_v0.12.gemfile
|
27
25
|
- gemfiles/fluentd_v0.14.gemfile
|
28
|
-
|
29
|
-
matrix:
|
30
|
-
exclude:
|
31
|
-
- rvm: 2.0.0
|
32
|
-
gemfile: Gemfile
|
33
|
-
- rvm: 2.0.0
|
34
|
-
gemfile: gemfiles/fluentd_v0.14.gemfile
|
data/Appraisals
CHANGED
data/README.md
CHANGED
@@ -2,16 +2,18 @@
|
|
2
2
|
|
3
3
|
[Fluentd](http://fluentd.org/) output plugin to write data into Hadoop HDFS over WebHDFS/HttpFs.
|
4
4
|
|
5
|
-
|
5
|
+
"webhdfs" output plugin formats data into plain text, and store it as files on HDFS. This plugin supports:
|
6
6
|
|
7
|
-
*
|
8
|
-
|
9
|
-
*
|
10
|
-
* include tag as line header, or not
|
11
|
-
* change field separator (default: TAB)
|
12
|
-
* add new line as termination, or not
|
7
|
+
* inject tag and time into record (and output plain text data) using `<inject>` section
|
8
|
+
* format events into plain text by format plugins using `<format>` section
|
9
|
+
* control flushing using `<buffer>` section
|
13
10
|
|
14
|
-
|
11
|
+
Paths on HDFS can be generated from event timestamp, tag or any other fields in records.
|
12
|
+
|
13
|
+
### Older versions
|
14
|
+
|
15
|
+
The versions of `0.x.x` of this plugin are for older version of Fluentd (v0.12.x). Old style configuration parameters (using `output_data_type`, `output_include_*` or others) are still supported, but are deprecated.
|
16
|
+
Users should use `<format>` section to control how to format events into plain text.
|
15
17
|
|
16
18
|
## Configuration
|
17
19
|
|
@@ -26,15 +28,16 @@ To store data by time,tag,json (same with '@type file') over WebHDFS:
|
|
26
28
|
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
27
29
|
</match>
|
28
30
|
|
29
|
-
If you want JSON object only (without time or tag or both on header of lines),
|
31
|
+
If you want JSON object only (without time or tag or both on header of lines), use `<format>` section to specify `json` formatter:
|
30
32
|
|
31
33
|
<match access.**>
|
32
34
|
@type webhdfs
|
33
35
|
host namenode.your.cluster.local
|
34
36
|
port 50070
|
35
37
|
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
36
|
-
|
37
|
-
|
38
|
+
<format>
|
39
|
+
@type json
|
40
|
+
</format>
|
38
41
|
</match>
|
39
42
|
|
40
43
|
To specify namenode, `namenode` is also available:
|
@@ -45,14 +48,47 @@ To specify namenode, `namenode` is also available:
|
|
45
48
|
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
46
49
|
</match>
|
47
50
|
|
48
|
-
To store data as
|
51
|
+
To store data as JSON, including time and tag (using `<inject>`), over WebHDFS:
|
49
52
|
|
50
53
|
<match access.**>
|
51
54
|
@type webhdfs
|
52
55
|
host namenode.your.cluster.local
|
53
56
|
port 50070
|
54
57
|
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
55
|
-
|
58
|
+
<buffer>
|
59
|
+
timekey_zone -0700 # to specify timezone used for "path" time placeholder formatting
|
60
|
+
</buffer>
|
61
|
+
<inject>
|
62
|
+
tag_key tag
|
63
|
+
time_key time
|
64
|
+
time_type string
|
65
|
+
timezone -0700
|
66
|
+
</inject>
|
67
|
+
<format>
|
68
|
+
@type json
|
69
|
+
</format>
|
70
|
+
</match>
|
71
|
+
|
72
|
+
To store data as JSON, including time as unix time, using path including tag as directory:
|
73
|
+
|
74
|
+
<match access.**>
|
75
|
+
@type webhdfs
|
76
|
+
host namenode.your.cluster.local
|
77
|
+
port 50070
|
78
|
+
path /path/on/hdfs/${tag}/access.log.%Y%m%d_%H.log
|
79
|
+
<buffer time,tag>
|
80
|
+
@type file # using file buffer
|
81
|
+
path /var/log/fluentd/buffer # buffer directory path
|
82
|
+
timekey 3h # create a file per 3h
|
83
|
+
timekey_use_utc true # time in path are formatted in UTC (default false means localtime)
|
84
|
+
</buffer>
|
85
|
+
<inject>
|
86
|
+
time_key time
|
87
|
+
time_type unixtime
|
88
|
+
</inject>
|
89
|
+
<format>
|
90
|
+
@type json
|
91
|
+
</format>
|
56
92
|
</match>
|
57
93
|
|
58
94
|
With username of pseudo authentication:
|
@@ -75,24 +111,6 @@ Store data over HttpFs (instead of WebHDFS):
|
|
75
111
|
httpfs true
|
76
112
|
</match>
|
77
113
|
|
78
|
-
Store data as TSV (TAB separated values) of specified keys, without time, with tag (removed prefix 'access'):
|
79
|
-
|
80
|
-
<match access.**>
|
81
|
-
@type webhdfs
|
82
|
-
host namenode.your.cluster.local
|
83
|
-
port 50070
|
84
|
-
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
85
|
-
|
86
|
-
field_separator TAB # or 'SPACE', 'COMMA' or 'SOH'(Start Of Heading: \001)
|
87
|
-
output_include_time false
|
88
|
-
output_include_tag true
|
89
|
-
remove_prefix access
|
90
|
-
|
91
|
-
output_data_type attr:path,status,referer,agent,bytes
|
92
|
-
</match>
|
93
|
-
|
94
|
-
If message doesn't have specified attribute, fluent-plugin-webhdfs outputs 'NULL' instead of values.
|
95
|
-
|
96
114
|
With ssl:
|
97
115
|
|
98
116
|
<match access.**>
|
@@ -118,11 +136,8 @@ With kerberos authentication:
|
|
118
136
|
port 50070
|
119
137
|
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
120
138
|
kerberos true
|
121
|
-
kerberos_keytab /path/to/keytab # if needed
|
122
139
|
</match>
|
123
140
|
|
124
|
-
NOTE: You need to install `gssapi` gem for kerberos. See https://github.com/kzk/webhdfs#for-kerberos-authentication
|
125
|
-
|
126
141
|
If you want to compress data before storing it:
|
127
142
|
|
128
143
|
<match access.**>
|
@@ -134,10 +149,7 @@ If you want to compress data before storing it:
|
|
134
149
|
</match>
|
135
150
|
|
136
151
|
Note that if you set `compress gzip`, then the suffix `.gz` will be added to path (or `.bz2`, `sz`, `.lzo`).
|
137
|
-
Note that you have to install
|
138
|
-
|
139
|
-
- snappy: install snappy gem
|
140
|
-
- bzip2: install bzip2-ffi gem
|
152
|
+
Note that you have to install snappy gem if you want to set `compress snappy`.
|
141
153
|
|
142
154
|
### Namenode HA / Auto retry for WebHDFS known errors
|
143
155
|
|
@@ -164,7 +176,7 @@ And you can also specify to retry known hdfs errors (such like `LeaseExpiredExce
|
|
164
176
|
### Performance notifications
|
165
177
|
|
166
178
|
Writing data on HDFS single file from 2 or more fluentd nodes, makes many bad blocks of HDFS. If you want to run 2 or more fluentd nodes with fluent-plugin-webhdfs, you should configure 'path' for each node.
|
167
|
-
|
179
|
+
To include hostname, `#{Socket.gethostname}` is available in Fluentd configuration string literals by ruby expression (in `"..."` strings). This plugin also supports `${uuid}` placeholder to include random uuid in paths.
|
168
180
|
|
169
181
|
For hostname:
|
170
182
|
|
@@ -172,7 +184,7 @@ For hostname:
|
|
172
184
|
@type webhdfs
|
173
185
|
host namenode.your.cluster.local
|
174
186
|
port 50070
|
175
|
-
path /log/access/%Y%m%d
|
187
|
+
path "/log/access/%Y%m%d/#{Socket.gethostname}.log" # double quotes needed to expand ruby expression in string
|
176
188
|
</match>
|
177
189
|
|
178
190
|
Or with random filename (to avoid duplicated file name only):
|
@@ -181,10 +193,10 @@ Or with random filename (to avoid duplicated file name only):
|
|
181
193
|
@type webhdfs
|
182
194
|
host namenode.your.cluster.local
|
183
195
|
port 50070
|
184
|
-
path /log/access/%Y%m%d/${uuid
|
196
|
+
path /log/access/%Y%m%d/${uuid}.log
|
185
197
|
</match>
|
186
198
|
|
187
|
-
With configurations above, you can handle all of files of
|
199
|
+
With configurations above, you can handle all of files of `/log/access/20120820/*` as specified timeslice access logs.
|
188
200
|
|
189
201
|
For high load cluster nodes, you can specify timeouts for HTTP requests.
|
190
202
|
|
@@ -220,15 +232,13 @@ With unstable datanodes that frequently downs, appending over WebHDFS may produc
|
|
220
232
|
port 50070
|
221
233
|
|
222
234
|
append no
|
223
|
-
path /log/access/%Y%m%d
|
235
|
+
path "/log/access/%Y%m%d/#{Socket.gethostname}.${chunk_id}.log"
|
224
236
|
</match>
|
225
237
|
|
226
238
|
`out_webhdfs` creates new files on hdfs per flush of fluentd, with chunk id. You shouldn't care broken files from append operations.
|
227
239
|
|
228
240
|
## TODO
|
229
241
|
|
230
|
-
* configuration example for Hadoop Namenode HA
|
231
|
-
* here, or docs.fluentd.org ?
|
232
242
|
* patches welcome!
|
233
243
|
|
234
244
|
## Copyright
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |gem|
|
4
4
|
gem.name = "fluent-plugin-webhdfs"
|
5
|
-
gem.version = "0.
|
5
|
+
gem.version = "1.0.0"
|
6
6
|
gem.authors = ["TAGOMORI Satoshi"]
|
7
7
|
gem.email = ["tagomoris@gmail.com"]
|
8
8
|
gem.summary = %q{Fluentd plugin to write data on HDFS over WebHDFS, with flexible formatting}
|
@@ -17,10 +17,10 @@ Gem::Specification.new do |gem|
|
|
17
17
|
|
18
18
|
gem.add_development_dependency "rake"
|
19
19
|
gem.add_development_dependency "test-unit"
|
20
|
+
gem.add_development_dependency "test-unit-rr"
|
20
21
|
gem.add_development_dependency "appraisal"
|
21
22
|
gem.add_development_dependency "snappy", '>= 0.0.13'
|
22
|
-
gem.
|
23
|
-
gem.add_runtime_dependency "fluentd", ['>= 0.10.59', "< 0.14.0"]
|
24
|
-
gem.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
|
23
|
+
gem.add_runtime_dependency "fluentd", '>= 0.14.4'
|
25
24
|
gem.add_runtime_dependency "webhdfs", '>= 0.6.0'
|
25
|
+
gem.add_runtime_dependency "bzip2-ffi"
|
26
26
|
end
|
@@ -1,131 +1,138 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
|
3
|
+
require 'fluent/plugin/output'
|
4
|
+
require 'fluent/config/element'
|
5
|
+
|
6
|
+
require 'webhdfs'
|
3
7
|
require 'tempfile'
|
4
8
|
require 'securerandom'
|
5
|
-
require 'fluent/mixin/plaintextformatter'
|
6
9
|
|
7
|
-
class Fluent::WebHDFSOutput < Fluent::
|
10
|
+
class Fluent::Plugin::WebHDFSOutput < Fluent::Plugin::Output
|
8
11
|
Fluent::Plugin.register_output('webhdfs', self)
|
9
12
|
|
10
|
-
|
11
|
-
config_set_default :time_slice_format, '%Y%m%d'
|
12
|
-
|
13
|
-
# For fluentd v0.12.16 or earlier
|
14
|
-
class << self
|
15
|
-
unless method_defined?(:desc)
|
16
|
-
def desc(description)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
13
|
+
helpers :inject, :formatter, :compat_parameters
|
20
14
|
|
21
15
|
desc 'WebHDFS/HttpFs host'
|
22
|
-
config_param :host, :string, :
|
16
|
+
config_param :host, :string, default: nil
|
23
17
|
desc 'WebHDFS/HttpFs port'
|
24
|
-
config_param :port, :integer, :
|
18
|
+
config_param :port, :integer, default: 50070
|
25
19
|
desc 'Namenode (host:port)'
|
26
|
-
config_param :namenode, :string, :
|
20
|
+
config_param :namenode, :string, default: nil # host:port
|
27
21
|
desc 'Standby namenode for Namenode HA (host:port)'
|
28
|
-
config_param :standby_namenode, :string, :
|
22
|
+
config_param :standby_namenode, :string, default: nil # host:port
|
29
23
|
|
30
24
|
desc 'Ignore errors on start up'
|
31
|
-
config_param :ignore_start_check_error, :bool, :
|
25
|
+
config_param :ignore_start_check_error, :bool, default: false
|
32
26
|
|
33
27
|
desc 'Output file path on HDFS'
|
34
28
|
config_param :path, :string
|
35
29
|
desc 'User name for pseudo authentication'
|
36
|
-
config_param :username, :string, :
|
30
|
+
config_param :username, :string, default: nil
|
37
31
|
|
38
32
|
desc 'Store data over HttpFs instead of WebHDFS'
|
39
|
-
config_param :httpfs, :bool, :
|
33
|
+
config_param :httpfs, :bool, default: false
|
40
34
|
|
41
35
|
desc 'Number of seconds to wait for the connection to open'
|
42
|
-
config_param :open_timeout, :integer, :
|
36
|
+
config_param :open_timeout, :integer, default: 30 # from ruby net/http default
|
43
37
|
desc 'Number of seconds to wait for one block to be read'
|
44
|
-
config_param :read_timeout, :integer, :
|
38
|
+
config_param :read_timeout, :integer, default: 60 # from ruby net/http default
|
45
39
|
|
46
40
|
desc 'Retry automatically when known errors of HDFS are occurred'
|
47
|
-
config_param :retry_known_errors, :bool, :
|
41
|
+
config_param :retry_known_errors, :bool, default: false
|
48
42
|
desc 'Retry interval'
|
49
|
-
config_param :retry_interval, :integer, :
|
43
|
+
config_param :retry_interval, :integer, default: nil
|
50
44
|
desc 'The number of retries'
|
51
|
-
config_param :retry_times, :integer, :
|
45
|
+
config_param :retry_times, :integer, default: nil
|
52
46
|
|
53
47
|
# how many times of write failure before switch to standby namenode
|
54
48
|
# by default it's 11 times that costs 1023 seconds inside fluentd,
|
55
49
|
# which is considered enough to exclude the scenes that caused by temporary network fail or single datanode fail
|
56
50
|
desc 'How many times of write failure before switch to standby namenode'
|
57
|
-
config_param :failures_before_use_standby, :integer, :
|
58
|
-
|
59
|
-
include Fluent::Mixin::PlainTextFormatter
|
51
|
+
config_param :failures_before_use_standby, :integer, default: 11
|
60
52
|
|
61
|
-
config_param :
|
53
|
+
config_param :end_with_newline, :bool, default: true
|
62
54
|
|
63
55
|
desc 'Append data or not'
|
64
|
-
config_param :append, :bool, :
|
56
|
+
config_param :append, :bool, default: true
|
65
57
|
|
66
58
|
desc 'Use SSL or not'
|
67
|
-
config_param :ssl, :bool, :
|
59
|
+
config_param :ssl, :bool, default: false
|
68
60
|
desc 'OpenSSL certificate authority file'
|
69
|
-
config_param :ssl_ca_file, :string, :
|
61
|
+
config_param :ssl_ca_file, :string, default: nil
|
70
62
|
desc 'OpenSSL verify mode (none,peer)'
|
71
|
-
config_param :ssl_verify_mode, :
|
72
|
-
case val
|
73
|
-
when 'none'
|
74
|
-
:none
|
75
|
-
when 'peer'
|
76
|
-
:peer
|
77
|
-
else
|
78
|
-
raise Fluent::ConfigError, "unexpected parameter on ssl_verify_mode: #{val}"
|
79
|
-
end
|
80
|
-
end
|
63
|
+
config_param :ssl_verify_mode, :enum, list: [:none, :peer], default: :none
|
81
64
|
|
82
65
|
desc 'Use kerberos authentication or not'
|
83
|
-
config_param :kerberos, :bool, :
|
84
|
-
desc 'kerberos keytab file'
|
85
|
-
config_param :kerberos_keytab, :string, :default => nil
|
66
|
+
config_param :kerberos, :bool, default: false
|
86
67
|
|
87
|
-
SUPPORTED_COMPRESS = [
|
68
|
+
SUPPORTED_COMPRESS = [:gzip, :bzip2, :snappy, :lzo_command, :text]
|
88
69
|
desc "Compress method (#{SUPPORTED_COMPRESS.join(',')})"
|
89
|
-
config_param :compress, :
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
70
|
+
config_param :compress, :enum, list: SUPPORTED_COMPRESS, default: :text
|
71
|
+
|
72
|
+
config_param :remove_prefix, :string, default: nil, deprecated: "use @label for routing"
|
73
|
+
config_param :default_tag, :string, default: nil, deprecated: "use @label for routing"
|
74
|
+
config_param :null_value, :string, default: nil, deprecated: "use filter plugins to convert null values into any specified string"
|
75
|
+
config_param :suppress_log_broken_string, :bool, default: false, deprecated: "use @log_level for plugin to suppress such info logs"
|
95
76
|
|
96
77
|
CHUNK_ID_PLACE_HOLDER = '${chunk_id}'
|
97
78
|
|
98
|
-
|
79
|
+
config_section :buffer do
|
80
|
+
config_set_default :chunk_keys, ["time"]
|
81
|
+
end
|
82
|
+
|
83
|
+
config_section :format do
|
84
|
+
config_set_default :@type, 'out_file'
|
85
|
+
config_set_default :localtime, false # default timezone is UTC
|
86
|
+
end
|
87
|
+
|
88
|
+
attr_reader :formatter, :compressor
|
99
89
|
|
100
90
|
def initialize
|
101
91
|
super
|
102
|
-
require 'net/http'
|
103
|
-
require 'time'
|
104
|
-
require 'webhdfs'
|
105
|
-
|
106
92
|
@compressor = nil
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
unless method_defined?(:log)
|
111
|
-
define_method("log") { $log }
|
93
|
+
@standby_namenode_host = nil
|
94
|
+
@output_include_tag = @output_include_time = nil # TODO: deprecated
|
95
|
+
@header_separator = @field_separator = nil # TODO: deprecated
|
112
96
|
end
|
113
97
|
|
114
98
|
def configure(conf)
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
99
|
+
compat_parameters_convert(conf, :buffer, default_chunk_key: "time")
|
100
|
+
|
101
|
+
timekey = case conf["path"]
|
102
|
+
when /%S/ then 1
|
103
|
+
when /%M/ then 60
|
104
|
+
when /%H/ then 3600
|
105
|
+
else 86400
|
106
|
+
end
|
107
|
+
if conf.elements(name: "buffer").empty?
|
108
|
+
e = Fluent::Config::Element.new("buffer", "time", {}, [])
|
109
|
+
conf.elements << e
|
123
110
|
end
|
111
|
+
buffer_config = conf.elements(name: "buffer").first
|
112
|
+
buffer_config["timekey"] = timekey unless buffer_config["timekey"]
|
124
113
|
|
125
|
-
|
114
|
+
compat_parameters_convert_plaintextformatter(conf)
|
126
115
|
|
127
116
|
super
|
128
117
|
|
118
|
+
@formatter = formatter_create
|
119
|
+
|
120
|
+
if @using_formatter_config
|
121
|
+
@null_value = nil
|
122
|
+
else
|
123
|
+
@formatter.delimiter = "\x01" if @formatter.respond_to?(:delimiter) && @formatter.delimiter == 'SOH'
|
124
|
+
@null_value ||= 'NULL'
|
125
|
+
end
|
126
|
+
|
127
|
+
if @default_tag.nil? && !@using_formatter_config && @output_include_tag
|
128
|
+
@default_tag = "tag_missing"
|
129
|
+
end
|
130
|
+
if @remove_prefix
|
131
|
+
@remove_prefix_actual = @remove_prefix + "."
|
132
|
+
@remove_prefix_actual_length = @remove_prefix_actual.length
|
133
|
+
end
|
134
|
+
|
135
|
+
verify_config_placeholders_in_path!(conf)
|
129
136
|
@replace_random_uuid = @path.include?('%{uuid}') || @path.include?('%{uuid_flush}')
|
130
137
|
if @replace_random_uuid
|
131
138
|
# to check SecureRandom.uuid is available or not (NotImplementedError raised in such environment)
|
@@ -136,14 +143,7 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
136
143
|
end
|
137
144
|
end
|
138
145
|
|
139
|
-
|
140
|
-
@compressor = COMPRESSOR_REGISTRY.lookup(@compress || 'text').new
|
141
|
-
rescue Fluent::ConfigError
|
142
|
-
raise
|
143
|
-
rescue
|
144
|
-
$log.warn "#{@comress} not found. Use 'text' instead"
|
145
|
-
@compressor = COMPRESSOR_REGISTRY.lookup('text').new
|
146
|
-
end
|
146
|
+
@compressor = COMPRESSOR_REGISTRY.lookup(@compress.to_s).new
|
147
147
|
|
148
148
|
if @host
|
149
149
|
@namenode_host = @host
|
@@ -178,7 +178,7 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
178
178
|
@client_standby = nil
|
179
179
|
end
|
180
180
|
|
181
|
-
|
181
|
+
unless @append
|
182
182
|
if @path.index(CHUNK_ID_PLACE_HOLDER).nil?
|
183
183
|
raise Fluent::ConfigError, "path must contain ${chunk_id}, which is the placeholder for chunk_id, when append is set to false."
|
184
184
|
end
|
@@ -204,7 +204,6 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
204
204
|
end
|
205
205
|
if @kerberos
|
206
206
|
client.kerberos = true
|
207
|
-
client.kerberos_keytab = @kerberos_keytab if @kerberos_keytab
|
208
207
|
end
|
209
208
|
|
210
209
|
client
|
@@ -242,14 +241,6 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
242
241
|
end
|
243
242
|
end
|
244
243
|
|
245
|
-
def shutdown
|
246
|
-
super
|
247
|
-
end
|
248
|
-
|
249
|
-
def path_format(chunk_key)
|
250
|
-
Time.strptime(chunk_key, @time_slice_format).strftime(@path)
|
251
|
-
end
|
252
|
-
|
253
244
|
def is_standby_exception(e)
|
254
245
|
e.is_a?(WebHDFS::IOError) && e.message.match(/org\.apache\.hadoop\.ipc\.StandbyException/)
|
255
246
|
end
|
@@ -261,12 +252,6 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
261
252
|
end
|
262
253
|
end
|
263
254
|
|
264
|
-
def chunk_unique_id_to_str(unique_id)
|
265
|
-
unique_id.unpack('C*').map{|x| x.to_s(16).rjust(2,'0')}.join('')
|
266
|
-
end
|
267
|
-
|
268
|
-
# TODO check conflictions
|
269
|
-
|
270
255
|
def send_data(path, data)
|
271
256
|
if @append
|
272
257
|
begin
|
@@ -281,7 +266,7 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
281
266
|
|
282
267
|
HOSTNAME_PLACEHOLDERS_DEPRECATED = ['${hostname}', '%{hostname}', '__HOSTNAME__']
|
283
268
|
UUID_RANDOM_PLACEHOLDERS_DEPRECATED = ['${uuid}', '${uuid:random}', '__UUID__', '__UUID_RANDOM__']
|
284
|
-
UUID_OTHER_PLACEHOLDERS_OBSOLETED = ['${uuid:hostname}', '%{uuid:hostname}', '__UUID_HOSTNAME__', '${uuid:timestamp}', '%{uuid:timestamp}', '__UUID_TIMESTAMP__']
|
269
|
+
UUID_OTHER_PLACEHOLDERS_OBSOLETED = ['${uuid:hostname}', '%{uuid:hostname}', '__UUID_HOSTNAME__', '${uuid:timestamp}', '%{uuid:timestamp}', '__UUID_TIMESTAMP__']
|
285
270
|
|
286
271
|
def verify_config_placeholders_in_path!(conf)
|
287
272
|
return unless conf.has_key?('path')
|
@@ -310,20 +295,20 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
310
295
|
log.error "configuration placeholder #{ph} is now unsupported by webhdfs output plugin."
|
311
296
|
end
|
312
297
|
end
|
313
|
-
raise
|
298
|
+
raise ConfigError, "there are unsupported placeholders in path."
|
314
299
|
end
|
315
300
|
end
|
316
301
|
|
317
302
|
def generate_path(chunk)
|
318
303
|
hdfs_path = if @append
|
319
|
-
|
304
|
+
extract_placeholders(@path, chunk.metadata)
|
320
305
|
else
|
321
|
-
|
306
|
+
extract_placeholders(@path, chunk.metadata).gsub(CHUNK_ID_PLACE_HOLDER, dump_unique_id(chunk.unique_id))
|
322
307
|
end
|
323
308
|
hdfs_path = "#{hdfs_path}#{@compressor.ext}"
|
324
309
|
if @replace_random_uuid
|
325
310
|
uuid_random = SecureRandom.uuid
|
326
|
-
hdfs_path
|
311
|
+
hdfs_path.gsub!('%{uuid}', uuid_random).gsub!('%{uuid_flush}', uuid_random)
|
327
312
|
end
|
328
313
|
hdfs_path
|
329
314
|
end
|
@@ -339,6 +324,48 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
339
324
|
end
|
340
325
|
end
|
341
326
|
|
327
|
+
def format(tag, time, record)
|
328
|
+
if @remove_prefix # TODO: remove when it's obsoleted
|
329
|
+
if tag.start_with?(@remove_prefix_actual)
|
330
|
+
if tag.length > @remove_prefix_actual_length
|
331
|
+
tag = tag[@remove_prefix_actual_length..-1]
|
332
|
+
else
|
333
|
+
tag = @default_tag
|
334
|
+
end
|
335
|
+
elsif tag.start_with?(@remove_prefix)
|
336
|
+
if tag == @remove_prefix
|
337
|
+
tag = @default_tag
|
338
|
+
else
|
339
|
+
tag = tag.sub(@remove_prefix, '')
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
if @null_value # TODO: remove when it's obsoleted
|
345
|
+
check_keys = (record.keys + @null_convert_keys).uniq
|
346
|
+
check_keys.each do |key|
|
347
|
+
record[key] = @null_value if record[key].nil?
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
351
|
+
if @using_formatter_config
|
352
|
+
record = inject_values_to_record(tag, time, record)
|
353
|
+
line = @formatter.format(tag, time, record)
|
354
|
+
else # TODO: remove when it's obsoleted
|
355
|
+
time_str = @output_include_time ? @time_formatter.call(time) + @header_separator : ''
|
356
|
+
tag_str = @output_include_tag ? tag + @header_separator : ''
|
357
|
+
record_str = @formatter.format(tag, time, record)
|
358
|
+
line = time_str + tag_str + record_str
|
359
|
+
end
|
360
|
+
line << "\n" if @end_with_newline && !line.end_with?("\n")
|
361
|
+
line
|
362
|
+
rescue => e # remove this clause when @suppress_log_broken_string is obsoleted
|
363
|
+
unless @suppress_log_broken_string
|
364
|
+
log.info "unexpected error while formatting events, ignored", tag: tag, record: record, error: e
|
365
|
+
end
|
366
|
+
''
|
367
|
+
end
|
368
|
+
|
342
369
|
def write(chunk)
|
343
370
|
hdfs_path = generate_path(chunk)
|
344
371
|
|
@@ -369,6 +396,72 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
369
396
|
hdfs_path
|
370
397
|
end
|
371
398
|
|
399
|
+
def compat_parameters_convert_plaintextformatter(conf)
|
400
|
+
if !conf.elements('format').empty? || !conf['output_data_type']
|
401
|
+
@using_formatter_config = true
|
402
|
+
@null_convert_keys = []
|
403
|
+
return
|
404
|
+
end
|
405
|
+
|
406
|
+
log.warn "webhdfs output plugin is working with old configuration parameters. use <inject>/<format> sections instead for further releases."
|
407
|
+
@using_formatter_config = false
|
408
|
+
@null_convert_keys = []
|
409
|
+
|
410
|
+
@header_separator = case conf['field_separator']
|
411
|
+
when nil then "\t"
|
412
|
+
when 'SPACE' then ' '
|
413
|
+
when 'TAB' then "\t"
|
414
|
+
when 'COMMA' then ','
|
415
|
+
when 'SOH' then "\x01"
|
416
|
+
else conf['field_separator']
|
417
|
+
end
|
418
|
+
|
419
|
+
format_section = Fluent::Config::Element.new('format', '', {}, [])
|
420
|
+
case conf['output_data_type']
|
421
|
+
when '', 'json' # blank value is for compatibility reason (especially in testing)
|
422
|
+
format_section['@type'] = 'json'
|
423
|
+
when 'ltsv'
|
424
|
+
format_section['@type'] = 'ltsv'
|
425
|
+
else
|
426
|
+
unless conf['output_data_type'].start_with?('attr:')
|
427
|
+
raise Fluent::ConfigError, "output_data_type is invalid: #{conf['output_data_type']}"
|
428
|
+
end
|
429
|
+
format_section['@format'] = 'tsv'
|
430
|
+
keys_part = conf['output_data_type'].sub(/^attr:/, '')
|
431
|
+
@null_convert_keys = keys_part.split(',')
|
432
|
+
format_section['keys'] = keys_part
|
433
|
+
format_section['delimiter'] = case conf['field_separator']
|
434
|
+
when nil then '\t'
|
435
|
+
when 'SPACE' then ' '
|
436
|
+
when 'TAB' then '\t'
|
437
|
+
when 'COMMA' then ','
|
438
|
+
when 'SOH' then 'SOH' # fixed later
|
439
|
+
else conf['field_separator']
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
443
|
+
conf.elements << format_section
|
444
|
+
|
445
|
+
@output_include_time = conf.has_key?('output_include_time') ? Fluent::Config.bool_value(conf['output_include_time']) : true
|
446
|
+
@output_include_tag = conf.has_key?('output_include_tag') ? Fluent::Config.bool_value(conf['output_include_tag']) : true
|
447
|
+
|
448
|
+
if @output_include_time
|
449
|
+
# default timezone is UTC
|
450
|
+
using_localtime = if !conf.has_key?('utc') && !conf.has_key?('localtime')
|
451
|
+
false
|
452
|
+
elsif conf.has_key?('localtime') && conf.has_key?('utc')
|
453
|
+
raise Fluent::ConfigError, "specify either 'localtime' or 'utc'"
|
454
|
+
elsif conf.has_key?('localtime')
|
455
|
+
Fluent::Config.bool_value('localtime')
|
456
|
+
else
|
457
|
+
Fluent::Config.bool_value('utc')
|
458
|
+
end
|
459
|
+
@time_formatter = Fluent::TimeFormatter.new(conf['time_format'], using_localtime)
|
460
|
+
else
|
461
|
+
@time_formatter = nil
|
462
|
+
end
|
463
|
+
end
|
464
|
+
|
372
465
|
class Compressor
|
373
466
|
include Fluent::Configurable
|
374
467
|
|
@@ -395,7 +488,7 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
395
488
|
begin
|
396
489
|
Open3.capture3("#{command} -V")
|
397
490
|
rescue Errno::ENOENT
|
398
|
-
raise
|
491
|
+
raise ConfigError, "'#{command}' utility must be in PATH for #{algo} compression"
|
399
492
|
end
|
400
493
|
end
|
401
494
|
end
|