fluent-plugin-webhdfs 0.7.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -10
- data/Appraisals +0 -2
- data/README.md +55 -45
- data/fluent-plugin-webhdfs.gemspec +4 -4
- data/lib/fluent/plugin/out_webhdfs.rb +193 -100
- data/lib/fluent/plugin/webhdfs_compressor_bzip2.rb +3 -7
- data/lib/fluent/plugin/webhdfs_compressor_gzip.rb +3 -3
- data/lib/fluent/plugin/webhdfs_compressor_lzo_command.rb +3 -3
- data/lib/fluent/plugin/webhdfs_compressor_snappy.rb +2 -2
- data/lib/fluent/plugin/webhdfs_compressor_text.rb +2 -2
- data/test/helper.rb +5 -0
- data/test/plugin/test_compressor.rb +3 -3
- data/test/plugin/test_out_webhdfs.rb +179 -105
- metadata +18 -24
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8865ee69f790536d7ce119ead16abc7862efdf59
|
4
|
+
data.tar.gz: df3f7cc64b42d465733ca51815b9e066b9d4310c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 384e20026a6b64a91c3ca59058e6e4e33480dda26057165cf9464006bc7e0b6d61170d97878abfe5c8c29a23b263e7a6db770f4cfcf4170b5de57ee3f2337da6
|
7
|
+
data.tar.gz: 464ec8333c3c97002150cabbeed4f13077abfe473944f4a56fb764414f25e6e03e793942f11ca270695c22cc813866e63c802debfa6f967ac08f98e4637b9e5d
|
data/.travis.yml
CHANGED
@@ -2,10 +2,9 @@ sudo: false
|
|
2
2
|
language: ruby
|
3
3
|
|
4
4
|
rvm:
|
5
|
-
- 2.0.0
|
6
5
|
- 2.1
|
7
6
|
- 2.2
|
8
|
-
- 2.3.
|
7
|
+
- 2.3.1
|
9
8
|
|
10
9
|
branches:
|
11
10
|
only:
|
@@ -23,12 +22,4 @@ script: bundle exec rake test
|
|
23
22
|
|
24
23
|
gemfile:
|
25
24
|
- Gemfile
|
26
|
-
- gemfiles/fluentd_v0.12.gemfile
|
27
25
|
- gemfiles/fluentd_v0.14.gemfile
|
28
|
-
|
29
|
-
matrix:
|
30
|
-
exclude:
|
31
|
-
- rvm: 2.0.0
|
32
|
-
gemfile: Gemfile
|
33
|
-
- rvm: 2.0.0
|
34
|
-
gemfile: gemfiles/fluentd_v0.14.gemfile
|
data/Appraisals
CHANGED
data/README.md
CHANGED
@@ -2,16 +2,18 @@
|
|
2
2
|
|
3
3
|
[Fluentd](http://fluentd.org/) output plugin to write data into Hadoop HDFS over WebHDFS/HttpFs.
|
4
4
|
|
5
|
-
|
5
|
+
"webhdfs" output plugin formats data into plain text, and store it as files on HDFS. This plugin supports:
|
6
6
|
|
7
|
-
*
|
8
|
-
|
9
|
-
*
|
10
|
-
* include tag as line header, or not
|
11
|
-
* change field separator (default: TAB)
|
12
|
-
* add new line as termination, or not
|
7
|
+
* inject tag and time into record (and output plain text data) using `<inject>` section
|
8
|
+
* format events into plain text by format plugins using `<format>` section
|
9
|
+
* control flushing using `<buffer>` section
|
13
10
|
|
14
|
-
|
11
|
+
Paths on HDFS can be generated from event timestamp, tag or any other fields in records.
|
12
|
+
|
13
|
+
### Older versions
|
14
|
+
|
15
|
+
The versions of `0.x.x` of this plugin are for older version of Fluentd (v0.12.x). Old style configuration parameters (using `output_data_type`, `output_include_*` or others) are still supported, but are deprecated.
|
16
|
+
Users should use `<format>` section to control how to format events into plain text.
|
15
17
|
|
16
18
|
## Configuration
|
17
19
|
|
@@ -26,15 +28,16 @@ To store data by time,tag,json (same with '@type file') over WebHDFS:
|
|
26
28
|
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
27
29
|
</match>
|
28
30
|
|
29
|
-
If you want JSON object only (without time or tag or both on header of lines),
|
31
|
+
If you want JSON object only (without time or tag or both on header of lines), use `<format>` section to specify `json` formatter:
|
30
32
|
|
31
33
|
<match access.**>
|
32
34
|
@type webhdfs
|
33
35
|
host namenode.your.cluster.local
|
34
36
|
port 50070
|
35
37
|
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
36
|
-
|
37
|
-
|
38
|
+
<format>
|
39
|
+
@type json
|
40
|
+
</format>
|
38
41
|
</match>
|
39
42
|
|
40
43
|
To specify namenode, `namenode` is also available:
|
@@ -45,14 +48,47 @@ To specify namenode, `namenode` is also available:
|
|
45
48
|
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
46
49
|
</match>
|
47
50
|
|
48
|
-
To store data as
|
51
|
+
To store data as JSON, including time and tag (using `<inject>`), over WebHDFS:
|
49
52
|
|
50
53
|
<match access.**>
|
51
54
|
@type webhdfs
|
52
55
|
host namenode.your.cluster.local
|
53
56
|
port 50070
|
54
57
|
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
55
|
-
|
58
|
+
<buffer>
|
59
|
+
timekey_zone -0700 # to specify timezone used for "path" time placeholder formatting
|
60
|
+
</buffer>
|
61
|
+
<inject>
|
62
|
+
tag_key tag
|
63
|
+
time_key time
|
64
|
+
time_type string
|
65
|
+
timezone -0700
|
66
|
+
</inject>
|
67
|
+
<format>
|
68
|
+
@type json
|
69
|
+
</format>
|
70
|
+
</match>
|
71
|
+
|
72
|
+
To store data as JSON, including time as unix time, using path including tag as directory:
|
73
|
+
|
74
|
+
<match access.**>
|
75
|
+
@type webhdfs
|
76
|
+
host namenode.your.cluster.local
|
77
|
+
port 50070
|
78
|
+
path /path/on/hdfs/${tag}/access.log.%Y%m%d_%H.log
|
79
|
+
<buffer time,tag>
|
80
|
+
@type file # using file buffer
|
81
|
+
path /var/log/fluentd/buffer # buffer directory path
|
82
|
+
timekey 3h # create a file per 3h
|
83
|
+
timekey_use_utc true # time in path are formatted in UTC (default false means localtime)
|
84
|
+
</buffer>
|
85
|
+
<inject>
|
86
|
+
time_key time
|
87
|
+
time_type unixtime
|
88
|
+
</inject>
|
89
|
+
<format>
|
90
|
+
@type json
|
91
|
+
</format>
|
56
92
|
</match>
|
57
93
|
|
58
94
|
With username of pseudo authentication:
|
@@ -75,24 +111,6 @@ Store data over HttpFs (instead of WebHDFS):
|
|
75
111
|
httpfs true
|
76
112
|
</match>
|
77
113
|
|
78
|
-
Store data as TSV (TAB separated values) of specified keys, without time, with tag (removed prefix 'access'):
|
79
|
-
|
80
|
-
<match access.**>
|
81
|
-
@type webhdfs
|
82
|
-
host namenode.your.cluster.local
|
83
|
-
port 50070
|
84
|
-
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
85
|
-
|
86
|
-
field_separator TAB # or 'SPACE', 'COMMA' or 'SOH'(Start Of Heading: \001)
|
87
|
-
output_include_time false
|
88
|
-
output_include_tag true
|
89
|
-
remove_prefix access
|
90
|
-
|
91
|
-
output_data_type attr:path,status,referer,agent,bytes
|
92
|
-
</match>
|
93
|
-
|
94
|
-
If message doesn't have specified attribute, fluent-plugin-webhdfs outputs 'NULL' instead of values.
|
95
|
-
|
96
114
|
With ssl:
|
97
115
|
|
98
116
|
<match access.**>
|
@@ -118,11 +136,8 @@ With kerberos authentication:
|
|
118
136
|
port 50070
|
119
137
|
path /path/on/hdfs/access.log.%Y%m%d_%H.log
|
120
138
|
kerberos true
|
121
|
-
kerberos_keytab /path/to/keytab # if needed
|
122
139
|
</match>
|
123
140
|
|
124
|
-
NOTE: You need to install `gssapi` gem for kerberos. See https://github.com/kzk/webhdfs#for-kerberos-authentication
|
125
|
-
|
126
141
|
If you want to compress data before storing it:
|
127
142
|
|
128
143
|
<match access.**>
|
@@ -134,10 +149,7 @@ If you want to compress data before storing it:
|
|
134
149
|
</match>
|
135
150
|
|
136
151
|
Note that if you set `compress gzip`, then the suffix `.gz` will be added to path (or `.bz2`, `sz`, `.lzo`).
|
137
|
-
Note that you have to install
|
138
|
-
|
139
|
-
- snappy: install snappy gem
|
140
|
-
- bzip2: install bzip2-ffi gem
|
152
|
+
Note that you have to install snappy gem if you want to set `compress snappy`.
|
141
153
|
|
142
154
|
### Namenode HA / Auto retry for WebHDFS known errors
|
143
155
|
|
@@ -164,7 +176,7 @@ And you can also specify to retry known hdfs errors (such like `LeaseExpiredExce
|
|
164
176
|
### Performance notifications
|
165
177
|
|
166
178
|
Writing data on HDFS single file from 2 or more fluentd nodes, makes many bad blocks of HDFS. If you want to run 2 or more fluentd nodes with fluent-plugin-webhdfs, you should configure 'path' for each node.
|
167
|
-
|
179
|
+
To include hostname, `#{Socket.gethostname}` is available in Fluentd configuration string literals by ruby expression (in `"..."` strings). This plugin also supports `${uuid}` placeholder to include random uuid in paths.
|
168
180
|
|
169
181
|
For hostname:
|
170
182
|
|
@@ -172,7 +184,7 @@ For hostname:
|
|
172
184
|
@type webhdfs
|
173
185
|
host namenode.your.cluster.local
|
174
186
|
port 50070
|
175
|
-
path /log/access/%Y%m%d
|
187
|
+
path "/log/access/%Y%m%d/#{Socket.gethostname}.log" # double quotes needed to expand ruby expression in string
|
176
188
|
</match>
|
177
189
|
|
178
190
|
Or with random filename (to avoid duplicated file name only):
|
@@ -181,10 +193,10 @@ Or with random filename (to avoid duplicated file name only):
|
|
181
193
|
@type webhdfs
|
182
194
|
host namenode.your.cluster.local
|
183
195
|
port 50070
|
184
|
-
path /log/access/%Y%m%d/${uuid
|
196
|
+
path /log/access/%Y%m%d/${uuid}.log
|
185
197
|
</match>
|
186
198
|
|
187
|
-
With configurations above, you can handle all of files of
|
199
|
+
With configurations above, you can handle all of files of `/log/access/20120820/*` as specified timeslice access logs.
|
188
200
|
|
189
201
|
For high load cluster nodes, you can specify timeouts for HTTP requests.
|
190
202
|
|
@@ -220,15 +232,13 @@ With unstable datanodes that frequently downs, appending over WebHDFS may produc
|
|
220
232
|
port 50070
|
221
233
|
|
222
234
|
append no
|
223
|
-
path /log/access/%Y%m%d
|
235
|
+
path "/log/access/%Y%m%d/#{Socket.gethostname}.${chunk_id}.log"
|
224
236
|
</match>
|
225
237
|
|
226
238
|
`out_webhdfs` creates new files on hdfs per flush of fluentd, with chunk id. You shouldn't care broken files from append operations.
|
227
239
|
|
228
240
|
## TODO
|
229
241
|
|
230
|
-
* configuration example for Hadoop Namenode HA
|
231
|
-
* here, or docs.fluentd.org ?
|
232
242
|
* patches welcome!
|
233
243
|
|
234
244
|
## Copyright
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |gem|
|
4
4
|
gem.name = "fluent-plugin-webhdfs"
|
5
|
-
gem.version = "0.
|
5
|
+
gem.version = "1.0.0"
|
6
6
|
gem.authors = ["TAGOMORI Satoshi"]
|
7
7
|
gem.email = ["tagomoris@gmail.com"]
|
8
8
|
gem.summary = %q{Fluentd plugin to write data on HDFS over WebHDFS, with flexible formatting}
|
@@ -17,10 +17,10 @@ Gem::Specification.new do |gem|
|
|
17
17
|
|
18
18
|
gem.add_development_dependency "rake"
|
19
19
|
gem.add_development_dependency "test-unit"
|
20
|
+
gem.add_development_dependency "test-unit-rr"
|
20
21
|
gem.add_development_dependency "appraisal"
|
21
22
|
gem.add_development_dependency "snappy", '>= 0.0.13'
|
22
|
-
gem.
|
23
|
-
gem.add_runtime_dependency "fluentd", ['>= 0.10.59', "< 0.14.0"]
|
24
|
-
gem.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
|
23
|
+
gem.add_runtime_dependency "fluentd", '>= 0.14.4'
|
25
24
|
gem.add_runtime_dependency "webhdfs", '>= 0.6.0'
|
25
|
+
gem.add_runtime_dependency "bzip2-ffi"
|
26
26
|
end
|
@@ -1,131 +1,138 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
|
3
|
+
require 'fluent/plugin/output'
|
4
|
+
require 'fluent/config/element'
|
5
|
+
|
6
|
+
require 'webhdfs'
|
3
7
|
require 'tempfile'
|
4
8
|
require 'securerandom'
|
5
|
-
require 'fluent/mixin/plaintextformatter'
|
6
9
|
|
7
|
-
class Fluent::WebHDFSOutput < Fluent::
|
10
|
+
class Fluent::Plugin::WebHDFSOutput < Fluent::Plugin::Output
|
8
11
|
Fluent::Plugin.register_output('webhdfs', self)
|
9
12
|
|
10
|
-
|
11
|
-
config_set_default :time_slice_format, '%Y%m%d'
|
12
|
-
|
13
|
-
# For fluentd v0.12.16 or earlier
|
14
|
-
class << self
|
15
|
-
unless method_defined?(:desc)
|
16
|
-
def desc(description)
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
13
|
+
helpers :inject, :formatter, :compat_parameters
|
20
14
|
|
21
15
|
desc 'WebHDFS/HttpFs host'
|
22
|
-
config_param :host, :string, :
|
16
|
+
config_param :host, :string, default: nil
|
23
17
|
desc 'WebHDFS/HttpFs port'
|
24
|
-
config_param :port, :integer, :
|
18
|
+
config_param :port, :integer, default: 50070
|
25
19
|
desc 'Namenode (host:port)'
|
26
|
-
config_param :namenode, :string, :
|
20
|
+
config_param :namenode, :string, default: nil # host:port
|
27
21
|
desc 'Standby namenode for Namenode HA (host:port)'
|
28
|
-
config_param :standby_namenode, :string, :
|
22
|
+
config_param :standby_namenode, :string, default: nil # host:port
|
29
23
|
|
30
24
|
desc 'Ignore errors on start up'
|
31
|
-
config_param :ignore_start_check_error, :bool, :
|
25
|
+
config_param :ignore_start_check_error, :bool, default: false
|
32
26
|
|
33
27
|
desc 'Output file path on HDFS'
|
34
28
|
config_param :path, :string
|
35
29
|
desc 'User name for pseudo authentication'
|
36
|
-
config_param :username, :string, :
|
30
|
+
config_param :username, :string, default: nil
|
37
31
|
|
38
32
|
desc 'Store data over HttpFs instead of WebHDFS'
|
39
|
-
config_param :httpfs, :bool, :
|
33
|
+
config_param :httpfs, :bool, default: false
|
40
34
|
|
41
35
|
desc 'Number of seconds to wait for the connection to open'
|
42
|
-
config_param :open_timeout, :integer, :
|
36
|
+
config_param :open_timeout, :integer, default: 30 # from ruby net/http default
|
43
37
|
desc 'Number of seconds to wait for one block to be read'
|
44
|
-
config_param :read_timeout, :integer, :
|
38
|
+
config_param :read_timeout, :integer, default: 60 # from ruby net/http default
|
45
39
|
|
46
40
|
desc 'Retry automatically when known errors of HDFS are occurred'
|
47
|
-
config_param :retry_known_errors, :bool, :
|
41
|
+
config_param :retry_known_errors, :bool, default: false
|
48
42
|
desc 'Retry interval'
|
49
|
-
config_param :retry_interval, :integer, :
|
43
|
+
config_param :retry_interval, :integer, default: nil
|
50
44
|
desc 'The number of retries'
|
51
|
-
config_param :retry_times, :integer, :
|
45
|
+
config_param :retry_times, :integer, default: nil
|
52
46
|
|
53
47
|
# how many times of write failure before switch to standby namenode
|
54
48
|
# by default it's 11 times that costs 1023 seconds inside fluentd,
|
55
49
|
# which is considered enough to exclude the scenes that caused by temporary network fail or single datanode fail
|
56
50
|
desc 'How many times of write failure before switch to standby namenode'
|
57
|
-
config_param :failures_before_use_standby, :integer, :
|
58
|
-
|
59
|
-
include Fluent::Mixin::PlainTextFormatter
|
51
|
+
config_param :failures_before_use_standby, :integer, default: 11
|
60
52
|
|
61
|
-
config_param :
|
53
|
+
config_param :end_with_newline, :bool, default: true
|
62
54
|
|
63
55
|
desc 'Append data or not'
|
64
|
-
config_param :append, :bool, :
|
56
|
+
config_param :append, :bool, default: true
|
65
57
|
|
66
58
|
desc 'Use SSL or not'
|
67
|
-
config_param :ssl, :bool, :
|
59
|
+
config_param :ssl, :bool, default: false
|
68
60
|
desc 'OpenSSL certificate authority file'
|
69
|
-
config_param :ssl_ca_file, :string, :
|
61
|
+
config_param :ssl_ca_file, :string, default: nil
|
70
62
|
desc 'OpenSSL verify mode (none,peer)'
|
71
|
-
config_param :ssl_verify_mode, :
|
72
|
-
case val
|
73
|
-
when 'none'
|
74
|
-
:none
|
75
|
-
when 'peer'
|
76
|
-
:peer
|
77
|
-
else
|
78
|
-
raise Fluent::ConfigError, "unexpected parameter on ssl_verify_mode: #{val}"
|
79
|
-
end
|
80
|
-
end
|
63
|
+
config_param :ssl_verify_mode, :enum, list: [:none, :peer], default: :none
|
81
64
|
|
82
65
|
desc 'Use kerberos authentication or not'
|
83
|
-
config_param :kerberos, :bool, :
|
84
|
-
desc 'kerberos keytab file'
|
85
|
-
config_param :kerberos_keytab, :string, :default => nil
|
66
|
+
config_param :kerberos, :bool, default: false
|
86
67
|
|
87
|
-
SUPPORTED_COMPRESS = [
|
68
|
+
SUPPORTED_COMPRESS = [:gzip, :bzip2, :snappy, :lzo_command, :text]
|
88
69
|
desc "Compress method (#{SUPPORTED_COMPRESS.join(',')})"
|
89
|
-
config_param :compress, :
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
70
|
+
config_param :compress, :enum, list: SUPPORTED_COMPRESS, default: :text
|
71
|
+
|
72
|
+
config_param :remove_prefix, :string, default: nil, deprecated: "use @label for routing"
|
73
|
+
config_param :default_tag, :string, default: nil, deprecated: "use @label for routing"
|
74
|
+
config_param :null_value, :string, default: nil, deprecated: "use filter plugins to convert null values into any specified string"
|
75
|
+
config_param :suppress_log_broken_string, :bool, default: false, deprecated: "use @log_level for plugin to suppress such info logs"
|
95
76
|
|
96
77
|
CHUNK_ID_PLACE_HOLDER = '${chunk_id}'
|
97
78
|
|
98
|
-
|
79
|
+
config_section :buffer do
|
80
|
+
config_set_default :chunk_keys, ["time"]
|
81
|
+
end
|
82
|
+
|
83
|
+
config_section :format do
|
84
|
+
config_set_default :@type, 'out_file'
|
85
|
+
config_set_default :localtime, false # default timezone is UTC
|
86
|
+
end
|
87
|
+
|
88
|
+
attr_reader :formatter, :compressor
|
99
89
|
|
100
90
|
def initialize
|
101
91
|
super
|
102
|
-
require 'net/http'
|
103
|
-
require 'time'
|
104
|
-
require 'webhdfs'
|
105
|
-
|
106
92
|
@compressor = nil
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
unless method_defined?(:log)
|
111
|
-
define_method("log") { $log }
|
93
|
+
@standby_namenode_host = nil
|
94
|
+
@output_include_tag = @output_include_time = nil # TODO: deprecated
|
95
|
+
@header_separator = @field_separator = nil # TODO: deprecated
|
112
96
|
end
|
113
97
|
|
114
98
|
def configure(conf)
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
99
|
+
compat_parameters_convert(conf, :buffer, default_chunk_key: "time")
|
100
|
+
|
101
|
+
timekey = case conf["path"]
|
102
|
+
when /%S/ then 1
|
103
|
+
when /%M/ then 60
|
104
|
+
when /%H/ then 3600
|
105
|
+
else 86400
|
106
|
+
end
|
107
|
+
if conf.elements(name: "buffer").empty?
|
108
|
+
e = Fluent::Config::Element.new("buffer", "time", {}, [])
|
109
|
+
conf.elements << e
|
123
110
|
end
|
111
|
+
buffer_config = conf.elements(name: "buffer").first
|
112
|
+
buffer_config["timekey"] = timekey unless buffer_config["timekey"]
|
124
113
|
|
125
|
-
|
114
|
+
compat_parameters_convert_plaintextformatter(conf)
|
126
115
|
|
127
116
|
super
|
128
117
|
|
118
|
+
@formatter = formatter_create
|
119
|
+
|
120
|
+
if @using_formatter_config
|
121
|
+
@null_value = nil
|
122
|
+
else
|
123
|
+
@formatter.delimiter = "\x01" if @formatter.respond_to?(:delimiter) && @formatter.delimiter == 'SOH'
|
124
|
+
@null_value ||= 'NULL'
|
125
|
+
end
|
126
|
+
|
127
|
+
if @default_tag.nil? && !@using_formatter_config && @output_include_tag
|
128
|
+
@default_tag = "tag_missing"
|
129
|
+
end
|
130
|
+
if @remove_prefix
|
131
|
+
@remove_prefix_actual = @remove_prefix + "."
|
132
|
+
@remove_prefix_actual_length = @remove_prefix_actual.length
|
133
|
+
end
|
134
|
+
|
135
|
+
verify_config_placeholders_in_path!(conf)
|
129
136
|
@replace_random_uuid = @path.include?('%{uuid}') || @path.include?('%{uuid_flush}')
|
130
137
|
if @replace_random_uuid
|
131
138
|
# to check SecureRandom.uuid is available or not (NotImplementedError raised in such environment)
|
@@ -136,14 +143,7 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
136
143
|
end
|
137
144
|
end
|
138
145
|
|
139
|
-
|
140
|
-
@compressor = COMPRESSOR_REGISTRY.lookup(@compress || 'text').new
|
141
|
-
rescue Fluent::ConfigError
|
142
|
-
raise
|
143
|
-
rescue
|
144
|
-
$log.warn "#{@comress} not found. Use 'text' instead"
|
145
|
-
@compressor = COMPRESSOR_REGISTRY.lookup('text').new
|
146
|
-
end
|
146
|
+
@compressor = COMPRESSOR_REGISTRY.lookup(@compress.to_s).new
|
147
147
|
|
148
148
|
if @host
|
149
149
|
@namenode_host = @host
|
@@ -178,7 +178,7 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
178
178
|
@client_standby = nil
|
179
179
|
end
|
180
180
|
|
181
|
-
|
181
|
+
unless @append
|
182
182
|
if @path.index(CHUNK_ID_PLACE_HOLDER).nil?
|
183
183
|
raise Fluent::ConfigError, "path must contain ${chunk_id}, which is the placeholder for chunk_id, when append is set to false."
|
184
184
|
end
|
@@ -204,7 +204,6 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
204
204
|
end
|
205
205
|
if @kerberos
|
206
206
|
client.kerberos = true
|
207
|
-
client.kerberos_keytab = @kerberos_keytab if @kerberos_keytab
|
208
207
|
end
|
209
208
|
|
210
209
|
client
|
@@ -242,14 +241,6 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
242
241
|
end
|
243
242
|
end
|
244
243
|
|
245
|
-
def shutdown
|
246
|
-
super
|
247
|
-
end
|
248
|
-
|
249
|
-
def path_format(chunk_key)
|
250
|
-
Time.strptime(chunk_key, @time_slice_format).strftime(@path)
|
251
|
-
end
|
252
|
-
|
253
244
|
def is_standby_exception(e)
|
254
245
|
e.is_a?(WebHDFS::IOError) && e.message.match(/org\.apache\.hadoop\.ipc\.StandbyException/)
|
255
246
|
end
|
@@ -261,12 +252,6 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
261
252
|
end
|
262
253
|
end
|
263
254
|
|
264
|
-
def chunk_unique_id_to_str(unique_id)
|
265
|
-
unique_id.unpack('C*').map{|x| x.to_s(16).rjust(2,'0')}.join('')
|
266
|
-
end
|
267
|
-
|
268
|
-
# TODO check conflictions
|
269
|
-
|
270
255
|
def send_data(path, data)
|
271
256
|
if @append
|
272
257
|
begin
|
@@ -281,7 +266,7 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
281
266
|
|
282
267
|
HOSTNAME_PLACEHOLDERS_DEPRECATED = ['${hostname}', '%{hostname}', '__HOSTNAME__']
|
283
268
|
UUID_RANDOM_PLACEHOLDERS_DEPRECATED = ['${uuid}', '${uuid:random}', '__UUID__', '__UUID_RANDOM__']
|
284
|
-
UUID_OTHER_PLACEHOLDERS_OBSOLETED = ['${uuid:hostname}', '%{uuid:hostname}', '__UUID_HOSTNAME__', '${uuid:timestamp}', '%{uuid:timestamp}', '__UUID_TIMESTAMP__']
|
269
|
+
UUID_OTHER_PLACEHOLDERS_OBSOLETED = ['${uuid:hostname}', '%{uuid:hostname}', '__UUID_HOSTNAME__', '${uuid:timestamp}', '%{uuid:timestamp}', '__UUID_TIMESTAMP__']
|
285
270
|
|
286
271
|
def verify_config_placeholders_in_path!(conf)
|
287
272
|
return unless conf.has_key?('path')
|
@@ -310,20 +295,20 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
310
295
|
log.error "configuration placeholder #{ph} is now unsupported by webhdfs output plugin."
|
311
296
|
end
|
312
297
|
end
|
313
|
-
raise
|
298
|
+
raise ConfigError, "there are unsupported placeholders in path."
|
314
299
|
end
|
315
300
|
end
|
316
301
|
|
317
302
|
def generate_path(chunk)
|
318
303
|
hdfs_path = if @append
|
319
|
-
|
304
|
+
extract_placeholders(@path, chunk.metadata)
|
320
305
|
else
|
321
|
-
|
306
|
+
extract_placeholders(@path, chunk.metadata).gsub(CHUNK_ID_PLACE_HOLDER, dump_unique_id(chunk.unique_id))
|
322
307
|
end
|
323
308
|
hdfs_path = "#{hdfs_path}#{@compressor.ext}"
|
324
309
|
if @replace_random_uuid
|
325
310
|
uuid_random = SecureRandom.uuid
|
326
|
-
hdfs_path
|
311
|
+
hdfs_path.gsub!('%{uuid}', uuid_random).gsub!('%{uuid_flush}', uuid_random)
|
327
312
|
end
|
328
313
|
hdfs_path
|
329
314
|
end
|
@@ -339,6 +324,48 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
339
324
|
end
|
340
325
|
end
|
341
326
|
|
327
|
+
def format(tag, time, record)
|
328
|
+
if @remove_prefix # TODO: remove when it's obsoleted
|
329
|
+
if tag.start_with?(@remove_prefix_actual)
|
330
|
+
if tag.length > @remove_prefix_actual_length
|
331
|
+
tag = tag[@remove_prefix_actual_length..-1]
|
332
|
+
else
|
333
|
+
tag = @default_tag
|
334
|
+
end
|
335
|
+
elsif tag.start_with?(@remove_prefix)
|
336
|
+
if tag == @remove_prefix
|
337
|
+
tag = @default_tag
|
338
|
+
else
|
339
|
+
tag = tag.sub(@remove_prefix, '')
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
if @null_value # TODO: remove when it's obsoleted
|
345
|
+
check_keys = (record.keys + @null_convert_keys).uniq
|
346
|
+
check_keys.each do |key|
|
347
|
+
record[key] = @null_value if record[key].nil?
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
351
|
+
if @using_formatter_config
|
352
|
+
record = inject_values_to_record(tag, time, record)
|
353
|
+
line = @formatter.format(tag, time, record)
|
354
|
+
else # TODO: remove when it's obsoleted
|
355
|
+
time_str = @output_include_time ? @time_formatter.call(time) + @header_separator : ''
|
356
|
+
tag_str = @output_include_tag ? tag + @header_separator : ''
|
357
|
+
record_str = @formatter.format(tag, time, record)
|
358
|
+
line = time_str + tag_str + record_str
|
359
|
+
end
|
360
|
+
line << "\n" if @end_with_newline && !line.end_with?("\n")
|
361
|
+
line
|
362
|
+
rescue => e # remove this clause when @suppress_log_broken_string is obsoleted
|
363
|
+
unless @suppress_log_broken_string
|
364
|
+
log.info "unexpected error while formatting events, ignored", tag: tag, record: record, error: e
|
365
|
+
end
|
366
|
+
''
|
367
|
+
end
|
368
|
+
|
342
369
|
def write(chunk)
|
343
370
|
hdfs_path = generate_path(chunk)
|
344
371
|
|
@@ -369,6 +396,72 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
369
396
|
hdfs_path
|
370
397
|
end
|
371
398
|
|
399
|
+
def compat_parameters_convert_plaintextformatter(conf)
|
400
|
+
if !conf.elements('format').empty? || !conf['output_data_type']
|
401
|
+
@using_formatter_config = true
|
402
|
+
@null_convert_keys = []
|
403
|
+
return
|
404
|
+
end
|
405
|
+
|
406
|
+
log.warn "webhdfs output plugin is working with old configuration parameters. use <inject>/<format> sections instead for further releases."
|
407
|
+
@using_formatter_config = false
|
408
|
+
@null_convert_keys = []
|
409
|
+
|
410
|
+
@header_separator = case conf['field_separator']
|
411
|
+
when nil then "\t"
|
412
|
+
when 'SPACE' then ' '
|
413
|
+
when 'TAB' then "\t"
|
414
|
+
when 'COMMA' then ','
|
415
|
+
when 'SOH' then "\x01"
|
416
|
+
else conf['field_separator']
|
417
|
+
end
|
418
|
+
|
419
|
+
format_section = Fluent::Config::Element.new('format', '', {}, [])
|
420
|
+
case conf['output_data_type']
|
421
|
+
when '', 'json' # blank value is for compatibility reason (especially in testing)
|
422
|
+
format_section['@type'] = 'json'
|
423
|
+
when 'ltsv'
|
424
|
+
format_section['@type'] = 'ltsv'
|
425
|
+
else
|
426
|
+
unless conf['output_data_type'].start_with?('attr:')
|
427
|
+
raise Fluent::ConfigError, "output_data_type is invalid: #{conf['output_data_type']}"
|
428
|
+
end
|
429
|
+
format_section['@format'] = 'tsv'
|
430
|
+
keys_part = conf['output_data_type'].sub(/^attr:/, '')
|
431
|
+
@null_convert_keys = keys_part.split(',')
|
432
|
+
format_section['keys'] = keys_part
|
433
|
+
format_section['delimiter'] = case conf['field_separator']
|
434
|
+
when nil then '\t'
|
435
|
+
when 'SPACE' then ' '
|
436
|
+
when 'TAB' then '\t'
|
437
|
+
when 'COMMA' then ','
|
438
|
+
when 'SOH' then 'SOH' # fixed later
|
439
|
+
else conf['field_separator']
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
443
|
+
conf.elements << format_section
|
444
|
+
|
445
|
+
@output_include_time = conf.has_key?('output_include_time') ? Fluent::Config.bool_value(conf['output_include_time']) : true
|
446
|
+
@output_include_tag = conf.has_key?('output_include_tag') ? Fluent::Config.bool_value(conf['output_include_tag']) : true
|
447
|
+
|
448
|
+
if @output_include_time
|
449
|
+
# default timezone is UTC
|
450
|
+
using_localtime = if !conf.has_key?('utc') && !conf.has_key?('localtime')
|
451
|
+
false
|
452
|
+
elsif conf.has_key?('localtime') && conf.has_key?('utc')
|
453
|
+
raise Fluent::ConfigError, "specify either 'localtime' or 'utc'"
|
454
|
+
elsif conf.has_key?('localtime')
|
455
|
+
Fluent::Config.bool_value('localtime')
|
456
|
+
else
|
457
|
+
Fluent::Config.bool_value('utc')
|
458
|
+
end
|
459
|
+
@time_formatter = Fluent::TimeFormatter.new(conf['time_format'], using_localtime)
|
460
|
+
else
|
461
|
+
@time_formatter = nil
|
462
|
+
end
|
463
|
+
end
|
464
|
+
|
372
465
|
class Compressor
|
373
466
|
include Fluent::Configurable
|
374
467
|
|
@@ -395,7 +488,7 @@ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
|
|
395
488
|
begin
|
396
489
|
Open3.capture3("#{command} -V")
|
397
490
|
rescue Errno::ENOENT
|
398
|
-
raise
|
491
|
+
raise ConfigError, "'#{command}' utility must be in PATH for #{algo} compression"
|
399
492
|
end
|
400
493
|
end
|
401
494
|
end
|