fluentd 1.6.3 → 1.7.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of fluentd might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/.drone.yml +35 -0
- data/.github/ISSUE_TEMPLATE/bug_report.md +2 -0
- data/CHANGELOG.md +58 -0
- data/README.md +5 -1
- data/fluentd.gemspec +1 -1
- data/lib/fluent/clock.rb +4 -0
- data/lib/fluent/compat/output.rb +3 -3
- data/lib/fluent/compat/socket_util.rb +1 -1
- data/lib/fluent/config/element.rb +3 -3
- data/lib/fluent/config/literal_parser.rb +1 -1
- data/lib/fluent/config/section.rb +4 -1
- data/lib/fluent/error.rb +4 -0
- data/lib/fluent/event.rb +28 -24
- data/lib/fluent/event_router.rb +2 -1
- data/lib/fluent/log.rb +1 -1
- data/lib/fluent/msgpack_factory.rb +8 -0
- data/lib/fluent/plugin/bare_output.rb +4 -4
- data/lib/fluent/plugin/buf_file_single.rb +211 -0
- data/lib/fluent/plugin/buffer.rb +62 -63
- data/lib/fluent/plugin/buffer/chunk.rb +21 -3
- data/lib/fluent/plugin/buffer/file_chunk.rb +37 -12
- data/lib/fluent/plugin/buffer/file_single_chunk.rb +314 -0
- data/lib/fluent/plugin/buffer/memory_chunk.rb +2 -1
- data/lib/fluent/plugin/compressable.rb +10 -6
- data/lib/fluent/plugin/filter_grep.rb +2 -2
- data/lib/fluent/plugin/formatter_csv.rb +10 -6
- data/lib/fluent/plugin/in_syslog.rb +10 -3
- data/lib/fluent/plugin/in_tail.rb +7 -2
- data/lib/fluent/plugin/in_tcp.rb +34 -7
- data/lib/fluent/plugin/multi_output.rb +4 -4
- data/lib/fluent/plugin/out_exec_filter.rb +1 -0
- data/lib/fluent/plugin/out_file.rb +13 -3
- data/lib/fluent/plugin/out_forward.rb +126 -588
- data/lib/fluent/plugin/out_forward/ack_handler.rb +161 -0
- data/lib/fluent/plugin/out_forward/connection_manager.rb +113 -0
- data/lib/fluent/plugin/out_forward/error.rb +28 -0
- data/lib/fluent/plugin/out_forward/failure_detector.rb +84 -0
- data/lib/fluent/plugin/out_forward/handshake_protocol.rb +121 -0
- data/lib/fluent/plugin/out_forward/load_balancer.rb +111 -0
- data/lib/fluent/plugin/out_forward/socket_cache.rb +138 -0
- data/lib/fluent/plugin/out_http.rb +231 -0
- data/lib/fluent/plugin/output.rb +29 -35
- data/lib/fluent/plugin/parser.rb +77 -0
- data/lib/fluent/plugin/parser_csv.rb +75 -0
- data/lib/fluent/plugin_helper/server.rb +1 -1
- data/lib/fluent/plugin_helper/thread.rb +1 -0
- data/lib/fluent/root_agent.rb +1 -1
- data/lib/fluent/time.rb +4 -2
- data/lib/fluent/timezone.rb +21 -7
- data/lib/fluent/version.rb +1 -1
- data/test/command/test_fluentd.rb +1 -1
- data/test/command/test_plugin_generator.rb +18 -2
- data/test/config/test_configurable.rb +78 -40
- data/test/counter/test_store.rb +1 -1
- data/test/helper.rb +1 -0
- data/test/helpers/process_extenstion.rb +33 -0
- data/test/plugin/out_forward/test_ack_handler.rb +101 -0
- data/test/plugin/out_forward/test_connection_manager.rb +145 -0
- data/test/plugin/out_forward/test_handshake_protocol.rb +103 -0
- data/test/plugin/out_forward/test_load_balancer.rb +60 -0
- data/test/plugin/out_forward/test_socket_cache.rb +139 -0
- data/test/plugin/test_buf_file.rb +118 -2
- data/test/plugin/test_buf_file_single.rb +734 -0
- data/test/plugin/test_buffer.rb +4 -48
- data/test/plugin/test_buffer_file_chunk.rb +19 -1
- data/test/plugin/test_buffer_file_single_chunk.rb +620 -0
- data/test/plugin/test_formatter_csv.rb +16 -0
- data/test/plugin/test_in_syslog.rb +56 -6
- data/test/plugin/test_in_tail.rb +1 -1
- data/test/plugin/test_in_tcp.rb +25 -0
- data/test/plugin/test_out_forward.rb +75 -201
- data/test/plugin/test_out_http.rb +352 -0
- data/test/plugin/test_output_as_buffered.rb +27 -24
- data/test/plugin/test_parser.rb +40 -0
- data/test/plugin/test_parser_csv.rb +83 -0
- data/test/plugin_helper/test_record_accessor.rb +1 -1
- data/test/test_time_formatter.rb +140 -121
- metadata +33 -4
@@ -110,7 +110,7 @@ module Fluent::Plugin
|
|
110
110
|
end
|
111
111
|
|
112
112
|
if @regexps.size > 1
|
113
|
-
log.info "Top level multiple <regexp> is
|
113
|
+
log.info "Top level multiple <regexp> is interpreted as 'and' condition"
|
114
114
|
end
|
115
115
|
@regexps.each do |e|
|
116
116
|
raise Fluent::ConfigError, "Duplicate key: #{e.key}" if regexp_and_conditions.key?(e.key)
|
@@ -118,7 +118,7 @@ module Fluent::Plugin
|
|
118
118
|
end
|
119
119
|
|
120
120
|
if @excludes.size > 1
|
121
|
-
log.info "Top level multiple <exclude> is
|
121
|
+
log.info "Top level multiple <exclude> is interpreted as 'or' condition"
|
122
122
|
end
|
123
123
|
@excludes.each do |e|
|
124
124
|
raise Fluent::ConfigError, "Duplicate key: #{e.key}" if exclude_or_conditions.key?(e.key)
|
@@ -33,18 +33,22 @@ module Fluent
|
|
33
33
|
|
34
34
|
def configure(conf)
|
35
35
|
super
|
36
|
+
|
36
37
|
@fields = fields.select{|f| !f.empty? }
|
37
38
|
raise ConfigError, "empty value is specified in fields parameter" if @fields.empty?
|
38
39
|
|
39
|
-
@generate_opts = {col_sep: @delimiter, force_quotes: @force_quotes
|
40
|
+
@generate_opts = {col_sep: @delimiter, force_quotes: @force_quotes, headers: @fields,
|
41
|
+
row_sep: @add_newline ? :auto : "".force_encoding(Encoding::ASCII_8BIT)}
|
42
|
+
# Cache CSV object per thread to avoid internal state sharing
|
43
|
+
@cache = {}
|
40
44
|
end
|
41
45
|
|
42
46
|
def format(tag, time, record)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
47
|
+
csv = (@cache[Thread.current] ||= CSV.new("".force_encoding(Encoding::ASCII_8BIT), @generate_opts))
|
48
|
+
line = (csv << record).string.dup
|
49
|
+
# Need manual cleanup because CSV writer doesn't provide such method.
|
50
|
+
csv.rewind
|
51
|
+
csv.truncate(0)
|
48
52
|
line
|
49
53
|
end
|
50
54
|
end
|
@@ -199,6 +199,13 @@ module Fluent::Plugin
|
|
199
199
|
|
200
200
|
private
|
201
201
|
|
202
|
+
def emit_unmatched(data, sock)
|
203
|
+
record = {"unmatched_line" => data}
|
204
|
+
record[@source_address_key] = sock.remote_addr if @source_address_key
|
205
|
+
record[@source_hostname_key] = sock.remote_host if @source_hostname_key
|
206
|
+
emit("#{@tag}.unmatched", Fluent::EventTime.now, record)
|
207
|
+
end
|
208
|
+
|
202
209
|
def message_handler(data, sock)
|
203
210
|
pri = nil
|
204
211
|
text = data
|
@@ -206,7 +213,7 @@ module Fluent::Plugin
|
|
206
213
|
m = SYSLOG_REGEXP.match(data)
|
207
214
|
unless m
|
208
215
|
if @emit_unmatched_lines
|
209
|
-
|
216
|
+
emit_unmatched(data, sock)
|
210
217
|
end
|
211
218
|
log.warn "invalid syslog message: #{data.dump}"
|
212
219
|
return
|
@@ -218,7 +225,7 @@ module Fluent::Plugin
|
|
218
225
|
@parser.parse(text) do |time, record|
|
219
226
|
unless time && record
|
220
227
|
if @emit_unmatched_lines
|
221
|
-
|
228
|
+
emit_unmatched(data, sock)
|
222
229
|
end
|
223
230
|
log.warn "failed to parse message", data: data
|
224
231
|
return
|
@@ -238,7 +245,7 @@ module Fluent::Plugin
|
|
238
245
|
end
|
239
246
|
rescue => e
|
240
247
|
if @emit_unmatched_lines
|
241
|
-
|
248
|
+
emit_unmatched(data, sock)
|
242
249
|
end
|
243
250
|
log.error "invalid input", data: data, error: e
|
244
251
|
log.error_backtrace
|
@@ -97,6 +97,10 @@ module Fluent::Plugin
|
|
97
97
|
desc 'Ignore repeated permission error logs'
|
98
98
|
config_param :ignore_repeated_permission_error, :bool, default: false
|
99
99
|
|
100
|
+
config_section :parse, required: false, multi: true, init: true, param_name: :parser_configs do
|
101
|
+
config_argument :usage, :string, default: 'in_tail_parser'
|
102
|
+
end
|
103
|
+
|
100
104
|
attr_reader :paths
|
101
105
|
|
102
106
|
@@pos_file_paths = {}
|
@@ -148,7 +152,8 @@ module Fluent::Plugin
|
|
148
152
|
method(:parse_singleline)
|
149
153
|
end
|
150
154
|
@file_perm = system_config.file_permission || FILE_PERMISSION
|
151
|
-
|
155
|
+
# parser is already created by parser helper
|
156
|
+
@parser = parser_create(usage: parser_config['usage'] || @parser_configs.first.usage)
|
152
157
|
end
|
153
158
|
|
154
159
|
def configure_tag
|
@@ -431,7 +436,7 @@ module Fluent::Plugin
|
|
431
436
|
end
|
432
437
|
}
|
433
438
|
rescue => e
|
434
|
-
log.warn line.
|
439
|
+
log.warn 'invalid line found', file: tail_watcher.path, line: line, error: e.to_s
|
435
440
|
log.debug_backtrace(e.backtrace)
|
436
441
|
end
|
437
442
|
end
|
data/lib/fluent/plugin/in_tcp.rb
CHANGED
@@ -61,13 +61,15 @@ module Fluent::Plugin
|
|
61
61
|
def start
|
62
62
|
super
|
63
63
|
|
64
|
-
|
65
|
-
|
66
|
-
|
64
|
+
del_size = @delimiter.length
|
65
|
+
if @_extract_enabled && @_extract_tag_key
|
66
|
+
server_create(:in_tcp_server_single_emit, @port, bind: @bind, resolve_name: !!@source_hostname_key) do |data, conn|
|
67
|
+
conn.buffer << data
|
68
|
+
buf = conn.buffer
|
67
69
|
pos = 0
|
68
|
-
while i =
|
69
|
-
msg =
|
70
|
-
pos = i +
|
70
|
+
while i = buf.index(@delimiter, pos)
|
71
|
+
msg = buf[pos...i]
|
72
|
+
pos = i + del_size
|
71
73
|
|
72
74
|
@parser.parse(msg) do |time, record|
|
73
75
|
unless time && record
|
@@ -83,7 +85,32 @@ module Fluent::Plugin
|
|
83
85
|
router.emit(tag, time, record)
|
84
86
|
end
|
85
87
|
end
|
86
|
-
|
88
|
+
buf.slice!(0, pos) if pos > 0
|
89
|
+
end
|
90
|
+
else
|
91
|
+
server_create(:in_tcp_server_batch_emit, @port, bind: @bind, resolve_name: !!@source_hostname_key) do |data, conn|
|
92
|
+
conn.buffer << data
|
93
|
+
buf = conn.buffer
|
94
|
+
pos = 0
|
95
|
+
es = Fluent::MultiEventStream.new
|
96
|
+
while i = buf.index(@delimiter, pos)
|
97
|
+
msg = buf[pos...i]
|
98
|
+
pos = i + del_size
|
99
|
+
|
100
|
+
@parser.parse(msg) do |time, record|
|
101
|
+
unless time && record
|
102
|
+
log.warn "pattern not matched", message: msg
|
103
|
+
next
|
104
|
+
end
|
105
|
+
|
106
|
+
time ||= extract_time_from_record(record) || Fluent::EventTime.now
|
107
|
+
record[@source_address_key] = conn.remote_addr if @source_address_key
|
108
|
+
record[@source_hostname_key] = conn.remote_host if @source_hostname_key
|
109
|
+
es.add(time, record)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
router.emit_stream(@tag, es)
|
113
|
+
buf.slice!(0, pos) if pos > 0
|
87
114
|
end
|
88
115
|
end
|
89
116
|
end
|
@@ -44,7 +44,7 @@ module Fluent
|
|
44
44
|
@outputs = []
|
45
45
|
@outputs_statically_created = false
|
46
46
|
|
47
|
-
@
|
47
|
+
@counter_mutex = Mutex.new
|
48
48
|
# TODO: well organized counters
|
49
49
|
@num_errors = 0
|
50
50
|
@emit_count = 0
|
@@ -143,12 +143,12 @@ module Fluent
|
|
143
143
|
end
|
144
144
|
|
145
145
|
def emit_sync(tag, es)
|
146
|
-
@
|
146
|
+
@counter_mutex.synchronize{ @emit_count += 1 }
|
147
147
|
begin
|
148
148
|
process(tag, es)
|
149
|
-
@
|
149
|
+
@counter_mutex.synchronize{ @emit_records += es.size }
|
150
150
|
rescue
|
151
|
-
@
|
151
|
+
@counter_mutex.synchronize{ @num_errors += 1 }
|
152
152
|
raise
|
153
153
|
end
|
154
154
|
end
|
@@ -72,6 +72,17 @@ module Fluent::Plugin
|
|
72
72
|
attr_accessor :last_written_path # for tests
|
73
73
|
|
74
74
|
module SymlinkBufferMixin
|
75
|
+
def metadata(timekey: nil, tag: nil, variables: nil)
|
76
|
+
metadata = super
|
77
|
+
|
78
|
+
@latest_metadata ||= new_metadata(timekey: 0)
|
79
|
+
if metadata.timekey && (metadata.timekey >= @latest_metadata.timekey)
|
80
|
+
@latest_metadata = metadata
|
81
|
+
end
|
82
|
+
|
83
|
+
metadata
|
84
|
+
end
|
85
|
+
|
75
86
|
def output_plugin_for_symlink=(output_plugin)
|
76
87
|
@_output_plugin_for_symlink = output_plugin
|
77
88
|
end
|
@@ -86,8 +97,7 @@ module Fluent::Plugin
|
|
86
97
|
# timekey will be appended into that file chunk. On the other side, resumed file chunks might NOT
|
87
98
|
# have timekey, especially in the cases that resumed file chunks are generated by Fluentd v0.12.
|
88
99
|
# These chunks will be enqueued immediately, and will be flushed soon.
|
89
|
-
|
90
|
-
if chunk.metadata == latest_metadata
|
100
|
+
if chunk.metadata == @latest_metadata
|
91
101
|
sym_path = @_output_plugin_for_symlink.extract_placeholders(@_symlink_path, chunk)
|
92
102
|
FileUtils.mkdir_p(File.dirname(sym_path), mode: @_output_plugin_for_symlink.dir_perm)
|
93
103
|
FileUtils.ln_sf(chunk.path, sym_path)
|
@@ -115,7 +125,7 @@ module Fluent::Plugin
|
|
115
125
|
|
116
126
|
if conf.has_key?('utc') || conf.has_key?('localtime')
|
117
127
|
param_name = conf.has_key?('utc') ? 'utc' : 'localtime'
|
118
|
-
log.warn "'#{param_name}' is
|
128
|
+
log.warn "'#{param_name}' is deprecated for output plugin. This parameter is used for formatter plugin in compatibility layer. If you want to use same feature, use timekey_use_utc parameter in <buffer> directive instead"
|
119
129
|
end
|
120
130
|
|
121
131
|
super
|
@@ -20,13 +20,16 @@ require 'fluent/clock'
|
|
20
20
|
require 'base64'
|
21
21
|
|
22
22
|
require 'fluent/compat/socket_util'
|
23
|
+
require 'fluent/plugin/out_forward/handshake_protocol'
|
24
|
+
require 'fluent/plugin/out_forward/load_balancer'
|
25
|
+
require 'fluent/plugin/out_forward/socket_cache'
|
26
|
+
require 'fluent/plugin/out_forward/failure_detector'
|
27
|
+
require 'fluent/plugin/out_forward/error'
|
28
|
+
require 'fluent/plugin/out_forward/connection_manager'
|
29
|
+
require 'fluent/plugin/out_forward/ack_handler'
|
23
30
|
|
24
31
|
module Fluent::Plugin
|
25
32
|
class ForwardOutput < Output
|
26
|
-
class Error < StandardError; end
|
27
|
-
class NoNodesAvailable < Error; end
|
28
|
-
class ConnectionClosedError < Error; end
|
29
|
-
|
30
33
|
Fluent::Plugin.register_output('forward', self)
|
31
34
|
|
32
35
|
helpers :socket, :server, :timer, :thread, :compat_parameters
|
@@ -154,8 +157,6 @@ module Fluent::Plugin
|
|
154
157
|
@thread = nil
|
155
158
|
|
156
159
|
@usock = nil
|
157
|
-
@sock_ack_waiting = nil
|
158
|
-
@sock_ack_waiting_mutex = nil
|
159
160
|
@keep_alive_watcher_interval = 5 # TODO
|
160
161
|
end
|
161
162
|
|
@@ -176,10 +177,8 @@ module Fluent::Plugin
|
|
176
177
|
@heartbeat_type = :transport
|
177
178
|
end
|
178
179
|
|
179
|
-
if @dns_round_robin
|
180
|
-
|
181
|
-
raise Fluent::ConfigError, "forward output heartbeat type must be 'transport' or 'none' to use dns_round_robin option"
|
182
|
-
end
|
180
|
+
if @dns_round_robin && @heartbeat_type == :udp
|
181
|
+
raise Fluent::ConfigError, "forward output heartbeat type must be 'transport' or 'none' to use dns_round_robin option"
|
183
182
|
end
|
184
183
|
|
185
184
|
if @transport == :tls
|
@@ -201,15 +200,24 @@ module Fluent::Plugin
|
|
201
200
|
end
|
202
201
|
end
|
203
202
|
|
203
|
+
@ack_handler = @require_ack_response ? AckHandler.new(timeout: @ack_response_timeout, log: @log, read_length: @read_length) : nil
|
204
|
+
socket_cache = @keepalive ? SocketCache.new(@keepalive_timeout, @log) : nil
|
205
|
+
@connection_manager = ConnectionManager.new(
|
206
|
+
log: @log,
|
207
|
+
secure: !!@security,
|
208
|
+
connection_factory: method(:create_transfer_socket),
|
209
|
+
socket_cache: socket_cache,
|
210
|
+
)
|
211
|
+
|
204
212
|
@servers.each do |server|
|
205
213
|
failure = FailureDetector.new(@heartbeat_interval, @hard_timeout, Time.now.to_i.to_f)
|
206
214
|
name = server.name || "#{server.host}:#{server.port}"
|
207
215
|
|
208
216
|
log.info "adding forwarding server '#{name}'", host: server.host, port: server.port, weight: server.weight, plugin_id: plugin_id
|
209
217
|
if @heartbeat_type == :none
|
210
|
-
@nodes << NoneHeartbeatNode.new(self, server, failure: failure,
|
218
|
+
@nodes << NoneHeartbeatNode.new(self, server, failure: failure, connection_manager: @connection_manager, ack_handler: @ack_handler)
|
211
219
|
else
|
212
|
-
node = Node.new(self, server, failure: failure,
|
220
|
+
node = Node.new(self, server, failure: failure, connection_manager: @connection_manager, ack_handler: @ack_handler)
|
213
221
|
begin
|
214
222
|
node.validate_host_resolution!
|
215
223
|
rescue => e
|
@@ -251,31 +259,25 @@ module Fluent::Plugin
|
|
251
259
|
def start
|
252
260
|
super
|
253
261
|
|
254
|
-
|
255
|
-
|
256
|
-
if @ack_response_timeout && @delayed_commit_timeout != @ack_response_timeout
|
257
|
-
log.info "delayed_commit_timeout is overwritten by ack_response_timeout"
|
258
|
-
@delayed_commit_timeout = @ack_response_timeout + 2 # minimum ack_reader IO.select interval is 1s
|
259
|
-
end
|
260
|
-
|
261
|
-
@rand_seed = Random.new.seed
|
262
|
-
rebuild_weight_array
|
263
|
-
@rr = 0
|
262
|
+
@load_balancer = LoadBalancer.new(log)
|
263
|
+
@load_balancer.rebuild_weight_array(@nodes)
|
264
264
|
|
265
265
|
unless @heartbeat_type == :none
|
266
266
|
if @heartbeat_type == :udp
|
267
267
|
@usock = socket_create_udp(@nodes.first.host, @nodes.first.port, nonblock: true)
|
268
|
-
server_create_udp(:out_forward_heartbeat_receiver, 0, socket: @usock, max_bytes: @read_length
|
269
|
-
sockaddr = Socket.pack_sockaddr_in(sock.remote_port, sock.remote_host)
|
270
|
-
on_heartbeat(sockaddr, data)
|
271
|
-
end
|
268
|
+
server_create_udp(:out_forward_heartbeat_receiver, 0, socket: @usock, max_bytes: @read_length, &method(:on_udp_heatbeat_response_recv))
|
272
269
|
end
|
273
|
-
timer_execute(:out_forward_heartbeat_request, @heartbeat_interval, &method(:
|
270
|
+
timer_execute(:out_forward_heartbeat_request, @heartbeat_interval, &method(:on_heartbeat_timer))
|
274
271
|
end
|
275
272
|
|
276
273
|
if @require_ack_response
|
277
|
-
@
|
278
|
-
|
274
|
+
# Output#start sets @delayed_commit_timeout by @buffer_config.delayed_commit_timeout
|
275
|
+
# But it should be overwritten by ack_response_timeout to rollback chunks after timeout
|
276
|
+
if @delayed_commit_timeout != @ack_response_timeout
|
277
|
+
log.info "delayed_commit_timeout is overwritten by ack_response_timeout"
|
278
|
+
@delayed_commit_timeout = @ack_response_timeout + 2 # minimum ack_reader IO.select interval is 1s
|
279
|
+
end
|
280
|
+
|
279
281
|
thread_create(:out_forward_receiving_ack, &method(:ack_reader))
|
280
282
|
end
|
281
283
|
|
@@ -301,22 +303,22 @@ module Fluent::Plugin
|
|
301
303
|
@usock.close rescue nil
|
302
304
|
end
|
303
305
|
|
304
|
-
if @keepalive && @keepalive_timeout
|
305
|
-
@nodes.each(&:clear)
|
306
|
-
end
|
307
306
|
super
|
308
307
|
end
|
309
308
|
|
309
|
+
def stop
|
310
|
+
super
|
311
|
+
|
312
|
+
if @keepalive
|
313
|
+
@connection_manager.stop
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
310
317
|
def write(chunk)
|
311
318
|
return if chunk.empty?
|
312
319
|
tag = chunk.metadata.tag
|
313
|
-
select_a_healthy_node{|node| node.send_data(tag, chunk) }
|
314
|
-
end
|
315
320
|
|
316
|
-
|
317
|
-
def expired?(now)
|
318
|
-
time + timeout < now
|
319
|
-
end
|
321
|
+
@load_balancer.select_healthy_node { |node| node.send_data(tag, chunk) }
|
320
322
|
end
|
321
323
|
|
322
324
|
def try_write(chunk)
|
@@ -326,35 +328,7 @@ module Fluent::Plugin
|
|
326
328
|
return
|
327
329
|
end
|
328
330
|
tag = chunk.metadata.tag
|
329
|
-
|
330
|
-
chunk_id_base64 = Base64.encode64(chunk.unique_id)
|
331
|
-
current_time = Fluent::Clock.now
|
332
|
-
info = ACKWaitingSockInfo.new(sock, chunk.unique_id, chunk_id_base64, node, current_time, @ack_response_timeout)
|
333
|
-
@sock_ack_waiting_mutex.synchronize do
|
334
|
-
@sock_ack_waiting << info
|
335
|
-
end
|
336
|
-
end
|
337
|
-
|
338
|
-
def select_a_healthy_node
|
339
|
-
error = nil
|
340
|
-
|
341
|
-
wlen = @weight_array.length
|
342
|
-
wlen.times do
|
343
|
-
@rr = (@rr + 1) % wlen
|
344
|
-
node = @weight_array[@rr]
|
345
|
-
next unless node.available?
|
346
|
-
|
347
|
-
begin
|
348
|
-
ret = yield node
|
349
|
-
return ret, node
|
350
|
-
rescue
|
351
|
-
# for load balancing during detecting crashed servers
|
352
|
-
error = $! # use the latest error
|
353
|
-
end
|
354
|
-
end
|
355
|
-
|
356
|
-
raise error if error
|
357
|
-
raise NoNodesAvailable, "no nodes are available"
|
331
|
+
@load_balancer.select_healthy_node { |n| n.send_data(tag, chunk) }
|
358
332
|
end
|
359
333
|
|
360
334
|
def create_transfer_socket(host, port, hostname, &block)
|
@@ -403,130 +377,41 @@ module Fluent::Plugin
|
|
403
377
|
|
404
378
|
private
|
405
379
|
|
406
|
-
def
|
407
|
-
|
408
|
-
|
409
|
-
}
|
410
|
-
|
411
|
-
lost_weight = 0
|
412
|
-
regular_nodes.each {|n|
|
413
|
-
unless n.available?
|
414
|
-
lost_weight += n.weight
|
415
|
-
end
|
416
|
-
}
|
417
|
-
log.debug "rebuilding weight array", lost_weight: lost_weight
|
418
|
-
|
419
|
-
if lost_weight > 0
|
420
|
-
standby_nodes.each {|n|
|
421
|
-
if n.available?
|
422
|
-
regular_nodes << n
|
423
|
-
log.warn "using standby node #{n.host}:#{n.port}", weight: n.weight
|
424
|
-
lost_weight -= n.weight
|
425
|
-
break if lost_weight <= 0
|
426
|
-
end
|
427
|
-
}
|
428
|
-
end
|
429
|
-
|
430
|
-
weight_array = []
|
431
|
-
if regular_nodes.empty?
|
432
|
-
log.warn('No nodes are available')
|
433
|
-
@weight_array = weight_array
|
434
|
-
return @weight_array
|
435
|
-
end
|
436
|
-
|
437
|
-
gcd = regular_nodes.map {|n| n.weight }.inject(0) {|r,w| r.gcd(w) }
|
438
|
-
regular_nodes.each {|n|
|
439
|
-
(n.weight / gcd).times {
|
440
|
-
weight_array << n
|
441
|
-
}
|
442
|
-
}
|
443
|
-
|
444
|
-
# for load balancing during detecting crashed servers
|
445
|
-
coe = (regular_nodes.size * 6) / weight_array.size
|
446
|
-
weight_array *= coe if coe > 1
|
447
|
-
|
448
|
-
r = Random.new(@rand_seed)
|
449
|
-
weight_array.sort_by! { r.rand }
|
450
|
-
|
451
|
-
@weight_array = weight_array
|
452
|
-
end
|
453
|
-
|
454
|
-
def on_timer
|
455
|
-
@nodes.each {|n|
|
380
|
+
def on_heartbeat_timer
|
381
|
+
need_rebuild = false
|
382
|
+
@nodes.each do |n|
|
456
383
|
begin
|
457
384
|
log.trace "sending heartbeat", host: n.host, port: n.port, heartbeat_type: @heartbeat_type
|
458
385
|
n.usock = @usock if @usock
|
459
|
-
|
460
|
-
rebuild_weight_array
|
461
|
-
end
|
386
|
+
need_rebuild = n.send_heartbeat || need_rebuild
|
462
387
|
rescue Errno::EAGAIN, Errno::EWOULDBLOCK, Errno::EINTR, Errno::ECONNREFUSED, Errno::ETIMEDOUT => e
|
463
388
|
log.debug "failed to send heartbeat packet", host: n.host, port: n.port, heartbeat_type: @heartbeat_type, error: e
|
464
389
|
rescue => e
|
465
390
|
log.debug "unexpected error happen during heartbeat", host: n.host, port: n.port, heartbeat_type: @heartbeat_type, error: e
|
466
391
|
end
|
467
|
-
if n.tick
|
468
|
-
rebuild_weight_array
|
469
|
-
end
|
470
|
-
}
|
471
|
-
end
|
472
392
|
|
473
|
-
|
474
|
-
if node = @nodes.find {|n| n.sockaddr == sockaddr }
|
475
|
-
# log.trace "heartbeat arrived", name: node.name, host: node.host, port: node.port
|
476
|
-
if node.heartbeat
|
477
|
-
rebuild_weight_array
|
478
|
-
end
|
393
|
+
need_rebuild = n.tick || need_rebuild
|
479
394
|
end
|
480
|
-
end
|
481
395
|
|
482
|
-
|
483
|
-
|
396
|
+
if need_rebuild
|
397
|
+
@load_balancer.rebuild_weight_array(@nodes)
|
398
|
+
end
|
484
399
|
end
|
485
400
|
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
end
|
493
|
-
info = @sock_ack_waiting_mutex.synchronize{ @sock_ack_waiting.find{|i| i.sock == sock } }
|
494
|
-
|
495
|
-
# When connection is closed by remote host, socket is ready to read and #recv returns an empty string that means EOF.
|
496
|
-
# If this happens we assume the data wasn't delivered and retry it.
|
497
|
-
if raw_data.empty?
|
498
|
-
log.warn "destination node closed the connection. regard it as unavailable.", host: info.node.host, port: info.node.port
|
499
|
-
info.node.disable!
|
500
|
-
rollback_write(info.chunk_id, update_retry: false)
|
501
|
-
return nil
|
502
|
-
else
|
503
|
-
unpacker.feed(raw_data)
|
504
|
-
res = unpacker.read
|
505
|
-
log.trace "getting response from destination", host: info.node.host, port: info.node.port, chunk_id: dump_unique_id_hex(info.chunk_id), response: res
|
506
|
-
if res['ack'] != info.chunk_id_base64
|
507
|
-
# Some errors may have occurred when ack and chunk id is different, so send the chunk again.
|
508
|
-
log.warn "ack in response and chunk id in sent data are different", chunk_id: dump_unique_id_hex(info.chunk_id), ack: res['ack']
|
509
|
-
rollback_write(info.chunk_id, update_retry: false)
|
510
|
-
return nil
|
511
|
-
else
|
512
|
-
log.trace "got a correct ack response", chunk_id: dump_unique_id_hex(info.chunk_id)
|
401
|
+
def on_udp_heatbeat_response_recv(data, sock)
|
402
|
+
sockaddr = Socket.pack_sockaddr_in(sock.remote_port, sock.remote_host)
|
403
|
+
if node = @nodes.find { |n| n.sockaddr == sockaddr }
|
404
|
+
# log.trace "heartbeat arrived", name: node.name, host: node.host, port: node.port
|
405
|
+
if node.heartbeat
|
406
|
+
@load_balancer.rebuild_weight_array(@nodes)
|
513
407
|
end
|
514
|
-
return info.chunk_id
|
515
|
-
end
|
516
|
-
rescue => e
|
517
|
-
log.error "unexpected error while receiving ack message", error: e
|
518
|
-
log.error_backtrace
|
519
|
-
ensure
|
520
|
-
if @keepalive
|
521
|
-
info.node.socket_cache.dec_ref_by_value(info.sock)
|
522
408
|
else
|
523
|
-
|
524
|
-
info.sock.close rescue nil
|
409
|
+
log.warn("Unknown heartbeat response received from #{sock.remote_host}:#{sock.remote_port}")
|
525
410
|
end
|
411
|
+
end
|
526
412
|
|
527
|
-
|
528
|
-
|
529
|
-
end
|
413
|
+
def on_purge_obsolete_socks
|
414
|
+
@connection_manager.purge_obsolete_socks
|
530
415
|
end
|
531
416
|
|
532
417
|
def ack_reader
|
@@ -536,185 +421,33 @@ module Fluent::Plugin
|
|
536
421
|
@delayed_commit_timeout / 3.0
|
537
422
|
end
|
538
423
|
|
539
|
-
unpacker = Fluent::Engine.msgpack_unpacker
|
540
|
-
|
541
424
|
while thread_current_running?
|
542
|
-
|
543
|
-
|
544
|
-
begin
|
545
|
-
@sock_ack_waiting_mutex.synchronize do
|
546
|
-
new_list = []
|
547
|
-
@sock_ack_waiting.each do |info|
|
548
|
-
if info.expired?(now)
|
549
|
-
# There are 2 types of cases when no response has been received from socket:
|
550
|
-
# (1) the node does not support sending responses
|
551
|
-
# (2) the node does support sending response but responses have not arrived for some reasons.
|
552
|
-
log.warn "no response from node. regard it as unavailable.", host: info.node.host, port: info.node.port
|
553
|
-
info.node.disable!
|
554
|
-
if @keepalive
|
555
|
-
info.node.socket_cache.revoke_by_value(info.sock)
|
556
|
-
end
|
557
|
-
info.sock.close rescue nil
|
558
|
-
rollback_write(info.chunk_id, update_retry: false)
|
559
|
-
else
|
560
|
-
sockets << info.sock
|
561
|
-
new_list << info
|
562
|
-
end
|
563
|
-
end
|
564
|
-
@sock_ack_waiting = new_list
|
565
|
-
end
|
425
|
+
@ack_handler.collect_response(select_interval) do |chunk_id, node, sock, result|
|
426
|
+
@connection_manager.close(sock)
|
566
427
|
|
567
|
-
|
568
|
-
|
428
|
+
case result
|
429
|
+
when AckHandler::Result::SUCCESS
|
430
|
+
commit_write(chunk_id)
|
431
|
+
when AckHandler::Result::FAILED
|
432
|
+
node.disable!
|
433
|
+
rollback_write(chunk_id, update_retry: false)
|
434
|
+
when AckHandler::Result::CHUNKID_UNMATCHED
|
435
|
+
rollback_write(chunk_id, update_retry: false)
|
436
|
+
else
|
437
|
+
log.warn("BUG: invalid status #{result} #{chunk_id}")
|
569
438
|
|
570
|
-
|
571
|
-
|
572
|
-
|
439
|
+
if chunk_id
|
440
|
+
rollback_write(chunk_id, update_retry: false)
|
441
|
+
end
|
573
442
|
end
|
574
|
-
rescue => e
|
575
|
-
log.error "unexpected error while receiving ack", error: e
|
576
|
-
log.error_backtrace
|
577
443
|
end
|
578
444
|
end
|
579
445
|
end
|
580
446
|
|
581
447
|
class Node
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
def initialize(timeout, log)
|
586
|
-
@log = log
|
587
|
-
@timeout = timeout
|
588
|
-
@active_socks = {}
|
589
|
-
@inactive_socks = {}
|
590
|
-
@mutex = Mutex.new
|
591
|
-
end
|
592
|
-
|
593
|
-
def revoke(key = Thread.current.object_id)
|
594
|
-
@mutex.synchronize do
|
595
|
-
if @active_socks[key]
|
596
|
-
@inactive_socks[key] = @active_socks.delete(key)
|
597
|
-
@inactive_socks[key].ref = 0
|
598
|
-
end
|
599
|
-
end
|
600
|
-
end
|
601
|
-
|
602
|
-
def clear
|
603
|
-
@mutex.synchronize do
|
604
|
-
@inactive_socks.values.each do |s|
|
605
|
-
s.sock.close rescue nil
|
606
|
-
end
|
607
|
-
@inactive_socks.clear
|
608
|
-
|
609
|
-
@active_socks.values.each do |s|
|
610
|
-
s.sock.close rescue nil
|
611
|
-
end
|
612
|
-
@active_socks.clear
|
613
|
-
end
|
614
|
-
end
|
615
|
-
|
616
|
-
def purge_obsolete_socks
|
617
|
-
@mutex.synchronize do
|
618
|
-
@inactive_socks.keys.each do |k|
|
619
|
-
# 0 means sockets stored in this class received all acks
|
620
|
-
if @inactive_socks[k].ref <= 0
|
621
|
-
s = @inactive_socks.delete(k)
|
622
|
-
s.sock.close rescue nil
|
623
|
-
@log.debug("purged obsolete socket #{s.sock}")
|
624
|
-
end
|
625
|
-
end
|
626
|
-
|
627
|
-
@active_socks.keys.each do |k|
|
628
|
-
if expired?(k) && @active_socks[k].ref <= 0
|
629
|
-
@inactive_socks[k] = @active_socks.delete(k)
|
630
|
-
end
|
631
|
-
end
|
632
|
-
end
|
633
|
-
end
|
634
|
-
|
635
|
-
# We expect that `yield` returns a unique object in this class
|
636
|
-
def fetch_or(key = Thread.current.object_id)
|
637
|
-
@mutex.synchronize do
|
638
|
-
unless @active_socks[key]
|
639
|
-
@active_socks[key] = TimedSocket.new(timeout, yield, 1)
|
640
|
-
@log.debug("connect new socket #{@active_socks[key]}")
|
641
|
-
return @active_socks[key].sock
|
642
|
-
end
|
643
|
-
|
644
|
-
if expired?(key)
|
645
|
-
# Do not close this socket here in case of it will be used by other place (e.g. wait for receiving ack)
|
646
|
-
@inactive_socks[key] = @active_socks.delete(key)
|
647
|
-
@log.debug("connection #{@inactive_socks[key]} is expired. reconnecting...")
|
648
|
-
@active_socks[key] = TimedSocket.new(timeout, yield, 0)
|
649
|
-
end
|
650
|
-
|
651
|
-
@active_socks[key].ref += 1
|
652
|
-
@active_socks[key].sock
|
653
|
-
end
|
654
|
-
end
|
655
|
-
|
656
|
-
def dec_ref(key = Thread.current.object_id)
|
657
|
-
@mutex.synchronize do
|
658
|
-
if @active_socks[key]
|
659
|
-
@active_socks[key].ref -= 1
|
660
|
-
elsif @inactive_socks[key]
|
661
|
-
@inactive_socks[key].ref -= 1
|
662
|
-
else
|
663
|
-
@log.warn("Not found key for dec_ref: #{key}")
|
664
|
-
end
|
665
|
-
end
|
666
|
-
end
|
667
|
-
|
668
|
-
# This method is expected to be called in class which doesn't call #inc_ref
|
669
|
-
def dec_ref_by_value(val)
|
670
|
-
@mutex.synchronize do
|
671
|
-
sock = @active_socks.detect { |_, v| v.sock == val }
|
672
|
-
if sock
|
673
|
-
key = sock.first
|
674
|
-
@active_socks[key].ref -= 1
|
675
|
-
return
|
676
|
-
end
|
677
|
-
|
678
|
-
sock = @inactive_socks.detect { |_, v| v.sock == val }
|
679
|
-
if sock
|
680
|
-
key = sock.first
|
681
|
-
@inactive_socks[key].ref -= 1
|
682
|
-
return
|
683
|
-
else
|
684
|
-
@log.warn("Not found key for dec_ref_by_value: #{key}")
|
685
|
-
end
|
686
|
-
end
|
687
|
-
end
|
688
|
-
|
689
|
-
# This method is expected to be called in class which doesn't call #fetch_or
|
690
|
-
def revoke_by_value(val)
|
691
|
-
@mutex.synchronize do
|
692
|
-
sock = @active_socks.detect { |_, v| v.sock == val }
|
693
|
-
if sock
|
694
|
-
key = sock.first
|
695
|
-
@inactive_socks[key] = @active_socks.delete(key)
|
696
|
-
@inactive_socks[key].ref = 0
|
697
|
-
else
|
698
|
-
@log.debug("Not found for revoke_by_value :#{val}")
|
699
|
-
end
|
700
|
-
end
|
701
|
-
end
|
702
|
-
|
703
|
-
private
|
704
|
-
|
705
|
-
def timeout
|
706
|
-
@timeout && Time.now + @timeout
|
707
|
-
end
|
708
|
-
|
709
|
-
# This method is thread unsafe
|
710
|
-
def expired?(key = Thread.current.object_id)
|
711
|
-
@active_socks[key].timeout ? @active_socks[key].timeout < Time.now : false
|
712
|
-
end
|
713
|
-
end
|
714
|
-
|
715
|
-
# @param keepalive [Bool]
|
716
|
-
# @param keepalive_timeout [Integer | nil]
|
717
|
-
def initialize(sender, server, failure:, keepalive: false, keepalive_timeout: nil)
|
448
|
+
# @param connection_manager [Fluent::Plugin::ForwardOutput::ConnectionManager]
|
449
|
+
# @param ack_handler [Fluent::Plugin::ForwardOutput::AckHandler]
|
450
|
+
def initialize(sender, server, failure:, connection_manager:, ack_handler:)
|
718
451
|
@sender = sender
|
719
452
|
@log = sender.log
|
720
453
|
@compress = sender.compress
|
@@ -737,10 +470,13 @@ module Fluent::Plugin
|
|
737
470
|
|
738
471
|
@usock = nil
|
739
472
|
|
740
|
-
@
|
741
|
-
|
742
|
-
|
743
|
-
|
473
|
+
@handshake = HandshakeProtocol.new(
|
474
|
+
log: @log,
|
475
|
+
hostname: sender.security && sender.security.self_hostname,
|
476
|
+
shared_key: server.shared_key || (sender.security && sender.security.shared_key) || '',
|
477
|
+
password: server.password,
|
478
|
+
username: server.username,
|
479
|
+
)
|
744
480
|
|
745
481
|
@unpacker = Fluent::Engine.msgpack_unpacker
|
746
482
|
|
@@ -748,20 +484,15 @@ module Fluent::Plugin
|
|
748
484
|
@resolved_time = 0
|
749
485
|
@resolved_once = false
|
750
486
|
|
751
|
-
@
|
752
|
-
|
753
|
-
@socket_cache = SocketCache.new(keepalive_timeout, @log)
|
754
|
-
end
|
487
|
+
@connection_manager = connection_manager
|
488
|
+
@ack_handler = ack_handler
|
755
489
|
end
|
756
490
|
|
757
491
|
attr_accessor :usock
|
758
492
|
|
759
493
|
attr_reader :name, :host, :port, :weight, :standby, :state
|
760
|
-
attr_reader :sockaddr # used by
|
761
|
-
attr_reader :failure
|
762
|
-
attr_reader :socket_cache # for ack
|
763
|
-
|
764
|
-
RequestInfo = Struct.new(:state, :shared_key_nonce, :auth)
|
494
|
+
attr_reader :sockaddr # used by on_udp_heatbeat_response_recv
|
495
|
+
attr_reader :failure # for test
|
765
496
|
|
766
497
|
def validate_host_resolution!
|
767
498
|
resolved_host
|
@@ -783,13 +514,15 @@ module Fluent::Plugin
|
|
783
514
|
connect do |sock, ri|
|
784
515
|
if ri.state != :established
|
785
516
|
establish_connection(sock, ri)
|
786
|
-
|
517
|
+
if ri.state != :established
|
518
|
+
raise "Failed to establish connection to #{@host}:#{@port}"
|
519
|
+
end
|
787
520
|
end
|
788
521
|
end
|
789
522
|
end
|
790
523
|
|
791
524
|
def establish_connection(sock, ri)
|
792
|
-
while
|
525
|
+
while ri.state != :established
|
793
526
|
begin
|
794
527
|
# TODO: On Ruby 2.2 or earlier, read_nonblock doesn't work expectedly.
|
795
528
|
# We need rewrite around here using new socket/server plugin helper.
|
@@ -799,7 +532,9 @@ module Fluent::Plugin
|
|
799
532
|
next
|
800
533
|
end
|
801
534
|
@unpacker.feed_each(buf) do |data|
|
802
|
-
|
535
|
+
if @handshake.invoke(sock, ri, data) == :established
|
536
|
+
@log.debug "connection established", host: @host, port: @port
|
537
|
+
end
|
803
538
|
end
|
804
539
|
rescue IO::WaitReadable
|
805
540
|
# If the exception is Errno::EWOULDBLOCK or Errno::EAGAIN, it is extended by IO::WaitReadable.
|
@@ -814,17 +549,21 @@ module Fluent::Plugin
|
|
814
549
|
@log.warn "disconnected", host: @host, port: @port
|
815
550
|
disable!
|
816
551
|
break
|
552
|
+
rescue HeloError => e
|
553
|
+
@log.warn "received invalid helo message from #{@name}"
|
554
|
+
disable!
|
555
|
+
break
|
556
|
+
rescue PingpongError => e
|
557
|
+
@log.warn "connection refused to #{@name || @host}: #{e.message}"
|
558
|
+
disable!
|
559
|
+
break
|
817
560
|
end
|
818
561
|
end
|
819
562
|
end
|
820
563
|
|
821
564
|
def send_data_actual(sock, tag, chunk)
|
822
|
-
unless available?
|
823
|
-
raise ConnectionClosedError, "failed to establish connection with node #{@name}"
|
824
|
-
end
|
825
|
-
|
826
565
|
option = { 'size' => chunk.size, 'compressed' => @compress }
|
827
|
-
option['chunk'] = Base64.encode64(chunk.unique_id) if @
|
566
|
+
option['chunk'] = Base64.encode64(chunk.unique_id) if @ack_handler
|
828
567
|
|
829
568
|
# https://github.com/fluent/fluentd/wiki/Forward-Protocol-Specification-v1#packedforward-mode
|
830
569
|
# out_forward always uses str32 type for entries.
|
@@ -845,48 +584,26 @@ module Fluent::Plugin
|
|
845
584
|
end
|
846
585
|
|
847
586
|
def send_data(tag, chunk)
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
587
|
+
ack = @ack_handler && @ack_handler.create_ack(chunk.unique_id, self)
|
588
|
+
connect(nil, ack: ack) do |sock, ri|
|
589
|
+
if ri.state != :established
|
590
|
+
establish_connection(sock, ri)
|
852
591
|
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
if @keepalive
|
857
|
-
@socket_cache.revoke
|
858
|
-
else
|
859
|
-
sock.close rescue nil
|
592
|
+
if ri.state != :established
|
593
|
+
raise ConnectionClosedError, "failed to establish connection with node #{@name}"
|
594
|
+
end
|
860
595
|
end
|
861
|
-
raise
|
862
|
-
end
|
863
596
|
|
864
|
-
|
865
|
-
return sock # to read ACK from socket
|
597
|
+
send_data_actual(sock, tag, chunk)
|
866
598
|
end
|
867
599
|
|
868
|
-
if @keepalive
|
869
|
-
@socket_cache.dec_ref
|
870
|
-
else
|
871
|
-
sock.close_write rescue nil
|
872
|
-
sock.close rescue nil
|
873
|
-
end
|
874
600
|
heartbeat(false)
|
875
601
|
nil
|
876
602
|
end
|
877
603
|
|
878
|
-
def clear
|
879
|
-
@keepalive && @socket_cache.clear
|
880
|
-
end
|
881
|
-
|
882
|
-
def purge_obsolete_socks
|
883
|
-
unless @keepalive
|
884
|
-
raise "Don not call this method without keepalive option"
|
885
|
-
end
|
886
|
-
@socket_cache.purge_obsolete_socks
|
887
|
-
end
|
888
|
-
|
889
604
|
# FORWARD_TCP_HEARTBEAT_DATA = FORWARD_HEADER + ''.to_msgpack + [].to_msgpack
|
605
|
+
#
|
606
|
+
# @return [Boolean] return true if it needs to rebuild nodes
|
890
607
|
def send_heartbeat
|
891
608
|
begin
|
892
609
|
dest_addr = resolved_host
|
@@ -894,14 +611,14 @@ module Fluent::Plugin
|
|
894
611
|
rescue ::SocketError => e
|
895
612
|
if !@resolved_once && @sender.ignore_network_errors_at_startup
|
896
613
|
@log.warn "failed to resolve node name in heartbeating", server: @name || @host, error: e
|
897
|
-
return
|
614
|
+
return false
|
898
615
|
end
|
899
616
|
raise
|
900
617
|
end
|
901
618
|
|
902
619
|
case @sender.heartbeat_type
|
903
620
|
when :transport
|
904
|
-
connect(dest_addr) do |
|
621
|
+
connect(dest_addr) do |_ri, _sock|
|
905
622
|
## don't send any data to not cause a compatibility problem
|
906
623
|
# sock.write FORWARD_TCP_HEARTBEAT_DATA
|
907
624
|
|
@@ -910,8 +627,9 @@ module Fluent::Plugin
|
|
910
627
|
heartbeat(true)
|
911
628
|
end
|
912
629
|
when :udp
|
913
|
-
@usock.send "\0", 0, Socket.pack_sockaddr_in(@port,
|
914
|
-
|
630
|
+
@usock.send "\0", 0, Socket.pack_sockaddr_in(@port, dest_addr)
|
631
|
+
# response is going to receive at on_udp_heatbeat_response_recv
|
632
|
+
false
|
915
633
|
when :none # :none doesn't use this class
|
916
634
|
raise "BUG: heartbeat_type none must not use Node"
|
917
635
|
else
|
@@ -943,14 +661,14 @@ module Fluent::Plugin
|
|
943
661
|
def resolve_dns!
|
944
662
|
addrinfo_list = Socket.getaddrinfo(@host, @port, nil, Socket::SOCK_STREAM)
|
945
663
|
addrinfo = @sender.dns_round_robin ? addrinfo_list.sample : addrinfo_list.first
|
946
|
-
@sockaddr = Socket.pack_sockaddr_in(addrinfo[1], addrinfo[3]) # used by
|
664
|
+
@sockaddr = Socket.pack_sockaddr_in(addrinfo[1], addrinfo[3]) # used by on_udp_heatbeat_response_recv
|
947
665
|
addrinfo[3]
|
948
666
|
end
|
949
667
|
private :resolve_dns!
|
950
668
|
|
951
669
|
def tick
|
952
670
|
now = Time.now.to_f
|
953
|
-
|
671
|
+
unless available?
|
954
672
|
if @failure.hard_timeout?(now)
|
955
673
|
@failure.clear
|
956
674
|
end
|
@@ -959,7 +677,7 @@ module Fluent::Plugin
|
|
959
677
|
|
960
678
|
if @failure.hard_timeout?(now)
|
961
679
|
@log.warn "detached forwarding server '#{@name}'", host: @host, port: @port, hard_timeout: true
|
962
|
-
|
680
|
+
disable!
|
963
681
|
@resolved_host = nil # expire cached host
|
964
682
|
@failure.clear
|
965
683
|
return true
|
@@ -969,7 +687,7 @@ module Fluent::Plugin
|
|
969
687
|
phi = @failure.phi(now)
|
970
688
|
if phi > @sender.phi_threshold
|
971
689
|
@log.warn "detached forwarding server '#{@name}'", host: @host, port: @port, phi: phi, phi_threshold: @sender.phi_threshold
|
972
|
-
|
690
|
+
disable!
|
973
691
|
@resolved_host = nil # expire cached host
|
974
692
|
@failure.clear
|
975
693
|
return true
|
@@ -981,7 +699,7 @@ module Fluent::Plugin
|
|
981
699
|
def heartbeat(detect=true)
|
982
700
|
now = Time.now.to_f
|
983
701
|
@failure.add(now)
|
984
|
-
if detect &&
|
702
|
+
if detect && !available? && @failure.sample_size > @sender.recover_sample_size
|
985
703
|
@available = true
|
986
704
|
@log.warn "recovered forwarding server '#{@name}'", host: @host, port: @port
|
987
705
|
true
|
@@ -990,127 +708,10 @@ module Fluent::Plugin
|
|
990
708
|
end
|
991
709
|
end
|
992
710
|
|
993
|
-
def generate_salt
|
994
|
-
SecureRandom.hex(16)
|
995
|
-
end
|
996
|
-
|
997
|
-
def check_helo(ri, message)
|
998
|
-
@log.debug "checking helo"
|
999
|
-
# ['HELO', options(hash)]
|
1000
|
-
unless message.size == 2 && message[0] == 'HELO'
|
1001
|
-
return false
|
1002
|
-
end
|
1003
|
-
opts = message[1] || {}
|
1004
|
-
# make shared_key_check failed (instead of error) if protocol version mismatch exist
|
1005
|
-
ri.shared_key_nonce = opts['nonce'] || ''
|
1006
|
-
ri.auth = opts['auth'] || ''
|
1007
|
-
true
|
1008
|
-
end
|
1009
|
-
|
1010
|
-
def generate_ping(ri)
|
1011
|
-
@log.debug "generating ping"
|
1012
|
-
# ['PING', self_hostname, sharedkey\_salt, sha512\_hex(sharedkey\_salt + self_hostname + nonce + shared_key),
|
1013
|
-
# username || '', sha512\_hex(auth\_salt + username + password) || '']
|
1014
|
-
shared_key_hexdigest = Digest::SHA512.new.update(@shared_key_salt)
|
1015
|
-
.update(@sender.security.self_hostname)
|
1016
|
-
.update(ri.shared_key_nonce)
|
1017
|
-
.update(@shared_key)
|
1018
|
-
.hexdigest
|
1019
|
-
ping = ['PING', @sender.security.self_hostname, @shared_key_salt, shared_key_hexdigest]
|
1020
|
-
if !ri.auth.empty?
|
1021
|
-
password_hexdigest = Digest::SHA512.new.update(ri.auth).update(@username).update(@password).hexdigest
|
1022
|
-
ping.push(@username, password_hexdigest)
|
1023
|
-
else
|
1024
|
-
ping.push('','')
|
1025
|
-
end
|
1026
|
-
ping
|
1027
|
-
end
|
1028
|
-
|
1029
|
-
def check_pong(ri, message)
|
1030
|
-
@log.debug "checking pong"
|
1031
|
-
# ['PONG', bool(authentication result), 'reason if authentication failed',
|
1032
|
-
# self_hostname, sha512\_hex(salt + self_hostname + nonce + sharedkey)]
|
1033
|
-
unless message.size == 5 && message[0] == 'PONG'
|
1034
|
-
return false, 'invalid format for PONG message'
|
1035
|
-
end
|
1036
|
-
_pong, auth_result, reason, hostname, shared_key_hexdigest = message
|
1037
|
-
|
1038
|
-
unless auth_result
|
1039
|
-
return false, 'authentication failed: ' + reason
|
1040
|
-
end
|
1041
|
-
|
1042
|
-
if hostname == @sender.security.self_hostname
|
1043
|
-
return false, 'same hostname between input and output: invalid configuration'
|
1044
|
-
end
|
1045
|
-
|
1046
|
-
clientside = Digest::SHA512.new.update(@shared_key_salt).update(hostname).update(ri.shared_key_nonce).update(@shared_key).hexdigest
|
1047
|
-
unless shared_key_hexdigest == clientside
|
1048
|
-
return false, 'shared key mismatch'
|
1049
|
-
end
|
1050
|
-
|
1051
|
-
return true, nil
|
1052
|
-
end
|
1053
|
-
|
1054
|
-
def on_read(sock, ri, data)
|
1055
|
-
@log.trace __callee__
|
1056
|
-
|
1057
|
-
case ri.state
|
1058
|
-
when :helo
|
1059
|
-
unless check_helo(ri, data)
|
1060
|
-
@log.warn "received invalid helo message from #{@name}"
|
1061
|
-
disable! # shutdown
|
1062
|
-
return
|
1063
|
-
end
|
1064
|
-
sock.write(generate_ping(ri).to_msgpack)
|
1065
|
-
ri.state = :pingpong
|
1066
|
-
when :pingpong
|
1067
|
-
succeeded, reason = check_pong(ri, data)
|
1068
|
-
unless succeeded
|
1069
|
-
@log.warn "connection refused to #{@name || @host}: #{reason}"
|
1070
|
-
disable! # shutdown
|
1071
|
-
return
|
1072
|
-
end
|
1073
|
-
ri.state = :established
|
1074
|
-
@log.debug "connection established", host: @host, port: @port
|
1075
|
-
else
|
1076
|
-
raise "BUG: unknown session state: #{ri.state}"
|
1077
|
-
end
|
1078
|
-
end
|
1079
|
-
|
1080
711
|
private
|
1081
712
|
|
1082
|
-
def connect(host = nil)
|
1083
|
-
|
1084
|
-
if @keepalive
|
1085
|
-
ri = RequestInfo.new(:established)
|
1086
|
-
sock = @socket_cache.fetch_or do
|
1087
|
-
s = @sender.create_transfer_socket(host || resolved_host, port, @hostname)
|
1088
|
-
ri = RequestInfo.new(@sender.security ? :helo : :established) # overwrite if new connection
|
1089
|
-
s
|
1090
|
-
end
|
1091
|
-
[sock, ri]
|
1092
|
-
else
|
1093
|
-
@log.debug('connect new socket')
|
1094
|
-
[@sender.create_transfer_socket(host || resolved_host, port, @hostname), RequestInfo.new(@sender.security ? :helo : :established)]
|
1095
|
-
end
|
1096
|
-
|
1097
|
-
if block_given?
|
1098
|
-
ret = nil
|
1099
|
-
begin
|
1100
|
-
ret = yield(socket, request_info)
|
1101
|
-
rescue
|
1102
|
-
@socket_cache.revoke if @keepalive
|
1103
|
-
raise
|
1104
|
-
else
|
1105
|
-
@socket_cache.dec_ref if @keepalive
|
1106
|
-
ensure
|
1107
|
-
socket.close unless @keepalive
|
1108
|
-
end
|
1109
|
-
|
1110
|
-
ret
|
1111
|
-
else
|
1112
|
-
[socket, request_info]
|
1113
|
-
end
|
713
|
+
def connect(host = nil, ack: false, &block)
|
714
|
+
@connection_manager.connect(host: host || resolved_host, port: port, hostname: @hostname, ack: ack, &block)
|
1114
715
|
end
|
1115
716
|
end
|
1116
717
|
|
@@ -1128,68 +729,5 @@ module Fluent::Plugin
|
|
1128
729
|
true
|
1129
730
|
end
|
1130
731
|
end
|
1131
|
-
|
1132
|
-
class FailureDetector
|
1133
|
-
PHI_FACTOR = 1.0 / Math.log(10.0)
|
1134
|
-
SAMPLE_SIZE = 1000
|
1135
|
-
|
1136
|
-
def initialize(heartbeat_interval, hard_timeout, init_last)
|
1137
|
-
@heartbeat_interval = heartbeat_interval
|
1138
|
-
@last = init_last
|
1139
|
-
@hard_timeout = hard_timeout
|
1140
|
-
|
1141
|
-
# microsec
|
1142
|
-
@init_gap = (heartbeat_interval * 1e6).to_i
|
1143
|
-
@window = [@init_gap]
|
1144
|
-
end
|
1145
|
-
|
1146
|
-
def hard_timeout?(now)
|
1147
|
-
now - @last > @hard_timeout
|
1148
|
-
end
|
1149
|
-
|
1150
|
-
def add(now)
|
1151
|
-
if @window.empty?
|
1152
|
-
@window << @init_gap
|
1153
|
-
@last = now
|
1154
|
-
else
|
1155
|
-
gap = now - @last
|
1156
|
-
@window << (gap * 1e6).to_i
|
1157
|
-
@window.shift if @window.length > SAMPLE_SIZE
|
1158
|
-
@last = now
|
1159
|
-
end
|
1160
|
-
end
|
1161
|
-
|
1162
|
-
def phi(now)
|
1163
|
-
size = @window.size
|
1164
|
-
return 0.0 if size == 0
|
1165
|
-
|
1166
|
-
# Calculate weighted moving average
|
1167
|
-
mean_usec = 0
|
1168
|
-
fact = 0
|
1169
|
-
@window.each_with_index {|gap,i|
|
1170
|
-
mean_usec += gap * (1+i)
|
1171
|
-
fact += (1+i)
|
1172
|
-
}
|
1173
|
-
mean_usec = mean_usec / fact
|
1174
|
-
|
1175
|
-
# Normalize arrive intervals into 1sec
|
1176
|
-
mean = (mean_usec.to_f / 1e6) - @heartbeat_interval + 1
|
1177
|
-
|
1178
|
-
# Calculate phi of the phi accrual failure detector
|
1179
|
-
t = now - @last - @heartbeat_interval + 1
|
1180
|
-
phi = PHI_FACTOR * t / mean
|
1181
|
-
|
1182
|
-
return phi
|
1183
|
-
end
|
1184
|
-
|
1185
|
-
def sample_size
|
1186
|
-
@window.size
|
1187
|
-
end
|
1188
|
-
|
1189
|
-
def clear
|
1190
|
-
@window.clear
|
1191
|
-
@last = 0
|
1192
|
-
end
|
1193
|
-
end
|
1194
732
|
end
|
1195
733
|
end
|