fluentd 1.13.3 → 1.16.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (179) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/{bug_report.yaml → bug_report.yml} +2 -0
  3. data/.github/ISSUE_TEMPLATE/config.yml +2 -2
  4. data/.github/ISSUE_TEMPLATE/{feature_request.yaml → feature_request.yml} +1 -0
  5. data/.github/workflows/stale-actions.yml +11 -9
  6. data/.github/workflows/test.yml +32 -0
  7. data/CHANGELOG.md +490 -10
  8. data/CONTRIBUTING.md +2 -2
  9. data/MAINTAINERS.md +7 -5
  10. data/README.md +3 -23
  11. data/Rakefile +1 -1
  12. data/SECURITY.md +14 -0
  13. data/fluentd.gemspec +7 -8
  14. data/lib/fluent/command/cat.rb +13 -3
  15. data/lib/fluent/command/ctl.rb +6 -3
  16. data/lib/fluent/command/fluentd.rb +73 -65
  17. data/lib/fluent/command/plugin_config_formatter.rb +1 -1
  18. data/lib/fluent/compat/output.rb +9 -6
  19. data/lib/fluent/config/dsl.rb +1 -1
  20. data/lib/fluent/config/error.rb +12 -0
  21. data/lib/fluent/config/literal_parser.rb +2 -2
  22. data/lib/fluent/config/parser.rb +1 -1
  23. data/lib/fluent/config/v1_parser.rb +3 -3
  24. data/lib/fluent/config/yaml_parser/fluent_value.rb +47 -0
  25. data/lib/fluent/config/yaml_parser/loader.rb +108 -0
  26. data/lib/fluent/config/yaml_parser/parser.rb +166 -0
  27. data/lib/fluent/config/yaml_parser/section_builder.rb +107 -0
  28. data/lib/fluent/config/yaml_parser.rb +56 -0
  29. data/lib/fluent/config.rb +14 -1
  30. data/lib/fluent/counter/server.rb +1 -1
  31. data/lib/fluent/counter/validator.rb +3 -3
  32. data/lib/fluent/daemon.rb +2 -4
  33. data/lib/fluent/engine.rb +1 -1
  34. data/lib/fluent/env.rb +4 -0
  35. data/lib/fluent/error.rb +3 -0
  36. data/lib/fluent/event.rb +8 -4
  37. data/lib/fluent/event_router.rb +47 -2
  38. data/lib/fluent/file_wrapper.rb +137 -0
  39. data/lib/fluent/log/console_adapter.rb +66 -0
  40. data/lib/fluent/log.rb +44 -5
  41. data/lib/fluent/match.rb +1 -1
  42. data/lib/fluent/msgpack_factory.rb +6 -1
  43. data/lib/fluent/oj_options.rb +1 -2
  44. data/lib/fluent/plugin/bare_output.rb +49 -8
  45. data/lib/fluent/plugin/base.rb +26 -9
  46. data/lib/fluent/plugin/buf_file.rb +34 -5
  47. data/lib/fluent/plugin/buf_file_single.rb +32 -3
  48. data/lib/fluent/plugin/buffer/file_chunk.rb +1 -1
  49. data/lib/fluent/plugin/buffer.rb +216 -70
  50. data/lib/fluent/plugin/filter.rb +35 -1
  51. data/lib/fluent/plugin/filter_record_transformer.rb +1 -1
  52. data/lib/fluent/plugin/in_forward.rb +2 -2
  53. data/lib/fluent/plugin/in_http.rb +39 -10
  54. data/lib/fluent/plugin/in_monitor_agent.rb +4 -2
  55. data/lib/fluent/plugin/in_sample.rb +1 -1
  56. data/lib/fluent/plugin/in_syslog.rb +13 -1
  57. data/lib/fluent/plugin/in_tail/group_watch.rb +204 -0
  58. data/lib/fluent/plugin/in_tail/position_file.rb +33 -33
  59. data/lib/fluent/plugin/in_tail.rb +216 -84
  60. data/lib/fluent/plugin/in_tcp.rb +47 -2
  61. data/lib/fluent/plugin/input.rb +39 -1
  62. data/lib/fluent/plugin/metrics.rb +119 -0
  63. data/lib/fluent/plugin/metrics_local.rb +96 -0
  64. data/lib/fluent/plugin/multi_output.rb +43 -6
  65. data/lib/fluent/plugin/out_copy.rb +1 -1
  66. data/lib/fluent/plugin/out_exec_filter.rb +2 -2
  67. data/lib/fluent/plugin/out_file.rb +20 -2
  68. data/lib/fluent/plugin/out_forward/ack_handler.rb +19 -4
  69. data/lib/fluent/plugin/out_forward/socket_cache.rb +2 -0
  70. data/lib/fluent/plugin/out_forward.rb +17 -9
  71. data/lib/fluent/plugin/out_secondary_file.rb +39 -22
  72. data/lib/fluent/plugin/output.rb +167 -78
  73. data/lib/fluent/plugin/parser.rb +3 -4
  74. data/lib/fluent/plugin/parser_apache2.rb +1 -1
  75. data/lib/fluent/plugin/parser_json.rb +1 -1
  76. data/lib/fluent/plugin/parser_syslog.rb +1 -1
  77. data/lib/fluent/plugin/storage_local.rb +3 -5
  78. data/lib/fluent/plugin.rb +10 -1
  79. data/lib/fluent/plugin_helper/child_process.rb +3 -0
  80. data/lib/fluent/plugin_helper/event_emitter.rb +8 -1
  81. data/lib/fluent/plugin_helper/event_loop.rb +2 -2
  82. data/lib/fluent/plugin_helper/http_server/server.rb +2 -1
  83. data/lib/fluent/plugin_helper/metrics.rb +129 -0
  84. data/lib/fluent/plugin_helper/record_accessor.rb +1 -1
  85. data/lib/fluent/plugin_helper/retry_state.rb +14 -4
  86. data/lib/fluent/plugin_helper/server.rb +35 -6
  87. data/lib/fluent/plugin_helper/service_discovery.rb +2 -2
  88. data/lib/fluent/plugin_helper/socket.rb +13 -2
  89. data/lib/fluent/plugin_helper/thread.rb +3 -3
  90. data/lib/fluent/plugin_helper.rb +1 -0
  91. data/lib/fluent/plugin_id.rb +3 -2
  92. data/lib/fluent/registry.rb +2 -1
  93. data/lib/fluent/root_agent.rb +6 -0
  94. data/lib/fluent/rpc.rb +4 -3
  95. data/lib/fluent/supervisor.rb +283 -259
  96. data/lib/fluent/system_config.rb +13 -3
  97. data/lib/fluent/test/driver/base.rb +11 -5
  98. data/lib/fluent/test/driver/filter.rb +4 -0
  99. data/lib/fluent/test/startup_shutdown.rb +6 -8
  100. data/lib/fluent/time.rb +21 -20
  101. data/lib/fluent/version.rb +1 -1
  102. data/lib/fluent/win32api.rb +38 -0
  103. data/lib/fluent/winsvc.rb +5 -8
  104. data/templates/new_gem/test/helper.rb.erb +0 -1
  105. data/test/command/test_cat.rb +31 -2
  106. data/test/command/test_ctl.rb +1 -2
  107. data/test/command/test_fluentd.rb +209 -24
  108. data/test/command/test_plugin_config_formatter.rb +0 -1
  109. data/test/compat/test_parser.rb +6 -6
  110. data/test/config/test_system_config.rb +13 -11
  111. data/test/config/test_types.rb +1 -1
  112. data/test/log/test_console_adapter.rb +110 -0
  113. data/test/plugin/in_tail/test_io_handler.rb +26 -8
  114. data/test/plugin/in_tail/test_position_file.rb +48 -59
  115. data/test/plugin/out_forward/test_ack_handler.rb +39 -0
  116. data/test/plugin/out_forward/test_socket_cache.rb +26 -1
  117. data/test/plugin/test_bare_output.rb +14 -1
  118. data/test/plugin/test_base.rb +133 -1
  119. data/test/plugin/test_buf_file.rb +62 -23
  120. data/test/plugin/test_buf_file_single.rb +65 -0
  121. data/test/plugin/test_buffer.rb +267 -3
  122. data/test/plugin/test_buffer_chunk.rb +11 -0
  123. data/test/plugin/test_filter.rb +12 -1
  124. data/test/plugin/test_filter_parser.rb +1 -1
  125. data/test/plugin/test_filter_stdout.rb +2 -2
  126. data/test/plugin/test_in_forward.rb +9 -11
  127. data/test/plugin/test_in_http.rb +65 -3
  128. data/test/plugin/test_in_monitor_agent.rb +216 -11
  129. data/test/plugin/test_in_object_space.rb +9 -3
  130. data/test/plugin/test_in_syslog.rb +35 -0
  131. data/test/plugin/test_in_tail.rb +1393 -385
  132. data/test/plugin/test_in_tcp.rb +87 -2
  133. data/test/plugin/test_in_udp.rb +28 -0
  134. data/test/plugin/test_in_unix.rb +2 -2
  135. data/test/plugin/test_input.rb +12 -1
  136. data/test/plugin/test_metrics.rb +294 -0
  137. data/test/plugin/test_metrics_local.rb +96 -0
  138. data/test/plugin/test_multi_output.rb +25 -1
  139. data/test/plugin/test_out_exec.rb +6 -4
  140. data/test/plugin/test_out_exec_filter.rb +6 -2
  141. data/test/plugin/test_out_file.rb +34 -17
  142. data/test/plugin/test_out_forward.rb +78 -77
  143. data/test/plugin/test_out_http.rb +1 -0
  144. data/test/plugin/test_out_stdout.rb +2 -2
  145. data/test/plugin/test_output.rb +297 -12
  146. data/test/plugin/test_output_as_buffered.rb +44 -44
  147. data/test/plugin/test_output_as_buffered_compress.rb +32 -18
  148. data/test/plugin/test_output_as_buffered_retries.rb +54 -7
  149. data/test/plugin/test_output_as_buffered_secondary.rb +4 -4
  150. data/test/plugin/test_parser_regexp.rb +1 -6
  151. data/test/plugin/test_parser_syslog.rb +1 -1
  152. data/test/plugin_helper/test_cert_option.rb +1 -1
  153. data/test/plugin_helper/test_child_process.rb +38 -16
  154. data/test/plugin_helper/test_event_emitter.rb +29 -0
  155. data/test/plugin_helper/test_http_server_helper.rb +1 -1
  156. data/test/plugin_helper/test_metrics.rb +137 -0
  157. data/test/plugin_helper/test_retry_state.rb +602 -38
  158. data/test/plugin_helper/test_server.rb +78 -6
  159. data/test/plugin_helper/test_timer.rb +2 -2
  160. data/test/test_config.rb +191 -24
  161. data/test/test_event_router.rb +17 -0
  162. data/test/test_file_wrapper.rb +53 -0
  163. data/test/test_formatter.rb +24 -21
  164. data/test/test_log.rb +122 -40
  165. data/test/test_msgpack_factory.rb +32 -0
  166. data/test/test_plugin_classes.rb +102 -0
  167. data/test/test_root_agent.rb +30 -1
  168. data/test/test_supervisor.rb +477 -257
  169. data/test/test_time_parser.rb +22 -0
  170. metadata +55 -34
  171. data/.drone.yml +0 -35
  172. data/.github/workflows/issue-auto-closer.yml +0 -12
  173. data/.github/workflows/linux-test.yaml +0 -36
  174. data/.github/workflows/macos-test.yaml +0 -30
  175. data/.github/workflows/windows-test.yaml +0 -46
  176. data/.gitlab-ci.yml +0 -103
  177. data/lib/fluent/plugin/file_wrapper.rb +0 -187
  178. data/test/plugin/test_file_wrapper.rb +0 -126
  179. data/test/test_logger_initializer.rb +0 -46
@@ -160,13 +160,20 @@ module Fluent
160
160
  def resume
161
161
  stage = {}
162
162
  queue = []
163
+ exist_broken_file = false
163
164
 
164
165
  patterns = [@path]
165
166
  patterns.unshift @additional_resume_path if @additional_resume_path
166
167
  Dir.glob(escaped_patterns(patterns)) do |path|
167
168
  next unless File.file?(path)
168
169
 
169
- log.debug { "restoring buffer file: path = #{path}" }
170
+ if owner.respond_to?(:buffer_config) && owner.buffer_config&.flush_at_shutdown
171
+ # When `flush_at_shutdown` is `true`, the remaining chunk files during resuming are possibly broken
172
+ # since there may be a power failure or similar failure.
173
+ log.warn { "restoring buffer file: path = #{path}" }
174
+ else
175
+ log.debug { "restoring buffer file: path = #{path}" }
176
+ end
170
177
 
171
178
  m = new_metadata() # this metadata will be updated in FileSingleChunk.new
172
179
  mode = Fluent::Plugin::Buffer::FileSingleChunk.assume_chunk_state(path)
@@ -179,6 +186,7 @@ module Fluent
179
186
  chunk = Fluent::Plugin::Buffer::FileSingleChunk.new(m, path, mode, @key_in_path, compress: @compress)
180
187
  chunk.restore_size(@chunk_format) if @calc_num_records
181
188
  rescue Fluent::Plugin::Buffer::FileSingleChunk::FileChunkError => e
189
+ exist_broken_file = true
182
190
  handle_broken_files(path, mode, e)
183
191
  next
184
192
  end
@@ -193,6 +201,15 @@ module Fluent
193
201
 
194
202
  queue.sort_by!(&:modified_at)
195
203
 
204
+ # If one of the files is corrupted, other files may also be corrupted and be undetected.
205
+ # The time priods of each chunk are helpful to check the data.
206
+ if exist_broken_file
207
+ log.info "Since a broken chunk file was found, it is possible that other files remaining at the time of resuming were also broken. Here is the list of the files."
208
+ (stage.values + queue).each { |chunk|
209
+ log.info " #{chunk.path}:", :created_at => chunk.created_at, :modified_at => chunk.modified_at
210
+ }
211
+ end
212
+
196
213
  return stage, queue
197
214
  end
198
215
 
@@ -207,8 +224,20 @@ module Fluent
207
224
  end
208
225
 
209
226
  def handle_broken_files(path, mode, e)
210
- log.error "found broken chunk file during resume. Delete corresponding files:", path: path, mode: mode, err_msg: e.message
211
- # After support 'backup_dir' feature, these files are moved to backup_dir instead of unlink.
227
+ log.error "found broken chunk file during resume.", :path => path, :mode => mode, :err_msg => e.message
228
+ unique_id, _ = Fluent::Plugin::Buffer::FileSingleChunk.unique_id_and_key_from_path(path)
229
+ backup(unique_id) { |f|
230
+ File.open(path, 'rb') { |chunk|
231
+ chunk.set_encoding(Encoding::ASCII_8BIT)
232
+ chunk.sync = true
233
+ chunk.binmode
234
+ IO.copy_stream(chunk, f)
235
+ }
236
+ }
237
+ rescue => error
238
+ log.error "backup failed. Delete corresponding files.", :err_msg => error.message
239
+ ensure
240
+ log.warn "disable_chunk_backup is true. #{dump_unique_id_hex(unique_id)} chunk is thrown away." if @disable_chunk_backup
212
241
  File.unlink(path) rescue nil
213
242
  end
214
243
 
@@ -204,7 +204,7 @@ module Fluent
204
204
  end
205
205
  end
206
206
 
207
- # used only for queued v0.12 buffer path
207
+ # used only for queued v0.12 buffer path or broken files
208
208
  def self.unique_id_from_path(path)
209
209
  if /\.(b|q)([0-9a-f]+)\.[^\/]*\Z/n =~ path # //n switch means explicit 'ASCII-8BIT' pattern
210
210
  return $2.scan(/../).map{|x| x.to_i(16) }.pack('C*')
@@ -16,6 +16,8 @@
16
16
 
17
17
  require 'fluent/plugin/base'
18
18
  require 'fluent/plugin/owned_by_mixin'
19
+ require 'fluent/plugin_id'
20
+ require 'fluent/plugin_helper'
19
21
  require 'fluent/unique_id'
20
22
  require 'fluent/ext_monitor_require'
21
23
 
@@ -24,7 +26,9 @@ module Fluent
24
26
  class Buffer < Base
25
27
  include OwnedByMixin
26
28
  include UniqueId::Mixin
29
+ include PluginId
27
30
  include MonitorMixin
31
+ include PluginHelper::Mixin # for metrics
28
32
 
29
33
  class BufferError < StandardError; end
30
34
  class BufferOverflowError < BufferError; end
@@ -39,6 +43,8 @@ module Fluent
39
43
 
40
44
  configured_in :buffer
41
45
 
46
+ helpers_internal :metrics
47
+
42
48
  # TODO: system total buffer limit size in bytes by SystemConfig
43
49
 
44
50
  config_param :chunk_limit_size, :size, default: DEFAULT_CHUNK_LIMIT_SIZE
@@ -60,6 +66,9 @@ module Fluent
60
66
  desc 'Compress buffered data.'
61
67
  config_param :compress, :enum, list: [:text, :gzip], default: :text
62
68
 
69
+ desc 'If true, chunks are thrown away when unrecoverable error happens'
70
+ config_param :disable_chunk_backup, :bool, default: false
71
+
63
72
  Metadata = Struct.new(:timekey, :tag, :variables, :seq) do
64
73
  def initialize(timekey, tag, variables)
65
74
  super(timekey, tag, variables, 0)
@@ -153,8 +162,11 @@ module Fluent
153
162
  end
154
163
  end
155
164
 
165
+ # for metrics
166
+ attr_reader :stage_size_metrics, :stage_length_metrics, :queue_size_metrics, :queue_length_metrics
167
+ attr_reader :available_buffer_space_ratios_metrics, :total_queued_size_metrics
168
+ attr_reader :newest_timekey_metrics, :oldest_timekey_metrics
156
169
  # for tests
157
- attr_accessor :stage_size, :queue_size
158
170
  attr_reader :stage, :queue, :dequeued, :queued_num
159
171
 
160
172
  def initialize
@@ -171,12 +183,35 @@ module Fluent
171
183
  @queued_num = {} # metadata => int (number of queued chunks)
172
184
  @dequeued_num = {} # metadata => int (number of dequeued chunks)
173
185
 
174
- @stage_size = @queue_size = 0
186
+ @stage_length_metrics = nil
187
+ @stage_size_metrics = nil
188
+ @queue_length_metrics = nil
189
+ @queue_size_metrics = nil
190
+ @available_buffer_space_ratios_metrics = nil
191
+ @total_queued_size_metrics = nil
192
+ @newest_timekey_metrics = nil
193
+ @oldest_timekey_metrics = nil
175
194
  @timekeys = Hash.new(0)
176
195
  @enable_update_timekeys = false
177
196
  @mutex = Mutex.new
178
197
  end
179
198
 
199
+ def stage_size
200
+ @stage_size_metrics.get
201
+ end
202
+
203
+ def stage_size=(value)
204
+ @stage_size_metrics.set(value)
205
+ end
206
+
207
+ def queue_size
208
+ @queue_size_metrics.get
209
+ end
210
+
211
+ def queue_size=(value)
212
+ @queue_size_metrics.set(value)
213
+ end
214
+
180
215
  def persistent?
181
216
  false
182
217
  end
@@ -187,6 +222,28 @@ module Fluent
187
222
  unless @queue_limit_length.nil?
188
223
  @total_limit_size = @chunk_limit_size * @queue_limit_length
189
224
  end
225
+ @stage_length_metrics = metrics_create(namespace: "fluentd", subsystem: "buffer", name: "stage_length",
226
+ help_text: 'Length of stage buffers', prefer_gauge: true)
227
+ @stage_length_metrics.set(0)
228
+ @stage_size_metrics = metrics_create(namespace: "fluentd", subsystem: "buffer", name: "stage_byte_size",
229
+ help_text: 'Total size of stage buffers', prefer_gauge: true)
230
+ @stage_size_metrics.set(0) # Ensure zero.
231
+ @queue_length_metrics = metrics_create(namespace: "fluentd", subsystem: "buffer", name: "queue_length",
232
+ help_text: 'Length of queue buffers', prefer_gauge: true)
233
+ @queue_length_metrics.set(0)
234
+ @queue_size_metrics = metrics_create(namespace: "fluentd", subsystem: "buffer", name: "queue_byte_size",
235
+ help_text: 'Total size of queue buffers', prefer_gauge: true)
236
+ @queue_size_metrics.set(0) # Ensure zero.
237
+ @available_buffer_space_ratios_metrics = metrics_create(namespace: "fluentd", subsystem: "buffer", name: "available_buffer_space_ratios",
238
+ help_text: 'Ratio of available space in buffer', prefer_gauge: true)
239
+ @available_buffer_space_ratios_metrics.set(100) # Default is 100%.
240
+ @total_queued_size_metrics = metrics_create(namespace: "fluentd", subsystem: "buffer", name: "total_queued_size",
241
+ help_text: 'Total size of stage and queue buffers', prefer_gauge: true)
242
+ @total_queued_size_metrics.set(0)
243
+ @newest_timekey_metrics = metrics_create(namespace: "fluentd", subsystem: "buffer", name: "newest_timekey",
244
+ help_text: 'Newest timekey in buffer', prefer_gauge: true)
245
+ @oldest_timekey_metrics = metrics_create(namespace: "fluentd", subsystem: "buffer", name: "oldest_timekey",
246
+ help_text: 'Oldest timekey in buffer', prefer_gauge: true)
190
247
  end
191
248
 
192
249
  def enable_update_timekeys
@@ -198,15 +255,15 @@ module Fluent
198
255
 
199
256
  @stage, @queue = resume
200
257
  @stage.each_pair do |metadata, chunk|
201
- @stage_size += chunk.bytesize
258
+ @stage_size_metrics.add(chunk.bytesize)
202
259
  end
203
260
  @queue.each do |chunk|
204
261
  @queued_num[chunk.metadata] ||= 0
205
262
  @queued_num[chunk.metadata] += 1
206
- @queue_size += chunk.bytesize
263
+ @queue_size_metrics.add(chunk.bytesize)
207
264
  end
208
265
  update_timekeys
209
- log.debug "buffer started", instance: self.object_id, stage_size: @stage_size, queue_size: @queue_size
266
+ log.debug "buffer started", instance: self.object_id, stage_size: @stage_size_metrics.get, queue_size: @queue_size_metrics.get
210
267
  end
211
268
 
212
269
  def close
@@ -228,17 +285,19 @@ module Fluent
228
285
  def terminate
229
286
  super
230
287
  @dequeued = @stage = @queue = @queued_num = nil
231
- @stage_size = @queue_size = 0
288
+ @stage_length_metrics = @stage_size_metrics = @queue_length_metrics = @queue_size_metrics = nil
289
+ @available_buffer_space_ratios_metrics = @total_queued_size_metrics = nil
290
+ @newest_timekey_metrics = @oldest_timekey_metrics = nil
232
291
  @timekeys.clear
233
292
  end
234
293
 
235
294
  def storable?
236
- @total_limit_size > @stage_size + @queue_size
295
+ @total_limit_size > @stage_size_metrics.get + @queue_size_metrics.get
237
296
  end
238
297
 
239
298
  ## TODO: for back pressure feature
240
299
  # def used?(ratio)
241
- # @total_limit_size * ratio > @stage_size + @queue_size
300
+ # @total_limit_size * ratio > @stage_size_metrics.get + @queue_size_metrics.get
242
301
  # end
243
302
 
244
303
  def resume
@@ -276,12 +335,14 @@ module Fluent
276
335
  unstaged_chunks = {} # metadata => [chunk, chunk, ...]
277
336
  chunks_to_enqueue = []
278
337
  staged_bytesizes_by_chunk = {}
338
+ # track internal BufferChunkOverflowError in write_step_by_step
339
+ buffer_chunk_overflow_errors = []
279
340
 
280
341
  begin
281
342
  # sort metadata to get lock of chunks in same order with other threads
282
343
  metadata_and_data.keys.sort.each do |metadata|
283
344
  data = metadata_and_data[metadata]
284
- write_once(metadata, data, format: format, size: size) do |chunk, adding_bytesize|
345
+ write_once(metadata, data, format: format, size: size) do |chunk, adding_bytesize, error|
285
346
  chunk.mon_enter # add lock to prevent to be committed/rollbacked from other threads
286
347
  operated_chunks << chunk
287
348
  if chunk.staged?
@@ -296,6 +357,9 @@ module Fluent
296
357
  unstaged_chunks[metadata] ||= []
297
358
  unstaged_chunks[metadata] << chunk
298
359
  end
360
+ if error && !error.empty?
361
+ buffer_chunk_overflow_errors << error
362
+ end
299
363
  end
300
364
  end
301
365
 
@@ -344,7 +408,7 @@ module Fluent
344
408
  #
345
409
  staged_bytesizes_by_chunk.each do |chunk, bytesize|
346
410
  chunk.synchronize do
347
- synchronize { @stage_size += bytesize }
411
+ synchronize { @stage_size_metrics.add(bytesize) }
348
412
  log.on_trace { log.trace { "chunk #{chunk.path} size_added: #{bytesize} new_size: #{chunk.bytesize}" } }
349
413
  end
350
414
  end
@@ -353,7 +417,7 @@ module Fluent
353
417
  if c.staged? && (enqueue || chunk_size_full?(c))
354
418
  m = c.metadata
355
419
  enqueue_chunk(m)
356
- if unstaged_chunks[m]
420
+ if unstaged_chunks[m] && !unstaged_chunks[m].empty?
357
421
  u = unstaged_chunks[m].pop
358
422
  u.synchronize do
359
423
  if u.unstaged? && !chunk_size_full?(u)
@@ -361,7 +425,7 @@ module Fluent
361
425
  u.metadata.seq = 0
362
426
  synchronize {
363
427
  @stage[m] = u.staged!
364
- @stage_size += u.bytesize
428
+ @stage_size_metrics.add(u.bytesize)
365
429
  }
366
430
  end
367
431
  end
@@ -388,6 +452,10 @@ module Fluent
388
452
  end
389
453
  chunk.mon_exit rescue nil # this may raise ThreadError for chunks already committed
390
454
  end
455
+ unless buffer_chunk_overflow_errors.empty?
456
+ # Notify delayed BufferChunkOverflowError here
457
+ raise BufferChunkOverflowError, buffer_chunk_overflow_errors.join(", ")
458
+ end
391
459
  end
392
460
  end
393
461
 
@@ -428,8 +496,8 @@ module Fluent
428
496
  chunk.enqueued!
429
497
  end
430
498
  bytesize = chunk.bytesize
431
- @stage_size -= bytesize
432
- @queue_size += bytesize
499
+ @stage_size_metrics.sub(bytesize)
500
+ @queue_size_metrics.add(bytesize)
433
501
  end
434
502
  end
435
503
  nil
@@ -446,7 +514,7 @@ module Fluent
446
514
  @queued_num[metadata] = @queued_num.fetch(metadata, 0) + 1
447
515
  chunk.enqueued!
448
516
  end
449
- @queue_size += chunk.bytesize
517
+ @queue_size_metrics.add(chunk.bytesize)
450
518
  end
451
519
  end
452
520
 
@@ -512,7 +580,7 @@ module Fluent
512
580
  chunk = @dequeued.delete(chunk_id)
513
581
  return false unless chunk # already purged by other thread
514
582
  @queue.unshift(chunk)
515
- log.trace "chunk taken back", instance: self.object_id, chunk_id: dump_unique_id_hex(chunk_id), metadata: chunk.metadata
583
+ log.on_trace { log.trace "chunk taken back", instance: self.object_id, chunk_id: dump_unique_id_hex(chunk_id), metadata: chunk.metadata }
516
584
  @queued_num[chunk.metadata] += 1 # BUG if nil
517
585
  @dequeued_num[chunk.metadata] -= 1
518
586
  end
@@ -531,7 +599,7 @@ module Fluent
531
599
  begin
532
600
  bytesize = chunk.bytesize
533
601
  chunk.purge
534
- @queue_size -= bytesize
602
+ @queue_size_metrics.sub(bytesize)
535
603
  rescue => e
536
604
  log.error "failed to purge buffer chunk", chunk_id: dump_unique_id_hex(chunk_id), error_class: e.class, error: e
537
605
  log.error_backtrace
@@ -542,7 +610,7 @@ module Fluent
542
610
  @queued_num.delete(metadata)
543
611
  @dequeued_num.delete(metadata)
544
612
  end
545
- log.trace "chunk purged", instance: self.object_id, chunk_id: dump_unique_id_hex(chunk_id), metadata: metadata
613
+ log.on_trace { log.trace "chunk purged", instance: self.object_id, chunk_id: dump_unique_id_hex(chunk_id), metadata: metadata }
546
614
  end
547
615
 
548
616
  nil
@@ -562,7 +630,7 @@ module Fluent
562
630
  log.error_backtrace
563
631
  end
564
632
  end
565
- @queue_size = 0
633
+ @queue_size_metrics.set(0)
566
634
  end
567
635
  end
568
636
 
@@ -680,16 +748,14 @@ module Fluent
680
748
  modified_chunks = []
681
749
  modified_metadata = metadata
682
750
  get_next_chunk = ->(){
683
- c = if staged_chunk_used
684
- # Staging new chunk here is bad idea:
685
- # Recovering whole state including newly staged chunks is much harder than current implementation.
686
- modified_metadata = modified_metadata.dup_next
687
- generate_chunk(modified_metadata)
688
- else
689
- synchronize { @stage[modified_metadata] ||= generate_chunk(modified_metadata).staged! }
690
- end
691
- modified_chunks << c
692
- c
751
+ if staged_chunk_used
752
+ # Staging new chunk here is bad idea:
753
+ # Recovering whole state including newly staged chunks is much harder than current implementation.
754
+ modified_metadata = modified_metadata.dup_next
755
+ generate_chunk(modified_metadata)
756
+ else
757
+ synchronize { @stage[modified_metadata] ||= generate_chunk(modified_metadata).staged! }
758
+ end
693
759
  }
694
760
 
695
761
  writing_splits_index = 0
@@ -697,60 +763,116 @@ module Fluent
697
763
 
698
764
  while writing_splits_index < splits.size
699
765
  chunk = get_next_chunk.call
700
- chunk.synchronize do
701
- raise ShouldRetry unless chunk.writable?
702
- staged_chunk_used = true if chunk.staged?
766
+ errors = []
767
+ # The chunk must be locked until being passed to &block.
768
+ chunk.mon_enter
769
+ modified_chunks << {chunk: chunk, adding_bytesize: 0, errors: errors}
703
770
 
704
- original_bytesize = chunk.bytesize
705
- begin
706
- while writing_splits_index < splits.size
707
- split = splits[writing_splits_index]
708
- if format
709
- chunk.concat(format.call(split), split.size)
710
- else
711
- chunk.append(split, compress: @compress)
771
+ raise ShouldRetry unless chunk.writable?
772
+ staged_chunk_used = true if chunk.staged?
773
+
774
+ original_bytesize = committed_bytesize = chunk.bytesize
775
+ begin
776
+ while writing_splits_index < splits.size
777
+ split = splits[writing_splits_index]
778
+ formatted_split = format ? format.call(split) : nil
779
+
780
+ if split.size == 1 # Check BufferChunkOverflowError
781
+ determined_bytesize = nil
782
+ if @compress != :text
783
+ determined_bytesize = nil
784
+ elsif formatted_split
785
+ determined_bytesize = formatted_split.bytesize
786
+ elsif split.first.respond_to?(:bytesize)
787
+ determined_bytesize = split.first.bytesize
712
788
  end
713
789
 
714
- if chunk_size_over?(chunk) # split size is larger than difference between size_full? and size_over?
715
- chunk.rollback
790
+ if determined_bytesize && determined_bytesize > @chunk_limit_size
791
+ # It is a obvious case that BufferChunkOverflowError should be raised here.
792
+ # But if it raises here, already processed 'split' or
793
+ # the proceeding 'split' will be lost completely.
794
+ # So it is a last resort to delay raising such a exception
795
+ errors << "a #{determined_bytesize} bytes record (nth: #{writing_splits_index}) is larger than buffer chunk limit size (#{@chunk_limit_size})"
796
+ writing_splits_index += 1
797
+ next
798
+ end
716
799
 
717
- if split.size == 1 && original_bytesize == 0
718
- big_record_size = format ? format.call(split).bytesize : split.first.bytesize
719
- raise BufferChunkOverflowError, "a #{big_record_size}bytes record is larger than buffer chunk limit size"
720
- end
800
+ if determined_bytesize.nil? || chunk.bytesize + determined_bytesize > @chunk_limit_size
801
+ # The split will (might) cause size over so keep already processed
802
+ # 'split' content here (allow performance regression a bit).
803
+ chunk.commit
804
+ committed_bytesize = chunk.bytesize
805
+ end
806
+ end
807
+
808
+ if format
809
+ chunk.concat(formatted_split, split.size)
810
+ else
811
+ chunk.append(split, compress: @compress)
812
+ end
813
+ adding_bytes = chunk.bytesize - committed_bytesize
814
+
815
+ if chunk_size_over?(chunk) # split size is larger than difference between size_full? and size_over?
816
+ chunk.rollback
817
+ committed_bytesize = chunk.bytesize
721
818
 
722
- if chunk_size_full?(chunk) || split.size == 1
723
- enqueue_chunk_before_retry = true
819
+ if split.size == 1 # Check BufferChunkOverflowError again
820
+ if adding_bytes > @chunk_limit_size
821
+ errors << "concatenated/appended a #{adding_bytes} bytes record (nth: #{writing_splits_index}) is larger than buffer chunk limit size (#{@chunk_limit_size})"
822
+ writing_splits_index += 1
823
+ next
724
824
  else
725
- splits_count *= 10
825
+ # As already processed content is kept after rollback, then unstaged chunk should be queued.
826
+ # After that, re-process current split again.
827
+ # New chunk should be allocated, to do it, modify @stage and so on.
828
+ synchronize { @stage.delete(modified_metadata) }
829
+ staged_chunk_used = false
830
+ chunk.unstaged!
831
+ break
726
832
  end
833
+ end
727
834
 
728
- raise ShouldRetry
835
+ if chunk_size_full?(chunk) || split.size == 1
836
+ enqueue_chunk_before_retry = true
837
+ else
838
+ splits_count *= 10
729
839
  end
730
840
 
731
- writing_splits_index += 1
841
+ raise ShouldRetry
842
+ end
843
+
844
+ writing_splits_index += 1
732
845
 
733
- if chunk_size_full?(chunk)
734
- break
735
- end
846
+ if chunk_size_full?(chunk)
847
+ break
736
848
  end
737
- rescue
738
- chunk.purge if chunk.unstaged? # unstaged chunk will leak unless purge it
739
- raise
740
849
  end
741
-
742
- block.call(chunk, chunk.bytesize - original_bytesize)
850
+ rescue
851
+ chunk.purge if chunk.unstaged? # unstaged chunk will leak unless purge it
852
+ raise
743
853
  end
854
+
855
+ modified_chunks.last[:adding_bytesize] = chunk.bytesize - original_bytesize
856
+ end
857
+ modified_chunks.each do |data|
858
+ block.call(data[:chunk], data[:adding_bytesize], data[:errors])
744
859
  end
745
860
  rescue ShouldRetry
746
- modified_chunks.each do |mc|
747
- mc.rollback rescue nil
748
- if mc.unstaged?
749
- mc.purge rescue nil
861
+ modified_chunks.each do |data|
862
+ chunk = data[:chunk]
863
+ chunk.rollback rescue nil
864
+ if chunk.unstaged?
865
+ chunk.purge rescue nil
750
866
  end
867
+ chunk.mon_exit rescue nil
751
868
  end
752
869
  enqueue_chunk(metadata) if enqueue_chunk_before_retry
753
870
  retry
871
+ ensure
872
+ modified_chunks.each do |data|
873
+ chunk = data[:chunk]
874
+ chunk.mon_exit
875
+ end
754
876
  end
755
877
 
756
878
  STATS_KEYS = [
@@ -765,28 +887,52 @@ module Fluent
765
887
  ]
766
888
 
767
889
  def statistics
768
- stage_size, queue_size = @stage_size, @queue_size
890
+ stage_size, queue_size = @stage_size_metrics.get, @queue_size_metrics.get
769
891
  buffer_space = 1.0 - ((stage_size + queue_size * 1.0) / @total_limit_size)
892
+ @stage_length_metrics.set(@stage.size)
893
+ @queue_length_metrics.set(@queue.size)
894
+ @available_buffer_space_ratios_metrics.set(buffer_space * 100)
895
+ @total_queued_size_metrics.set(stage_size + queue_size)
770
896
  stats = {
771
- 'stage_length' => @stage.size,
897
+ 'stage_length' => @stage_length_metrics.get,
772
898
  'stage_byte_size' => stage_size,
773
- 'queue_length' => @queue.size,
899
+ 'queue_length' => @queue_length_metrics.get,
774
900
  'queue_byte_size' => queue_size,
775
- 'available_buffer_space_ratios' => (buffer_space * 100).round(1),
776
- 'total_queued_size' => stage_size + queue_size,
901
+ 'available_buffer_space_ratios' => @available_buffer_space_ratios_metrics.get.round(1),
902
+ 'total_queued_size' => @total_queued_size_metrics.get,
777
903
  }
778
904
 
779
905
  tkeys = timekeys
780
906
  if (m = tkeys.min)
781
- stats['oldest_timekey'] = m
907
+ @oldest_timekey_metrics.set(m)
908
+ stats['oldest_timekey'] = @oldest_timekey_metrics.get
782
909
  end
783
910
  if (m = tkeys.max)
784
- stats['newest_timekey'] = m
911
+ @newest_timekey_metrics.set(m)
912
+ stats['newest_timekey'] = @newest_timekey_metrics.get
785
913
  end
786
914
 
787
915
  { 'buffer' => stats }
788
916
  end
789
917
 
918
+ def backup(chunk_unique_id)
919
+ unique_id = dump_unique_id_hex(chunk_unique_id)
920
+
921
+ if @disable_chunk_backup
922
+ log.warn "disable_chunk_backup is true. #{unique_id} chunk is not backed up."
923
+ return
924
+ end
925
+
926
+ safe_owner_id = owner.plugin_id.gsub(/[ "\/\\:;|*<>?]/, '_')
927
+ backup_base_dir = system_config.root_dir || DEFAULT_BACKUP_DIR
928
+ backup_file = File.join(backup_base_dir, 'backup', "worker#{fluentd_worker_id}", safe_owner_id, "#{unique_id}.log")
929
+ backup_dir = File.dirname(backup_file)
930
+
931
+ log.warn "bad chunk is moved to #{backup_file}"
932
+ FileUtils.mkdir_p(backup_dir, mode: system_config.dir_permission || Fluent::DEFAULT_DIR_PERMISSION) unless Dir.exist?(backup_dir)
933
+ File.open(backup_file, 'ab', system_config.file_permission || Fluent::DEFAULT_FILE_PERMISSION) { |f| yield f }
934
+ end
935
+
790
936
  private
791
937
 
792
938
  def optimistic_queued?(metadata = nil)
@@ -28,13 +28,47 @@ module Fluent
28
28
  include PluginLoggerMixin
29
29
  include PluginHelper::Mixin
30
30
 
31
- helpers_internal :event_emitter
31
+ helpers_internal :event_emitter, :metrics
32
32
 
33
33
  attr_reader :has_filter_with_time
34
34
 
35
35
  def initialize
36
36
  super
37
37
  @has_filter_with_time = has_filter_with_time?
38
+ @emit_records_metrics = nil
39
+ @emit_size_metrics = nil
40
+ @counter_mutex = Mutex.new
41
+ @enable_size_metrics = false
42
+ end
43
+
44
+ def emit_records
45
+ @emit_records_metrics.get
46
+ end
47
+
48
+ def emit_size
49
+ @emit_size_metrics.get
50
+ end
51
+
52
+ def configure(conf)
53
+ super
54
+
55
+ @emit_records_metrics = metrics_create(namespace: "fluentd", subsystem: "filter", name: "emit_records", help_text: "Number of count emit records")
56
+ @emit_size_metrics = metrics_create(namespace: "fluentd", subsystem: "filter", name: "emit_size", help_text: "Total size of emit events")
57
+ @enable_size_metrics = !!system_config.enable_size_metrics
58
+ end
59
+
60
+ def statistics
61
+ stats = {
62
+ 'emit_records' => @emit_records_metrics.get,
63
+ 'emit_size' => @emit_size_metrics.get,
64
+ }
65
+
66
+ { 'filter' => stats }
67
+ end
68
+
69
+ def measure_metrics(es)
70
+ @emit_records_metrics.add(es.size)
71
+ @emit_size_metrics.add(es.to_msgpack_stream.bytesize) if @enable_size_metrics
38
72
  end
39
73
 
40
74
  def filter(tag, time, record)
@@ -316,7 +316,7 @@ module Fluent::Plugin
316
316
  end
317
317
 
318
318
  (Object.instance_methods).each do |m|
319
- undef_method m unless m.to_s =~ /^__|respond_to_missing\?|object_id|public_methods|instance_eval|method_missing|define_singleton_method|respond_to\?|new_ostruct_member|^class$/
319
+ undef_method m unless /^__|respond_to_missing\?|object_id|public_methods|instance_eval|method_missing|define_singleton_method|respond_to\?|new_ostruct_member|^class$/.match?(m.to_s)
320
320
  end
321
321
  end
322
322
  end
@@ -40,7 +40,7 @@ module Fluent::Plugin
40
40
  config_param :backlog, :integer, default: nil
41
41
  # SO_LINGER 0 to send RST rather than FIN to avoid lots of connections sitting in TIME_WAIT at src
42
42
  desc 'The timeout time used to set linger option.'
43
- config_param :linger_timeout, :integer, default: 0
43
+ config_param :linger_timeout, :integer, default: nil, deprecated: "use transport directive"
44
44
  # This option is for Cool.io's loop wait timeout to avoid loop stuck at shutdown. Almost users don't need to change this value.
45
45
  config_param :blocking_timeout, :time, default: 0.5
46
46
  desc 'Try to resolve hostname from IP addresses or not.'
@@ -430,7 +430,7 @@ module Fluent::Plugin
430
430
  end
431
431
  _ping, hostname, shared_key_salt, shared_key_hexdigest, username, password_digest = message
432
432
 
433
- node = @nodes.select{|n| n[:address].include?(remote_addr) rescue false }.first
433
+ node = @nodes.find{|n| n[:address].include?(remote_addr) rescue false }
434
434
  if !node && !@security.allow_anonymous_source
435
435
  log.warn "Anonymous client disallowed", address: remote_addr, hostname: hostname
436
436
  return false, "anonymous source host '#{remote_addr}' denied", nil