logstash-output-s3-zst 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,445 @@
1
+ # encoding: utf-8
2
+ require "logstash/outputs/base"
3
+ require "logstash/namespace"
4
+ require "logstash/plugin_mixins/aws_config"
5
+ require "stud/temporary"
6
+ require "stud/task"
7
+ require "concurrent"
8
+ require "socket"
9
+ require "thread"
10
+ require "tmpdir"
11
+ require "fileutils"
12
+ require "set"
13
+ require "pathname"
14
+ require "aws-sdk"
15
+ require "logstash/outputs/s3/patch"
16
+
17
+ Aws.eager_autoload!
18
+
19
+ # INFORMATION:
20
+ #
21
+ # This plugin batches and uploads logstash events into Amazon Simple Storage Service (Amazon S3).
22
+ #
23
+ # Requirements:
24
+ # * Amazon S3 Bucket and S3 Access Permissions (Typically access_key_id and secret_access_key)
25
+ # * S3 PutObject permission
26
+ #
27
+ # S3 outputs create temporary files into the OS' temporary directory, you can specify where to save them using the `temporary_directory` option.
28
+ #
29
+ # S3 output files have the following format
30
+ #
31
+ # ls.s3.312bc026-2f5d-49bc-ae9f-5940cf4ad9a6.2013-04-18T10.00.tag_hello.part0.txt
32
+ #
33
+ #
34
+ # |=======
35
+ # | ls.s3 | indicate logstash plugin s3 |
36
+ # | 312bc026-2f5d-49bc-ae9f-5940cf4ad9a6 | a new, random uuid per file. |
37
+ # | 2013-04-18T10.00 | represents the time whenever you specify time_file. |
38
+ # | tag_hello | this indicates the event's tag. |
39
+ # | part0 | this means if you indicate size_file then it will generate more parts if you file.size > size_file. When a file is full it will be pushed to the bucket and then deleted from the temporary directory. If a file is empty, it is simply deleted. Empty files will not be pushed |
40
+ # |=======
41
+ #
42
+ # Crash Recovery:
43
+ # * This plugin will recover and upload temporary log files after crash/abnormal termination when using `restore` set to true
44
+ #
45
+ ##[Note regarding time_file and size_file] :
46
+ #
47
+ ## Both time_file and size_file settings can trigger a log "file rotation"
48
+ ## A log rotation pushes the current log "part" to s3 and deleted from local temporary storage.
49
+ #
50
+ ## If you specify BOTH size_file and time_file then it will create file for each tag (if specified).
51
+ ## When EITHER time_file minutes have elapsed OR log file size > size_file, a log rotation is triggered.
52
+ ##
53
+ ## If you ONLY specify time_file but NOT file_size, one file for each tag (if specified) will be created.
54
+ ## When time_file minutes elapses, a log rotation will be triggered.
55
+ #
56
+ ## If you ONLY specify size_file, but NOT time_file, one files for each tag (if specified) will be created.
57
+ ## When size of log file part > size_file, a log rotation will be triggered.
58
+ #
59
+ ## If NEITHER size_file nor time_file is specified, ONLY one file for each tag (if specified) will be created.
60
+ ## WARNING: Since no log rotation is triggered, S3 Upload will only occur when logstash restarts.
61
+ #
62
+ #
63
+ # #### Usage:
64
+ # This is an example of logstash config:
65
+ # [source,ruby]
66
+ # output {
67
+ # s3{
68
+ # access_key_id => "crazy_key" (required)
69
+ # secret_access_key => "monkey_access_key" (required)
70
+ # region => "eu-west-1" (optional, default = "us-east-1")
71
+ # bucket => "your_bucket" (required)
72
+ # size_file => 2048 (optional) - Bytes
73
+ # time_file => 5 (optional) - Minutes
74
+ # codec => "plain" (optional)
75
+ # canned_acl => "private" (optional. Options are "private", "public-read", "public-read-write", "authenticated-read", "aws-exec-read", "bucket-owner-read", "bucket-owner-full-control", "log-delivery-write". Defaults to "private" )
76
+ # }
77
+ #
78
+ class LogStash::Outputs::S3ZST < LogStash::Outputs::Base
79
+ require "logstash/outputs/s3/writable_directory_validator"
80
+ require "logstash/outputs/s3/path_validator"
81
+ require "logstash/outputs/s3/write_bucket_permission_validator"
82
+ require "logstash/outputs/s3/size_rotation_policy"
83
+ require "logstash/outputs/s3/time_rotation_policy"
84
+ require "logstash/outputs/s3/size_and_time_rotation_policy"
85
+ require "logstash/outputs/s3/temporary_file"
86
+ require "logstash/outputs/s3/temporary_file_factory"
87
+ require "logstash/outputs/s3/uploader"
88
+ require "logstash/outputs/s3/file_repository"
89
+
90
+ include LogStash::PluginMixins::AwsConfig::V2
91
+
92
+ PREFIX_KEY_NORMALIZE_CHARACTER = "_"
93
+ PERIODIC_CHECK_INTERVAL_IN_SECONDS = 15
94
+ CRASH_RECOVERY_THREADPOOL = Concurrent::ThreadPoolExecutor.new({
95
+ :min_threads => 1,
96
+ :max_threads => 2,
97
+ :fallback_policy => :caller_runs
98
+ })
99
+
100
+ GZIP_ENCODING = "gzip"
101
+
102
+ config_name "s3-zst"
103
+ default :codec, "line"
104
+
105
+ concurrency :shared
106
+
107
+ # S3 bucket
108
+ config :bucket, :validate => :string, :required => true
109
+
110
+ config :additional_settings, :validate => :hash, :default => {}
111
+
112
+ # Set the size of file in bytes, this means that files on bucket when have dimension > file_size, they are stored in two or more file.
113
+ # If you have tags then it will generate a specific size file for every tags
114
+ #
115
+ # NOTE: define size of file is the better thing, because generate a local temporary file on disk and then put it in bucket.
116
+ config :size_file, :validate => :number, :default => 1024 * 1024 * 5
117
+
118
+ # Set the time, in MINUTES, to close the current sub_time_section of bucket.
119
+ # If you also define file_size you have a number of files related to the section and the current tag.
120
+ # If it's valued 0 and rotation_strategy is 'time' or 'size_and_time' then the plugin reaise a configuration error.
121
+ config :time_file, :validate => :number, :default => 15
122
+
123
+ # If `restore => false` is specified and Logstash crashes, the unprocessed files are not sent into the bucket.
124
+ #
125
+ # NOTE: that the `recovery => true` default assumes multiple S3 outputs would set a unique `temporary_directory => ...`
126
+ # if they do not than only a single S3 output is safe to recover (since let-over files are processed and deleted).
127
+ config :restore, :validate => :boolean, :default => true
128
+
129
+ # The S3 canned ACL to use when putting the file. Defaults to "private".
130
+ config :canned_acl, :validate => ["private", "public-read", "public-read-write", "authenticated-read", "aws-exec-read", "bucket-owner-read", "bucket-owner-full-control", "log-delivery-write"],
131
+ :default => "private"
132
+
133
+ # Specifies whether or not to use S3's server side encryption. Defaults to no encryption.
134
+ config :server_side_encryption, :validate => :boolean, :default => false
135
+
136
+ # Specifies what type of encryption to use when SSE is enabled.
137
+ config :server_side_encryption_algorithm, :validate => ["AES256", "aws:kms"], :default => "AES256"
138
+
139
+ # The key to use when specified along with server_side_encryption => aws:kms.
140
+ # If server_side_encryption => aws:kms is set but this is not default KMS key is used.
141
+ # http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html
142
+ config :ssekms_key_id, :validate => :string
143
+
144
+ # Specifies what S3 storage class to use when uploading the file.
145
+ # More information about the different storage classes can be found:
146
+ # http://docs.aws.amazon.com/AmazonS3/latest/dev/storage-class-intro.html
147
+ # Defaults to STANDARD.
148
+ config :storage_class, :validate => ["STANDARD", "REDUCED_REDUNDANCY", "STANDARD_IA", "ONEZONE_IA"], :default => "STANDARD"
149
+
150
+ # Set the directory where logstash will store the tmp files before sending it to S3
151
+ # default to the current OS temporary directory in linux /tmp/logstash
152
+ #
153
+ # NOTE: the reason we do not have a unique (isolated) temporary directory as a default, to support multiple plugin instances,
154
+ # is that we would have to rely on something static that does not change between restarts (e.g. a user set id => ...).
155
+ config :temporary_directory, :validate => :string, :default => File.join(Dir.tmpdir, "logstash")
156
+
157
+ # Specify a prefix to the uploaded filename, this can simulate directories on S3. Prefix does not require leading slash.
158
+ # This option support string interpolation, be warned this can created a lot of temporary local files.
159
+ config :prefix, :validate => :string, :default => ''
160
+
161
+ # Specify how many workers to use to upload the files to S3
162
+ config :upload_workers_count, :validate => :number, :default => (Concurrent.processor_count * 0.5).ceil
163
+
164
+ # Number of items we can keep in the local queue before uploading them
165
+ config :upload_queue_size, :validate => :number, :default => 2 * (Concurrent.processor_count * 0.25).ceil
166
+
167
+ # Files larger than this number are uploaded using the S3 multipart APIs. Default threshold is 15MB.
168
+ config :upload_multipart_threshold, :validate => :number, :default => 15 * 1024 * 1024
169
+
170
+ # The version of the S3 signature hash to use. Normally uses the internal client default, can be explicitly
171
+ # specified here
172
+ config :signature_version, :validate => ['v2', 'v4']
173
+
174
+ # Define tags to be appended to the file on the S3 bucket.
175
+ #
176
+ # Example:
177
+ # tags => ["elasticsearch", "logstash", "kibana"]
178
+ #
179
+ # Will generate this file:
180
+ # "ls.s3.logstash.local.2015-01-01T00.00.tag_elasticsearch.logstash.kibana.part0.txt"
181
+ #
182
+ config :tags, :validate => :array, :default => []
183
+
184
+ # Specify the content encoding. Supports ("gzip"). Defaults to "none"
185
+ config :encoding, :validate => ["none", GZIP_ENCODING], :default => "none"
186
+
187
+ # Define the strategy to use to decide when we need to rotate the file and push it to S3,
188
+ # The default strategy is to check for both size and time, the first one to match will rotate the file.
189
+ config :rotation_strategy, :validate => ["size_and_time", "size", "time"], :default => "size_and_time"
190
+
191
+ # The common use case is to define permission on the root bucket and give Logstash full access to write its logs.
192
+ # In some circonstances you need finer grained permission on subfolder, this allow you to disable the check at startup.
193
+ config :validate_credentials_on_root_bucket, :validate => :boolean, :default => true
194
+
195
+ # The number of times to retry a failed S3 upload.
196
+ config :retry_count, :validate => :number, :default => Float::INFINITY
197
+
198
+ # The amount of time to wait in seconds before attempting to retry a failed upload.
199
+ config :retry_delay, :validate => :number, :default => 1
200
+
201
+ def register
202
+ # I've move the validation of the items into custom classes
203
+ # to prepare for the new config validation that will be part of the core so the core can
204
+ # be moved easily.
205
+ unless @prefix.empty?
206
+ if !PathValidator.valid?(prefix)
207
+ raise LogStash::ConfigurationError, "Prefix must not contains: #{PathValidator::INVALID_CHARACTERS}"
208
+ end
209
+ end
210
+
211
+ if !WritableDirectoryValidator.valid?(@temporary_directory)
212
+ raise LogStash::ConfigurationError, "Logstash must have the permissions to write to the temporary directory: #{@temporary_directory}"
213
+ end
214
+
215
+ if @validate_credentials_on_root_bucket && !WriteBucketPermissionValidator.new(@logger).valid?(bucket_resource, upload_options)
216
+ raise LogStash::ConfigurationError, "Logstash must have the privileges to write to root bucket `#{@bucket}`, check your credentials or your permissions."
217
+ end
218
+
219
+ if @time_file.nil? && @size_file.nil? || @size_file == 0 && @time_file == 0
220
+ raise LogStash::ConfigurationError, "The S3 plugin must have at least one of time_file or size_file set to a value greater than 0"
221
+ end
222
+
223
+ @file_repository = FileRepository.new(@tags, @encoding, @temporary_directory)
224
+
225
+ @rotation = rotation_strategy
226
+
227
+ executor = Concurrent::ThreadPoolExecutor.new({ :min_threads => 1,
228
+ :max_threads => @upload_workers_count,
229
+ :max_queue => @upload_queue_size,
230
+ :fallback_policy => :caller_runs })
231
+
232
+ @uploader = Uploader.new(bucket_resource, @logger, executor, retry_count: @retry_count, retry_delay: @retry_delay)
233
+
234
+ # Restoring from crash will use a new threadpool to slowly recover
235
+ # New events should have more priority.
236
+ restore_from_crash if @restore
237
+
238
+ # If we need time based rotation we need to do periodic check on the file
239
+ # to take care of file that were not updated recently
240
+ start_periodic_check if @rotation.needs_periodic?
241
+ end
242
+
243
+ def multi_receive_encoded(events_and_encoded)
244
+ prefix_written_to = Set.new
245
+
246
+ events_and_encoded.each do |event, encoded|
247
+ prefix_key = normalize_key(event.sprintf(@prefix))
248
+ prefix_written_to << prefix_key
249
+
250
+ begin
251
+ @file_repository.get_file(prefix_key) { |file| file.write(encoded) }
252
+ # The output should stop accepting new events coming in, since it cannot do anything with them anymore.
253
+ # Log the error and rethrow it.
254
+ rescue Errno::ENOSPC => e
255
+ @logger.error("S3: No space left in temporary directory", :temporary_directory => @temporary_directory)
256
+ raise e
257
+ end
258
+ end
259
+
260
+ # Groups IO calls to optimize fstat checks
261
+ rotate_if_needed(prefix_written_to)
262
+ end
263
+
264
+ def close
265
+ stop_periodic_check if @rotation.needs_periodic?
266
+
267
+ @logger.debug("Uploading current workspace")
268
+
269
+ @file_repository.shutdown # stop stale sweeps
270
+
271
+ # The plugin has stopped receiving new events, but we still have
272
+ # data on disk, lets make sure it get to S3.
273
+ # If Logstash get interrupted, the `restore_from_crash` (when set to true) method will pickup
274
+ # the content in the temporary directly and upload it.
275
+ # This will block the shutdown until all upload are done or the use force quit.
276
+ @file_repository.each_files do |file|
277
+ upload_file(file)
278
+ end
279
+
280
+ @uploader.stop # wait until all the current upload are complete
281
+ @crash_uploader.stop if @restore # we might have still work to do for recovery so wait until we are done
282
+ end
283
+
284
+ def full_options
285
+ options = aws_options_hash || {}
286
+ options[:signature_version] = @signature_version if @signature_version
287
+ symbolized_settings.merge(options)
288
+ end
289
+
290
+ def symbolized_settings
291
+ @symbolized_settings ||= symbolize_keys_and_cast_true_false(@additional_settings)
292
+ end
293
+
294
+ def symbolize_keys_and_cast_true_false(hash)
295
+ case hash
296
+ when Hash
297
+ symbolized = {}
298
+ hash.each { |key, value| symbolized[key.to_sym] = symbolize_keys_and_cast_true_false(value) }
299
+ symbolized
300
+ when 'true'
301
+ true
302
+ when 'false'
303
+ false
304
+ else
305
+ hash
306
+ end
307
+ end
308
+
309
+ def normalize_key(prefix_key)
310
+ prefix_key.gsub(PathValidator.matches_re, PREFIX_KEY_NORMALIZE_CHARACTER)
311
+ end
312
+
313
+ def upload_options
314
+ {
315
+ :acl => @canned_acl,
316
+ :server_side_encryption => @server_side_encryption ? @server_side_encryption_algorithm : nil,
317
+ :ssekms_key_id => @server_side_encryption_algorithm == "aws:kms" ? @ssekms_key_id : nil,
318
+ :storage_class => @storage_class,
319
+ :content_encoding => @encoding == GZIP_ENCODING ? GZIP_ENCODING : nil,
320
+ :multipart_threshold => @upload_multipart_threshold
321
+ }
322
+ end
323
+
324
+ private
325
+ # We start a task in the background for check for stale files and make sure we rotate them to S3 if needed.
326
+ def start_periodic_check
327
+ @logger.debug("Start periodic rotation check")
328
+
329
+ @periodic_check = Concurrent::TimerTask.new(:execution_interval => PERIODIC_CHECK_INTERVAL_IN_SECONDS) do
330
+ @logger.debug("Periodic check for stale files")
331
+
332
+ rotate_if_needed(@file_repository.keys)
333
+ end
334
+
335
+ @periodic_check.execute
336
+ end
337
+
338
+ def stop_periodic_check
339
+ @periodic_check.shutdown
340
+ end
341
+
342
+ def bucket_resource
343
+ Aws::S3::Bucket.new(@bucket, full_options)
344
+ end
345
+
346
+ def rotate_if_needed(prefixes)
347
+ # Each file access is thread safe,
348
+ # until the rotation is done then only
349
+ # one thread has access to the resource.
350
+ @file_repository.each_factory(prefixes) do |factory|
351
+ # we have exclusive access to the one-and-only
352
+ # prefix WRAPPER for this factory.
353
+ temp_file = factory.current
354
+
355
+ if @rotation.rotate?(temp_file)
356
+ @logger.debug? && @logger.debug("Rotate file",
357
+ :key => temp_file.key,
358
+ :path => temp_file.path,
359
+ :strategy => @rotation.class.name)
360
+
361
+ upload_file(temp_file) # may be async or blocking
362
+ factory.rotate!
363
+ end
364
+ end
365
+ end
366
+
367
+ def upload_file(temp_file)
368
+ @logger.debug? && @logger.debug("Queue for upload", :path => temp_file.path)
369
+
370
+ # if the queue is full the calling thread will be used to upload
371
+ temp_file.close # make sure the content is on disk
372
+ if temp_file.size > 0
373
+ @uploader.upload_async(temp_file,
374
+ :on_complete => method(:clean_temporary_file),
375
+ :upload_options => upload_options )
376
+ end
377
+ end
378
+
379
+ def rotation_strategy
380
+ case @rotation_strategy
381
+ when "size"
382
+ SizeRotationPolicy.new(size_file)
383
+ when "time"
384
+ TimeRotationPolicy.new(time_file)
385
+ when "size_and_time"
386
+ SizeAndTimeRotationPolicy.new(size_file, time_file)
387
+ end
388
+ end
389
+
390
+ def clean_temporary_file(file)
391
+ @logger.debug? && @logger.debug("Removing temporary file", :path => file.path)
392
+ file.delete!
393
+ end
394
+
395
+ # The upload process will use a separate uploader/threadpool with less resource allocated to it.
396
+ # but it will use an unbounded queue for the work, it may take some time before all the older files get processed.
397
+ def restore_from_crash
398
+ @crash_uploader = Uploader.new(bucket_resource, @logger, CRASH_RECOVERY_THREADPOOL)
399
+
400
+ temp_folder_path = Pathname.new(@temporary_directory)
401
+ files = Dir.glob(::File.join(@temporary_directory, "**/*"))
402
+ .select { |file_path| ::File.file?(file_path) }
403
+ under_recovery_files = get_under_recovery_files(files)
404
+
405
+ files.each do |file_path|
406
+ # when encoding is GZIP, if file is already recovering or recovered and uploading to S3, log and skip
407
+ if under_recovery_files.include?(file_path)
408
+ unless file_path.include?(TemporaryFile.gzip_extension)
409
+ @logger.warn("The #{file_path} file either under recover process or failed to recover before.")
410
+ end
411
+ else
412
+ temp_file = TemporaryFile.create_from_existing_file(file_path, temp_folder_path)
413
+ # do not remove or upload if Logstash tries to recover file but fails
414
+ if temp_file.recoverable?
415
+ if temp_file.size > 0
416
+ @logger.debug? && @logger.debug("Recovering from crash and uploading", :path => temp_file.path)
417
+ @crash_uploader.upload_async(temp_file,
418
+ :on_complete => method(:clean_temporary_file),
419
+ :upload_options => upload_options)
420
+ else
421
+ clean_temporary_file(temp_file)
422
+ end
423
+ end
424
+ end
425
+ end
426
+ end
427
+
428
+ # figures out the recovering files and
429
+ # creates a skip list to ignore for the rest of processes
430
+ def get_under_recovery_files(files)
431
+ skip_files = Set.new
432
+ return skip_files unless @encoding == GZIP_ENCODING
433
+
434
+ files.each do |file_path|
435
+ if file_path.include?(TemporaryFile.recovery_file_name_tag)
436
+ skip_files << file_path
437
+ if file_path.include?(TemporaryFile.gzip_extension)
438
+ # also include the original corrupted gzip file
439
+ skip_files << file_path.gsub(TemporaryFile.recovery_file_name_tag, "")
440
+ end
441
+ end
442
+ end
443
+ skip_files
444
+ end
445
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+ require "jars/installer"
3
+ require "fileutils"
4
+
5
+ task :vendor do
6
+ exit(1) unless system './gradlew vendor'
7
+ version = File.read("VERSION").strip
8
+ end
9
+
10
+ desc "clean"
11
+ task :clean do
12
+ ["build", "vendor/jar-dependencies", "Gemfile.lock"].each do |p|
13
+ FileUtils.rm_rf(p)
14
+ end
15
+ end
@@ -0,0 +1,30 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'logstash-output-s3-zst'
3
+ s.version = '1.0.0'
4
+ s.licenses = ['Apache-2.0']
5
+ s.summary = "Sends Logstash events to the Amazon Simple Storage Service and zst compresses it"
6
+ s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
7
+ s.authors = ["Evan Vinciguerra", "Elastic"]
8
+ s.email = 'evanv511@gmailcom'
9
+ s.homepage = "https://github.com/evanvin/logstash-output-s3-zst"
10
+ s.require_paths = ["lib", "vendor/jar-dependencies"]
11
+
12
+ # Files
13
+ s.files = Dir["lib/**/*","spec/**/*","*.gemspec","*.md","CONTRIBUTORS","Gemfile","LICENSE","NOTICE.TXT", "vendor/jar-dependencies/**/*.jar", "vendor/jar-dependencies/**/*.rb", "VERSION", "docs/**/*"]
14
+
15
+ # Tests
16
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
17
+
18
+ # Special flag to let us know this is actually a logstash plugin
19
+ s.metadata = { "logstash_plugin" => "true", "logstash_group" => "output" }
20
+
21
+ # Gem dependencies
22
+ s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99"
23
+ s.add_runtime_dependency 'logstash-mixin-aws', '>= 4.3.0'
24
+ s.add_runtime_dependency "concurrent-ruby"
25
+ s.add_runtime_dependency 'stud', '~> 0.0.22'
26
+ s.add_runtime_dependency 'zstd', '~> 1.1', '>= 1.1.2.1'
27
+ s.add_development_dependency 'logstash-devutils'
28
+ s.add_development_dependency 'logstash-input-generator'
29
+ s.add_development_dependency 'logstash-codec-line'
30
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+ require "logstash/outputs/s3"
3
+ require "logstash/event"
4
+ require "logstash/codecs/line"
5
+ require "stud/temporary"
6
+
7
+ describe LogStash::Outputs::S3 do
8
+ describe "Main Test" do
9
+ expect(1).to eq(1)
10
+ end
11
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require "logstash/devutils/rspec/spec_helper"
3
+ require_relative "supports/helpers"
4
+ require "logstash/logging/logger"
5
+
6
+ LogStash::Logging::Logger::configure_logging("debug") if ENV["DEBUG"]