logstash-output-s3-zst 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,445 @@
1
+ # encoding: utf-8
2
+ require "logstash/outputs/base"
3
+ require "logstash/namespace"
4
+ require "logstash/plugin_mixins/aws_config"
5
+ require "stud/temporary"
6
+ require "stud/task"
7
+ require "concurrent"
8
+ require "socket"
9
+ require "thread"
10
+ require "tmpdir"
11
+ require "fileutils"
12
+ require "set"
13
+ require "pathname"
14
+ require "aws-sdk"
15
+ require "logstash/outputs/s3/patch"
16
+
17
+ Aws.eager_autoload!
18
+
19
+ # INFORMATION:
20
+ #
21
+ # This plugin batches and uploads logstash events into Amazon Simple Storage Service (Amazon S3).
22
+ #
23
+ # Requirements:
24
+ # * Amazon S3 Bucket and S3 Access Permissions (Typically access_key_id and secret_access_key)
25
+ # * S3 PutObject permission
26
+ #
27
+ # S3 outputs create temporary files into the OS' temporary directory, you can specify where to save them using the `temporary_directory` option.
28
+ #
29
+ # S3 output files have the following format
30
+ #
31
+ # ls.s3.312bc026-2f5d-49bc-ae9f-5940cf4ad9a6.2013-04-18T10.00.tag_hello.part0.txt
32
+ #
33
+ #
34
+ # |=======
35
+ # | ls.s3 | indicate logstash plugin s3 |
36
+ # | 312bc026-2f5d-49bc-ae9f-5940cf4ad9a6 | a new, random uuid per file. |
37
+ # | 2013-04-18T10.00 | represents the time whenever you specify time_file. |
38
+ # | tag_hello | this indicates the event's tag. |
39
+ # | part0 | this means if you indicate size_file then it will generate more parts if you file.size > size_file. When a file is full it will be pushed to the bucket and then deleted from the temporary directory. If a file is empty, it is simply deleted. Empty files will not be pushed |
40
+ # |=======
41
+ #
42
+ # Crash Recovery:
43
+ # * This plugin will recover and upload temporary log files after crash/abnormal termination when using `restore` set to true
44
+ #
45
+ ##[Note regarding time_file and size_file] :
46
+ #
47
+ ## Both time_file and size_file settings can trigger a log "file rotation"
48
+ ## A log rotation pushes the current log "part" to s3 and deleted from local temporary storage.
49
+ #
50
+ ## If you specify BOTH size_file and time_file then it will create file for each tag (if specified).
51
+ ## When EITHER time_file minutes have elapsed OR log file size > size_file, a log rotation is triggered.
52
+ ##
53
+ ## If you ONLY specify time_file but NOT file_size, one file for each tag (if specified) will be created.
54
+ ## When time_file minutes elapses, a log rotation will be triggered.
55
+ #
56
+ ## If you ONLY specify size_file, but NOT time_file, one files for each tag (if specified) will be created.
57
+ ## When size of log file part > size_file, a log rotation will be triggered.
58
+ #
59
+ ## If NEITHER size_file nor time_file is specified, ONLY one file for each tag (if specified) will be created.
60
+ ## WARNING: Since no log rotation is triggered, S3 Upload will only occur when logstash restarts.
61
+ #
62
+ #
63
+ # #### Usage:
64
+ # This is an example of logstash config:
65
+ # [source,ruby]
66
+ # output {
67
+ # s3{
68
+ # access_key_id => "crazy_key" (required)
69
+ # secret_access_key => "monkey_access_key" (required)
70
+ # region => "eu-west-1" (optional, default = "us-east-1")
71
+ # bucket => "your_bucket" (required)
72
+ # size_file => 2048 (optional) - Bytes
73
+ # time_file => 5 (optional) - Minutes
74
+ # codec => "plain" (optional)
75
+ # canned_acl => "private" (optional. Options are "private", "public-read", "public-read-write", "authenticated-read", "aws-exec-read", "bucket-owner-read", "bucket-owner-full-control", "log-delivery-write". Defaults to "private" )
76
+ # }
77
+ #
78
+ class LogStash::Outputs::S3ZST < LogStash::Outputs::Base
79
+ require "logstash/outputs/s3/writable_directory_validator"
80
+ require "logstash/outputs/s3/path_validator"
81
+ require "logstash/outputs/s3/write_bucket_permission_validator"
82
+ require "logstash/outputs/s3/size_rotation_policy"
83
+ require "logstash/outputs/s3/time_rotation_policy"
84
+ require "logstash/outputs/s3/size_and_time_rotation_policy"
85
+ require "logstash/outputs/s3/temporary_file"
86
+ require "logstash/outputs/s3/temporary_file_factory"
87
+ require "logstash/outputs/s3/uploader"
88
+ require "logstash/outputs/s3/file_repository"
89
+
90
+ include LogStash::PluginMixins::AwsConfig::V2
91
+
92
+ PREFIX_KEY_NORMALIZE_CHARACTER = "_"
93
+ PERIODIC_CHECK_INTERVAL_IN_SECONDS = 15
94
+ CRASH_RECOVERY_THREADPOOL = Concurrent::ThreadPoolExecutor.new({
95
+ :min_threads => 1,
96
+ :max_threads => 2,
97
+ :fallback_policy => :caller_runs
98
+ })
99
+
100
+ GZIP_ENCODING = "gzip"
101
+
102
+ config_name "s3-zst"
103
+ default :codec, "line"
104
+
105
+ concurrency :shared
106
+
107
+ # S3 bucket
108
+ config :bucket, :validate => :string, :required => true
109
+
110
+ config :additional_settings, :validate => :hash, :default => {}
111
+
112
+ # Set the size of file in bytes, this means that files on bucket when have dimension > file_size, they are stored in two or more file.
113
+ # If you have tags then it will generate a specific size file for every tags
114
+ #
115
+ # NOTE: define size of file is the better thing, because generate a local temporary file on disk and then put it in bucket.
116
+ config :size_file, :validate => :number, :default => 1024 * 1024 * 5
117
+
118
+ # Set the time, in MINUTES, to close the current sub_time_section of bucket.
119
+ # If you also define file_size you have a number of files related to the section and the current tag.
120
+ # If it's valued 0 and rotation_strategy is 'time' or 'size_and_time' then the plugin reaise a configuration error.
121
+ config :time_file, :validate => :number, :default => 15
122
+
123
+ # If `restore => false` is specified and Logstash crashes, the unprocessed files are not sent into the bucket.
124
+ #
125
+ # NOTE: that the `recovery => true` default assumes multiple S3 outputs would set a unique `temporary_directory => ...`
126
+ # if they do not than only a single S3 output is safe to recover (since let-over files are processed and deleted).
127
+ config :restore, :validate => :boolean, :default => true
128
+
129
+ # The S3 canned ACL to use when putting the file. Defaults to "private".
130
+ config :canned_acl, :validate => ["private", "public-read", "public-read-write", "authenticated-read", "aws-exec-read", "bucket-owner-read", "bucket-owner-full-control", "log-delivery-write"],
131
+ :default => "private"
132
+
133
+ # Specifies whether or not to use S3's server side encryption. Defaults to no encryption.
134
+ config :server_side_encryption, :validate => :boolean, :default => false
135
+
136
+ # Specifies what type of encryption to use when SSE is enabled.
137
+ config :server_side_encryption_algorithm, :validate => ["AES256", "aws:kms"], :default => "AES256"
138
+
139
+ # The key to use when specified along with server_side_encryption => aws:kms.
140
+ # If server_side_encryption => aws:kms is set but this is not default KMS key is used.
141
+ # http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html
142
+ config :ssekms_key_id, :validate => :string
143
+
144
+ # Specifies what S3 storage class to use when uploading the file.
145
+ # More information about the different storage classes can be found:
146
+ # http://docs.aws.amazon.com/AmazonS3/latest/dev/storage-class-intro.html
147
+ # Defaults to STANDARD.
148
+ config :storage_class, :validate => ["STANDARD", "REDUCED_REDUNDANCY", "STANDARD_IA", "ONEZONE_IA"], :default => "STANDARD"
149
+
150
+ # Set the directory where logstash will store the tmp files before sending it to S3
151
+ # default to the current OS temporary directory in linux /tmp/logstash
152
+ #
153
+ # NOTE: the reason we do not have a unique (isolated) temporary directory as a default, to support multiple plugin instances,
154
+ # is that we would have to rely on something static that does not change between restarts (e.g. a user set id => ...).
155
+ config :temporary_directory, :validate => :string, :default => File.join(Dir.tmpdir, "logstash")
156
+
157
+ # Specify a prefix to the uploaded filename, this can simulate directories on S3. Prefix does not require leading slash.
158
+ # This option support string interpolation, be warned this can created a lot of temporary local files.
159
+ config :prefix, :validate => :string, :default => ''
160
+
161
+ # Specify how many workers to use to upload the files to S3
162
+ config :upload_workers_count, :validate => :number, :default => (Concurrent.processor_count * 0.5).ceil
163
+
164
+ # Number of items we can keep in the local queue before uploading them
165
+ config :upload_queue_size, :validate => :number, :default => 2 * (Concurrent.processor_count * 0.25).ceil
166
+
167
+ # Files larger than this number are uploaded using the S3 multipart APIs. Default threshold is 15MB.
168
+ config :upload_multipart_threshold, :validate => :number, :default => 15 * 1024 * 1024
169
+
170
+ # The version of the S3 signature hash to use. Normally uses the internal client default, can be explicitly
171
+ # specified here
172
+ config :signature_version, :validate => ['v2', 'v4']
173
+
174
+ # Define tags to be appended to the file on the S3 bucket.
175
+ #
176
+ # Example:
177
+ # tags => ["elasticsearch", "logstash", "kibana"]
178
+ #
179
+ # Will generate this file:
180
+ # "ls.s3.logstash.local.2015-01-01T00.00.tag_elasticsearch.logstash.kibana.part0.txt"
181
+ #
182
+ config :tags, :validate => :array, :default => []
183
+
184
+ # Specify the content encoding. Supports ("gzip"). Defaults to "none"
185
+ config :encoding, :validate => ["none", GZIP_ENCODING], :default => "none"
186
+
187
+ # Define the strategy to use to decide when we need to rotate the file and push it to S3,
188
+ # The default strategy is to check for both size and time, the first one to match will rotate the file.
189
+ config :rotation_strategy, :validate => ["size_and_time", "size", "time"], :default => "size_and_time"
190
+
191
+ # The common use case is to define permission on the root bucket and give Logstash full access to write its logs.
192
+ # In some circonstances you need finer grained permission on subfolder, this allow you to disable the check at startup.
193
+ config :validate_credentials_on_root_bucket, :validate => :boolean, :default => true
194
+
195
+ # The number of times to retry a failed S3 upload.
196
+ config :retry_count, :validate => :number, :default => Float::INFINITY
197
+
198
+ # The amount of time to wait in seconds before attempting to retry a failed upload.
199
+ config :retry_delay, :validate => :number, :default => 1
200
+
201
+ def register
202
+ # I've move the validation of the items into custom classes
203
+ # to prepare for the new config validation that will be part of the core so the core can
204
+ # be moved easily.
205
+ unless @prefix.empty?
206
+ if !PathValidator.valid?(prefix)
207
+ raise LogStash::ConfigurationError, "Prefix must not contains: #{PathValidator::INVALID_CHARACTERS}"
208
+ end
209
+ end
210
+
211
+ if !WritableDirectoryValidator.valid?(@temporary_directory)
212
+ raise LogStash::ConfigurationError, "Logstash must have the permissions to write to the temporary directory: #{@temporary_directory}"
213
+ end
214
+
215
+ if @validate_credentials_on_root_bucket && !WriteBucketPermissionValidator.new(@logger).valid?(bucket_resource, upload_options)
216
+ raise LogStash::ConfigurationError, "Logstash must have the privileges to write to root bucket `#{@bucket}`, check your credentials or your permissions."
217
+ end
218
+
219
+ if @time_file.nil? && @size_file.nil? || @size_file == 0 && @time_file == 0
220
+ raise LogStash::ConfigurationError, "The S3 plugin must have at least one of time_file or size_file set to a value greater than 0"
221
+ end
222
+
223
+ @file_repository = FileRepository.new(@tags, @encoding, @temporary_directory)
224
+
225
+ @rotation = rotation_strategy
226
+
227
+ executor = Concurrent::ThreadPoolExecutor.new({ :min_threads => 1,
228
+ :max_threads => @upload_workers_count,
229
+ :max_queue => @upload_queue_size,
230
+ :fallback_policy => :caller_runs })
231
+
232
+ @uploader = Uploader.new(bucket_resource, @logger, executor, retry_count: @retry_count, retry_delay: @retry_delay)
233
+
234
+ # Restoring from crash will use a new threadpool to slowly recover
235
+ # New events should have more priority.
236
+ restore_from_crash if @restore
237
+
238
+ # If we need time based rotation we need to do periodic check on the file
239
+ # to take care of file that were not updated recently
240
+ start_periodic_check if @rotation.needs_periodic?
241
+ end
242
+
243
+ def multi_receive_encoded(events_and_encoded)
244
+ prefix_written_to = Set.new
245
+
246
+ events_and_encoded.each do |event, encoded|
247
+ prefix_key = normalize_key(event.sprintf(@prefix))
248
+ prefix_written_to << prefix_key
249
+
250
+ begin
251
+ @file_repository.get_file(prefix_key) { |file| file.write(encoded) }
252
+ # The output should stop accepting new events coming in, since it cannot do anything with them anymore.
253
+ # Log the error and rethrow it.
254
+ rescue Errno::ENOSPC => e
255
+ @logger.error("S3: No space left in temporary directory", :temporary_directory => @temporary_directory)
256
+ raise e
257
+ end
258
+ end
259
+
260
+ # Groups IO calls to optimize fstat checks
261
+ rotate_if_needed(prefix_written_to)
262
+ end
263
+
264
+ def close
265
+ stop_periodic_check if @rotation.needs_periodic?
266
+
267
+ @logger.debug("Uploading current workspace")
268
+
269
+ @file_repository.shutdown # stop stale sweeps
270
+
271
+ # The plugin has stopped receiving new events, but we still have
272
+ # data on disk, lets make sure it get to S3.
273
+ # If Logstash get interrupted, the `restore_from_crash` (when set to true) method will pickup
274
+ # the content in the temporary directly and upload it.
275
+ # This will block the shutdown until all upload are done or the use force quit.
276
+ @file_repository.each_files do |file|
277
+ upload_file(file)
278
+ end
279
+
280
+ @uploader.stop # wait until all the current upload are complete
281
+ @crash_uploader.stop if @restore # we might have still work to do for recovery so wait until we are done
282
+ end
283
+
284
+ def full_options
285
+ options = aws_options_hash || {}
286
+ options[:signature_version] = @signature_version if @signature_version
287
+ symbolized_settings.merge(options)
288
+ end
289
+
290
+ def symbolized_settings
291
+ @symbolized_settings ||= symbolize_keys_and_cast_true_false(@additional_settings)
292
+ end
293
+
294
+ def symbolize_keys_and_cast_true_false(hash)
295
+ case hash
296
+ when Hash
297
+ symbolized = {}
298
+ hash.each { |key, value| symbolized[key.to_sym] = symbolize_keys_and_cast_true_false(value) }
299
+ symbolized
300
+ when 'true'
301
+ true
302
+ when 'false'
303
+ false
304
+ else
305
+ hash
306
+ end
307
+ end
308
+
309
+ def normalize_key(prefix_key)
310
+ prefix_key.gsub(PathValidator.matches_re, PREFIX_KEY_NORMALIZE_CHARACTER)
311
+ end
312
+
313
+ def upload_options
314
+ {
315
+ :acl => @canned_acl,
316
+ :server_side_encryption => @server_side_encryption ? @server_side_encryption_algorithm : nil,
317
+ :ssekms_key_id => @server_side_encryption_algorithm == "aws:kms" ? @ssekms_key_id : nil,
318
+ :storage_class => @storage_class,
319
+ :content_encoding => @encoding == GZIP_ENCODING ? GZIP_ENCODING : nil,
320
+ :multipart_threshold => @upload_multipart_threshold
321
+ }
322
+ end
323
+
324
+ private
325
+ # We start a task in the background for check for stale files and make sure we rotate them to S3 if needed.
326
+ def start_periodic_check
327
+ @logger.debug("Start periodic rotation check")
328
+
329
+ @periodic_check = Concurrent::TimerTask.new(:execution_interval => PERIODIC_CHECK_INTERVAL_IN_SECONDS) do
330
+ @logger.debug("Periodic check for stale files")
331
+
332
+ rotate_if_needed(@file_repository.keys)
333
+ end
334
+
335
+ @periodic_check.execute
336
+ end
337
+
338
+ def stop_periodic_check
339
+ @periodic_check.shutdown
340
+ end
341
+
342
+ def bucket_resource
343
+ Aws::S3::Bucket.new(@bucket, full_options)
344
+ end
345
+
346
+ def rotate_if_needed(prefixes)
347
+ # Each file access is thread safe,
348
+ # until the rotation is done then only
349
+ # one thread has access to the resource.
350
+ @file_repository.each_factory(prefixes) do |factory|
351
+ # we have exclusive access to the one-and-only
352
+ # prefix WRAPPER for this factory.
353
+ temp_file = factory.current
354
+
355
+ if @rotation.rotate?(temp_file)
356
+ @logger.debug? && @logger.debug("Rotate file",
357
+ :key => temp_file.key,
358
+ :path => temp_file.path,
359
+ :strategy => @rotation.class.name)
360
+
361
+ upload_file(temp_file) # may be async or blocking
362
+ factory.rotate!
363
+ end
364
+ end
365
+ end
366
+
367
+ def upload_file(temp_file)
368
+ @logger.debug? && @logger.debug("Queue for upload", :path => temp_file.path)
369
+
370
+ # if the queue is full the calling thread will be used to upload
371
+ temp_file.close # make sure the content is on disk
372
+ if temp_file.size > 0
373
+ @uploader.upload_async(temp_file,
374
+ :on_complete => method(:clean_temporary_file),
375
+ :upload_options => upload_options )
376
+ end
377
+ end
378
+
379
+ def rotation_strategy
380
+ case @rotation_strategy
381
+ when "size"
382
+ SizeRotationPolicy.new(size_file)
383
+ when "time"
384
+ TimeRotationPolicy.new(time_file)
385
+ when "size_and_time"
386
+ SizeAndTimeRotationPolicy.new(size_file, time_file)
387
+ end
388
+ end
389
+
390
+ def clean_temporary_file(file)
391
+ @logger.debug? && @logger.debug("Removing temporary file", :path => file.path)
392
+ file.delete!
393
+ end
394
+
395
+ # The upload process will use a separate uploader/threadpool with less resource allocated to it.
396
+ # but it will use an unbounded queue for the work, it may take some time before all the older files get processed.
397
+ def restore_from_crash
398
+ @crash_uploader = Uploader.new(bucket_resource, @logger, CRASH_RECOVERY_THREADPOOL)
399
+
400
+ temp_folder_path = Pathname.new(@temporary_directory)
401
+ files = Dir.glob(::File.join(@temporary_directory, "**/*"))
402
+ .select { |file_path| ::File.file?(file_path) }
403
+ under_recovery_files = get_under_recovery_files(files)
404
+
405
+ files.each do |file_path|
406
+ # when encoding is GZIP, if file is already recovering or recovered and uploading to S3, log and skip
407
+ if under_recovery_files.include?(file_path)
408
+ unless file_path.include?(TemporaryFile.gzip_extension)
409
+ @logger.warn("The #{file_path} file either under recover process or failed to recover before.")
410
+ end
411
+ else
412
+ temp_file = TemporaryFile.create_from_existing_file(file_path, temp_folder_path)
413
+ # do not remove or upload if Logstash tries to recover file but fails
414
+ if temp_file.recoverable?
415
+ if temp_file.size > 0
416
+ @logger.debug? && @logger.debug("Recovering from crash and uploading", :path => temp_file.path)
417
+ @crash_uploader.upload_async(temp_file,
418
+ :on_complete => method(:clean_temporary_file),
419
+ :upload_options => upload_options)
420
+ else
421
+ clean_temporary_file(temp_file)
422
+ end
423
+ end
424
+ end
425
+ end
426
+ end
427
+
428
+ # figures out the recovering files and
429
+ # creates a skip list to ignore for the rest of processes
430
+ def get_under_recovery_files(files)
431
+ skip_files = Set.new
432
+ return skip_files unless @encoding == GZIP_ENCODING
433
+
434
+ files.each do |file_path|
435
+ if file_path.include?(TemporaryFile.recovery_file_name_tag)
436
+ skip_files << file_path
437
+ if file_path.include?(TemporaryFile.gzip_extension)
438
+ # also include the original corrupted gzip file
439
+ skip_files << file_path.gsub(TemporaryFile.recovery_file_name_tag, "")
440
+ end
441
+ end
442
+ end
443
+ skip_files
444
+ end
445
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+ require "jars/installer"
3
+ require "fileutils"
4
+
5
+ task :vendor do
6
+ exit(1) unless system './gradlew vendor'
7
+ version = File.read("VERSION").strip
8
+ end
9
+
10
+ desc "clean"
11
+ task :clean do
12
+ ["build", "vendor/jar-dependencies", "Gemfile.lock"].each do |p|
13
+ FileUtils.rm_rf(p)
14
+ end
15
+ end
@@ -0,0 +1,30 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'logstash-output-s3-zst'
3
+ s.version = '1.0.0'
4
+ s.licenses = ['Apache-2.0']
5
+ s.summary = "Sends Logstash events to the Amazon Simple Storage Service and zst compresses it"
6
+ s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
7
+ s.authors = ["Evan Vinciguerra", "Elastic"]
8
+ s.email = 'evanv511@gmailcom'
9
+ s.homepage = "https://github.com/evanvin/logstash-output-s3-zst"
10
+ s.require_paths = ["lib", "vendor/jar-dependencies"]
11
+
12
+ # Files
13
+ s.files = Dir["lib/**/*","spec/**/*","*.gemspec","*.md","CONTRIBUTORS","Gemfile","LICENSE","NOTICE.TXT", "vendor/jar-dependencies/**/*.jar", "vendor/jar-dependencies/**/*.rb", "VERSION", "docs/**/*"]
14
+
15
+ # Tests
16
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
17
+
18
+ # Special flag to let us know this is actually a logstash plugin
19
+ s.metadata = { "logstash_plugin" => "true", "logstash_group" => "output" }
20
+
21
+ # Gem dependencies
22
+ s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99"
23
+ s.add_runtime_dependency 'logstash-mixin-aws', '>= 4.3.0'
24
+ s.add_runtime_dependency "concurrent-ruby"
25
+ s.add_runtime_dependency 'stud', '~> 0.0.22'
26
+ s.add_runtime_dependency 'zstd', '~> 1.1', '>= 1.1.2.1'
27
+ s.add_development_dependency 'logstash-devutils'
28
+ s.add_development_dependency 'logstash-input-generator'
29
+ s.add_development_dependency 'logstash-codec-line'
30
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+ require "logstash/outputs/s3"
3
+ require "logstash/event"
4
+ require "logstash/codecs/line"
5
+ require "stud/temporary"
6
+
7
+ describe LogStash::Outputs::S3 do
8
+ describe "Main Test" do
9
+ expect(1).to eq(1)
10
+ end
11
+ end
@@ -0,0 +1,6 @@
1
+ # encoding: utf-8
2
+ require "logstash/devutils/rspec/spec_helper"
3
+ require_relative "supports/helpers"
4
+ require "logstash/logging/logger"
5
+
6
+ LogStash::Logging::Logger::configure_logging("debug") if ENV["DEBUG"]