logstash-input-azureblob-offline 0.9.13.1-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,525 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/base"
3
+ require "logstash/namespace"
4
+
5
+ # Azure Storage SDK for Ruby
6
+ require "azure/storage"
7
+ require 'json' # for registry content
8
+ require "securerandom" # for generating uuid.
9
+
10
+ require "com/microsoft/json-parser"
11
+
12
+ #require Dir[ File.dirname(__FILE__) + "/../../*_jars.rb" ].first
13
+ # Registry item to coordinate between mulitple clients
14
+ class LogStash::Inputs::RegistryItem
15
+ attr_accessor :file_path, :etag, :offset, :reader, :gen
16
+ # Allow json serialization.
17
+ def as_json(options={})
18
+ {
19
+ file_path: @file_path,
20
+ etag: @etag,
21
+ reader: @reader,
22
+ offset: @offset,
23
+ gen: @gen
24
+ }
25
+ end # as_json
26
+
27
+ def to_json(*options)
28
+ as_json(*options).to_json(*options)
29
+ end # to_json
30
+
31
+ def initialize(file_path, etag, reader, offset = 0, gen = 0)
32
+ @file_path = file_path
33
+ @etag = etag
34
+ @reader = reader
35
+ @offset = offset
36
+ @gen = gen
37
+ end # initialize
38
+ end # class RegistryItem
39
+
40
+ # Logstash input plugin for Azure Blobs
41
+ #
42
+ # This logstash plugin gathers data from Microsoft Azure Blobs
43
+ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
44
+ config_name 'azureblob-offline'
45
+
46
+ # If undefined, Logstash will complain, even if codec is unused.
47
+ default :codec, 'json_lines'
48
+
49
+ # Set the account name for the azure storage account.
50
+ config :storage_account_name, :validate => :string
51
+
52
+ # Set the key to access the storage account.
53
+ config :storage_access_key, :validate => :string
54
+
55
+ # Set the container of the blobs.
56
+ config :container, :validate => :string
57
+
58
+ # The path(s) to the file(s) to use as an input. By default it will
59
+ # watch every files in the storage container.
60
+ # You can use filename patterns here, such as `logs/*.log`.
61
+ # If you use a pattern like `logs/**/*.log`, a recursive search
62
+ # of `logs` will be done for all `*.log` files.
63
+ # Do not include a leading `/`, as Azure path look like this:
64
+ # `path/to/blob/file.txt`
65
+ #
66
+ # You may also configure multiple paths. See an example
67
+ # on the <<array,Logstash configuration page>>.
68
+ config :path_filters, :validate => :array, :default => [], :required => false
69
+
70
+ # Set the endpoint for the blobs.
71
+ #
72
+ # The default, `core.windows.net` targets the public azure.
73
+ config :endpoint, :validate => :string, :default => 'core.windows.net'
74
+
75
+ # Set the value of using backup mode.
76
+ config :backupmode, :validate => :boolean, :default => false, :deprecated => true, :obsolete => 'This option is obsoleted and the settings will be ignored.'
77
+
78
+ # Set the value for the registry file.
79
+ #
80
+ # The default, `data/registry`, is used to coordinate readings for various instances of the clients.
81
+ config :registry_path, :validate => :string, :default => 'data/registry'
82
+
83
+ # Sets the value for registry file lock duration in seconds. It must be set to -1, or between 15 to 60 inclusively.
84
+ #
85
+ # The default, `15` means the registry file will be locked for at most 15 seconds. This should usually be sufficient to
86
+ # read the content of registry. Having this configuration here to allow lease expired in case the client crashed that
87
+ # never got a chance to release the lease for the registry.
88
+ config :registry_lease_duration, :validate => :number, :default => 15
89
+
90
+ # Set how many seconds to keep idle before checking for new logs.
91
+ #
92
+ # The default, `30`, means trigger a reading for the log every 30 seconds after entering idle.
93
+ config :interval, :validate => :number, :default => 30
94
+
95
+ # Set the registry create mode
96
+ #
97
+ # The default, `resume`, means when the registry is initially created, it assumes all logs has been handled.
98
+ # When set to `start_over`, it will read all log files from begining.
99
+ config :registry_create_policy, :validate => :string, :default => 'resume'
100
+
101
+ # Sets the header of the file that does not repeat over records. Usually, these are json opening tags.
102
+ config :file_head_bytes, :validate => :number, :default => 0
103
+
104
+ # Sets the tail of the file that does not repeat over records. Usually, these are json closing tags.
105
+ config :file_tail_bytes, :validate => :number, :default => 0
106
+
107
+ # Sets how to break json
108
+ #
109
+ # Only works when the codec is set to `json`. Sets the policy to break the json object in the array into small events.
110
+ # Break json into small sections will not be as efficient as keep it as a whole, but will reduce the usage of
111
+ # the memory.
112
+ # Possible options: `do_not_break`, `with_head_tail`, `without_head_tail`
113
+ config :break_json_down_policy, :validate => :string, :default => 'do_not_break', :obsolete => 'This option is obsoleted and the settings will be ignored.'
114
+
115
+ # Sets when break json happens, how many json object will be put in 1 batch
116
+ config :break_json_batch_count, :validate => :number, :default => 10, :obsolete => 'This option is obsoleted and the settings will be ignored.'
117
+
118
+ # Sets the page-size for returned blob items. Too big number will hit heap overflow; Too small number will leads to too many requests.
119
+ #
120
+ # The default, `100` is good for default heap size of 1G.
121
+ config :blob_list_page_size, :validate => :number, :default => 100
122
+
123
+ # The default is 4 MB
124
+ config :file_chunk_size_bytes, :validate => :number, :default => 4 * 1024 * 1024
125
+
126
+ config :azure_blob_file_path_field, :validate => :boolean, :default => false
127
+
128
+ config :azure_blob_file_path_field_name, :validate => :string, :default => "azureblobfilepath"
129
+
130
+ # Constant of max integer
131
+ MAX = 2**([42].pack('i').size * 16 - 2) - 1
132
+
133
+ # Update the registry offset each time after this number of entries have been processed
134
+ UPDATE_REGISTRY_COUNT = 100
135
+
136
+ public
137
+ def register
138
+ user_agent = 'logstash-input-azureblob-offline'
139
+
140
+ # this is the reader # for this specific instance.
141
+ @reader = SecureRandom.uuid
142
+
143
+ # Setup a specific instance of an Azure::Storage::Client
144
+ client = Azure::Storage::Client.create(:storage_account_name => @storage_account_name, :storage_access_key => @storage_access_key, :storage_blob_host => "https://#{@storage_account_name}.blob.#{@endpoint}", :user_agent_prefix => user_agent)
145
+ # Get an azure storage blob service object from a specific instance of an Azure::Storage::Client
146
+ @azure_blob = client.blob_client
147
+ # Add retry filter to the service object
148
+ @azure_blob.with_filter(Azure::Storage::Core::Filter::ExponentialRetryPolicyFilter.new)
149
+ end # def register
150
+
151
+ def run(queue)
152
+ # we can abort the loop if stop? becomes true
153
+ while !stop?
154
+ process(queue)
155
+ @logger.debug("Hitting interval of #{@interval}s . . .")
156
+ Stud.stoppable_sleep(@interval) { stop? }
157
+ end # loop
158
+ end # def run
159
+
160
+ def stop
161
+ cleanup_registry
162
+ end # def stop
163
+
164
+ # Start processing the next item.
165
+ def process(queue)
166
+ begin
167
+ @processed_entries = 0
168
+ blob, start_index, gen = register_for_read
169
+
170
+ unless blob.nil?
171
+ begin
172
+ blob_name = blob.name
173
+ @logger.debug("Processing blob #{blob.name}")
174
+ blob_size = blob.properties[:content_length]
175
+ # Work-around: After returned by get_blob, the etag will contains quotes.
176
+ new_etag = blob.properties[:etag]
177
+ # ~ Work-around
178
+
179
+ blob, header = @azure_blob.get_blob(@container, blob_name, {:end_range => (@file_head_bytes-1) }) if header.nil? unless @file_head_bytes.nil? or @file_head_bytes <= 0
180
+
181
+ blob, tail = @azure_blob.get_blob(@container, blob_name, {:start_range => blob_size - @file_tail_bytes}) if tail.nil? unless @file_tail_bytes.nil? or @file_tail_bytes <= 0
182
+
183
+ if start_index == 0
184
+ # Skip the header since it is already read.
185
+ start_index = @file_head_bytes
186
+ end
187
+
188
+ @logger.debug("start index: #{start_index} blob size: #{blob_size}")
189
+
190
+ content_length = 0
191
+ blob_reader = BlobReader.new(@logger, @azure_blob, @container, blob_name, file_chunk_size_bytes, start_index, blob_size - 1 - @file_tail_bytes)
192
+
193
+ is_json_codec = (defined?(LogStash::Codecs::JSON) == 'constant') && (@codec.is_a? LogStash::Codecs::JSON)
194
+ if is_json_codec
195
+ parser = JsonParser.new(@logger, blob_reader)
196
+
197
+ parser.parse(->(json_content) {
198
+ content_length += json_content.length
199
+
200
+ enqueue_content(queue, json_content, header, tail, blob_name)
201
+
202
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
203
+ }, ->(malformed_json) {
204
+ @logger.debug("Skipping #{malformed_json.length} malformed bytes")
205
+ content_length = content_length + malformed_json.length
206
+
207
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
208
+ })
209
+ else
210
+ begin
211
+ content, are_more_bytes_available = blob_reader.read
212
+
213
+ content_length += content.length
214
+ enqueue_content(queue, content, header, tail, blob_name)
215
+
216
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
217
+ end until !are_more_bytes_available || content.nil?
218
+
219
+ end #if
220
+ ensure
221
+ # Making sure the reader is removed from the registry even when there's exception.
222
+ request_registry_update(start_index, content_length, blob_name, new_etag, gen)
223
+ end # begin
224
+ end # unless
225
+ rescue => e
226
+ @logger.error("Oh My, An error occurred. Error:#{e}: Trace: #{e.backtrace}", :exception => e)
227
+ end # begin
228
+ end # process
229
+
230
+ def enqueue_content(queue, content, header, tail, blob_name)
231
+ if (header.nil? || header.length == 0) && (tail.nil? || tail.length == 0)
232
+ #skip some unnecessary copying
233
+ full_content = content
234
+ else
235
+ full_content = ''
236
+ full_content << header unless header.nil? || header.length == 0
237
+ full_content << content
238
+ full_content << tail unless tail.nil? || tail.length == 0
239
+ end
240
+
241
+ @codec.decode(full_content) do |event|
242
+ if @azure_blob_file_path_field
243
+ event.set(@azure_blob_file_path_field_name, blob_name)
244
+ end
245
+ decorate(event)
246
+ queue << event
247
+ end
248
+ end
249
+
250
+ def on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
251
+ @processed_entries += 1
252
+ request_registry_update(start_index, content_length, blob_name, new_etag, gen) if @processed_entries % UPDATE_REGISTRY_COUNT == 0
253
+ end
254
+
255
+ def request_registry_update(start_index, content_length, blob_name, new_etag, gen)
256
+ new_offset = start_index
257
+ new_offset += content_length unless content_length.nil?
258
+ @logger.debug("New registry offset: #{new_offset}")
259
+ new_registry_item = LogStash::Inputs::RegistryItem.new(blob_name, new_etag, nil, new_offset, gen)
260
+ update_registry(new_registry_item)
261
+ end
262
+
263
+ # Deserialize registry hash from json string.
264
+ def deserialize_registry_hash (json_string)
265
+ result = Hash.new
266
+ temp_hash = JSON.parse(json_string)
267
+ temp_hash.values.each { |kvp|
268
+ result[kvp['file_path']] = LogStash::Inputs::RegistryItem.new(kvp['file_path'], kvp['etag'], kvp['reader'], kvp['offset'], kvp['gen'])
269
+ }
270
+ return result
271
+ end #deserialize_registry_hash
272
+
273
+ # List all the blobs in the given container.
274
+ def list_all_blobs
275
+ blobs = Set.new []
276
+ continuation_token = NIL
277
+ @blob_list_page_size = 100 if @blob_list_page_size <= 0
278
+ loop do
279
+ # Need to limit the returned number of the returned entries to avoid out of memory exception.
280
+ entries = @azure_blob.list_blobs(@container, { :timeout => 60, :marker => continuation_token, :max_results => @blob_list_page_size })
281
+ if @path_filters.empty?
282
+ entries.each do |entry|
283
+ blobs << entry
284
+ end # each
285
+ else
286
+ # Add the registry_path to the list of matched blobs
287
+ @path_filters << @registry_path
288
+ entries.each do |entry|
289
+ # FNM_PATHNAME is required so that "**/test" can match "test" at the root folder
290
+ # FNM_EXTGLOB allows you to use "test{a,b,c}" to match either "testa", "testb" or "testc" (closer to shell behavior)
291
+ matched = @path_filters.any? {|path| File.fnmatch?(path, entry.name, File::FNM_PATHNAME | File::FNM_EXTGLOB)}
292
+ blobs << entry if matched
293
+ end # each
294
+ end
295
+ continuation_token = entries.continuation_token
296
+ break if continuation_token.empty?
297
+ end # loop
298
+ return blobs
299
+ end # def list_blobs
300
+
301
+ # Raise generation for blob in registry
302
+ def raise_gen(registry_hash, file_path)
303
+ begin
304
+ target_item = registry_hash[file_path]
305
+ begin
306
+ target_item.gen += 1
307
+ # Protect gen from overflow.
308
+ target_item.gen = target_item.gen / 2 if target_item.gen == MAX
309
+ rescue StandardError => e
310
+ @logger.error("Fail to get the next generation for target item #{target_item}.", :exception => e)
311
+ target_item.gen = 0
312
+ end
313
+
314
+ min_gen_item = registry_hash.values.min_by { |x| x.gen }
315
+ while min_gen_item.gen > 0
316
+ registry_hash.values.each { |value|
317
+ value.gen -= 1
318
+ }
319
+ min_gen_item = registry_hash.values.min_by { |x| x.gen }
320
+ end
321
+ end
322
+ end # raise_gen
323
+
324
+ # Acquire a lease on a blob item with retries.
325
+ #
326
+ # By default, it will retry 60 times with 1 second interval.
327
+ def acquire_lease(blob_name, retry_times = 60, interval_sec = 1)
328
+ lease = nil;
329
+ retried = 0;
330
+ while lease.nil? do
331
+ begin
332
+ lease = @azure_blob.acquire_blob_lease(@container, blob_name, { :timeout => 60, :duration => @registry_lease_duration })
333
+ rescue StandardError => e
334
+ if (e.class.name.include? 'LeaseAlreadyPresent')
335
+ if (retried > retry_times)
336
+ raise
337
+ end
338
+ retried += 1
339
+ sleep interval_sec
340
+ else
341
+ # Anything else happend other than 'LeaseAlreadyPresent', break the lease. This is a work-around for the behavior that when
342
+ # timeout exception is hit, somehow, a infinite lease will be put on the lock file.
343
+ @azure_blob.break_blob_lease(@container, blob_name, { :break_period => 30 })
344
+ end
345
+ end
346
+ end #while
347
+ return lease
348
+ end # acquire_lease
349
+
350
+ # Return the next blob for reading as well as the start index.
351
+ def register_for_read
352
+ begin
353
+ all_blobs = list_all_blobs
354
+ registry = all_blobs.find { |item| item.name.downcase == @registry_path }
355
+
356
+ candidate_blobs = all_blobs.select { |item| (item.name.downcase != @registry_path) }
357
+
358
+ start_index = 0
359
+ gen = 0
360
+ lease = nil
361
+
362
+ if registry.nil?
363
+ registry_hash = create_registry(candidate_blobs)
364
+ lease = acquire_lease(@registry_path)
365
+ else
366
+ lease = acquire_lease(@registry_path)
367
+ registry_hash = load_registry
368
+ end #if
369
+
370
+ picked_blobs = Set.new []
371
+ # Pick up the next candidate
372
+ picked_blob = nil
373
+ candidate_blobs.each { |candidate_blob|
374
+ @logger.debug("candidate_blob: #{candidate_blob.name} content length: #{candidate_blob.properties[:content_length]}")
375
+ registry_item = registry_hash[candidate_blob.name]
376
+
377
+ # Appending items that doesn't exist in the hash table
378
+ if registry_item.nil?
379
+ registry_item = LogStash::Inputs::RegistryItem.new(candidate_blob.name, candidate_blob.properties[:etag], nil, 0, 0)
380
+ registry_hash[candidate_blob.name] = registry_item
381
+ end # if
382
+ @logger.debug("registry_item offset: #{registry_item.offset}")
383
+ if ((registry_item.offset < candidate_blob.properties[:content_length]) && (registry_item.reader.nil? || registry_item.reader == @reader))
384
+ @logger.debug("candidate_blob picked: #{candidate_blob.name} content length: #{candidate_blob.properties[:content_length]}")
385
+ picked_blobs << candidate_blob
386
+ end
387
+ }
388
+
389
+ picked_blob = picked_blobs.min_by { |b| registry_hash[b.name].gen }
390
+ unless picked_blob.nil?
391
+ registry_item = registry_hash[picked_blob.name]
392
+ registry_item.reader = @reader
393
+ registry_hash[picked_blob.name] = registry_item
394
+ start_index = registry_item.offset
395
+ raise_gen(registry_hash, picked_blob.name)
396
+ gen = registry_item.gen
397
+ end # unless
398
+
399
+ # Save the change for the registry
400
+ save_registry(registry_hash, lease)
401
+
402
+ @azure_blob.release_blob_lease(@container, @registry_path, lease)
403
+ lease = nil
404
+
405
+ return picked_blob, start_index, gen
406
+ rescue StandardError => e
407
+ @logger.error("Oh My, An error occurred. #{e}: #{e.backtrace}", :exception => e)
408
+ return nil, nil, nil
409
+ ensure
410
+ @azure_blob.release_blob_lease(@container, @registry_path, lease) unless lease.nil?
411
+ lease = nil
412
+ end # rescue
413
+ end #register_for_read
414
+
415
+ # Update the registry
416
+ def update_registry(registry_item)
417
+ begin
418
+ lease = nil
419
+ lease = acquire_lease(@registry_path)
420
+ registry_hash = load_registry
421
+ registry_hash[registry_item.file_path] = registry_item
422
+ save_registry(registry_hash, lease)
423
+ @azure_blob.release_blob_lease(@container, @registry_path, lease)
424
+ lease = nil
425
+ rescue StandardError => e
426
+ @logger.error("Oh My, An error occurred. #{e}:\n#{e.backtrace}", :exception => e)
427
+ ensure
428
+ @azure_blob.release_blob_lease(@container, @registry_path, lease) unless lease.nil?
429
+ lease = nil
430
+ end #rescue
431
+ end # def update_registry
432
+
433
+ # Clean up the registry.
434
+ def cleanup_registry
435
+ begin
436
+ @logger.debug("azureblob : start cleanup_registry")
437
+ lease = nil
438
+ lease = acquire_lease(@registry_path)
439
+ registry_hash = load_registry
440
+ registry_hash.each { | key, registry_item|
441
+ registry_item.reader = nil if registry_item.reader == @reader
442
+ }
443
+ save_registry(registry_hash, lease)
444
+ @azure_blob.release_blob_lease(@container, @registry_path, lease)
445
+ lease = nil
446
+ rescue StandardError => e
447
+ @logger.error("Oh My, An error occurred. #{e}:\n#{e.backtrace}", :exception => e)
448
+ ensure
449
+ @azure_blob.release_blob_lease(@container, @registry_path, lease) unless lease.nil?
450
+ lease = nil
451
+ end #rescue
452
+ @logger.debug("azureblob : End of cleanup_registry")
453
+ end # def cleanup_registry
454
+
455
+ # Create a registry file to coordinate between multiple azure blob inputs.
456
+ def create_registry(blob_items)
457
+ @azure_blob.create_block_blob(@container, @registry_path, '')
458
+ lease = acquire_lease(@registry_path)
459
+ registry_hash = Hash.new
460
+ blob_items.each do |blob_item|
461
+ initial_offset = 0
462
+ initial_offset = blob_item.properties[:content_length] if @registry_create_policy == 'resume'
463
+ registry_item = LogStash::Inputs::RegistryItem.new(blob_item.name, blob_item.properties[:etag], nil, initial_offset, 0)
464
+ registry_hash[blob_item.name] = registry_item
465
+ end # each
466
+ save_registry(registry_hash, lease)
467
+ @azure_blob.release_blob_lease(@container, @registry_path, lease)
468
+ registry_hash
469
+ end # create_registry
470
+
471
+ # Load the content of the registry into the registry hash and return it.
472
+ def load_registry
473
+ # Get content
474
+ _registry_blob, registry_blob_body = @azure_blob.get_blob(@container, @registry_path)
475
+ registry_hash = deserialize_registry_hash(registry_blob_body)
476
+ registry_hash
477
+ end # def load_registry
478
+
479
+ # Serialize the registry hash and save it.
480
+ def save_registry(registry_hash, lease_id)
481
+ # Serialize hash to json
482
+ registry_hash_json = JSON.generate(registry_hash)
483
+
484
+ # Upload registry to blob
485
+ @azure_blob.create_block_blob(@container, @registry_path, registry_hash_json, lease_id: lease_id)
486
+ end # def save_registry
487
+ end # class LogStash::Inputs::LogstashInputAzureblob
488
+
489
+ class BlobReader < LinearReader
490
+ def initialize(logger, azure_blob, container, blob_name, chunk_size, blob_start_index, blob_end_index)
491
+ @logger = logger
492
+ @azure_blob = azure_blob
493
+ @container = container
494
+ @blob_name = blob_name
495
+ @blob_start_index = blob_start_index
496
+ @blob_end_index = blob_end_index
497
+ @chunk_size = chunk_size
498
+ end
499
+
500
+ def read
501
+ if @blob_end_index < @blob_start_index
502
+ return nil, false
503
+ end
504
+
505
+ are_more_bytes_available = false
506
+
507
+ if @blob_end_index >= @blob_start_index + @chunk_size
508
+ end_index = @blob_start_index + @chunk_size - 1
509
+ are_more_bytes_available = true
510
+ else
511
+ end_index = @blob_end_index
512
+ end
513
+ content = read_from_blob(@blob_start_index, end_index)
514
+
515
+ @blob_start_index = end_index + 1
516
+ return content, are_more_bytes_available
517
+ end
518
+
519
+ private
520
+
521
+ def read_from_blob(start_index, end_index)
522
+ _blob, content = @azure_blob.get_blob(@container, @blob_name, {:start_range => start_index, :end_range => end_index } )
523
+ return content
524
+ end
525
+ end #class BlobReader