logstash-input-azureblob-offline 0.9.13.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,525 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/base"
3
+ require "logstash/namespace"
4
+
5
+ # Azure Storage SDK for Ruby
6
+ require "azure/storage"
7
+ require 'json' # for registry content
8
+ require "securerandom" # for generating uuid.
9
+
10
+ require "com/microsoft/json-parser"
11
+
12
+ #require Dir[ File.dirname(__FILE__) + "/../../*_jars.rb" ].first
13
+ # Registry item to coordinate between mulitple clients
14
+ class LogStash::Inputs::RegistryItem
15
+ attr_accessor :file_path, :etag, :offset, :reader, :gen
16
+ # Allow json serialization.
17
+ def as_json(options={})
18
+ {
19
+ file_path: @file_path,
20
+ etag: @etag,
21
+ reader: @reader,
22
+ offset: @offset,
23
+ gen: @gen
24
+ }
25
+ end # as_json
26
+
27
+ def to_json(*options)
28
+ as_json(*options).to_json(*options)
29
+ end # to_json
30
+
31
+ def initialize(file_path, etag, reader, offset = 0, gen = 0)
32
+ @file_path = file_path
33
+ @etag = etag
34
+ @reader = reader
35
+ @offset = offset
36
+ @gen = gen
37
+ end # initialize
38
+ end # class RegistryItem
39
+
40
+ # Logstash input plugin for Azure Blobs
41
+ #
42
+ # This logstash plugin gathers data from Microsoft Azure Blobs
43
+ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
44
+ config_name 'azureblob-offline'
45
+
46
+ # If undefined, Logstash will complain, even if codec is unused.
47
+ default :codec, 'json_lines'
48
+
49
+ # Set the account name for the azure storage account.
50
+ config :storage_account_name, :validate => :string
51
+
52
+ # Set the key to access the storage account.
53
+ config :storage_access_key, :validate => :string
54
+
55
+ # Set the container of the blobs.
56
+ config :container, :validate => :string
57
+
58
+ # The path(s) to the file(s) to use as an input. By default it will
59
+ # watch every files in the storage container.
60
+ # You can use filename patterns here, such as `logs/*.log`.
61
+ # If you use a pattern like `logs/**/*.log`, a recursive search
62
+ # of `logs` will be done for all `*.log` files.
63
+ # Do not include a leading `/`, as Azure path look like this:
64
+ # `path/to/blob/file.txt`
65
+ #
66
+ # You may also configure multiple paths. See an example
67
+ # on the <<array,Logstash configuration page>>.
68
+ config :path_filters, :validate => :array, :default => [], :required => false
69
+
70
+ # Set the endpoint for the blobs.
71
+ #
72
+ # The default, `core.windows.net` targets the public azure.
73
+ config :endpoint, :validate => :string, :default => 'core.windows.net'
74
+
75
+ # Set the value of using backup mode.
76
+ config :backupmode, :validate => :boolean, :default => false, :deprecated => true, :obsolete => 'This option is obsoleted and the settings will be ignored.'
77
+
78
+ # Set the value for the registry file.
79
+ #
80
+ # The default, `data/registry`, is used to coordinate readings for various instances of the clients.
81
+ config :registry_path, :validate => :string, :default => 'data/registry'
82
+
83
+ # Sets the value for registry file lock duration in seconds. It must be set to -1, or between 15 to 60 inclusively.
84
+ #
85
+ # The default, `15` means the registry file will be locked for at most 15 seconds. This should usually be sufficient to
86
+ # read the content of registry. Having this configuration here to allow lease expired in case the client crashed that
87
+ # never got a chance to release the lease for the registry.
88
+ config :registry_lease_duration, :validate => :number, :default => 15
89
+
90
+ # Set how many seconds to keep idle before checking for new logs.
91
+ #
92
+ # The default, `30`, means trigger a reading for the log every 30 seconds after entering idle.
93
+ config :interval, :validate => :number, :default => 30
94
+
95
+ # Set the registry create mode
96
+ #
97
+ # The default, `resume`, means when the registry is initially created, it assumes all logs has been handled.
98
+ # When set to `start_over`, it will read all log files from begining.
99
+ config :registry_create_policy, :validate => :string, :default => 'resume'
100
+
101
+ # Sets the header of the file that does not repeat over records. Usually, these are json opening tags.
102
+ config :file_head_bytes, :validate => :number, :default => 0
103
+
104
+ # Sets the tail of the file that does not repeat over records. Usually, these are json closing tags.
105
+ config :file_tail_bytes, :validate => :number, :default => 0
106
+
107
+ # Sets how to break json
108
+ #
109
+ # Only works when the codec is set to `json`. Sets the policy to break the json object in the array into small events.
110
+ # Break json into small sections will not be as efficient as keep it as a whole, but will reduce the usage of
111
+ # the memory.
112
+ # Possible options: `do_not_break`, `with_head_tail`, `without_head_tail`
113
+ config :break_json_down_policy, :validate => :string, :default => 'do_not_break', :obsolete => 'This option is obsoleted and the settings will be ignored.'
114
+
115
+ # Sets when break json happens, how many json object will be put in 1 batch
116
+ config :break_json_batch_count, :validate => :number, :default => 10, :obsolete => 'This option is obsoleted and the settings will be ignored.'
117
+
118
+ # Sets the page-size for returned blob items. Too big number will hit heap overflow; Too small number will leads to too many requests.
119
+ #
120
+ # The default, `100` is good for default heap size of 1G.
121
+ config :blob_list_page_size, :validate => :number, :default => 100
122
+
123
+ # The default is 4 MB
124
+ config :file_chunk_size_bytes, :validate => :number, :default => 4 * 1024 * 1024
125
+
126
+ config :azure_blob_file_path_field, :validate => :boolean, :default => false
127
+
128
+ config :azure_blob_file_path_field_name, :validate => :string, :default => "azureblobfilepath"
129
+
130
+ # Constant of max integer
131
+ MAX = 2**([42].pack('i').size * 16 - 2) - 1
132
+
133
+ # Update the registry offset each time after this number of entries have been processed
134
+ UPDATE_REGISTRY_COUNT = 100
135
+
136
+ public
137
+ def register
138
+ user_agent = 'logstash-input-azureblob-offline'
139
+
140
+ # this is the reader # for this specific instance.
141
+ @reader = SecureRandom.uuid
142
+
143
+ # Setup a specific instance of an Azure::Storage::Client
144
+ client = Azure::Storage::Client.create(:storage_account_name => @storage_account_name, :storage_access_key => @storage_access_key, :storage_blob_host => "https://#{@storage_account_name}.blob.#{@endpoint}", :user_agent_prefix => user_agent)
145
+ # Get an azure storage blob service object from a specific instance of an Azure::Storage::Client
146
+ @azure_blob = client.blob_client
147
+ # Add retry filter to the service object
148
+ @azure_blob.with_filter(Azure::Storage::Core::Filter::ExponentialRetryPolicyFilter.new)
149
+ end # def register
150
+
151
+ def run(queue)
152
+ # we can abort the loop if stop? becomes true
153
+ while !stop?
154
+ process(queue)
155
+ @logger.debug("Hitting interval of #{@interval}s . . .")
156
+ Stud.stoppable_sleep(@interval) { stop? }
157
+ end # loop
158
+ end # def run
159
+
160
+ def stop
161
+ cleanup_registry
162
+ end # def stop
163
+
164
+ # Start processing the next item.
165
+ def process(queue)
166
+ begin
167
+ @processed_entries = 0
168
+ blob, start_index, gen = register_for_read
169
+
170
+ unless blob.nil?
171
+ begin
172
+ blob_name = blob.name
173
+ @logger.debug("Processing blob #{blob.name}")
174
+ blob_size = blob.properties[:content_length]
175
+ # Work-around: After returned by get_blob, the etag will contains quotes.
176
+ new_etag = blob.properties[:etag]
177
+ # ~ Work-around
178
+
179
+ blob, header = @azure_blob.get_blob(@container, blob_name, {:end_range => (@file_head_bytes-1) }) if header.nil? unless @file_head_bytes.nil? or @file_head_bytes <= 0
180
+
181
+ blob, tail = @azure_blob.get_blob(@container, blob_name, {:start_range => blob_size - @file_tail_bytes}) if tail.nil? unless @file_tail_bytes.nil? or @file_tail_bytes <= 0
182
+
183
+ if start_index == 0
184
+ # Skip the header since it is already read.
185
+ start_index = @file_head_bytes
186
+ end
187
+
188
+ @logger.debug("start index: #{start_index} blob size: #{blob_size}")
189
+
190
+ content_length = 0
191
+ blob_reader = BlobReader.new(@logger, @azure_blob, @container, blob_name, file_chunk_size_bytes, start_index, blob_size - 1 - @file_tail_bytes)
192
+
193
+ is_json_codec = (defined?(LogStash::Codecs::JSON) == 'constant') && (@codec.is_a? LogStash::Codecs::JSON)
194
+ if is_json_codec
195
+ parser = JsonParser.new(@logger, blob_reader)
196
+
197
+ parser.parse(->(json_content) {
198
+ content_length += json_content.length
199
+
200
+ enqueue_content(queue, json_content, header, tail, blob_name)
201
+
202
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
203
+ }, ->(malformed_json) {
204
+ @logger.debug("Skipping #{malformed_json.length} malformed bytes")
205
+ content_length = content_length + malformed_json.length
206
+
207
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
208
+ })
209
+ else
210
+ begin
211
+ content, are_more_bytes_available = blob_reader.read
212
+
213
+ content_length += content.length
214
+ enqueue_content(queue, content, header, tail, blob_name)
215
+
216
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
217
+ end until !are_more_bytes_available || content.nil?
218
+
219
+ end #if
220
+ ensure
221
+ # Making sure the reader is removed from the registry even when there's exception.
222
+ request_registry_update(start_index, content_length, blob_name, new_etag, gen)
223
+ end # begin
224
+ end # unless
225
+ rescue => e
226
+ @logger.error("Oh My, An error occurred. Error:#{e}: Trace: #{e.backtrace}", :exception => e)
227
+ end # begin
228
+ end # process
229
+
230
+ def enqueue_content(queue, content, header, tail, blob_name)
231
+ if (header.nil? || header.length == 0) && (tail.nil? || tail.length == 0)
232
+ #skip some unnecessary copying
233
+ full_content = content
234
+ else
235
+ full_content = ''
236
+ full_content << header unless header.nil? || header.length == 0
237
+ full_content << content
238
+ full_content << tail unless tail.nil? || tail.length == 0
239
+ end
240
+
241
+ @codec.decode(full_content) do |event|
242
+ if @azure_blob_file_path_field
243
+ event.set(@azure_blob_file_path_field_name, blob_name)
244
+ end
245
+ decorate(event)
246
+ queue << event
247
+ end
248
+ end
249
+
250
+ def on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
251
+ @processed_entries += 1
252
+ request_registry_update(start_index, content_length, blob_name, new_etag, gen) if @processed_entries % UPDATE_REGISTRY_COUNT == 0
253
+ end
254
+
255
+ def request_registry_update(start_index, content_length, blob_name, new_etag, gen)
256
+ new_offset = start_index
257
+ new_offset += content_length unless content_length.nil?
258
+ @logger.debug("New registry offset: #{new_offset}")
259
+ new_registry_item = LogStash::Inputs::RegistryItem.new(blob_name, new_etag, nil, new_offset, gen)
260
+ update_registry(new_registry_item)
261
+ end
262
+
263
+ # Deserialize registry hash from json string.
264
+ def deserialize_registry_hash (json_string)
265
+ result = Hash.new
266
+ temp_hash = JSON.parse(json_string)
267
+ temp_hash.values.each { |kvp|
268
+ result[kvp['file_path']] = LogStash::Inputs::RegistryItem.new(kvp['file_path'], kvp['etag'], kvp['reader'], kvp['offset'], kvp['gen'])
269
+ }
270
+ return result
271
+ end #deserialize_registry_hash
272
+
273
+ # List all the blobs in the given container.
274
+ def list_all_blobs
275
+ blobs = Set.new []
276
+ continuation_token = NIL
277
+ @blob_list_page_size = 100 if @blob_list_page_size <= 0
278
+ loop do
279
+ # Need to limit the returned number of the returned entries to avoid out of memory exception.
280
+ entries = @azure_blob.list_blobs(@container, { :timeout => 60, :marker => continuation_token, :max_results => @blob_list_page_size })
281
+ if @path_filters.empty?
282
+ entries.each do |entry|
283
+ blobs << entry
284
+ end # each
285
+ else
286
+ # Add the registry_path to the list of matched blobs
287
+ @path_filters << @registry_path
288
+ entries.each do |entry|
289
+ # FNM_PATHNAME is required so that "**/test" can match "test" at the root folder
290
+ # FNM_EXTGLOB allows you to use "test{a,b,c}" to match either "testa", "testb" or "testc" (closer to shell behavior)
291
+ matched = @path_filters.any? {|path| File.fnmatch?(path, entry.name, File::FNM_PATHNAME | File::FNM_EXTGLOB)}
292
+ blobs << entry if matched
293
+ end # each
294
+ end
295
+ continuation_token = entries.continuation_token
296
+ break if continuation_token.empty?
297
+ end # loop
298
+ return blobs
299
+ end # def list_blobs
300
+
301
+ # Raise generation for blob in registry
302
+ def raise_gen(registry_hash, file_path)
303
+ begin
304
+ target_item = registry_hash[file_path]
305
+ begin
306
+ target_item.gen += 1
307
+ # Protect gen from overflow.
308
+ target_item.gen = target_item.gen / 2 if target_item.gen == MAX
309
+ rescue StandardError => e
310
+ @logger.error("Fail to get the next generation for target item #{target_item}.", :exception => e)
311
+ target_item.gen = 0
312
+ end
313
+
314
+ min_gen_item = registry_hash.values.min_by { |x| x.gen }
315
+ while min_gen_item.gen > 0
316
+ registry_hash.values.each { |value|
317
+ value.gen -= 1
318
+ }
319
+ min_gen_item = registry_hash.values.min_by { |x| x.gen }
320
+ end
321
+ end
322
+ end # raise_gen
323
+
324
+ # Acquire a lease on a blob item with retries.
325
+ #
326
+ # By default, it will retry 60 times with 1 second interval.
327
+ def acquire_lease(blob_name, retry_times = 60, interval_sec = 1)
328
+ lease = nil;
329
+ retried = 0;
330
+ while lease.nil? do
331
+ begin
332
+ lease = @azure_blob.acquire_blob_lease(@container, blob_name, { :timeout => 60, :duration => @registry_lease_duration })
333
+ rescue StandardError => e
334
+ if (e.class.name.include? 'LeaseAlreadyPresent')
335
+ if (retried > retry_times)
336
+ raise
337
+ end
338
+ retried += 1
339
+ sleep interval_sec
340
+ else
341
+ # Anything else happend other than 'LeaseAlreadyPresent', break the lease. This is a work-around for the behavior that when
342
+ # timeout exception is hit, somehow, a infinite lease will be put on the lock file.
343
+ @azure_blob.break_blob_lease(@container, blob_name, { :break_period => 30 })
344
+ end
345
+ end
346
+ end #while
347
+ return lease
348
+ end # acquire_lease
349
+
350
+ # Return the next blob for reading as well as the start index.
351
+ def register_for_read
352
+ begin
353
+ all_blobs = list_all_blobs
354
+ registry = all_blobs.find { |item| item.name.downcase == @registry_path }
355
+
356
+ candidate_blobs = all_blobs.select { |item| (item.name.downcase != @registry_path) }
357
+
358
+ start_index = 0
359
+ gen = 0
360
+ lease = nil
361
+
362
+ if registry.nil?
363
+ registry_hash = create_registry(candidate_blobs)
364
+ lease = acquire_lease(@registry_path)
365
+ else
366
+ lease = acquire_lease(@registry_path)
367
+ registry_hash = load_registry
368
+ end #if
369
+
370
+ picked_blobs = Set.new []
371
+ # Pick up the next candidate
372
+ picked_blob = nil
373
+ candidate_blobs.each { |candidate_blob|
374
+ @logger.debug("candidate_blob: #{candidate_blob.name} content length: #{candidate_blob.properties[:content_length]}")
375
+ registry_item = registry_hash[candidate_blob.name]
376
+
377
+ # Appending items that doesn't exist in the hash table
378
+ if registry_item.nil?
379
+ registry_item = LogStash::Inputs::RegistryItem.new(candidate_blob.name, candidate_blob.properties[:etag], nil, 0, 0)
380
+ registry_hash[candidate_blob.name] = registry_item
381
+ end # if
382
+ @logger.debug("registry_item offset: #{registry_item.offset}")
383
+ if ((registry_item.offset < candidate_blob.properties[:content_length]) && (registry_item.reader.nil? || registry_item.reader == @reader))
384
+ @logger.debug("candidate_blob picked: #{candidate_blob.name} content length: #{candidate_blob.properties[:content_length]}")
385
+ picked_blobs << candidate_blob
386
+ end
387
+ }
388
+
389
+ picked_blob = picked_blobs.min_by { |b| registry_hash[b.name].gen }
390
+ unless picked_blob.nil?
391
+ registry_item = registry_hash[picked_blob.name]
392
+ registry_item.reader = @reader
393
+ registry_hash[picked_blob.name] = registry_item
394
+ start_index = registry_item.offset
395
+ raise_gen(registry_hash, picked_blob.name)
396
+ gen = registry_item.gen
397
+ end # unless
398
+
399
+ # Save the change for the registry
400
+ save_registry(registry_hash, lease)
401
+
402
+ @azure_blob.release_blob_lease(@container, @registry_path, lease)
403
+ lease = nil
404
+
405
+ return picked_blob, start_index, gen
406
+ rescue StandardError => e
407
+ @logger.error("Oh My, An error occurred. #{e}: #{e.backtrace}", :exception => e)
408
+ return nil, nil, nil
409
+ ensure
410
+ @azure_blob.release_blob_lease(@container, @registry_path, lease) unless lease.nil?
411
+ lease = nil
412
+ end # rescue
413
+ end #register_for_read
414
+
415
+ # Update the registry
416
+ def update_registry(registry_item)
417
+ begin
418
+ lease = nil
419
+ lease = acquire_lease(@registry_path)
420
+ registry_hash = load_registry
421
+ registry_hash[registry_item.file_path] = registry_item
422
+ save_registry(registry_hash, lease)
423
+ @azure_blob.release_blob_lease(@container, @registry_path, lease)
424
+ lease = nil
425
+ rescue StandardError => e
426
+ @logger.error("Oh My, An error occurred. #{e}:\n#{e.backtrace}", :exception => e)
427
+ ensure
428
+ @azure_blob.release_blob_lease(@container, @registry_path, lease) unless lease.nil?
429
+ lease = nil
430
+ end #rescue
431
+ end # def update_registry
432
+
433
+ # Clean up the registry.
434
+ def cleanup_registry
435
+ begin
436
+ @logger.debug("azureblob : start cleanup_registry")
437
+ lease = nil
438
+ lease = acquire_lease(@registry_path)
439
+ registry_hash = load_registry
440
+ registry_hash.each { | key, registry_item|
441
+ registry_item.reader = nil if registry_item.reader == @reader
442
+ }
443
+ save_registry(registry_hash, lease)
444
+ @azure_blob.release_blob_lease(@container, @registry_path, lease)
445
+ lease = nil
446
+ rescue StandardError => e
447
+ @logger.error("Oh My, An error occurred. #{e}:\n#{e.backtrace}", :exception => e)
448
+ ensure
449
+ @azure_blob.release_blob_lease(@container, @registry_path, lease) unless lease.nil?
450
+ lease = nil
451
+ end #rescue
452
+ @logger.debug("azureblob : End of cleanup_registry")
453
+ end # def cleanup_registry
454
+
455
+ # Create a registry file to coordinate between multiple azure blob inputs.
456
+ def create_registry(blob_items)
457
+ @azure_blob.create_block_blob(@container, @registry_path, '')
458
+ lease = acquire_lease(@registry_path)
459
+ registry_hash = Hash.new
460
+ blob_items.each do |blob_item|
461
+ initial_offset = 0
462
+ initial_offset = blob_item.properties[:content_length] if @registry_create_policy == 'resume'
463
+ registry_item = LogStash::Inputs::RegistryItem.new(blob_item.name, blob_item.properties[:etag], nil, initial_offset, 0)
464
+ registry_hash[blob_item.name] = registry_item
465
+ end # each
466
+ save_registry(registry_hash, lease)
467
+ @azure_blob.release_blob_lease(@container, @registry_path, lease)
468
+ registry_hash
469
+ end # create_registry
470
+
471
+ # Load the content of the registry into the registry hash and return it.
472
+ def load_registry
473
+ # Get content
474
+ _registry_blob, registry_blob_body = @azure_blob.get_blob(@container, @registry_path)
475
+ registry_hash = deserialize_registry_hash(registry_blob_body)
476
+ registry_hash
477
+ end # def load_registry
478
+
479
+ # Serialize the registry hash and save it.
480
+ def save_registry(registry_hash, lease_id)
481
+ # Serialize hash to json
482
+ registry_hash_json = JSON.generate(registry_hash)
483
+
484
+ # Upload registry to blob
485
+ @azure_blob.create_block_blob(@container, @registry_path, registry_hash_json, lease_id: lease_id)
486
+ end # def save_registry
487
+ end # class LogStash::Inputs::LogstashInputAzureblob
488
+
489
+ class BlobReader < LinearReader
490
+ def initialize(logger, azure_blob, container, blob_name, chunk_size, blob_start_index, blob_end_index)
491
+ @logger = logger
492
+ @azure_blob = azure_blob
493
+ @container = container
494
+ @blob_name = blob_name
495
+ @blob_start_index = blob_start_index
496
+ @blob_end_index = blob_end_index
497
+ @chunk_size = chunk_size
498
+ end
499
+
500
+ def read
501
+ if @blob_end_index < @blob_start_index
502
+ return nil, false
503
+ end
504
+
505
+ are_more_bytes_available = false
506
+
507
+ if @blob_end_index >= @blob_start_index + @chunk_size
508
+ end_index = @blob_start_index + @chunk_size - 1
509
+ are_more_bytes_available = true
510
+ else
511
+ end_index = @blob_end_index
512
+ end
513
+ content = read_from_blob(@blob_start_index, end_index)
514
+
515
+ @blob_start_index = end_index + 1
516
+ return content, are_more_bytes_available
517
+ end
518
+
519
+ private
520
+
521
+ def read_from_blob(start_index, end_index)
522
+ _blob, content = @azure_blob.get_blob(@container, @blob_name, {:start_range => start_index, :end_range => end_index } )
523
+ return content
524
+ end
525
+ end #class BlobReader