logstash-input-azureblob 0.9.12-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ # this is a generated file, to avoid over-writing it just delete this comment
2
+ begin
3
+ require 'jar_dependencies'
4
+ rescue LoadError
5
+ require 'org/glassfish/javax.json/1.1/javax.json-1.1.jar'
6
+ end
7
+
8
+ if defined? Jars
9
+ require_jar( 'org.glassfish', 'javax.json', '1.1' )
10
+ end
@@ -0,0 +1,500 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/base"
3
+ require "logstash/namespace"
4
+
5
+ # Azure Storage SDK for Ruby
6
+ require "azure/storage"
7
+ require 'json' # for registry content
8
+ require "securerandom" # for generating uuid.
9
+
10
+ require "com/microsoft/json-parser"
11
+
12
+ #require Dir[ File.dirname(__FILE__) + "/../../*_jars.rb" ].first
13
+ # Registry item to coordinate between mulitple clients
14
+ class LogStash::Inputs::RegistryItem
15
+ attr_accessor :file_path, :etag, :offset, :reader, :gen
16
+ # Allow json serialization.
17
+ def as_json(options={})
18
+ {
19
+ file_path: @file_path,
20
+ etag: @etag,
21
+ reader: @reader,
22
+ offset: @offset,
23
+ gen: @gen
24
+ }
25
+ end # as_json
26
+
27
+ def to_json(*options)
28
+ as_json(*options).to_json(*options)
29
+ end # to_json
30
+
31
+ def initialize(file_path, etag, reader, offset = 0, gen = 0)
32
+ @file_path = file_path
33
+ @etag = etag
34
+ @reader = reader
35
+ @offset = offset
36
+ @gen = gen
37
+ end # initialize
38
+ end # class RegistryItem
39
+
40
+
41
+ # Logstash input plugin for Azure Blobs
42
+ #
43
+ # This logstash plugin gathers data from Microsoft Azure Blobs
44
+ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
45
+ config_name "azureblob"
46
+
47
+ # If undefined, Logstash will complain, even if codec is unused.
48
+ default :codec, "json_lines"
49
+
50
+ # Set the account name for the azure storage account.
51
+ config :storage_account_name, :validate => :string
52
+
53
+ # Set the key to access the storage account.
54
+ config :storage_access_key, :validate => :string
55
+
56
+ # Set the container of the blobs.
57
+ config :container, :validate => :string
58
+
59
+ # Set the endpoint for the blobs.
60
+ #
61
+ # The default, `core.windows.net` targets the public azure.
62
+ config :endpoint, :validate => :string, :default => 'core.windows.net'
63
+
64
+ # Set the value of using backup mode.
65
+ config :backupmode, :validate => :boolean, :default => false, :deprecated => true, :obsolete => 'This option is obsoleted and the settings will be ignored.'
66
+
67
+ # Set the value for the registry file.
68
+ #
69
+ # The default, `data/registry`, is used to coordinate readings for various instances of the clients.
70
+ config :registry_path, :validate => :string, :default => 'data/registry'
71
+
72
+ # Sets the value for registry file lock duration in seconds. It must be set to -1, or between 15 to 60 inclusively.
73
+ #
74
+ # The default, `15` means the registry file will be locked for at most 15 seconds. This should usually be sufficient to
75
+ # read the content of registry. Having this configuration here to allow lease expired in case the client crashed that
76
+ # never got a chance to release the lease for the registry.
77
+ config :registry_lease_duration, :validate => :number, :default => 15
78
+
79
+ # Set how many seconds to keep idle before checking for new logs.
80
+ #
81
+ # The default, `30`, means trigger a reading for the log every 30 seconds after entering idle.
82
+ config :interval, :validate => :number, :default => 30
83
+
84
+ # Set the registry create mode
85
+ #
86
+ # The default, `resume`, means when the registry is initially created, it assumes all logs has been handled.
87
+ # When set to `start_over`, it will read all log files from begining.
88
+ config :registry_create_policy, :validate => :string, :default => 'resume'
89
+
90
+ # Sets the header of the file that does not repeat over records. Usually, these are json opening tags.
91
+ config :file_head_bytes, :validate => :number, :default => 0
92
+
93
+ # Sets the tail of the file that does not repeat over records. Usually, these are json closing tags.
94
+ config :file_tail_bytes, :validate => :number, :default => 0
95
+
96
+ # Sets how to break json
97
+ #
98
+ # Only works when the codec is set to `json`. Sets the policy to break the json object in the array into small events.
99
+ # Break json into small sections will not be as efficient as keep it as a whole, but will reduce the usage of
100
+ # the memory.
101
+ # Possible options: `do_not_break`, `with_head_tail`, `without_head_tail`
102
+ config :break_json_down_policy, :validate => :string, :default => 'do_not_break', :obsolete => 'This option is obsoleted and the settings will be ignored.'
103
+
104
+ # Sets when break json happens, how many json object will be put in 1 batch
105
+ config :break_json_batch_count, :validate => :number, :default => 10, :obsolete => 'This option is obsoleted and the settings will be ignored.'
106
+
107
+ # Sets the page-size for returned blob items. Too big number will hit heap overflow; Too small number will leads to too many requests.
108
+ #
109
+ # The default, `100` is good for default heap size of 1G.
110
+ config :blob_list_page_size, :validate => :number, :default => 100
111
+
112
+ # The default is 4 MB
113
+ config :file_chunk_size_bytes, :validate => :number, :default => 4 * 1024 * 1024
114
+
115
+ # Constant of max integer
116
+ MAX = 2 ** ([42].pack('i').size * 16 - 2 ) -1
117
+
118
+ # Update the registry offset each time after this number of entries have been processed
119
+ UPDATE_REGISTRY_COUNT = 100
120
+
121
+ public
122
+ def register
123
+ user_agent = "logstash-input-azureblob"
124
+ user_agent << "/" << Gem.latest_spec_for("logstash-input-azureblob").version.to_s
125
+
126
+ # this is the reader # for this specific instance.
127
+ @reader = SecureRandom.uuid
128
+ @registry_locker = "#{@registry_path}.lock"
129
+
130
+ # Setup a specific instance of an Azure::Storage::Client
131
+ client = Azure::Storage::Client.create(:storage_account_name => @storage_account_name, :storage_access_key => @storage_access_key, :storage_blob_host => "https://#{@storage_account_name}.blob.#{@endpoint}", :user_agent_prefix => user_agent)
132
+ # Get an azure storage blob service object from a specific instance of an Azure::Storage::Client
133
+ @azure_blob = client.blob_client
134
+ # Add retry filter to the service object
135
+ @azure_blob.with_filter(Azure::Storage::Core::Filter::ExponentialRetryPolicyFilter.new)
136
+ end # def register
137
+
138
+ def run(queue)
139
+ # we can abort the loop if stop? becomes true
140
+ while !stop?
141
+ process(queue)
142
+ @logger.debug("Hitting interval of #{@interval}ms . . .")
143
+ Stud.stoppable_sleep(@interval) { stop? }
144
+ end # loop
145
+ end # def run
146
+
147
+ def stop
148
+ cleanup_registry
149
+ end # def stop
150
+
151
+ # Start processing the next item.
152
+ def process(queue)
153
+ begin
154
+ @processed_entries = 0
155
+ blob, start_index, gen = register_for_read
156
+
157
+ if(!blob.nil?)
158
+ begin
159
+ blob_name = blob.name
160
+ @logger.debug("Processing blob #{blob.name}")
161
+ blob_size = blob.properties[:content_length]
162
+ # Work-around: After returned by get_blob, the etag will contains quotes.
163
+ new_etag = blob.properties[:etag]
164
+ # ~ Work-around
165
+
166
+ blob, header = @azure_blob.get_blob(@container, blob_name, {:end_range => (@file_head_bytes-1) }) if header.nil? unless @file_head_bytes.nil? or @file_head_bytes <= 0
167
+
168
+ blob, tail = @azure_blob.get_blob(@container, blob_name, {:start_range => blob_size - @file_tail_bytes}) if tail.nil? unless @file_tail_bytes.nil? or @file_tail_bytes <= 0
169
+
170
+ if start_index == 0
171
+ # Skip the header since it is already read.
172
+ start_index = @file_head_bytes
173
+ end
174
+
175
+ @logger.debug("start index: #{start_index} blob size: #{blob_size}")
176
+
177
+ content_length = 0
178
+ blob_reader = BlobReader.new(@logger, @azure_blob, @container, blob_name, file_chunk_size_bytes, start_index, blob_size - 1 - @file_tail_bytes)
179
+
180
+ is_json_codec = (defined?(LogStash::Codecs::JSON) == 'constant') && (@codec.is_a? LogStash::Codecs::JSON)
181
+ if is_json_codec
182
+ parser = JsonParser.new(@logger, blob_reader)
183
+
184
+ parser.parse(->(json_content) {
185
+ content_length = content_length + json_content.length
186
+
187
+ enqueue_content(queue, json_content, header, tail)
188
+
189
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
190
+ }, ->(malformed_json) {
191
+ @logger.debug("Skipping #{malformed_json.length} malformed bytes")
192
+ content_length = content_length + malformed_json.length
193
+
194
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
195
+ })
196
+ else
197
+ begin
198
+ content, are_more_bytes_available = blob_reader.read
199
+
200
+ content_length = content_length + content.length
201
+ enqueue_content(queue, content, header, tail)
202
+
203
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
204
+ end until !are_more_bytes_available || content.nil?
205
+
206
+ end #if
207
+ ensure
208
+ # Making sure the reader is removed from the registry even when there's exception.
209
+ request_registry_update(start_index, content_length, blob_name, new_etag, gen)
210
+ end # begin
211
+ end # if
212
+ rescue => e
213
+ @logger.error("Oh My, An error occurred. Error:#{e}: Trace: #{e.backtrace}", :exception => e)
214
+ end # begin
215
+ end # process
216
+
217
+ def enqueue_content(queue, content, header, tail)
218
+ if (header.nil? || header.length == 0) && (tail.nil? || tail.length == 0)
219
+ #skip some unnecessary copying
220
+ full_content = content
221
+ else
222
+ full_content = ""
223
+ full_content << header unless header.nil? || header.length == 0
224
+ full_content << content
225
+ full_content << tail unless tail.nil? || tail.length == 0
226
+ end
227
+
228
+ @codec.decode(full_content) do |event|
229
+ decorate(event)
230
+ queue << event
231
+ end
232
+ end
233
+
234
+ def on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
235
+ @processed_entries = @processed_entries + 1
236
+ if @processed_entries % UPDATE_REGISTRY_COUNT == 0
237
+ request_registry_update(start_index, content_length, blob_name, new_etag, gen)
238
+ end
239
+ end
240
+
241
+ def request_registry_update(start_index, content_length, blob_name, new_etag, gen)
242
+ new_offset = start_index
243
+ new_offset = new_offset + content_length unless content_length.nil?
244
+ @logger.debug("New registry offset: #{new_offset}")
245
+ new_registry_item = LogStash::Inputs::RegistryItem.new(blob_name, new_etag, nil, new_offset, gen)
246
+ update_registry(new_registry_item)
247
+ end
248
+
249
+ # Deserialize registry hash from json string.
250
+ def deserialize_registry_hash (json_string)
251
+ result = Hash.new
252
+ temp_hash = JSON.parse(json_string)
253
+ temp_hash.values.each { |kvp|
254
+ result[kvp['file_path']] = LogStash::Inputs::RegistryItem.new(kvp['file_path'], kvp['etag'], kvp['reader'], kvp['offset'], kvp['gen'])
255
+ }
256
+ return result
257
+ end #deserialize_registry_hash
258
+
259
+ # List all the blobs in the given container.
260
+ def list_all_blobs
261
+ blobs = Set.new []
262
+ continuation_token = NIL
263
+ @blob_list_page_size = 100 if @blob_list_page_size <= 0
264
+ loop do
265
+ # Need to limit the returned number of the returned entries to avoid out of memory exception.
266
+ entries = @azure_blob.list_blobs(@container, { :timeout => 60, :marker => continuation_token, :max_results => @blob_list_page_size })
267
+ entries.each do |entry|
268
+ blobs << entry
269
+ end # each
270
+ continuation_token = entries.continuation_token
271
+ break if continuation_token.empty?
272
+ end # loop
273
+ return blobs
274
+ end # def list_blobs
275
+
276
+ # Raise generation for blob in registry
277
+ def raise_gen(registry_hash, file_path)
278
+ begin
279
+ target_item = registry_hash[file_path]
280
+ begin
281
+ target_item.gen += 1
282
+ # Protect gen from overflow.
283
+ target_item.gen = target_item.gen / 2 if target_item.gen == MAX
284
+ rescue StandardError => e
285
+ @logger.error("Fail to get the next generation for target item #{target_item}.", :exception => e)
286
+ target_item.gen = 0
287
+ end
288
+
289
+ min_gen_item = registry_hash.values.min_by { |x| x.gen }
290
+ while min_gen_item.gen > 0
291
+ registry_hash.values.each { |value|
292
+ value.gen -= 1
293
+ }
294
+ min_gen_item = registry_hash.values.min_by { |x| x.gen }
295
+ end
296
+ end
297
+ end # raise_gen
298
+
299
+ # Acquire a lease on a blob item with retries.
300
+ #
301
+ # By default, it will retry 60 times with 1 second interval.
302
+ def acquire_lease(blob_name, retry_times = 60, interval_sec = 1)
303
+ lease = nil;
304
+ retried = 0;
305
+ while lease.nil? do
306
+ begin
307
+ lease = @azure_blob.acquire_blob_lease(@container, blob_name, { :timeout => 60, :duration => @registry_lease_duration })
308
+ rescue StandardError => e
309
+ if(e.type && e.type == 'LeaseAlreadyPresent')
310
+ if (retried > retry_times)
311
+ raise
312
+ end
313
+ retried += 1
314
+ sleep interval_sec
315
+ else
316
+ # Anything else happend other than 'LeaseAlreadyPresent', break the lease. This is a work-around for the behavior that when
317
+ # timeout exception is hit, somehow, a infinite lease will be put on the lock file.
318
+ @azure_blob.break_blob_lease(@container, blob, { :break_period => 30 })
319
+ end
320
+ end
321
+ end #while
322
+ return lease
323
+ end # acquire_lease
324
+
325
+ # Return the next blob for reading as well as the start index.
326
+ def register_for_read
327
+ begin
328
+ all_blobs = list_all_blobs
329
+ registry = all_blobs.find { |item| item.name.downcase == @registry_path }
330
+ registry_locker = all_blobs.find { |item| item.name.downcase == @registry_locker }
331
+
332
+ candidate_blobs = all_blobs.select { |item| (item.name.downcase != @registry_path) && ( item.name.downcase != @registry_locker ) }
333
+
334
+ start_index = 0
335
+ gen = 0
336
+ lease = nil
337
+
338
+ # Put lease on locker file than the registy file to allow update of the registry as a workaround for Azure Storage Ruby SDK issue # 16.
339
+ # Workaround: https://github.com/Azure/azure-storage-ruby/issues/16
340
+ registry_locker = @azure_blob.create_block_blob(@container, @registry_locker, @reader) if registry_locker.nil?
341
+ lease = acquire_lease(@registry_locker)
342
+ # ~ Workaround
343
+
344
+ if(registry.nil?)
345
+ registry_hash = create_registry(candidate_blobs)
346
+ else
347
+ registry_hash = load_registry
348
+ end #if
349
+
350
+ picked_blobs = Set.new []
351
+ # Pick up the next candidate
352
+ picked_blob = nil
353
+ candidate_blobs.each { |candidate_blob|
354
+ @logger.debug("candidate_blob: #{candidate_blob.name} content length: #{candidate_blob.properties[:content_length]}")
355
+ registry_item = registry_hash[candidate_blob.name]
356
+
357
+ # Appending items that doesn't exist in the hash table
358
+ if registry_item.nil?
359
+ registry_item = LogStash::Inputs::RegistryItem.new(candidate_blob.name, candidate_blob.properties[:etag], nil, 0, 0)
360
+ registry_hash[candidate_blob.name] = registry_item
361
+ end # if
362
+ @logger.debug("registry_item offset: #{registry_item.offset}")
363
+ if ((registry_item.offset < candidate_blob.properties[:content_length]) && (registry_item.reader.nil? || registry_item.reader == @reader))
364
+ @logger.debug("candidate_blob picked: #{candidate_blob.name} content length: #{candidate_blob.properties[:content_length]}")
365
+ picked_blobs << candidate_blob
366
+ end
367
+ }
368
+
369
+ picked_blob = picked_blobs.min_by { |b| registry_hash[b.name].gen }
370
+ if !picked_blob.nil?
371
+ registry_item = registry_hash[picked_blob.name]
372
+ registry_item.reader = @reader
373
+ registry_hash[picked_blob.name] = registry_item
374
+ start_index = registry_item.offset
375
+ raise_gen(registry_hash, picked_blob.name)
376
+ gen = registry_item.gen
377
+ end #if
378
+
379
+ # Save the chnage for the registry
380
+ save_registry(registry_hash)
381
+
382
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease)
383
+ lease = nil;
384
+
385
+ return picked_blob, start_index, gen
386
+ rescue StandardError => e
387
+ @logger.error("Oh My, An error occurred. #{e}: #{e.backtrace}", :exception => e)
388
+ return nil, nil, nil
389
+ ensure
390
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease) unless lease.nil?
391
+ lease = nil
392
+ end # rescue
393
+ end #register_for_read
394
+
395
+ # Update the registry
396
+ def update_registry (registry_item)
397
+ begin
398
+ lease = nil
399
+ lease = acquire_lease(@registry_locker)
400
+ registry_hash = load_registry
401
+ registry_hash[registry_item.file_path] = registry_item
402
+ save_registry(registry_hash)
403
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease)
404
+ lease = nil
405
+ rescue StandardError => e
406
+ @logger.error("Oh My, An error occurred. #{e}:\n#{e.backtrace}", :exception => e)
407
+ ensure
408
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease) unless lease.nil?
409
+ lease = nil
410
+ end #rescue
411
+ end # def update_registry
412
+
413
+ # Clean up the registry.
414
+ def cleanup_registry
415
+ begin
416
+ lease = nil
417
+ lease = acquire_lease(@registry_locker)
418
+ registry_hash = load_registry
419
+ registry_hash.each { | key, registry_item|
420
+ registry_item.reader = nil if registry_item.reader == @reader
421
+ }
422
+ save_registry(registry_hash)
423
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease)
424
+ lease = nil
425
+ rescue StandardError => e
426
+ @logger.error("Oh My, An error occurred. #{e}:\n#{e.backtrace}", :exception => e)
427
+ ensure
428
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease) unless lease.nil?
429
+ lease = nil
430
+ end #rescue
431
+ end # def cleanup_registry
432
+
433
+ # Create a registry file to coordinate between multiple azure blob inputs.
434
+ def create_registry (blob_items)
435
+ registry_hash = Hash.new
436
+
437
+ blob_items.each do |blob_item|
438
+ initial_offset = 0
439
+ initial_offset = blob_item.properties[:content_length] if @registry_create_policy == 'resume'
440
+ registry_item = LogStash::Inputs::RegistryItem.new(blob_item.name, blob_item.properties[:etag], nil, initial_offset, 0)
441
+ registry_hash[blob_item.name] = registry_item
442
+ end # each
443
+ save_registry(registry_hash)
444
+ return registry_hash
445
+ end # create_registry
446
+
447
+ # Load the content of the registry into the registry hash and return it.
448
+ def load_registry
449
+ # Get content
450
+ registry_blob, registry_blob_body = @azure_blob.get_blob(@container, @registry_path)
451
+ registry_hash = deserialize_registry_hash(registry_blob_body)
452
+ return registry_hash
453
+ end # def load_registry
454
+
455
+ # Serialize the registry hash and save it.
456
+ def save_registry(registry_hash)
457
+ # Serialize hash to json
458
+ registry_hash_json = JSON.generate(registry_hash)
459
+
460
+ # Upload registry to blob
461
+ @azure_blob.create_block_blob(@container, @registry_path, registry_hash_json)
462
+ end # def save_registry
463
+ end # class LogStash::Inputs::LogstashInputAzureblob
464
+
465
+ class BlobReader < LinearReader
466
+ def initialize(logger, azure_blob, container, blob_name, chunk_size, blob_start_index, blob_end_index)
467
+ @logger = logger
468
+ @azure_blob = azure_blob
469
+ @container = container
470
+ @blob_name = blob_name
471
+ @blob_start_index = blob_start_index
472
+ @blob_end_index = blob_end_index
473
+ @chunk_size = chunk_size
474
+ end
475
+
476
+ def read
477
+ if @blob_end_index < @blob_start_index
478
+ return nil, false
479
+ end
480
+
481
+ are_more_bytes_available = false
482
+
483
+ if @blob_end_index >= @blob_start_index + @chunk_size
484
+ end_index = @blob_start_index + @chunk_size - 1
485
+ are_more_bytes_available = true
486
+ else
487
+ end_index = @blob_end_index
488
+ end
489
+ content = read_from_blob(@blob_start_index, end_index)
490
+
491
+ @blob_start_index = end_index + 1
492
+ return content, are_more_bytes_available
493
+ end
494
+
495
+ private
496
+ def read_from_blob(start_index, end_index)
497
+ blob, content = @azure_blob.get_blob(@container, @blob_name, {:start_range => start_index, :end_range => end_index } )
498
+ return content
499
+ end
500
+ end #class BlobReader