logstash-input-azureblob 0.9.12-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,10 @@
1
+ # this is a generated file, to avoid over-writing it just delete this comment
2
+ begin
3
+ require 'jar_dependencies'
4
+ rescue LoadError
5
+ require 'org/glassfish/javax.json/1.1/javax.json-1.1.jar'
6
+ end
7
+
8
+ if defined? Jars
9
+ require_jar( 'org.glassfish', 'javax.json', '1.1' )
10
+ end
@@ -0,0 +1,500 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/base"
3
+ require "logstash/namespace"
4
+
5
+ # Azure Storage SDK for Ruby
6
+ require "azure/storage"
7
+ require 'json' # for registry content
8
+ require "securerandom" # for generating uuid.
9
+
10
+ require "com/microsoft/json-parser"
11
+
12
+ #require Dir[ File.dirname(__FILE__) + "/../../*_jars.rb" ].first
13
+ # Registry item to coordinate between mulitple clients
14
+ class LogStash::Inputs::RegistryItem
15
+ attr_accessor :file_path, :etag, :offset, :reader, :gen
16
+ # Allow json serialization.
17
+ def as_json(options={})
18
+ {
19
+ file_path: @file_path,
20
+ etag: @etag,
21
+ reader: @reader,
22
+ offset: @offset,
23
+ gen: @gen
24
+ }
25
+ end # as_json
26
+
27
+ def to_json(*options)
28
+ as_json(*options).to_json(*options)
29
+ end # to_json
30
+
31
+ def initialize(file_path, etag, reader, offset = 0, gen = 0)
32
+ @file_path = file_path
33
+ @etag = etag
34
+ @reader = reader
35
+ @offset = offset
36
+ @gen = gen
37
+ end # initialize
38
+ end # class RegistryItem
39
+
40
+
41
+ # Logstash input plugin for Azure Blobs
42
+ #
43
+ # This logstash plugin gathers data from Microsoft Azure Blobs
44
+ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
45
+ config_name "azureblob"
46
+
47
+ # If undefined, Logstash will complain, even if codec is unused.
48
+ default :codec, "json_lines"
49
+
50
+ # Set the account name for the azure storage account.
51
+ config :storage_account_name, :validate => :string
52
+
53
+ # Set the key to access the storage account.
54
+ config :storage_access_key, :validate => :string
55
+
56
+ # Set the container of the blobs.
57
+ config :container, :validate => :string
58
+
59
+ # Set the endpoint for the blobs.
60
+ #
61
+ # The default, `core.windows.net` targets the public azure.
62
+ config :endpoint, :validate => :string, :default => 'core.windows.net'
63
+
64
+ # Set the value of using backup mode.
65
+ config :backupmode, :validate => :boolean, :default => false, :deprecated => true, :obsolete => 'This option is obsoleted and the settings will be ignored.'
66
+
67
+ # Set the value for the registry file.
68
+ #
69
+ # The default, `data/registry`, is used to coordinate readings for various instances of the clients.
70
+ config :registry_path, :validate => :string, :default => 'data/registry'
71
+
72
+ # Sets the value for registry file lock duration in seconds. It must be set to -1, or between 15 to 60 inclusively.
73
+ #
74
+ # The default, `15` means the registry file will be locked for at most 15 seconds. This should usually be sufficient to
75
+ # read the content of registry. Having this configuration here to allow lease expired in case the client crashed that
76
+ # never got a chance to release the lease for the registry.
77
+ config :registry_lease_duration, :validate => :number, :default => 15
78
+
79
+ # Set how many seconds to keep idle before checking for new logs.
80
+ #
81
+ # The default, `30`, means trigger a reading for the log every 30 seconds after entering idle.
82
+ config :interval, :validate => :number, :default => 30
83
+
84
+ # Set the registry create mode
85
+ #
86
+ # The default, `resume`, means when the registry is initially created, it assumes all logs has been handled.
87
+ # When set to `start_over`, it will read all log files from begining.
88
+ config :registry_create_policy, :validate => :string, :default => 'resume'
89
+
90
+ # Sets the header of the file that does not repeat over records. Usually, these are json opening tags.
91
+ config :file_head_bytes, :validate => :number, :default => 0
92
+
93
+ # Sets the tail of the file that does not repeat over records. Usually, these are json closing tags.
94
+ config :file_tail_bytes, :validate => :number, :default => 0
95
+
96
+ # Sets how to break json
97
+ #
98
+ # Only works when the codec is set to `json`. Sets the policy to break the json object in the array into small events.
99
+ # Break json into small sections will not be as efficient as keep it as a whole, but will reduce the usage of
100
+ # the memory.
101
+ # Possible options: `do_not_break`, `with_head_tail`, `without_head_tail`
102
+ config :break_json_down_policy, :validate => :string, :default => 'do_not_break', :obsolete => 'This option is obsoleted and the settings will be ignored.'
103
+
104
+ # Sets when break json happens, how many json object will be put in 1 batch
105
+ config :break_json_batch_count, :validate => :number, :default => 10, :obsolete => 'This option is obsoleted and the settings will be ignored.'
106
+
107
+ # Sets the page-size for returned blob items. Too big number will hit heap overflow; Too small number will leads to too many requests.
108
+ #
109
+ # The default, `100` is good for default heap size of 1G.
110
+ config :blob_list_page_size, :validate => :number, :default => 100
111
+
112
+ # The default is 4 MB
113
+ config :file_chunk_size_bytes, :validate => :number, :default => 4 * 1024 * 1024
114
+
115
+ # Constant of max integer
116
+ MAX = 2 ** ([42].pack('i').size * 16 - 2 ) -1
117
+
118
+ # Update the registry offset each time after this number of entries have been processed
119
+ UPDATE_REGISTRY_COUNT = 100
120
+
121
+ public
122
+ def register
123
+ user_agent = "logstash-input-azureblob"
124
+ user_agent << "/" << Gem.latest_spec_for("logstash-input-azureblob").version.to_s
125
+
126
+ # this is the reader # for this specific instance.
127
+ @reader = SecureRandom.uuid
128
+ @registry_locker = "#{@registry_path}.lock"
129
+
130
+ # Setup a specific instance of an Azure::Storage::Client
131
+ client = Azure::Storage::Client.create(:storage_account_name => @storage_account_name, :storage_access_key => @storage_access_key, :storage_blob_host => "https://#{@storage_account_name}.blob.#{@endpoint}", :user_agent_prefix => user_agent)
132
+ # Get an azure storage blob service object from a specific instance of an Azure::Storage::Client
133
+ @azure_blob = client.blob_client
134
+ # Add retry filter to the service object
135
+ @azure_blob.with_filter(Azure::Storage::Core::Filter::ExponentialRetryPolicyFilter.new)
136
+ end # def register
137
+
138
+ def run(queue)
139
+ # we can abort the loop if stop? becomes true
140
+ while !stop?
141
+ process(queue)
142
+ @logger.debug("Hitting interval of #{@interval}ms . . .")
143
+ Stud.stoppable_sleep(@interval) { stop? }
144
+ end # loop
145
+ end # def run
146
+
147
+ def stop
148
+ cleanup_registry
149
+ end # def stop
150
+
151
+ # Start processing the next item.
152
+ def process(queue)
153
+ begin
154
+ @processed_entries = 0
155
+ blob, start_index, gen = register_for_read
156
+
157
+ if(!blob.nil?)
158
+ begin
159
+ blob_name = blob.name
160
+ @logger.debug("Processing blob #{blob.name}")
161
+ blob_size = blob.properties[:content_length]
162
+ # Work-around: After returned by get_blob, the etag will contains quotes.
163
+ new_etag = blob.properties[:etag]
164
+ # ~ Work-around
165
+
166
+ blob, header = @azure_blob.get_blob(@container, blob_name, {:end_range => (@file_head_bytes-1) }) if header.nil? unless @file_head_bytes.nil? or @file_head_bytes <= 0
167
+
168
+ blob, tail = @azure_blob.get_blob(@container, blob_name, {:start_range => blob_size - @file_tail_bytes}) if tail.nil? unless @file_tail_bytes.nil? or @file_tail_bytes <= 0
169
+
170
+ if start_index == 0
171
+ # Skip the header since it is already read.
172
+ start_index = @file_head_bytes
173
+ end
174
+
175
+ @logger.debug("start index: #{start_index} blob size: #{blob_size}")
176
+
177
+ content_length = 0
178
+ blob_reader = BlobReader.new(@logger, @azure_blob, @container, blob_name, file_chunk_size_bytes, start_index, blob_size - 1 - @file_tail_bytes)
179
+
180
+ is_json_codec = (defined?(LogStash::Codecs::JSON) == 'constant') && (@codec.is_a? LogStash::Codecs::JSON)
181
+ if is_json_codec
182
+ parser = JsonParser.new(@logger, blob_reader)
183
+
184
+ parser.parse(->(json_content) {
185
+ content_length = content_length + json_content.length
186
+
187
+ enqueue_content(queue, json_content, header, tail)
188
+
189
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
190
+ }, ->(malformed_json) {
191
+ @logger.debug("Skipping #{malformed_json.length} malformed bytes")
192
+ content_length = content_length + malformed_json.length
193
+
194
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
195
+ })
196
+ else
197
+ begin
198
+ content, are_more_bytes_available = blob_reader.read
199
+
200
+ content_length = content_length + content.length
201
+ enqueue_content(queue, content, header, tail)
202
+
203
+ on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
204
+ end until !are_more_bytes_available || content.nil?
205
+
206
+ end #if
207
+ ensure
208
+ # Making sure the reader is removed from the registry even when there's exception.
209
+ request_registry_update(start_index, content_length, blob_name, new_etag, gen)
210
+ end # begin
211
+ end # if
212
+ rescue => e
213
+ @logger.error("Oh My, An error occurred. Error:#{e}: Trace: #{e.backtrace}", :exception => e)
214
+ end # begin
215
+ end # process
216
+
217
+ def enqueue_content(queue, content, header, tail)
218
+ if (header.nil? || header.length == 0) && (tail.nil? || tail.length == 0)
219
+ #skip some unnecessary copying
220
+ full_content = content
221
+ else
222
+ full_content = ""
223
+ full_content << header unless header.nil? || header.length == 0
224
+ full_content << content
225
+ full_content << tail unless tail.nil? || tail.length == 0
226
+ end
227
+
228
+ @codec.decode(full_content) do |event|
229
+ decorate(event)
230
+ queue << event
231
+ end
232
+ end
233
+
234
+ def on_entry_processed(start_index, content_length, blob_name, new_etag, gen)
235
+ @processed_entries = @processed_entries + 1
236
+ if @processed_entries % UPDATE_REGISTRY_COUNT == 0
237
+ request_registry_update(start_index, content_length, blob_name, new_etag, gen)
238
+ end
239
+ end
240
+
241
+ def request_registry_update(start_index, content_length, blob_name, new_etag, gen)
242
+ new_offset = start_index
243
+ new_offset = new_offset + content_length unless content_length.nil?
244
+ @logger.debug("New registry offset: #{new_offset}")
245
+ new_registry_item = LogStash::Inputs::RegistryItem.new(blob_name, new_etag, nil, new_offset, gen)
246
+ update_registry(new_registry_item)
247
+ end
248
+
249
+ # Deserialize registry hash from json string.
250
+ def deserialize_registry_hash (json_string)
251
+ result = Hash.new
252
+ temp_hash = JSON.parse(json_string)
253
+ temp_hash.values.each { |kvp|
254
+ result[kvp['file_path']] = LogStash::Inputs::RegistryItem.new(kvp['file_path'], kvp['etag'], kvp['reader'], kvp['offset'], kvp['gen'])
255
+ }
256
+ return result
257
+ end #deserialize_registry_hash
258
+
259
+ # List all the blobs in the given container.
260
+ def list_all_blobs
261
+ blobs = Set.new []
262
+ continuation_token = NIL
263
+ @blob_list_page_size = 100 if @blob_list_page_size <= 0
264
+ loop do
265
+ # Need to limit the returned number of the returned entries to avoid out of memory exception.
266
+ entries = @azure_blob.list_blobs(@container, { :timeout => 60, :marker => continuation_token, :max_results => @blob_list_page_size })
267
+ entries.each do |entry|
268
+ blobs << entry
269
+ end # each
270
+ continuation_token = entries.continuation_token
271
+ break if continuation_token.empty?
272
+ end # loop
273
+ return blobs
274
+ end # def list_blobs
275
+
276
+ # Raise generation for blob in registry
277
+ def raise_gen(registry_hash, file_path)
278
+ begin
279
+ target_item = registry_hash[file_path]
280
+ begin
281
+ target_item.gen += 1
282
+ # Protect gen from overflow.
283
+ target_item.gen = target_item.gen / 2 if target_item.gen == MAX
284
+ rescue StandardError => e
285
+ @logger.error("Fail to get the next generation for target item #{target_item}.", :exception => e)
286
+ target_item.gen = 0
287
+ end
288
+
289
+ min_gen_item = registry_hash.values.min_by { |x| x.gen }
290
+ while min_gen_item.gen > 0
291
+ registry_hash.values.each { |value|
292
+ value.gen -= 1
293
+ }
294
+ min_gen_item = registry_hash.values.min_by { |x| x.gen }
295
+ end
296
+ end
297
+ end # raise_gen
298
+
299
+ # Acquire a lease on a blob item with retries.
300
+ #
301
+ # By default, it will retry 60 times with 1 second interval.
302
+ def acquire_lease(blob_name, retry_times = 60, interval_sec = 1)
303
+ lease = nil;
304
+ retried = 0;
305
+ while lease.nil? do
306
+ begin
307
+ lease = @azure_blob.acquire_blob_lease(@container, blob_name, { :timeout => 60, :duration => @registry_lease_duration })
308
+ rescue StandardError => e
309
+ if(e.type && e.type == 'LeaseAlreadyPresent')
310
+ if (retried > retry_times)
311
+ raise
312
+ end
313
+ retried += 1
314
+ sleep interval_sec
315
+ else
316
+ # Anything else happend other than 'LeaseAlreadyPresent', break the lease. This is a work-around for the behavior that when
317
+ # timeout exception is hit, somehow, a infinite lease will be put on the lock file.
318
+ @azure_blob.break_blob_lease(@container, blob, { :break_period => 30 })
319
+ end
320
+ end
321
+ end #while
322
+ return lease
323
+ end # acquire_lease
324
+
325
+ # Return the next blob for reading as well as the start index.
326
+ def register_for_read
327
+ begin
328
+ all_blobs = list_all_blobs
329
+ registry = all_blobs.find { |item| item.name.downcase == @registry_path }
330
+ registry_locker = all_blobs.find { |item| item.name.downcase == @registry_locker }
331
+
332
+ candidate_blobs = all_blobs.select { |item| (item.name.downcase != @registry_path) && ( item.name.downcase != @registry_locker ) }
333
+
334
+ start_index = 0
335
+ gen = 0
336
+ lease = nil
337
+
338
+ # Put lease on locker file than the registy file to allow update of the registry as a workaround for Azure Storage Ruby SDK issue # 16.
339
+ # Workaround: https://github.com/Azure/azure-storage-ruby/issues/16
340
+ registry_locker = @azure_blob.create_block_blob(@container, @registry_locker, @reader) if registry_locker.nil?
341
+ lease = acquire_lease(@registry_locker)
342
+ # ~ Workaround
343
+
344
+ if(registry.nil?)
345
+ registry_hash = create_registry(candidate_blobs)
346
+ else
347
+ registry_hash = load_registry
348
+ end #if
349
+
350
+ picked_blobs = Set.new []
351
+ # Pick up the next candidate
352
+ picked_blob = nil
353
+ candidate_blobs.each { |candidate_blob|
354
+ @logger.debug("candidate_blob: #{candidate_blob.name} content length: #{candidate_blob.properties[:content_length]}")
355
+ registry_item = registry_hash[candidate_blob.name]
356
+
357
+ # Appending items that doesn't exist in the hash table
358
+ if registry_item.nil?
359
+ registry_item = LogStash::Inputs::RegistryItem.new(candidate_blob.name, candidate_blob.properties[:etag], nil, 0, 0)
360
+ registry_hash[candidate_blob.name] = registry_item
361
+ end # if
362
+ @logger.debug("registry_item offset: #{registry_item.offset}")
363
+ if ((registry_item.offset < candidate_blob.properties[:content_length]) && (registry_item.reader.nil? || registry_item.reader == @reader))
364
+ @logger.debug("candidate_blob picked: #{candidate_blob.name} content length: #{candidate_blob.properties[:content_length]}")
365
+ picked_blobs << candidate_blob
366
+ end
367
+ }
368
+
369
+ picked_blob = picked_blobs.min_by { |b| registry_hash[b.name].gen }
370
+ if !picked_blob.nil?
371
+ registry_item = registry_hash[picked_blob.name]
372
+ registry_item.reader = @reader
373
+ registry_hash[picked_blob.name] = registry_item
374
+ start_index = registry_item.offset
375
+ raise_gen(registry_hash, picked_blob.name)
376
+ gen = registry_item.gen
377
+ end #if
378
+
379
+ # Save the chnage for the registry
380
+ save_registry(registry_hash)
381
+
382
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease)
383
+ lease = nil;
384
+
385
+ return picked_blob, start_index, gen
386
+ rescue StandardError => e
387
+ @logger.error("Oh My, An error occurred. #{e}: #{e.backtrace}", :exception => e)
388
+ return nil, nil, nil
389
+ ensure
390
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease) unless lease.nil?
391
+ lease = nil
392
+ end # rescue
393
+ end #register_for_read
394
+
395
+ # Update the registry
396
+ def update_registry (registry_item)
397
+ begin
398
+ lease = nil
399
+ lease = acquire_lease(@registry_locker)
400
+ registry_hash = load_registry
401
+ registry_hash[registry_item.file_path] = registry_item
402
+ save_registry(registry_hash)
403
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease)
404
+ lease = nil
405
+ rescue StandardError => e
406
+ @logger.error("Oh My, An error occurred. #{e}:\n#{e.backtrace}", :exception => e)
407
+ ensure
408
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease) unless lease.nil?
409
+ lease = nil
410
+ end #rescue
411
+ end # def update_registry
412
+
413
+ # Clean up the registry.
414
+ def cleanup_registry
415
+ begin
416
+ lease = nil
417
+ lease = acquire_lease(@registry_locker)
418
+ registry_hash = load_registry
419
+ registry_hash.each { | key, registry_item|
420
+ registry_item.reader = nil if registry_item.reader == @reader
421
+ }
422
+ save_registry(registry_hash)
423
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease)
424
+ lease = nil
425
+ rescue StandardError => e
426
+ @logger.error("Oh My, An error occurred. #{e}:\n#{e.backtrace}", :exception => e)
427
+ ensure
428
+ @azure_blob.release_blob_lease(@container, @registry_locker, lease) unless lease.nil?
429
+ lease = nil
430
+ end #rescue
431
+ end # def cleanup_registry
432
+
433
+ # Create a registry file to coordinate between multiple azure blob inputs.
434
+ def create_registry (blob_items)
435
+ registry_hash = Hash.new
436
+
437
+ blob_items.each do |blob_item|
438
+ initial_offset = 0
439
+ initial_offset = blob_item.properties[:content_length] if @registry_create_policy == 'resume'
440
+ registry_item = LogStash::Inputs::RegistryItem.new(blob_item.name, blob_item.properties[:etag], nil, initial_offset, 0)
441
+ registry_hash[blob_item.name] = registry_item
442
+ end # each
443
+ save_registry(registry_hash)
444
+ return registry_hash
445
+ end # create_registry
446
+
447
+ # Load the content of the registry into the registry hash and return it.
448
+ def load_registry
449
+ # Get content
450
+ registry_blob, registry_blob_body = @azure_blob.get_blob(@container, @registry_path)
451
+ registry_hash = deserialize_registry_hash(registry_blob_body)
452
+ return registry_hash
453
+ end # def load_registry
454
+
455
+ # Serialize the registry hash and save it.
456
+ def save_registry(registry_hash)
457
+ # Serialize hash to json
458
+ registry_hash_json = JSON.generate(registry_hash)
459
+
460
+ # Upload registry to blob
461
+ @azure_blob.create_block_blob(@container, @registry_path, registry_hash_json)
462
+ end # def save_registry
463
+ end # class LogStash::Inputs::LogstashInputAzureblob
464
+
465
+ class BlobReader < LinearReader
466
+ def initialize(logger, azure_blob, container, blob_name, chunk_size, blob_start_index, blob_end_index)
467
+ @logger = logger
468
+ @azure_blob = azure_blob
469
+ @container = container
470
+ @blob_name = blob_name
471
+ @blob_start_index = blob_start_index
472
+ @blob_end_index = blob_end_index
473
+ @chunk_size = chunk_size
474
+ end
475
+
476
+ def read
477
+ if @blob_end_index < @blob_start_index
478
+ return nil, false
479
+ end
480
+
481
+ are_more_bytes_available = false
482
+
483
+ if @blob_end_index >= @blob_start_index + @chunk_size
484
+ end_index = @blob_start_index + @chunk_size - 1
485
+ are_more_bytes_available = true
486
+ else
487
+ end_index = @blob_end_index
488
+ end
489
+ content = read_from_blob(@blob_start_index, end_index)
490
+
491
+ @blob_start_index = end_index + 1
492
+ return content, are_more_bytes_available
493
+ end
494
+
495
+ private
496
+ def read_from_blob(start_index, end_index)
497
+ blob, content = @azure_blob.get_blob(@container, @blob_name, {:start_range => start_index, :end_range => end_index } )
498
+ return content
499
+ end
500
+ end #class BlobReader