RubyGems - logstash-input-azure_blob_storage - Versions diffs - 0.10.7 → 0.11.0 - Mend

logstash-input-azure_blob_storage 0.10.7 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/logstash/inputs/azure_blob_storage.rb +132 -105
data/logstash-input-azure_blob_storage.gemspec +3 -3
metadata +4 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1d724246f3b086f02716a7eafba64cbf18f07553623ca3d98f70d48cd6bdcb5f
-  data.tar.gz: 5631a24d37b769b8efbf372bb90304c06c9f25fc43b80be70047c3477499dca7
+  metadata.gz: cb93fc423babf6bc4cd7b13b1280a27ea6156fa2aeebe69ab172d8e925940d2c
+  data.tar.gz: ae92a22e56d87cc9d4d0f0b615460a8da33afb822aaa3f0d109cd3b86137ee53
 SHA512:
-  metadata.gz: c43a79fe53cc185dead527b0705dce6d68754f068e035fc068c57c8b9b79a9a9ae7f3e6bf89c342c76b3035ea73309d86709d677736981fe6b02fec85e311e4e
-  data.tar.gz: e658c02488625a4accfd60a647cf3839bb49bbbad3b9f37534808af1fbab860d90f4aebf4f984f3eaedbb1cf7e25ad26adc8463d1ab174bc002250a997e9fd3e
+  metadata.gz: f5a311f322b04740a98182271e3074171feea7b7e899e3ec712b457182d91f34033cc73b1219042bad727f0c6566b1c3cf0ce362e5cc9d2b1e4d09a2029d5456
+  data.tar.gz: 53072a976feddc171ad02960fdcec6612099caaade967c28a501ba1ca413f1dccdc071e050e322eefa84a266b0c9a9ed487a6d98a36c62480e573421b2fc27b7

data/lib/logstash/inputs/azure_blob_storage.rb CHANGED Viewed

@@ -1,80 +1,80 @@
 # encoding: utf-8
-require "logstash/inputs/base"
-require "stud/interval"
+require 'logstash/inputs/base'
+require 'stud/interval'
 require 'azure/storage/blob'
 require 'json'
 # This is a logstash input plugin for files in Azure Blob Storage. There is a storage explorer in the portal and an application with the same name https://storageexplorer.com. A storage account has by default a globally unique name, {storageaccount}.blob.core.windows.net which is a CNAME to  Azures blob servers blob.*.store.core.windows.net. A storageaccount has an container and those have a directory and blobs (like files). Blobs have one or more blocks. After writing the blocks, they can be committed. Some Azure diagnostics can send events to an EventHub that can be parse through the plugin logstash-input-azure_event_hubs, but for the events that are only stored in an storage account, use this plugin. The original logstash-input-azureblob from azure-diagnostics-tools is great for low volumes, but it suffers from outdated client, slow reads, lease locking issues and json parse errors.
 # https://azure.microsoft.com/en-us/services/storage/blobs/
 class LogStash::Inputs::AzureBlobStorage < LogStash::Inputs::Base
-  config_name "azure_blob_storage"
+config_name "azure_blob_storage"
-  # If undefined, Logstash will complain, even if codec is unused. The codec for nsgflowlog is "json"  and the for WADIIS and APPSERVICE is "line".
-  default :codec, "json"
+# If undefined, Logstash will complain, even if codec is unused. The codec for nsgflowlog is "json"  and the for WADIIS and APPSERVICE is "line".
+default :codec, "json"
-  # logtype can be nsgflowlog, wadiis, appservice or raw. The default is raw, where files are read and added as one event. If the file grows, the next interval the file is read from the offset, so that the delta is sent as another event. In raw mode, further processing has to be done in the filter block. If the logtype is specified, this plugin will split and mutate and add individual events to the queue.
-  config :logtype, :validate => ['nsgflowlog','wadiis','appservice','raw'], :default => 'raw'
+# logtype can be nsgflowlog, wadiis, appservice or raw. The default is raw, where files are read and added as one event. If the file grows, the next interval the file is read from the offset, so that the delta is sent as another event. In raw mode, further processing has to be done in the filter block. If the logtype is specified, this plugin will split and mutate and add individual events to the queue.
+config :logtype, :validate => ['nsgflowlog','wadiis','appservice','raw'], :default => 'raw'
-  # The storage account is accessed through Azure::Storage::Blob::BlobService, it needs either a sas_token, connection string or a storageaccount/access_key pair.
-  # https://github.com/Azure/azure-storage-ruby/blob/master/blob/lib/azure/storage/blob/blob_service.rb#L42
-  config :connection_string, :validate => :password, :required => false
+# The storage account is accessed through Azure::Storage::Blob::BlobService, it needs either a sas_token, connection string or a storageaccount/access_key pair.
+# https://github.com/Azure/azure-storage-ruby/blob/master/blob/lib/azure/storage/blob/blob_service.rb#L42
+config :connection_string, :validate => :password, :required => false
-  # The storage account name for the azure storage account.
-  config :storageaccount, :validate => :string, :required => false
+# The storage account name for the azure storage account.
+config :storageaccount, :validate => :string, :required => false
-  # DNS Suffix other then blob.core.windows.net
-  config :dns_suffix, :validate => :string, :required => false, :default => 'core.windows.net'
+# DNS Suffix other then blob.core.windows.net
+config :dns_suffix, :validate => :string, :required => false, :default => 'core.windows.net'
-  # The (primary or secondary) Access Key for the the storage account. The key can be found in the portal.azure.com or through the azure api StorageAccounts/ListKeys. For example the PowerShell command Get-AzStorageAccountKey.
-  config :access_key, :validate => :password, :required => false
+# The (primary or secondary) Access Key for the the storage account. The key can be found in the portal.azure.com or through the azure api StorageAccounts/ListKeys. For example the PowerShell command Get-AzStorageAccountKey.
+config :access_key, :validate => :password, :required => false
-  # SAS is the Shared Access Signature, that provides restricted access rights. If the sas_token is absent, the access_key is used instead.
-  config :sas_token, :validate => :password, :required => false
+# SAS is the Shared Access Signature, that provides restricted access rights. If the sas_token is absent, the access_key is used instead.
+config :sas_token, :validate => :password, :required => false
-  # The container of the blobs.
-  config :container, :validate => :string, :default => 'insights-logs-networksecuritygroupflowevent'
+# The container of the blobs.
+config :container, :validate => :string, :default => 'insights-logs-networksecuritygroupflowevent'
-  # The registry file keeps track of the files that have been processed and until which offset in bytes. It's similar in function
-  #
-  # The default, `data/registry`, it contains a Ruby Marshal Serialized Hash of the filename the offset read sofar and the filelength the list time a filelisting was done.
-  config :registry_path, :validate => :string, :required => false, :default => 'data/registry.dat'
+# The registry file keeps track of the files that have been processed and until which offset in bytes. It's similar in function
+#
+# The default, `data/registry`, it contains a Ruby Marshal Serialized Hash of the filename the offset read sofar and the filelength the list time a filelisting was done.
+config :registry_path, :validate => :string, :required => false, :default => 'data/registry.dat'
-  # The default, `resume`, will load the registry offsets and will start processing files from the offsets.
-  # When set to `start_over`, all log files are processed from begining.
-  # when set to `start_fresh`, it will read log files that are created or appended since this start of the pipeline.
-  config :registry_create_policy, :validate => ['resume','start_over','start_fresh'], :required => false, :default => 'resume'
+# The default, `resume`, will load the registry offsets and will start processing files from the offsets.
+# When set to `start_over`, all log files are processed from begining.
+# when set to `start_fresh`, it will read log files that are created or appended since this start of the pipeline.
+config :registry_create_policy, :validate => ['resume','start_over','start_fresh'], :required => false, :default => 'resume'
-  # The registry keeps track of the files that where already procesed. The interval is used to save the registry regularly, when new events have have been processed. It is also used to wait before listing the files again and substraciting the registry of already processed files to determine the worklist.
-  #
-  # waiting time in seconds until processing the next batch. NSGFLOWLOGS append a block per minute, so use multiples of 60 seconds, 300 for 5 minutes, 600 for 10 minutes. The registry is also saved after every interval.
-  # Partial reading starts from the offset and reads until the end, so the starting tag is prepended
-  #
-  # A00000000000000000000000000000000 12    {"records":[
-  # D672f4bbd95a04209b00dc05d899e3cce 2576  json objects for 1st minute
-  # D7fe0d4f275a84c32982795b0e5c7d3a1 2312  json objects for 2nd minute
-  # Z00000000000000000000000000000000 2     ]}
-  config :interval, :validate => :number, :default => 60
+# The registry keeps track of the files that where already procesed. The interval is used to save the registry regularly, when new events have have been processed. It is also used to wait before listing the files again and substraciting the registry of already processed files to determine the worklist.
+#
+# waiting time in seconds until processing the next batch. NSGFLOWLOGS append a block per minute, so use multiples of 60 seconds, 300 for 5 minutes, 600 for 10 minutes. The registry is also saved after every interval.
+# Partial reading starts from the offset and reads until the end, so the starting tag is prepended
+#
+# A00000000000000000000000000000000 12    {"records":[
+# D672f4bbd95a04209b00dc05d899e3cce 2576  json objects for 1st minute
+# D7fe0d4f275a84c32982795b0e5c7d3a1 2312  json objects for 2nd minute
+# Z00000000000000000000000000000000 2     ]}
+config :interval, :validate => :number, :default => 60
-  # WAD IIS Grok Pattern
-  #config :grokpattern, :validate => :string, :required => false, :default => '%{TIMESTAMP_ISO8601:log_timestamp} %{NOTSPACE:instanceId} %{NOTSPACE:instanceId2} %{IPORHOST:ServerIP} %{WORD:httpMethod} %{URIPATH:requestUri} %{NOTSPACE:requestQuery} %{NUMBER:port} %{NOTSPACE:username} %{IPORHOST:clientIP} %{NOTSPACE:httpVersion} %{NOTSPACE:userAgent} %{NOTSPACE:cookie} %{NOTSPACE:referer} %{NOTSPACE:host} %{NUMBER:httpStatus} %{NUMBER:subresponse} %{NUMBER:win32response} %{NUMBER:sentBytes:int} %{NUMBER:receivedBytes:int} %{NUMBER:timeTaken:int}'
+# WAD IIS Grok Pattern
+#config :grokpattern, :validate => :string, :required => false, :default => '%{TIMESTAMP_ISO8601:log_timestamp} %{NOTSPACE:instanceId} %{NOTSPACE:instanceId2} %{IPORHOST:ServerIP} %{WORD:httpMethod} %{URIPATH:requestUri} %{NOTSPACE:requestQuery} %{NUMBER:port} %{NOTSPACE:username} %{IPORHOST:clientIP} %{NOTSPACE:httpVersion} %{NOTSPACE:userAgent} %{NOTSPACE:cookie} %{NOTSPACE:referer} %{NOTSPACE:host} %{NUMBER:httpStatus} %{NUMBER:subresponse} %{NUMBER:win32response} %{NUMBER:sentBytes:int} %{NUMBER:receivedBytes:int} %{NUMBER:timeTaken:int}'
-  # The string that starts the JSON. Only needed when the codec is JSON. When partial file are read, the result will not be valid JSON unless the start and end are put back. the file_head and file_tail are learned at startup, by reading the first file in the blob_list and taking the first and last block, this would work for blobs that are appended like nsgflowlogs. The configuration can be set to override the learning. In case learning fails and the option is not set, the default is to use the 'records' as set by nsgflowlogs.
-  config :file_head, :validate => :string, :required => false, :default => '{"records":['
-  # The string that ends the JSON
-  config :file_tail, :validate => :string, :required => false, :default => ']}'
+# The string that starts the JSON. Only needed when the codec is JSON. When partial file are read, the result will not be valid JSON unless the start and end are put back. the file_head and file_tail are learned at startup, by reading the first file in the blob_list and taking the first and last block, this would work for blobs that are appended like nsgflowlogs. The configuration can be set to override the learning. In case learning fails and the option is not set, the default is to use the 'records' as set by nsgflowlogs.
+config :file_head, :validate => :string, :required => false, :default => '{"records":['
+# The string that ends the JSON
+config :file_tail, :validate => :string, :required => false, :default => ']}'
-  # The path(s) to the file(s) to use as an input. By default it will
-  # watch every files in the storage container.
-  # You can use filename patterns here, such as `logs/*.log`.
-  # If you use a pattern like `logs/**/*.log`, a recursive search
-  # of `logs` will be done for all `*.log` files.
-  # Do not include a leading `/`, as Azure path look like this:
-  # `path/to/blob/file.txt`
-  #
-  # You may also configure multiple paths. See an example
-  # on the <<array,Logstash configuration page>>.
-  # For NSGFLOWLOGS a path starts with "resourceId=/", but this would only be needed to exclude other files that may be written in the same container.
-  config :prefix, :validate => :string, :required => false
+# The path(s) to the file(s) to use as an input. By default it will
+# watch every files in the storage container.
+# You can use filename patterns here, such as `logs/*.log`.
+# If you use a pattern like `logs/**/*.log`, a recursive search
+# of `logs` will be done for all `*.log` files.
+# Do not include a leading `/`, as Azure path look like this:
+# `path/to/blob/file.txt`
+#
+# You may also configure multiple paths. See an example
+# on the <<array,Logstash configuration page>>.
+# For NSGFLOWLOGS a path starts with "resourceId=/", but this would only be needed to exclude other files that may be written in the same container.
+config :prefix, :validate => :string, :required => false
@@ -91,7 +91,7 @@ def register
     @processed = 0
     @regsaved = @processed
-    @buffer = FileWatch::BufferedTokenizer.new('\n')
+    #@buffer = FileWatch::BufferedTokenizer.new('\n')
     # Try in this order to access the storageaccount
     # 1. storageaccount / sas_token
@@ -119,19 +119,23 @@ def register
     end
     @registry = Hash.new
-    unless registry_create_policy == "start_over"
+    if registry_create_policy == "resume"
       begin
+        @logger.info(@pipe_id+" resuming from registry")
         @registry = Marshal.load(@blob_client.get_blob(container, registry_path)[1])
         #[0] headers [1] responsebody
       rescue
         @registry.clear
+        @logger.error(@pipe_id+" loading registry failed, starting over")
       end
     end
     # read filelist and set offsets to file length to mark all the old files as done
     if registry_create_policy == "start_fresh"
-        @registry.each do |name, file|
-            @registry.store(name, { :offset => file[:length], :length => file[:length] })
-        end
+        @logger.info(@pipe_id+" starting fresh")
+        @registry = list_blobs(true)
+        #tempreg.each do |name, file|
+        #   @registry.store(name, { :offset => file[:length], :length => file[:length] })
+        #end
     end
     @is_json = false
@@ -158,34 +162,47 @@ end # def register
 def run(queue)
+    newreg   = Hash.new
     filelist = Hash.new
+    worklist = Hash.new
     # we can abort the loop if stop? becomes true
     while !stop?
         chrono = Time.now.to_i
         # load te registry, compare it's offsets to file list, set offset to 0 for new files, process the whole list and if finished within the interval wait for next loop,
         # TODO: sort by timestamp
         #filelist.sort_by(|k,v|resource(k)[:date])
-        filelist = list_blobs()
-        save_registry(filelist)
-        @registry = filelist
+	worklist.clear
+	filelist.clear
+        newreg.clear
+        filelist = list_blobs(false)
+	# registry.merge(filelist) {|key, :offset, :length| :offset.merge :length }
+        filelist.each do |name, file|
+            off = 0
+            begin
+                off = @registry[name][:offset]
+            rescue
+                off = 0
+            end
+            newreg.store(name, { :offset => off, :length => file[:length] })
+	end
         # Worklist is the subset of files where the already read offset is smaller than the file size
-        worklist = filelist.select {|name,file| file[:offset] < file[:length]}
-        @logger.debug(@pipe_id+" worklist contains #{worklist.size} blobs to process")
+	worklist.clear
+	worklist = newreg.select {|name,file| file[:offset] < file[:length]}
         # This would be ideal for threading since it's IO intensive, would be nice with a ruby native ThreadPool
         worklist.each do |name, file|
-            res = resource(name)
+            #res = resource(name)
+            @logger.info(@pipe_id+" processing #{name} from #{file[:offset]} to #{file[:length]}")
+            size = 0
             if file[:offset] == 0
                 chunk = full_read(name)
-                # this may read more than originally listed
-                file[:length]=chunk.size
+                size=chunk.size
             else
                 chunk = partial_read_json(name, file[:offset], file[:length])
                 @logger.debug(@pipe_id+" partial file #{name} from #{file[:offset]} to #{file[:length]}")
             end
             if logtype == "nsgflowlog" && @is_json
+                res = resource(name)
                 begin
 		    fingjson = JSON.parse(chunk)
                     @processed += nsgflowlog(queue, fingjson)
@@ -193,9 +210,10 @@ def run(queue)
                 rescue JSON::ParserError
                     @logger.error(@pipe_id+" parse error on #{res[:nsg]} [#{res[:date]}] offset: #{file[:offset]} length: #{file[:length]}")
                 end
-            # TODO Convert this to line based grokking.
-	    elsif logtype == "wadiis" && !@is_json
-                @processed += wadiislog(queue, file[:name])
+            # TODO: Convert this to line based grokking.
+            # TODO: ECS Compliance?
+            elsif logtype == "wadiis" && !@is_json
+                @processed += wadiislog(queue, name)
             else
                 counter = 0
                 @codec.decode(chunk) do |event|
@@ -205,7 +223,8 @@ def run(queue)
                 end
                 @processed += counter
             end
-            @registry.store(name, { :offset => file[:length], :length => file[:length] })
+            @registry.store(name, { :offset => size, :length => file[:length] })
+	    #@logger.info(@pipe_id+" name #{name} size #{size} len #{file[:length]}")
             # if stop? good moment to stop what we're doing
             if stop?
                 return
@@ -300,51 +319,59 @@ def wadiislog(lines)
 end
 # list all blobs in the blobstore, set the offsets from the registry and return the filelist
-def list_blobs()
+# inspired by: https://github.com/Azure-Samples/storage-blobs-ruby-quickstart/blob/master/example.rb
+def list_blobs(fill)
     files = Hash.new
     nextMarker = nil
+    counter = 0
     loop do
-        blobs = @blob_client.list_blobs(@container, { marker: nextMarker, prefix: @prefix })
-        blobs.each do |blob|
-            # exclude the registry itself
-            unless blob.name == @registry_path
-                offset = 0
-                length = blob.properties[:content_length].to_i
-                off = @registry[blob.name]
-                unless off.nil?
-                    @logger.debug(@pipe_id+" seen #{blob.name} which is #{length} with offset #{offset}")
-                    offset = off[:offset]
-                end
-                files.store(blob.name, { :offset => offset, :length => length })
-            end
-        end
-        nextMarker = blobs.continuation_token
-        break unless nextMarker && !nextMarker.empty?
+      begin
+         if (counter > 10)
+             @logger.error(@pipe_id+" lets try again for the 10th time, why don't faraday and azure storage accounts not play nice together? it has something to do with follow_redirect and a missing authorization header?")
+         end
+         blobs = @blob_client.list_blobs(container, { marker: nextMarker, prefix: @prefix})
+         blobs.each do |blob|
+             # exclude the registry itself
+             unless blob.name == registry_path
+                 length = blob.properties[:content_length].to_i
+		 offset = 0
+                 if fill
+                     offset = length
+		 end
+                 files.store(blob.name, { :offset => offset, :length => length })
+             end
+         end
+         nextMarker = blobs.continuation_token
+         break unless nextMarker && !nextMarker.empty?
+      rescue Exception => e
+        @logger.error(@pipe_id+" caught: #{e.message}")
+	counter += 1
+      end
     end
-    @logger.debug(@pipe_id+" list_blobs found #{files.size} blobs")
     return files
 end
 # When events were processed after the last registry save, start a thread to update the registry file.
 def save_registry(filelist)
-     # TODO because of threading, processed values and regsaved are not thread safe, they can change as instance variable @!
-     unless @processed == @regsaved
-         @regsaved = @processed
-         @logger.info(@pipe_id+" processed #{@processed} events, saving #{filelist.size} blobs and offsets to registry #{registry_path}")
-         Thread.new {
-           begin
-             @blob_client.create_block_blob(container, registry_path, Marshal.dump(filelist))
-           rescue
-             @logger.error(@pipe_id+" Oh my, registry write failed, do you have write access?")
-           end
-         }
-      end
+    # TODO because of threading, processed values and regsaved are not thread safe, they can change as instance variable @!
+    unless @processed == @regsaved
+        @regsaved = @processed
+        @logger.info(@pipe_id+" processed #{@processed} events, saving #{filelist.size} blobs and offsets to registry #{registry_path}")
+        Thread.new {
+            begin
+                @blob_client.create_block_blob(container, registry_path, Marshal.dump(filelist))
+            rescue
+                @logger.error(@pipe_id+" Oh my, registry write failed, do you have write access?")
+            end
+        }
+    end
 end
 def learn_encapsulation
     # From one file, read first block and last block to learn head and tail
     # If the blobstorage can't be found, an error from farraday middleware will come with the text
     # org.jruby.ext.set.RubySet cannot be cast to class org.jruby.RubyFixnum
+    # implement options ... prefix may ot exist!
     blob = @blob_client.list_blobs(container, { maxresults: 1, prefix: @prefix }).first
     return if blob.nil?
     blocks = @blob_client.list_blob_blocks(container, blob.name)[:committed]

data/logstash-input-azure_blob_storage.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
   s.name          = 'logstash-input-azure_blob_storage'
-  s.version       = '0.10.7'
+  s.version       = '0.11.0'
   s.licenses      = ['Apache-2.0']
   s.summary       = 'This logstash plugin reads and parses data from Azure Storage Blobs.'
   s.description   = <<-EOF
@@ -20,8 +20,8 @@ EOF
   s.metadata = { "logstash_plugin" => "true", "logstash_group" => "input" }
   # Gem dependencies
-  s.add_runtime_dependency "logstash-core-plugin-api", "~> 2.1"
+  s.add_runtime_dependency 'logstash-core-plugin-api', '~> 2.1'
   s.add_runtime_dependency 'stud', '~> 0.0.23'
-  s.add_runtime_dependency 'azure-storage-blob', '~> 1.1'
+  s.add_runtime_dependency 'azure-storage-blob', '~> 1.0'
   s.add_development_dependency 'logstash-devutils', '~> 1.0', '>= 1.0.0'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: logstash-input-azure_blob_storage
 version: !ruby/object:Gem::Version
-  version: 0.10.7
+  version: 0.11.0
 platform: ruby
 authors:
 - Jan Geertsma
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-11-07 00:00:00.000000000 Z
+date: 2019-11-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
@@ -43,7 +43,7 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.1'
+        version: '1.0'
   name: azure-storage-blob
   prerelease: false
   type: :runtime
@@ -51,7 +51,7 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.1'
+        version: '1.0'
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
     requirements: