RubyGems - logstash-input-azure_blob_storage - Versions diffs - 0.11.4 → 0.11.5 - Mend

logstash-input-azure_blob_storage 0.11.4 → 0.11.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +8 -1
data/README.md +7 -2
data/lib/logstash/inputs/azure_blob_storage.rb +60 -22
data/logstash-input-azure_blob_storage.gemspec +2 -2
metadata +2 -22

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 158d9ef3b7997fb3ec67f4e2278861ae367c3e4a73f362dc56f145482d802e34
-  data.tar.gz: 89f5b1bc848a97cbf31b1323aa64d021d86a05292d3d7d006994ad170666a37d
+  metadata.gz: 3d446aed971a95e6e17a27ed1e9ec8b141f939b53697fb9c332cfb130404745a
+  data.tar.gz: 4a1321f6c6a30f6787d2133642ca23840371d6f4e18102cb775d345b09eb176a
 SHA512:
-  metadata.gz: 80f12e364ba3fd81375d2b88d24567d92ec83decac371552e3a814194f6dcae2f1c6991ac87f50e0012a8cb177f67da92790d40a71af953b211e5043a1691170
-  data.tar.gz: 0e54b9c0b9f63737ef8046d362c47f1c20f2d9f702db0311993def976f1a40c14534c7fae9a7a90e098ce4b3bdd18d00517f420e9cc6c4b7810f3709aee797e1
+  metadata.gz: b4f48a0bebcd6e3594584a4473b223838359d44e9ef591f958aa4c80c4c22953f6b0f708b19faeaf0517c66f47185bda4de75ab4e3618b23e2e7f23f71cb4bee
+  data.tar.gz: 508cd39ea159a4655e590f46ad0108c3b6e6de95ed575c4456da0230bae73fb384ecb7697ed710e7afb1542fe01cbd8a62130acedcbf0ba9c3040ace1f9d76d0

data/CHANGELOG.md CHANGED

@@ -1,6 +1,13 @@
+## 0.11.5
+  - Added optional filename into the message
+  - plumbing for emulator, start_over not learning from registry
 ## 0.11.4
   - fixed listing 3 times, rather than retrying to list max 3 times
-  - added log entries for better tracing in which phase the application is now and how long it takes
+  - added option to migrate/save to using local registry
+  - rewrote interval timing
+  - reduced saving of registry to maximum once per interval, protect duplicate simultanious writes
+  - added debug_timer for better tracing how long operations take
   - removing pipeline name from logfiles, logstash 7.6 and up have this in the log4j2 by default now
   - moved initialization from register to run. should make logs more readable

data/README.md CHANGED

@@ -40,7 +40,11 @@ The registry_create_policy is used when the pipeline is started to either resume
 interval defines the minimum time the registry should be saved to the registry file (by default 'data/registry.dat'), this is only needed in case the pipeline dies unexpectedly. During a normal shutdown the registry is also saved.
-During the pipeline start the plugin uses one file to learn how the JSON header and tail look like, they can also be configured manually.
+When registry_local_path is set to a directory, the registry is save on the logstash server in that directory. The filename is the pipe.id
+with registry_create_policy set to resume and the registry_local_path set to a directory where the registry isn't yet created, should load from the storage account and save the registry on the local server
+During the pipeline start for JSON codec, the plugin uses one file to learn how the JSON header and tail look like, they can also be configured manually.
 ## Running the pipeline
 The pipeline can be started in several ways.
@@ -91,6 +95,7 @@ The log level of the plugin can be put into DEBUG through
 curl -XPUT 'localhost:9600/_node/logging?pretty' -H 'Content-Type: application/json' -d'{"logger.logstash.inputs.azureblobstorage" : "DEBUG"}'
 ```
+because debug also makes logstash chatty, there are also debug_timer and debug_until that can be used to print additional informantion on what the pipeline is doing and how long it takes. debug_until is for the number of events until debug is disabled.
 ## Other Configuration Examples
 For nsgflowlogs, a simple configuration looks like this
@@ -176,7 +181,7 @@ filter {
     remove_field => ["subresponse"]
     remove_field => ["username"]
     remove_field => ["clientPort"]
-    remove_field => ["port"]
+    remove_field => ["port"]:0
     remove_field => ["timestamp"]
   }
 }

data/lib/logstash/inputs/azure_blob_storage.rb CHANGED

@@ -25,6 +25,9 @@ config :storageaccount, :validate => :string, :required => false
 # DNS Suffix other then blob.core.windows.net
 config :dns_suffix, :validate => :string, :required => false, :default => 'core.windows.net'
+# For development this can be used to emulate an accountstorage when not available from azure
+#config :use_development_storage, :validate => :boolean, :required => false
 # The (primary or secondary) Access Key for the the storage account. The key can be found in the portal.azure.com or through the azure api StorageAccounts/ListKeys. For example the PowerShell command Get-AzStorageAccountKey.
 config :access_key, :validate => :password, :required => false
@@ -58,6 +61,7 @@ config :registry_create_policy, :validate => ['resume','start_over','start_fresh
 # Z00000000000000000000000000000000 2     ]}
 config :interval, :validate => :number, :default => 60
+config :addfilename, :validate => :boolean, :default => false, :required => false
 # debug_until will for a maximum amount of processed messages shows 3 types of log printouts including processed filenames. This is a lightweight alternative to switching the loglevel from info to debug or even trace
 config :debug_until, :validate => :number, :default => 0, :required => false
@@ -127,11 +131,15 @@ def run(queue)
     unless conn.nil?
         @blob_client = Azure::Storage::Blob::BlobService.create_from_connection_string(conn)
     else
+#        unless use_development_storage?
         @blob_client = Azure::Storage::Blob::BlobService.create(
             storage_account_name: storageaccount,
 	    storage_dns_suffix: dns_suffix,
             storage_access_key: access_key.value,
         )
+#        else
+#            @logger.info("not yet implemented")
+#        end
     end
     @registry = Hash.new
@@ -167,7 +175,7 @@ def run(queue)
     if registry_create_policy == "start_fresh"
         @registry = list_blobs(true)
 	save_registry(@registry)
-	@logger.info("starting fresh, overwriting the registry to contain #{@registry.size} blobs/files")
+	@logger.info("starting fresh, writing a clean the registry to contain #{@registry.size} blobs/files")
     end
     @is_json = false
@@ -223,6 +231,7 @@ def run(queue)
             newreg.store(name, { :offset => off, :length => file[:length] })
 	    if (@debug_until > @processed) then @logger.info("2: adding offsets: #{name} #{off} #{file[:length]}") end
 	end
+        # size nilClass when the list doesn't grow?!
         # Worklist is the subset of files where the already read offset is smaller than the file size
 	worklist.clear
 	worklist = newreg.select {|name,file| file[:offset] < file[:length]}
@@ -230,13 +239,19 @@ def run(queue)
         # Start of processing
 	# This would be ideal for threading since it's IO intensive, would be nice with a ruby native ThreadPool
-        worklist.each do |name, file|
+        if (worklist.size > 0) then
+          worklist.each do |name, file|
             start = Time.now.to_i
             if (@debug_until > @processed) then @logger.info("3: processing #{name} from #{file[:offset]} to #{file[:length]}") end
             size = 0
             if file[:offset] == 0
-                chunk = full_read(name)
-                size=chunk.size
+                # This is where Sera4000 issue starts
+                begin
+                    chunk = full_read(name)
+                    size=chunk.size
+                rescue Exception => e
+                    @logger.error("Failed to read #{name} because of: #{e.message} .. will continue and pretend this never happened")
+                end
             else
                 chunk = partial_read_json(name, file[:offset], file[:length])
                 @logger.debug("partial file #{name} from #{file[:offset]} to #{file[:length]}")
@@ -245,7 +260,7 @@ def run(queue)
                 res = resource(name)
                 begin
 		    fingjson = JSON.parse(chunk)
-                    @processed += nsgflowlog(queue, fingjson)
+                    @processed += nsgflowlog(queue, fingjson, name)
                     @logger.debug("Processed #{res[:nsg]} [#{res[:date]}] #{@processed} events")
                 rescue JSON::ParserError
                     @logger.error("parse error on #{res[:nsg]} [#{res[:date]}] offset: #{file[:offset]} length: #{file[:length]}")
@@ -259,6 +274,9 @@ def run(queue)
                 begin
                   @codec.decode(chunk) do |event|
                     counter += 1
+                    if @addfilename
+                      event.set('filename', name)
+                    end
                     decorate(event)
                     queue << event
                   end
@@ -279,6 +297,7 @@ def run(queue)
 	    if ((Time.now.to_i - @last) > @interval)
                 save_registry(@registry)
             end
+          end
         end
 	# The files that got processed after the last registry save need to be saved too, in case the worklist is empty for some intervals.
         now = Time.now.to_i
@@ -326,8 +345,7 @@ def strip_comma(str)
 end
-def nsgflowlog(queue, json)
+def nsgflowlog(queue, json, name)
     count=0
     json["records"].each do |record|
       res = resource(record["resourceId"])
@@ -340,9 +358,16 @@ def nsgflowlog(queue, json)
                   tups = tup.split(',')
                   ev = rule.merge({:unixtimestamp => tups[0], :src_ip => tups[1], :dst_ip => tups[2], :src_port => tups[3], :dst_port => tups[4], :protocol => tups[5], :direction => tups[6], :decision => tups[7]})
                   if (record["properties"]["Version"]==2)
+                    tups[9] = 0 if tups[9].nil?
+                    tups[10] = 0 if tups[10].nil?
+                    tups[11] = 0 if tups[11].nil?
+                    tups[12] = 0 if tups[12].nil?
                       ev.merge!( {:flowstate => tups[8], :src_pack => tups[9], :src_bytes => tups[10], :dst_pack => tups[11], :dst_bytes => tups[12]} )
                   end
                   @logger.trace(ev.to_s)
+                  if @addfilename
+                      ev.merge!( {:filename => name } )
+                  end
                   event = LogStash::Event.new('message' => ev.to_json)
                   decorate(event)
                   queue << event
@@ -429,10 +454,10 @@ def save_registry(filelist)
                 @busy_writing_registry = true
                 unless (@registry_local_path)
                     @blob_client.create_block_blob(container, registry_path, Marshal.dump(filelist))
-                    @logger.info("processed #{@processed} events, saving #{filelist.size} blobs and offsets to registry #{registry_path}")
+                    @logger.info("processed #{@processed} events, saving #{filelist.size} blobs and offsets to remote registry #{registry_path}")
                 else
                     File.open(@registry_local_path+"/"+@pipe_id, 'w') { |file| file.write(Marshal.dump(filelist)) }
-                    @logger.info("processed #{@processed} events, saving #{filelist.size} blobs and offsets to registry #{registry_local_path+"/"+@pipe_id}")
+                    @logger.info("processed #{@processed} events, saving #{filelist.size} blobs and offsets to local registry #{registry_local_path+"/"+@pipe_id}")
                 end
                 @busy_writing_registry = false
                 @last = Time.now.to_i
@@ -446,21 +471,34 @@ def save_registry(filelist)
     end
 end
 def learn_encapsulation
     # From one file, read first block and last block to learn head and tail
-    # If the blobstorage can't be found, an error from farraday middleware will come with the text
-    # org.jruby.ext.set.RubySet cannot be cast to class org.jruby.RubyFixnum
-    blob = @blob_client.list_blobs(container, { maxresults: 1, prefix: @prefix }).first
-    return if blob.nil?
-    blocks = @blob_client.list_blob_blocks(container, blob.name)[:committed]
-    # TODO add check for empty blocks and log error that the header and footer can't be learned and must be set in the config
-    @logger.debug("using #{blob.name} to learn the json header and tail")
-    @head = @blob_client.get_blob(container, blob.name, start_range: 0, end_range: blocks.first.size-1)[1]
-    @logger.debug("learned header: #{@head}")
-    length = blob.properties[:content_length].to_i
-    offset = length - blocks.last.size
-    @tail = @blob_client.get_blob(container, blob.name, start_range: offset, end_range: length-1)[1]
-    @logger.debug("learned tail: #{@tail}")
+    begin
+        blobs = @blob_client.list_blobs(container, { maxresults: 3, prefix: @prefix})
+        blobs.each do |blob|
+            unless blob.name == registry_path
+              begin
+                blocks = @blob_client.list_blob_blocks(container, blob.name)[:committed]
+                if blocks.first.name.start_with?('A00')
+                  @logger.debug("using #{blob.name}/#{blocks.first.name} to learn the json header")
+                  @head = @blob_client.get_blob(container, blob.name, start_range: 0, end_range: blocks.first.size-1)[1]
+                end
+                if blocks.last.name.start_with?('Z00')
+                  @logger.debug("using #{blob.name}/#{blocks.last.name} to learn the json footer")
+                  length = blob.properties[:content_length].to_i
+                  offset = length - blocks.last.size
+                  @tail = @blob_client.get_blob(container, blob.name, start_range: offset, end_range: length-1)[1]
+                  @logger.debug("learned tail: #{@tail}")
+                end
+              rescue Exception => e
+                @logger.info("learn json one of the attempts failed #{e.message}")
+              end
+            end
+        end
+    rescue Exception => e
+        @logger.info("learn json header and footer failed because #{e.message}")
+    end
 end
 def resource(str)

data/logstash-input-azure_blob_storage.gemspec CHANGED

@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
   s.name          = 'logstash-input-azure_blob_storage'
-  s.version       = '0.11.4'
+  s.version       = '0.11.5'
   s.licenses      = ['Apache-2.0']
   s.summary       = 'This logstash plugin reads and parses data from Azure Storage Blobs.'
   s.description   = <<-EOF
@@ -23,5 +23,5 @@ EOF
   s.add_runtime_dependency 'logstash-core-plugin-api', '~> 2.1'
   s.add_runtime_dependency 'stud', '~> 0.0.23'
   s.add_runtime_dependency 'azure-storage-blob', '~> 1.1'
-  s.add_development_dependency 'logstash-devutils', '~> 1.0', '>= 1.0.0'
+  #s.add_development_dependency 'logstash-devutils', '~> 2'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: logstash-input-azure_blob_storage
 version: !ruby/object:Gem::Version
-  version: 0.11.4
+  version: 0.11.5
 platform: ruby
 authors:
 - Jan Geertsma
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-05-23 00:00:00.000000000 Z
+date: 2020-12-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement
@@ -52,26 +52,6 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.1'
-- !ruby/object:Gem::Dependency
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.0.0
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '1.0'
-  name: logstash-devutils
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 1.0.0
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '1.0'
 description: " This gem is a Logstash plugin. It reads and parses data from Azure\
   \ Storage Blobs. The azure_blob_storage is a reimplementation to replace azureblob\
   \ from azure-diagnostics-tools/Logstash. It can deal with larger volumes and partial\