RubyGems - fluent-plugin-cmdaa-stat - Versions diffs - 0.1.14 - Mend

fluent-plugin-cmdaa-stat 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +7 -0
data/lib/fluent/plugin/filter_cmda_stat.rb +614 -0
data/lib/fluent/plugin/log-likelihood.rb +45 -0
data/test/helper.rb +8 -0
data/test/plugin/test_out_cdma_stat.rb +18 -0
metadata +112 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 601c84dcbb48331b0571ca1d25e915b51b1dd4a255b73367b0fdba2ce41e441d
+  data.tar.gz: 71943fc238a5d8bef0dc75cfa334ec34281710aa39569b50cdbc614e9c16a614
+SHA512:
+  metadata.gz: 6be0631ff00754659053159cdb0b6794d391c6aa5c74f9399f3f62866fe5f5e78640fb99850f4d9f00e56f4a37b6e9cc71e18594c1776cdad27b2b958a6c11b8
+  data.tar.gz: f52fa8ca95de31cf0cf4a1e99b3b6b738ff740d7e3d3252cd8930695e5ea9820846861b1c5e1153efc1e53cee5ffab8708d4c4efbc60e914a9fafcd5827f2ca4

data/lib/fluent/plugin/filter_cmda_stat.rb ADDED

@@ -0,0 +1,614 @@
+#
+# Copyright 2018- Mark Pohl
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+require "fluent/plugin/filter"
+require "fluent/event"
+require "statistics2"
+require "digest"
+require 'rest-client'
+module Fluent::Plugin
+    class CmdaaStatFilter < Filter
+      Fluent::Plugin.register_filter("cmdaa_stat", self)
+        helpers :compat_parameters, :inject
+        desc "The amount of the total time for data collection in seconds."
+        config_param :total_time, :integer, default: 1800
+        desc "The length of the time slice in seconds"
+        config_param :time_slice, :integer, default: 30
+        desc "The url for connecting to the database. "
+        config_param :db_url, :string, default: "http://localhost:8080"
+        desc "The q-value for detecting anomalous log entries. "
+        config_param :q_value, :float, default: 0.95
+        desc "Where the process being monitored is running...docker or not."
+        config_param :docker_container, :bool, default: true
+        #Need a hash for tags so that I only have to go to the database once
+        #Set this up here
+        $log.info("CmdaaStatFilter object version 0.1.12 is started!")
+      def configure(conf)
+        compat_parameters_convert(conf, :inject)
+        super
+        $log.info("CmdaaStatFilter has run configure. url value is #{db_url}}")
+        $log.debug("In configure:  time slice is #{time_slice}  total time is #{total_time}")
+      end
+      def start
+        super
+        $log.info("CmdaaStatFilter has run start")
+        #The cutoff for measuring uniqueness of a log.  We might want this
+        #to be configured in a database.
+        @qValue = q_value
+        #Seed the random number generator which is used in calcuting the
+        #time offset to wait before updating the database with the
+        #current data.
+        srand
+        #Get the number of servers in this system
+        @serverCount = get_server_count
+        #Just above we got the server count.  For our purposes we don't want
+        #this value to be lower than 10 since we are using the value just for
+        #determining timing related to database read/write and coordinating them
+        #so that they don't all request the read/write at the same time.
+        if (@serverCount < 10 )
+          @serverCount = 10
+        end
+        #Might want this in a database so that it can be configured for each
+        #installation
+        @updateTimeCost = 3  #seconds it takes to update the database
+        #Structure of the incrementing array and hashes
+        #
+        # Top Level hash
+        #    tag  => hash that contains "logId" and "newDataArr" and "oldDataArr"
+        #    Next level
+        #       "logId"  => numeric identifier for this log File
+        #       "newDataArr" => Array with (total_time/time_slice) elements...
+        #                       each element contains a hash with counts for template identifiers (md5Id)
+        #                       for the new data in this time span (total_time)
+        #       "oldData" => Hash for old data.  The hashes are the md5 hash from the database
+        #       Next level --- for both the old and the new data hashes
+        #          MD5 Hash
+        #          md5 from log line contents  => count for this md5Id for this time slice
+        @logInfoHash = Hash.new(0)
+    #    @logInfoHash["0"] = 0
+        #The begin time for the current time slice
+        @startTime = Time.new
+        #The end time for the current time time slice
+        #when the time slice is shifted this will become the new startTime
+        #and we will create a new stopTime by adding the total_time value to it.
+        @stopTime = @startTime + total_time
+        @updateTime = @stopTime + get_update_time_offset()
+        @stopTimeSlice = @startTime + time_slice
+        @sliceIndex = 0
+        @totalIncrements = total_time / time_slice
+        #Need to know if we have just started since the arrays and hashes
+        #will not be fully populated until the second iteration.  This affects
+        #the algorithm so we need keep track of it and set it to false after
+        #we have gone through the first iteration of all the time slices in
+        #the first total_time length
+        @firstIteration = true
+      end
+      def shutdown
+        super
+        $log.info("CmdaStatFilter has run shutdown.")
+      end
+      def getChiSqrScore (countNew,totalNew,countOld,totalOld)
+        countNew = countNew.to_f
+        countOld = countOld.to_f
+        totalNew = totalNew.to_f
+        totalOld = totalOld.to_f
+        p = countNew / totalNew
+        q = countOld / totalOld
+        if ( p < q )
+          return 0
+        end
+        t = (countNew + countOld) / (totalOld + totalNew)
+        if ( t == 0)
+           return nil
+        end
+        v = countNew * Math::log(p/t) + countOld  * Math::log(q/t)
+        if ( t == 1 )
+          #set this to return absolute value since if countOld is zero
+          #seems like we should return 1 instead of -1.  If countOld is zero
+          #that means we have never seen this value before!
+          return Statistics2.chi2X_(1,2*v).abs
+        end
+        if ( p < 1)
+          v = v + ((totalNew - countNew) * Math::log((1-p)/(1-t)))
+        end
+        if ( q < 1)
+          v = v + ((totalOld - countOld) * Math::log((1-q)/(1-t)))
+        end
+        #set this to return absolute value since if countOld is zero
+        #seems like we should return 1 instead of -1. If countOld is zero
+        #that means we have never seen this value before!
+        return Statistics2.chi2X_(1,2*v).abs
+      end
+      #Get the number of servers in the system.
+      def get_server_count
+        get_server_count_url = db_url + '/server/find-all-servers'
+        serverResponse = RestClient.get get_server_count_url
+        serverCount = 0
+        if serverResponse.code != 200
+          #just return a zero
+          serverCount = 0
+        else
+            serverList = serverResponse.body
+            jsonArray = JSON.parse(serverList)
+            serverCount = jsonArray.size
+        end
+        return serverCount
+    end
+      def resetTimeInterval(stopTime)
+        @startTime = stopTime
+        @stopTime = @startTime + total_time
+      end
+      # Need a method to create a "hash" for the regex.  Since the grok_parser doesn't allow one to tag individual groks I think the best approach is to
+      # concatenate all of the field names and compute an md5 hash from that string.
+      # This should work no matter how the grok is configured and should be
+      # unique for each log source.  We will need to insure that the groks have unique field names.  Can we assign each grok a number and add that number
+      # to the field name to insure that the grok id will be unique?
+      # This method creates an array and populates the arroy with the tag (minus "cmdaa.") and keys from the json for the message.
+      # Next, since the keys are not guaranteed to be in order they are made lowercase and sorted to insure
+      # a consistent md5 will be computed each time.  Then the md5 is calculated and returned
+      def create_line_hash(tag,message)
+          tag = tag.gsub('cmdaa.','')
+          all_keys = Array[]
+          all_values = Array[]
+        #  all_keys.push(tag)
+          message.each do | key, value |
+              if key.start_with?("CMDA_XXXX")
+                 all_keys.push(key)
+              end
+          end
+          #If the keys are defined by us, we don't really need the downcase operation but will keep it for now
+          all_keys.sort_by { |word| word.downcase }
+    #      $log.debug("all_keys in order is #{all_keys}")
+          all_keys.each { |key| all_values.push(message[key]) }
+    #      $log.debug("all_values in order is #{all_values.to_s}")
+          Digest::MD5.hexdigest(all_values.to_s())
+      end
+      #looks like the function above is not really needed now the the grok maker alllows us to label the grok.
+      #Here is a much simpler version that still makes an MD5 value since that is what the database expects but
+      #we create that with the grok_name value.
+      def create_md5_from_grok_name(message)
+        Digest::MD5.hexdigest(message["grok_name"])
+      end
+      #This is a method so that we can easily change this to get some constants
+      #from the database if we need to.  The serverCount value here could be
+      #store in the database and populated when the system was setup.  For now
+      #we set the serverCount = 10 and assume that any update to the database
+      #will take 3 seconds or less
+      def get_update_time_offset
+        timeOffset = @serverCount.to_f / @updateTimeCost.to_f
+        #return a random number between 0 and timeOffset.to_i.  This random
+        #selection tries to reduce the number of systems trying to update the
+        #database at the same time.
+        return rand(timeOffset.to_i)
+      end
+      #get the logid using the logname so that I only need to call that once
+      #return the logid that will get passed to the other db methods
+      def get_logid(tag)
+        #Write code to create the two possible log names...we are parsing the log name from the tag
+        #but the tag has the file and path name after the text 'cmdaa'  Since lots of filenames have a . in them (i.e. .log)
+        #we won't know if the last . is part of the file name or a replacement for the last /.  This code will
+        #create a string with a _ in place of the last ".".  This will allow the like operation to find the file/path
+        #for either case.
+        #The proper placement of the underscore requires some splitting and array operations
+        #the tag starts with cmdaa and then each element is separated by a "."  All elements after
+        #cmdaa are directory levels except the last one which should be the filename
+        tmpStrArr = tag.gsub('cmdaa.','').split('.')
+        #Remove the last path segment and save it!
+        lastStr = tmpStrArr.pop
+        if docker_container
+          #Need to pop two more elements to get the one that is unique for docker logs
+          #the Kubernetes logs are usually stored in directories with this structure:
+          #/var/log/pods/containerid/pod-name/0.log  I don't know if the zero is a constant or
+          #if it increments.  Also of note in kubernetes there are multiple levels of
+          #file links before you get to the actual file.  The "top level" link is
+          #/var/log/containers which points to /var/log/pods/ which points to
+          #/var/lib/docker/containers, which is where the actual file resides
+          lastStr = tmpStrArr.pop
+          lastStr = tmpStrArr.pop
+          fileName = "/VAR/LOG/PODS/%/" + lastStr.upcase + "/%LOG"
+        else
+          #If the environment is not Docker, or Kubernetes or Rancher do it this way
+          fileName = ''
+          tmpStrArr.each { |str| fileName = fileName + '/' + str }
+          #add the underscore with the last path segment after it!
+          fileName = fileName + '_' + lastStr
+        end
+        $log.info("Filename to search for in db is " + fileName)
+    #    $log.debug("db_url is #{db_url} and fileName is #{fileName}")
+        #create the url to the REST call.  /log-file/find-by-filename-like is the URL endpoint in the REST service.
+        get_log_url = db_url + '/log-file/find-by-filename-like/'
+    #    $log.debug("input to rest call is " + get_log_url + URI.escape('{"fileName":"' + fileName + '"}',Regexp.new("[^#{URI::PATTERN::UNRESERVED}]")))
+        logIdResponse = RestClient.get get_log_url + URI.escape('{"fileName":"' + fileName + '"}',Regexp.new("[^#{URI::PATTERN::UNRESERVED}]"))
+    #    logId = RestClient.get "http://192.168.88.137:8001/api/v1/namespaces/default/services/cmdaa:8080/proxy/log-file/find-by-filename-like/%7B%22fileName%22%3A%22%2Ffluentd%2Ftest%2Ftest_log%22%7D"
+    #    $log.debug("logIdResponse code is #{logIdResponse.code}")
+    #    $log.debug("logIdResponse body is #{logIdResponse.body}")
+        if logIdResponse.code != 200
+          #create something empty to return
+          logId = 0
+        else
+            logId = logIdResponse.body
+        end
+        return logId
+      end
+      #When this process is started, gets the data from the database service for this log source
+      def get_old_counts(logId)
+        #Use logId to get list of existing shift_counts with REST call
+        get_data_url = db_url + "/log-line-count/find-all-lines-by-log-file/#{logId}"
+        $log.debug("input to rest call for get_old_counts is #{get_data_url}")
+        old_counts_list = RestClient.get get_data_url
+        if old_counts_list.code != 200
+          #create something empty to return
+          oldHashData = Hash.new(0)
+        else
+          #process JSON and create new hash for oldHashData
+          oldHashData = Hash.new(0)
+          jsonArray = JSON.parse(old_counts_list.body)
+          #Make sure we found something...if this is empty we will still
+          #return a hash it will just be empty
+          if jsonArray.size > 0
+            jsonArray.each { |value|  oldHashData[value["md5Id"]] = value["count"]
+            $log.debug("oldData key is #{value["md5Id"]}  oldData count is #{oldHashData[value["md5Id"]]}")
+          }
+          end
+        end
+        return oldHashData
+        #if there are no counts available return an empty hash
+        #if there are counts avaialble return the list that was returned by the rest call but reformatted as
+        #a hash with the md5 value as the key and the count as the value.
+      end
+      #Calculate the total number of counts from the individual counts in the
+      #hash passed in.
+      def get_total(countHash)
+        totalCounts = 0
+        countHash.each_value { |value| totalCounts = totalCounts + value }
+        return totalCounts
+      end
+      #This should call a web service to add current data counts to the database
+      def update_counts_db(logId,countList)
+        currentTime = Time.now
+        $log.debug("in update_counts_db, time before db call is #{currentTime.utc.iso8601}")
+        insertHash =  Hash.new(0)
+        jsonOut = ""
+        #$log.debug("countList as a string is #{countList.to_s}")
+        #Go through the countList hash and construct a string in JSON format
+        #for sending all records to the database.
+        countList.each { |key,value|
+          insertHash["logId"] = logId
+          insertHash["md5Id"] = key
+          insertHash["count"] = value
+          #If this is not the first time through this loop then add the new
+          #resord to the end of the string otherwise just assign the new JSON
+          #string to the jsonOUT string.
+          if jsonOut != ""
+            jsonOut =  jsonOut + "," + insertHash.to_json
+          else
+            jsonOut = insertHash.to_json
+          end
+        }
+        #Add brackets to the beginning and the end of the JSON string since
+        #the service is expecting a JSON array.
+        jsonOut = "[" + jsonOut + "]"
+        #$log.debug("jsonOut as a string is #{jsonOut}")
+        #construct the url for sending the data to the log-line-counter service
+        #The data is sent as a datatype with 3 elements.  Inside the service these
+        #elements are put into an insert/update statement.  The SQL will
+        #first try an insert and then if the record already exists an update will
+        #be sent.
+        post_log_url = db_url + '/log-line-count/update-transaction/'
+        #$log.debug("url for post is #{post_log_url}")
+        #The RestClient.post interface expects at least two parameters to be  passed but our service is really set with no named parameters so
+        #we pass in a second parameter that is empty...this seems to work.  Also the URI.escape etc is a Ruby way to encode a URL
+        return_value = RestClient.post post_log_url + URI.escape(jsonOut,Regexp.new("[^#{URI::PATTERN::UNRESERVED}]")), ""
+        currentTime = Time.now
+        $log.debug("in update_counts_db, time after db call is #{currentTime.utc.iso8601}")
+      end
+      def filter(tag, time, record)
+        #Checking for the tag allows us to handle multiple logs at once
+        #Some configurations in fluentd cmdaa_stat plugin might tail more than one log.
+        # By checking the tag here
+        #allows us to have different hashes for different tags and we can have
+        #completely separate data sets for each log file that is being tracked
+        #under the same cmdaa_stat <filter> configuration
+        $log.debug("In filter:  time slice is #{time_slice}  total time is #{total_time} tag is #{tag}")
+        if ( @logInfoHash[tag] == 0 )
+          @logInfoHash[tag] = Hash.new(0)
+          #Get the logId from the database.  Use the tag value to create a
+          #file or path with a wildcard entry that should find the log file
+          #entry in the database and get the logId that identifies that entry
+          @logInfoHash[tag]["logId"] = get_logid(tag)
+          #Create a new array that will have an element for each time slice
+          #(i.e. increment) Each array element will point to a hash.  The hash
+          #will contain MD5_ID keys with the count for each occurence of that
+          #MD5_ID in the log data coming through this filter
+          @logInfoHash[tag]["newDataArray"] = Array.new(@totalIncrements) { Hash.new(0) }
+          #Create an array of elments.  Each element will count the total number
+          #of logs we see in the current time slice (i.e. increment)
+          @logInfoHash[tag]["newIncrementTotals"] = Array.new(@totalIncrements) { |value| 0 }
+          #Create a hash.  This hash will contain the total number of logs that
+          #match a particular MD5_ID for the entire time span (total_time).  The
+          #key for the hash is MD5_ID.  The value for the hash is the count of
+          #how many times we see a log that computes to that MD5_ID
+          @logInfoHash[tag]["newTotal"] = Hash.new(0)
+          #Initialize a new total count for this time segment and this log file.
+          #Every time we see a log line for this file we increment this value
+          #by 1.  When we are done with this time segment (total_time) we reset
+          #this value to 0
+          @logInfoHash[tag]["newTotalAll"] = 0
+          #Get data currently in the database for this logFileId
+          @logInfoHash[tag]["oldData"] = get_old_counts(@logInfoHash[tag]["logId"])
+          #Get the total for all the values in the oldData
+          @logInfoHash[tag]["oldDataTotal"] = get_total(@logInfoHash[tag]["oldData"])
+          #Create a new Array.  Each element of the array will hold the updates
+          #that will be sent to the database.  One index will be for holding the
+          #data that is currently being sent to the database (sendingIndex) and
+          #one index will be for the currently counting data  (summingIndex)
+          @logInfoHash[tag]["updateGlobal"] = Array.new(2) { Hash.new(0) }
+          @summingIndex = 0
+          @sendingIndex = 1
+        end
+        if @logInfoHash[tag]["oldData"].size == 0
+          oldDataPresent = false
+        else
+          oldDataPresent = true
+        end
+        #get the current time
+        currentTime = Time.new
+        #If the current time slice has expired then move to the next slice and
+        #reset the time for the next time slice expiration
+        #Also need to check if we are past the last slice.  If we are past
+        #the last slice we need to start again at zero and copy the current
+        #slice to the old_data.
+        if ( currentTime - @stopTimeSlice ) >= 0
+          #Increment the slice index since the time has expired for this
+          #time slice
+          @sliceIndex = @sliceIndex + 1
+          #if we have gone past the total number of increments we need to
+          #do some housekeeping like set the @sliceIndex to zero so that
+          #we start over again with our index
+          if @sliceIndex >= @totalIncrements
+            @sliceIndex = 0
+            #Need to know whether this is the first time through this
+            #time span since the first time through gets handled differently.
+            #The first time through we don't have any data in the "old"
+            #time slice elements so we can't add them to anything.
+            if @firstIteration == true
+              @firstIteration = false
+            end
+          end
+          #set the new time point for the end of the current time slice
+          @stopTimeSlice = @stopTimeSlice + time_slice
+          #NOTE: This seems strange but after we have iterated through all the
+          #time slices and we start again, the oldest time slice is the one that
+          #we are about to overwrite.  It is also the one we want to use to add
+          #to the oldData and subtract from the new data.  Therefore we use the
+          #@sliceIndex to set the @oldestSliceIndex and get those values
+          #before they are overwritten.  This differs from how Jeff's original
+          #algorithm was written but I think this is correct.
+          #Also, in the first iteration, there is no data in the "oldData" arrays
+          #and hashes so we don't execute this code during the first iteration
+          #through the time slices.
+          if @firstIteration == false
+            @oldestSliceIndex = @sliceIndex
+            @logInfoHash[tag]["newTotalAll"]  = @logInfoHash[tag]["newTotalAll"] - @logInfoHash[tag]["newIncrementTotals"][@oldestSliceIndex]
+            @logInfoHash[tag]["oldDataTotal"] = @logInfoHash[tag]["oldDataTotal"] + @logInfoHash[tag]["newIncrementTotals"][@oldestSliceIndex]
+            @logInfoHash[tag]["newDataArray"][@oldestSliceIndex].each { |key, value|
+              @logInfoHash[tag]["newTotal"][key] = @logInfoHash[tag]["newTotal"][key] - value
+              if oldDataPresent
+                @logInfoHash[tag]["oldData"][key] = @logInfoHash[tag]["oldData"][key] + value
+              end
+              @logInfoHash[tag]["newDataArray"][@oldestSliceIndex][key] = 0
+            }
+            @logInfoHash[tag]["newIncrementTotals"][@oldestSliceIndex] = 0
+          end
+          #Here we add the @sliceIndex -1 values to the updateGlobal Array/hash
+          #We always do this, we don't skip the 1st iteration.  The -1 is because
+          #we added 1 to the sliceIndex above we need the values that were just
+          #counted
+          @logInfoHash[tag]["newDataArray"][@sliceIndex - 1].each { |key, value|
+            @logInfoHash[tag]["updateGlobal"][@summingIndex][key] =  @logInfoHash[tag]["updateGlobal"][@summingIndex][key] + value
+          }
+        end
+      #  $log.debug(" Outside Time if...logId from logInfoHash is #{@logInfoHash.to_s}")
+        #Not sure about this code and how it fits in with the algorithm
+        #Need to research a bit more.....
+        #As far as I can tell at this point there seems to be some overlap
+        #between the usefulness of this larger time interval (i.e. total_time)
+        #and the time interval that we need/want for the global_counts update
+        #Might need to check in with Jeff on this
+        if ( currentTime - @stopTime ) >= 0
+          resetTimeInterval(currentTime)
+          #Do all the other things that need to be done here:
+          #write records to database for all hash values for all logid's
+          #shift the current counts to be old counts for all hash values for all logid's
+          #get ready for the new counts
+          @logInfoHash.each_key { |key|
+          # Copy counts from the summingIndex hashes to the sendingIndex hashes and
+          # Reset the summingIndex hash values to 0
+            @logInfoHash[key]["updateGlobal"][@summingIndex].each { |key2,value2|
+              @logInfoHash[key]["updateGlobal"][@sendingIndex][key2] = value2
+              @logInfoHash[key]["updateGlobal"][@summingIndex][key2] = 0
+            }
+            #Refresh the old data here...get it from the database
+            #this can be done just about any time but this looks like the
+            #the most logical time
+            @logInfoHash[key]["oldData"] = get_old_counts(@logInfoHash[key]["logId"])
+            #Get the total for all the values in the oldData
+            @logInfoHash[key]["oldDataTotal"] = get_total(@logInfoHash[key]["oldData"])
+          }
+        end
+        if (currentTime - @updateTime) >= 0
+          #Send update to database here and then reset @updateTime
+          #This should be a System call- ( a fork here ) so that we do "multi-threading"
+          #We do not check for a return value since that would require a wait.
+          $log.debug("Before @updateTime has been reset it is #{@updateTime.utc.iso8601}")
+          @updateTime = @stopTime + get_update_time_offset()
+          $log.debug("After @updateTime has been reset it is #{@updateTime.utc.iso8601}")
+          #Split out to a subprocess to do this/these updates so there is no waiting
+          pid = fork do
+            @logInfoHash.each_key { |key|
+              update_counts_db(@logInfoHash[key]["logId"],@logInfoHash[key]["updateGlobal"][@sendingIndex])
+            }
+          end
+          #This tells Ruby that we don't want to hold on to this subprocess once it completes.
+          #When the subprocess completes just let it exit quietly.
+          #Supposedly one can use Thread#join to get the status of this subprocess
+          #after it has completed but I can't seem to get that to work.  I'm
+          #not sure if the Thread methods really apply to Process.
+          Process.detach(pid)
+        end
+        #Switched to using the grok_name key so we don't need to create a string before we do the MD5 transformation.
+        #rec_hash = create_line_hash(tag,record)
+        rec_hash = create_md5_from_grok_name(record)
+        $log.debug("md5 hash is #{rec_hash}. tag is #{tag}")
+        $log.debug("logFileId is #{@logInfoHash[tag]["logId"]}")
+        @logInfoHash[tag]["newDataArray"][@sliceIndex][rec_hash] = @logInfoHash[tag]["newDataArray"][@sliceIndex][rec_hash] + 1
+        @logInfoHash[tag]["newTotal"][rec_hash] = @logInfoHash[tag]["newTotal"][rec_hash] + 1
+        @logInfoHash[tag]["newIncrementTotals"][@sliceIndex] = @logInfoHash[tag]["newIncrementTotals"][@sliceIndex] + 1
+        @logInfoHash[tag]["newTotalAll"] = @logInfoHash[tag]["newTotalAll"] + 1
+        #Can only calculate a score if we have old data!
+        if oldDataPresent
+            $log.debug("parameters passing into getChiSqrScore are #{@logInfoHash[tag]["newDataArray"][@sliceIndex][rec_hash]} and #{@logInfoHash[tag]["newTotal"][rec_hash]} and #{@logInfoHash[tag]["oldData"][rec_hash]} and #{@logInfoHash[tag]["oldDataTotal"]}")
+            #This score gives us something like ( or maybe just like) a q value.
+            #Typically anything >= 0.95 will be considered significant but we might want that paramertized
+            score = getChiSqrScore(@logInfoHash[tag]["newDataArray"][@sliceIndex][rec_hash],
+                                   @logInfoHash[tag]["newTotalAll"],
+                                   @logInfoHash[tag]["oldData"][rec_hash],
+                                   @logInfoHash[tag]["oldDataTotal"])
+            $log.debug("for message: #{record.to_s}")
+            $log.debug("the score is #{score}")
+            if score >= @qValue
+              $log.info("Found an anomaly!! score is  #{score}")
+              $log.info("Anomaly is found for message: #{record.to_s}")
+            end
+        end
+      end
+    end
+end

data/lib/fluent/plugin/log-likelihood.rb ADDED

@@ -0,0 +1,45 @@
+require "statistics2"
+def getChiSqrScore (countNew,totalNew,countOld,totalOld)
+  countNew = countNew.to_f
+  countOld = countOld.to_f
+  totalNew = totalNew.to_f
+  totalOld = totalOld.to_f
+  p = countNew/ totalNew
+  q = countOld / totalOld
+  if ( p < q )
+    return 0
+  end
+  t = (countNew + countOld) / (totalOld + totalNew)
+  if ( t == 0)
+     return nil
+  end
+  v = countNew * Math::log(p/t) + countOld  * Math::log(q/t)
+  if ( t == 1 )
+    return Statistics2..chi2X_(1,2*v)
+  end
+  if ( p < 1)
+    v = v + ((totalNew - countNew) * Math::log((1-p)/(1-t)))
+  end
+  if ( q < 1)
+    v = v + ((totalOld - countOld) * Math::log((1-q)/(1-t)))
+  end
+  return Statistics2.chi2X_(1,2*v)
+end

data/test/helper.rb ADDED

@@ -0,0 +1,8 @@
+$LOAD_PATH.unshift(File.expand_path("../../", __FILE__))
+require "test-unit"
+require "fluent/test"
+require "fluent/test/driver/output"
+require "fluent/test/helpers"
+Test::Unit::TestCase.include(Fluent::Test::Helpers)
+Test::Unit::TestCase.extend(Fluent::Test::Helpers)

data/test/plugin/test_out_cdma_stat.rb ADDED

@@ -0,0 +1,18 @@
+require "helper"
+require "fluent/plugin/out_cdma_stat.rb"
+class CdmaaStatOutputTest < Test::Unit::TestCase
+  setup do
+    Fluent::Test.setup
+  end
+  test "failure" do
+    flunk
+  end
+  private
+  def create_driver(conf)
+    Fluent::Test::Driver::Output.new(Fluent::Plugin::CdmaaStatOutput).configure(conf)
+  end
+end

metadata ADDED

@@ -0,0 +1,112 @@
+--- !ruby/object:Gem::Specification
+name: fluent-plugin-cmdaa-stat
+version: !ruby/object:Gem::Version
+  version: 0.1.14
+platform: ruby
+authors:
+- Mark Pohl
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2019-03-28 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.14'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.14'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '12.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '12.0'
+- !ruby/object:Gem::Dependency
+  name: test-unit
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+- !ruby/object:Gem::Dependency
+  name: fluentd
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.14.10
+    - - "<"
+      - !ruby/object:Gem::Version
+        version: '2'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.14.10
+    - - "<"
+      - !ruby/object:Gem::Version
+        version: '2'
+description:
+email:
+- mark.g.pohl@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/fluent/plugin/filter_cmda_stat.rb
+- lib/fluent/plugin/log-likelihood.rb
+- test/helper.rb
+- test/plugin/test_out_cdma_stat.rb
+homepage: http://rubygems.com/cdmaa
+licenses:
+- Apache-2.0
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.7.6
+signing_key:
+specification_version: 4
+summary: CMDA plugin to process logdata and save stats to a database
+test_files:
+- test/helper.rb
+- test/plugin/test_out_cdma_stat.rb