RubyGems - logstash-filter-translate - Versions diffs - 3.1.0 → 3.2.0 - Mend

logstash-filter-translate 3.1.0 → 3.2.0

Files changed (27) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +8 -0
data/docs/index.asciidoc +173 -33
data/lib/logstash/filters/array_of_maps_value_update.rb +44 -0
data/lib/logstash/filters/array_of_values_update.rb +37 -0
data/lib/logstash/filters/dictionary/csv_file.rb +25 -0
data/lib/logstash/filters/dictionary/file.rb +140 -0
data/lib/logstash/filters/dictionary/json_file.rb +87 -0
data/lib/logstash/filters/dictionary/memory.rb +31 -0
data/lib/logstash/filters/dictionary/yaml_file.rb +24 -0
data/lib/logstash/filters/dictionary/yaml_visitor.rb +42 -0
data/lib/logstash/filters/fetch_strategy/file.rb +81 -0
data/lib/logstash/filters/fetch_strategy/memory.rb +52 -0
data/lib/logstash/filters/single_value_update.rb +33 -0
data/lib/logstash/filters/translate.rb +54 -155
data/logstash-filter-translate.gemspec +5 -1
data/spec/filters/benchmark_rspec.rb +69 -0
data/spec/filters/scheduling_spec.rb +200 -0
data/spec/filters/translate_spec.rb +238 -45
data/spec/filters/yaml_visitor_spec.rb +16 -0
data/spec/fixtures/regex_dict.csv +4 -0
data/spec/fixtures/regex_union_dict.csv +4 -0
data/spec/fixtures/tag-map-dict.yml +21 -0
data/spec/fixtures/tag-omap-dict.yml +21 -0
data/spec/support/build_huge_dictionaries.rb +33 -0
data/spec/support/rspec_wait_handler_helper.rb +38 -0
metadata +87 -2

data/lib/logstash/filters/dictionary/file.rb ADDED

@@ -0,0 +1,140 @@
+# encoding: utf-8
+require 'concurrent/atomic/atomic_boolean'
+require 'rufus-scheduler'
+require "logstash/util/loggable"
+require "logstash/filters/fetch_strategy/file"
+java_import 'java.util.concurrent.locks.ReentrantReadWriteLock'
+module LogStash module Filters module Dictionary
+  class DictionaryFileError < StandardError; end
+  class File
+    def self.create(path, refresh_interval, refresh_behaviour, exact, regex)
+      if /\.y[a]?ml$/.match(path)
+        instance = YamlFile.new(path, refresh_interval, exact, regex)
+      elsif path.end_with?(".json")
+        instance = JsonFile.new(path, refresh_interval, exact, regex)
+      elsif path.end_with?(".csv")
+        instance = CsvFile.new(path, refresh_interval, exact, regex)
+      else
+        raise "Translate: Dictionary #{path} has a non valid format"
+      end
+      if refresh_behaviour == 'merge'
+        instance.set_update_strategy(:merge_dictionary)
+      elsif refresh_behaviour == 'replace'
+        instance.set_update_strategy(:replace_dictionary)
+      else
+        # we really should never get here
+        raise(LogStash::ConfigurationError, "Unknown value for refresh_behaviour=#{refresh_behaviour.to_s}")
+      end
+    end
+    include LogStash::Util::Loggable
+    attr_reader :dictionary, :fetch_strategy
+    def initialize(path, refresh_interval, exact, regex)
+      @dictionary_path = path
+      @refresh_interval = refresh_interval
+      @short_refresh = @refresh_interval <= 300
+      @stopping = Concurrent::AtomicBoolean.new # ported from jdbc_static, need a way to prevent a scheduled execution from running a load.
+      rw_lock = java.util.concurrent.locks.ReentrantReadWriteLock.new
+      @write_lock = rw_lock.writeLock
+      @dictionary = Hash.new
+      @update_method = method(:merge_dictionary)
+      initialize_for_file_type
+      args = [@dictionary, rw_lock]
+      if exact
+        @fetch_strategy = regex ? FetchStrategy::File::ExactRegex.new(*args) : FetchStrategy::File::ExactRegex.new(*args)
+      else
+        @fetch_strategy = FetchStrategy::File::RegexUnion.new(*args)
+      end
+      load_dictionary(raise_exception = true)
+      stop_scheduler(initial = true)
+      start_scheduler unless @refresh_interval <= 0 # disabled, a scheduler interval of zero makes no sense
+    end
+    def stop_scheduler(initial = false)
+      @stopping.make_true unless initial
+      @scheduler.shutdown(:wait) if @scheduler
+    end
+    def load_dictionary(raise_exception=false)
+      begin
+        @dictionary_mtime = ::File.mtime(@dictionary_path).to_f
+        @update_method.call
+      rescue Errno::ENOENT
+        @logger.warn("dictionary file read failure, continuing with old dictionary", :path => @dictionary_path)
+      rescue => e
+        loading_exception(e, raise_exception)
+      end
+    end
+    def set_update_strategy(method_sym)
+      @update_method = method(method_sym)
+      self
+    end
+    protected
+    def initialize_for_file_type
+      # sub class specific initializer
+    end
+    def read_file_into_dictionary
+      # defined in csv_file, yaml_file and json_file
+    end
+    private
+    def start_scheduler
+      @scheduler = Rufus::Scheduler.new
+      @scheduler.interval("#{@refresh_interval}s", :overlap => false) do
+        reload_dictionary
+      end
+    end
+    def merge_dictionary
+      @write_lock.lock
+      begin
+        read_file_into_dictionary
+        @fetch_strategy.dictionary_updated
+      ensure
+        @write_lock.unlock
+      end
+    end
+    def replace_dictionary
+      @write_lock.lock
+      begin
+        @dictionary.clear
+        read_file_into_dictionary
+        @fetch_strategy.dictionary_updated
+      ensure
+        @write_lock.unlock
+      end
+    end
+    def reload_dictionary
+      return if @stopping.true?
+      if @short_refresh
+        load_dictionary if needs_refresh?
+      else
+        load_dictionary
+      end
+    end
+    def needs_refresh?
+      @dictionary_mtime != ::File.mtime(@dictionary_path).to_f
+    end
+    def loading_exception(e, raise_exception)
+      msg = "Translate: #{e.message} when loading dictionary file at #{@dictionary_path}"
+      if raise_exception
+        raise DictionaryFileError.new(msg)
+      else
+        @logger.warn("#{msg}, continuing with old dictionary", :dictionary_path => @dictionary_path)
+      end
+    end
+  end
+end end end

data/lib/logstash/filters/dictionary/json_file.rb ADDED

@@ -0,0 +1,87 @@
+# encoding: utf-8
+require "json"
+module LogStash module Filters module Dictionary
+  class JsonFile < File
+    protected
+    def initialize_for_file_type
+    end
+    def read_file_into_dictionary
+      content = IO.read(@dictionary_path, :mode => 'r:bom|utf-8')
+      @dictionary.update(LogStash::Json.load(content)) unless content.nil? || content.empty?
+    end
+  end
+end end end
+__END__
+Preserving the text below for near term prosperity...
+I tried hard to find a stream parsing solution with JrJackson and sc_load
+but it was no faster than the above code.
+The idea is for each line to be read into the streaming parse that will update
+the @dictionary as each key/value is found.
+It will be lower on memory consumption because the JSON string is not read into memory
+and then a Ruby Hash created and merged into @dictionary.
+I decided to trade speed for memory. Side Note, it seems that
+the json gem has become quite speedy lately.
+e.g.
+require_relative 'json_handler'
+...
+    def initialize_for_file_type
+      @handler = JsonHandler.new(@dictionary)
+    end
+    def read_file_into_dictionary
+      ::File.open(@dictionary_path, "r:bom|utf-8") do |io|
+        JrJackson::Json.sc_load(@handler, io, {raw: true})
+      end
+    end
+...
+where JsonHandler is:
+require 'jrjackson'
+module LogStash module Filters module Dictionary
+  class JsonHandler
+    def initialize(dictionary)
+      @dictionary = dictionary
+      @map_depth = 0
+    end
+    def hash_start()
+      @map_depth = @map_depth.succ
+      @map_depth == 1 ? @dictionary : {}
+    end
+    def hash_end()
+      @map_depth = @map_depth.pred
+    end
+    def hash_key(key)
+      key
+    end
+    def array_start()
+      []
+    end
+    def array_end()
+    end
+    def add_value(value)
+      # @result = value
+    end
+    def hash_set(h, key, value)
+      h[key] = value
+    end
+    def array_append(a, value)
+      a.push(value)
+    end
+  end
+end end end

data/lib/logstash/filters/dictionary/memory.rb ADDED

@@ -0,0 +1,31 @@
+# encoding: utf-8
+require "logstash/filters/fetch_strategy/memory"
+module LogStash module Filters module Dictionary
+  class Memory
+    attr_reader :dictionary, :fetch_strategy
+    def initialize(hash, exact, regex)
+      if exact
+        @fetch_strategy = regex ? FetchStrategy::Memory::ExactRegex.new(hash) : FetchStrategy::Memory::Exact.new(hash)
+      else
+        @fetch_strategy = FetchStrategy::Memory::RegexUnion.new(hash)
+      end
+    end
+    def stop_scheduler
+      # noop
+    end
+    private
+    def needs_refresh?
+      false
+    end
+    def load_dictionary(raise_exception=false)
+      # noop
+    end
+  end
+end end end

data/lib/logstash/filters/dictionary/yaml_file.rb ADDED

@@ -0,0 +1,24 @@
+# encoding: utf-8
+require_relative "yaml_visitor"
+module LogStash module Filters module Dictionary
+  class YamlFile < File
+    protected
+    def initialize_for_file_type
+      @visitor = YamlVisitor.create
+    end
+    def read_file_into_dictionary
+      # low level YAML read that tries to create as
+      # few intermediate objects as possible
+      # this overwrites the value at key
+      @visitor.accept_with_dictionary(
+        @dictionary, Psych.parse_stream(
+          IO.read(@dictionary_path, :mode => 'r:bom|utf-8')
+      ))
+    end
+  end
+end end end

data/lib/logstash/filters/dictionary/yaml_visitor.rb ADDED

@@ -0,0 +1,42 @@
+# encoding: utf-8
+require 'psych/visitors/to_ruby'
+require 'psych/exception'
+unless defined?(Regexp::NOENCODING)
+  Regexp::NOENCODING = 32
+end
+module LogStash module Filters module Dictionary
+  class YamlVisitor < Psych::Visitors::ToRuby
+    TAG_MAP_TABLE = Hash.new(false)
+    TAG_MAP_TABLE[nil] = true
+    TAG_MAP_TABLE["tag:yaml.org,2002:map"] = true
+    TAG_MAP_TABLE["tag:yaml.org,2002:omap"] = true
+    def accept_with_dictionary(dictionary, target)
+      @dictionary = dictionary
+      @map_depth = 0
+      accept(target)
+    end
+    def visit_Psych_Nodes_Mapping(o)
+      if Psych.load_tags[o.tag]
+        return revive(resolve_class(Psych.load_tags[o.tag]), o)
+      end
+      target_hash = @map_depth == 0 ? @dictionary : {}
+      @map_depth = @map_depth.succ
+      if TAG_MAP_TABLE[o.tag]
+        result = revive_hash(register(o, target_hash), o)
+      else
+        result = super(o)
+      end
+      @map_depth = @map_depth.pred
+      result
+    end
+  end
+end end end

data/lib/logstash/filters/fetch_strategy/file.rb ADDED

@@ -0,0 +1,81 @@
+# encoding: utf-8
+module LogStash module Filters module FetchStrategy module File
+  class Exact
+    def initialize(dictionary, rw_lock)
+      @dictionary = dictionary
+      @read_lock = rw_lock.readLock
+    end
+    def dictionary_updated
+    end
+    def fetch(source, results)
+      @read_lock.lock
+      begin
+        if @dictionary.include?(source)
+          results[1] = LogStash::Util.deep_clone(@dictionary[source])
+        else
+          results[0] = false
+        end
+      ensure
+        @read_lock.unlock
+      end
+    end
+  end
+  class ExactRegex
+    def initialize(dictionary, rw_lock)
+      @keys_regex = Hash.new()
+      @dictionary = dictionary
+      @read_lock = rw_lock.readLock
+    end
+    def dictionary_updated
+      @keys_regex.clear
+      # rebuilding the regex map is time expensive
+      # 100 000 keys takes 0.5 seconds on a high spec Macbook Pro
+      # at least we are not doing it for every event like before
+      @dictionary.keys.each{|k| @keys_regex[k] = Regexp.new(k)}
+    end
+    def fetch(source, results)
+      @read_lock.lock
+      begin
+        key = @dictionary.keys.detect{|k| source.match(@keys_regex[k])}
+        if key.nil?
+          results[0] = false
+        else
+          results[1] = LogStash::Util.deep_clone(@dictionary[key])
+        end
+      ensure
+        @read_lock.unlock
+      end
+    end
+  end
+  class RegexUnion
+    def initialize(dictionary, rw_lock)
+      @dictionary = dictionary
+      @read_lock = rw_lock.readLock
+    end
+    def dictionary_updated
+      @union_regex_keys = Regexp.union(@dictionary.keys)
+    end
+    def fetch(source, results)
+      @read_lock.lock
+      begin
+        value = source.gsub(@union_regex_keys, @dictionary)
+        if source == value
+          results[0] = false
+        else
+          results[1] = LogStash::Util.deep_clone(value)
+        end
+      ensure
+        @read_lock.unlock
+      end
+    end
+  end
+end end end end

data/lib/logstash/filters/fetch_strategy/memory.rb ADDED

@@ -0,0 +1,52 @@
+# encoding: utf-8
+module LogStash module Filters module FetchStrategy module Memory
+  class Exact
+    def initialize(dictionary)
+      @dictionary = dictionary
+    end
+    def fetch(source, results)
+      if @dictionary.include?(source)
+        results[1] = LogStash::Util.deep_clone(@dictionary[source])
+      else
+        results[0] = false
+      end
+    end
+  end
+  class ExactRegex
+    def initialize(dictionary)
+      @keys_regex = Hash.new()
+      @dictionary = dictionary
+      @dictionary.keys.each{|k| @keys_regex[k] = Regexp.new(k)}
+    end
+    def fetch(source, results)
+      key = @dictionary.keys.detect{|k| source.match(@keys_regex[k])}
+      if key.nil?
+        results[0] = false
+      else
+        results[1] = LogStash::Util.deep_clone(@dictionary[key])
+      end
+    end
+  end
+  class RegexUnion
+    def initialize(dictionary)
+      @dictionary = dictionary
+      @union_regex_keys = Regexp.union(@dictionary.keys)
+    end
+    def fetch(source, results)
+      value = source.gsub(@union_regex_keys, @dictionary)
+      if source == value
+        results[0] = false
+      else
+        results[1] = LogStash::Util.deep_clone(value)
+      end
+    end
+  end
+end end end end