RubyGems - embulk-input-splunk - Versions diffs - 0.2.0 → 0.2.1 - Mend

embulk-input-splunk 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: daf345bda2bb42c7ae1945b66cd0d70ebdd6c02cb1137c83c9ee34916716e6e4
-  data.tar.gz: 503aabbc4ff9e9c5b0674f4bb910a0b40f1f01d73d53cbf0f89a4702935641ae
+  metadata.gz: 83626f940cf8546f4efce77990d9bc3d2cc4a7accc2e4c7d81eb512ee5d4b7a1
+  data.tar.gz: d2bbd85c9286e6f24b32f67332883c421091878dfa61b2163221ece6abde7f09
 SHA512:
-  metadata.gz: 78f10b454a736fccfb373419c35897c7fcd9fc76970d4116e8101dd536d2e5b9fb108d29a2bebd7c3077d88ef93ef8253dcb3507f77327ddca3e9ab0ddb3de7b
-  data.tar.gz: 398aa35c57d07675a6ad2cf1e0f71b731304d3942644002b28789c898e454c29df171e3ce4ada0d3681434c071b95c0e04412ae48c2e44f5f52809f00de1b556
+  metadata.gz: 4b628c93a3417e01f4fae427225cc69e92cb9a96de7d1fb91ba7e93cfd1bba4d0b26b7b373bd7e269a5a7c91730d0409811dc5d27a5774dcccf3a193c37ab9b5
+  data.tar.gz: 062b9db314f9540c3617cb5adba8401a5677775f51ee55df4f86ac7b8450f8826445415942a08d369cdc17312e605d3fc31d18bebf6b34fbccef80d597ff9c8f

data/.gitignore CHANGED Viewed

@@ -3,3 +3,5 @@
 /tmp/
 /.bundle/
 /Gemfile.lock
+config.yml
+config_diff.yml

data/README.md CHANGED Viewed

@@ -25,6 +25,7 @@ In addition, as a column we treat `_time` as a String, but only because we could
 - **username**: splunk username (string, required)
 - **password**: splunk password (string, required)
 - **port**: splunk API port (integer, default: 8089)
+- **max_results**: API flag to limit results returned. Set to zero for theoretical no limit. However, Splunk server config will generally limit this to 50,000. Setting this to non-zero value will cause the plugin to keep fetching results in `max_results` batches (pagination) (integer, default: 50000)
 - **query**: the query you wish to run. It should be prefixed with "search" (string required)
 - **earliest_time**: the earliest time for the splunk search. (string, default: nil, which is unbounded)
 - **latest_time**: the latest time for the splunk search. (string, default: nil, which is unbounded)
@@ -102,11 +103,30 @@ in:
     - {name: "bar", type: "long"}
 ```
+### Max results
+The query below assumes to return 100 rows, but the max_results is set to 100. This will cause the plugin to loop 10 times, returning 10 results each time. In the end, you will receive the full 100 events.
+```yaml
+in:
+  type: splunk
+  host: splunk.example.com
+  username: splunk_user
+  password: abc123
+  port: 8089
+  max_results: 10
+  query: search index="main" | head 100
+  table:
+    - {name: "_time", type: "string"}
+    - {name: "foo", type: "string"}
+    - {name: "bar", type: "long"}
+```
 ### Complex Searches
-For those unfamiliar with YAML, the pipe (|) indicates a multiline value. In Splunk the pipe operator is used for creating multi-step processing.
+For those unfamiliar with YAML, `>` or `|` indicates a multiline string. In Splunk the pipe operator is also used for creating multi-step processing.
-For non-trivial Splunk queries, you should leverage the YAML pipe alongside Splunk pipes for easier to read queries.
+For non-trivial Splunk queries, you should leverage the YAML pipe or > alongside Splunk pipes for easier to read queries.
 ```yaml
 in:
@@ -127,6 +147,27 @@ in:
     - {name: "foo", type: "string"} # Uses foo from the above query
 ```
+Or with the greater than symbol:
+```yaml
+in:
+  type: splunk
+  host: splunk.example.com
+  username: splunk_user
+  password: abc123
+  port: 8089
+  query: >
+    search index="main" |
+    eval foo=bar |
+    where like(bar, "%baz%") |
+    head 100
+  earliest_time: 2017-01-18T19:23:08.237+11:00
+  latest_time: 2018-01-18T19:23:08.237+11:00
+  table:
+    - {name: "_time", type: "string"}
+    - {name: "foo", type: "string"} # Uses foo from the above query
+```
 ## Build
 ```

data/embulk-input-splunk.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |spec|
   spec.name          = "embulk-input-splunk"
-  spec.version       = "0.2.0"
+  spec.version       = "0.2.1"
   spec.authors       = ["Scott Arbeitman"]
   spec.summary       = "Splunk input plugin for Embulk"
   spec.description   = "Loads records from a Splunk query."

data/lib/embulk/input/splunk.rb CHANGED Viewed

@@ -9,30 +9,13 @@ module Embulk
     class Splunk < InputPlugin
       Plugin.register_input("splunk", self)
-      # Zero means unlimited results. Splunk's default is 100.
-      SPLUNK_UNLIMITED_RESULTS = 0
       SPLUNK_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%L%:z"
-      SPLUNK_OUTPUT_FORMAT = "json"
       SPLUNK_DEFAULT_TIME_FIELD = "_time"
-      SPLUNK_TIME_FIELD = { "name" => SPLUNK_DEFAULT_TIME_FIELD, "type" => "string" }
+      SPLUNK_TIME_FIELD = { "name" => SPLUNK_DEFAULT_TIME_FIELD, "type" => "string" }.freeze
       def self.transaction(config, &control)
-        task = {
-          "scheme" => config.param("scheme", :string, default: "https"),
-          "host" => config.param("host", :string),
-          "port" => config.param("port", :integer, default: 8089),
-          "username" => config.param("username", :string),
-          "password" => config.param("password", :string),
-          "query" => config.param("query", :string),
-          "earliest_time" => config.param(:earliest_time, :string, default: nil),
-          "latest_time" => config.param(:latest_time, :string, default: nil),
-          "incremental" => config.param("incremental", :bool, default: false),
-          "table" => config.param("table", :array, default: [])
-        }
+        task = task_from_config(config)
         if task["incremental"] && task["latest_time"]
           Embulk.logger.warn "Incremental is 'true' and latest_time is set. This may have unexpected results."
@@ -54,75 +37,118 @@ module Embulk
         task_reports = yield(task, columns, count)
         next_config_diff = {}
+        # This will work with multiple threads
+        latest_time_in_results = task_reports.collect do |report|
+          report[:latest_time_in_results].to_i
+        end.max
-        latest_time_in_results = task_reports.first[:latest_time_in_results]
         if task["incremental"] && latest_time_in_results.present?
-          next_config_diff[:earliest_time] = latest_time_in_results
+          next_config_diff[:earliest_time] = DateTime.strptime(latest_time_in_results.to_s, "%Q").strftime(SPLUNK_TIME_FORMAT)
         end
         return next_config_diff
       end
-      def init
-        splunk_config = {
+      def self.task_from_config(config)
+        task = {
+          "scheme" => config.param("scheme", :string, default: "https"),
+          "host" => config.param("host", :string),
+          "port" => config.param("port", :integer, default: 8089),
+          "username" => config.param("username", :string),
+          "password" => config.param("password", :string),
+          "max_results" => config.param("max_results", :integer, default: 50_000),
+          "query" => config.param("query", :string),
+          "earliest_time" => config.param(:earliest_time, :string, default: nil),
+          "latest_time" => config.param(:latest_time, :string, default: nil),
+          "incremental" => config.param("incremental", :bool, default: false),
+          "table" => config.param("table", :array, default: [])
+        }
+      end
+      protected
+      def build_query(query)
+        %Q{
+          #{query}
+          | sort #{SPLUNK_DEFAULT_TIME_FIELD}
+          | table #{ @fields.join(", ") }
+        }
+      end
+      def splunk_config
+        {
           :scheme => task[:scheme],
           :host => task[:host],
           :port => task[:port],
           :username => task[:username],
           :password => task[:password]
         }
+      end
+      public
+      def init
+        @max_results = task[:max_results]
         @earliest_time, @latest_time = task[:earliest_time], task[:latest_time]
-        Embulk.logger.info "Earliest time:  #{@earliest_time} / Latest time: #{@latest_time}"
         @fields = task["table"].collect { |entry| entry["name"] }
-        Embulk.logger.info "Using fields #{@fields.join', '} in query"
         @query = build_query( task[:query] )
-        Embulk.logger.info "Establishing connection to Splunk"
-        @service = ::Splunk::connect(splunk_config)
-      end
-      def build_query(query)
-        # Append table expression to query. Even if already present in the query, this should do no harm.
-        "#{query} | table #{ @fields.join(", ") } "
       end
       def run
-        Embulk.logger.info "Running query `#{@query}`"
-        stream = @service.create_oneshot(@query,
-                                         count: SPLUNK_UNLIMITED_RESULTS,
-                                         output_format: SPLUNK_OUTPUT_FORMAT,
-                                         earliest_time: @earliest_time,
-                                         latest_time: @latest_time)
-        reader = ::Splunk::ResultsReader.new(stream)
+        Embulk.logger.debug "Establishing connection to Splunk"
+        service = ::Splunk::connect(splunk_config)
         latest_time = nil
-        reader.each do |result|
-          #We convert event_time to Ruby time for comparison only.
-          event_time = Time.strptime( result[SPLUNK_DEFAULT_TIME_FIELD], SPLUNK_TIME_FORMAT )
+        loop_count = 0
+        # There is a limit to how many results Splunk API will return.
+        # To avoid silently dropping results, we need to iterate until there are not more results.
+        loop do
+          number_of_results = 0
-          #We need to keep track of latest time for incremental loads.
-          # Unfortunately, Splunk was not respecting our sort requests, so we need to do a comparison for each row.
-          latest_time = latest_time.nil? ? event_time : [latest_time, event_time].max
+          query_options = {
+            count:  @max_results,
+            offset: loop_count * @max_results,
+            earliest_time: @earliest_time,
+            latest_time: @latest_time,
+          }
+          Embulk.logger.debug "Running query `#{@query}` with options #{query_options} in loop #{loop_count}"
+          stream = service.create_oneshot(@query, query_options)
-          row = @fields.map { |field| result[ field ] }
-          page_builder.add( row )
+          reader = ::Splunk::ResultsReader.new(stream)
+          reader.each do |result|
+            number_of_results += 1
+            # We convert event_time to integer easy comparison only.
+            event_time = DateTime.strptime( result[SPLUNK_DEFAULT_TIME_FIELD], SPLUNK_TIME_FORMAT ).strftime("%Q").to_i
+            # We need to keep track of latest time for incremental loads.
+            # Unfortunately, Splunk was not respecting our sort requests, so we need to do a comparison for each row.
+            latest_time = latest_time.nil? ? event_time : [latest_time, event_time].max
+            row = @fields.map { |field| result[ field ] }
+            page_builder.add( row )
+          end
+          break if (number_of_results < @max_results) || (@max_results == 0)
+          loop_count += 1
         end
         page_builder.finish
-        task_result = {
-          latest_time_in_results: latest_time.strftime(SPLUNK_TIME_FORMAT)
-        }
-        return task_result
+        return { latest_time_in_results: latest_time }
       end
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: embulk-input-splunk
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.1
 platform: ruby
 authors:
 - Scott Arbeitman
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-02-21 00:00:00.000000000 Z
+date: 2018-02-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement