embulk-input-splunk 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: daf345bda2bb42c7ae1945b66cd0d70ebdd6c02cb1137c83c9ee34916716e6e4
4
- data.tar.gz: 503aabbc4ff9e9c5b0674f4bb910a0b40f1f01d73d53cbf0f89a4702935641ae
3
+ metadata.gz: 83626f940cf8546f4efce77990d9bc3d2cc4a7accc2e4c7d81eb512ee5d4b7a1
4
+ data.tar.gz: d2bbd85c9286e6f24b32f67332883c421091878dfa61b2163221ece6abde7f09
5
5
  SHA512:
6
- metadata.gz: 78f10b454a736fccfb373419c35897c7fcd9fc76970d4116e8101dd536d2e5b9fb108d29a2bebd7c3077d88ef93ef8253dcb3507f77327ddca3e9ab0ddb3de7b
7
- data.tar.gz: 398aa35c57d07675a6ad2cf1e0f71b731304d3942644002b28789c898e454c29df171e3ce4ada0d3681434c071b95c0e04412ae48c2e44f5f52809f00de1b556
6
+ metadata.gz: 4b628c93a3417e01f4fae427225cc69e92cb9a96de7d1fb91ba7e93cfd1bba4d0b26b7b373bd7e269a5a7c91730d0409811dc5d27a5774dcccf3a193c37ab9b5
7
+ data.tar.gz: 062b9db314f9540c3617cb5adba8401a5677775f51ee55df4f86ac7b8450f8826445415942a08d369cdc17312e605d3fc31d18bebf6b34fbccef80d597ff9c8f
data/.gitignore CHANGED
@@ -3,3 +3,5 @@
3
3
  /tmp/
4
4
  /.bundle/
5
5
  /Gemfile.lock
6
+ config.yml
7
+ config_diff.yml
data/README.md CHANGED
@@ -25,6 +25,7 @@ In addition, as a column we treat `_time` as a String, but only because we could
25
25
  - **username**: splunk username (string, required)
26
26
  - **password**: splunk password (string, required)
27
27
  - **port**: splunk API port (integer, default: 8089)
28
+ - **max_results**: API flag to limit results returned. Set to zero for theoretical no limit. However, Splunk server config will generally limit this to 50,000. Setting this to non-zero value will cause the plugin to keep fetching results in `max_results` batches (pagination) (integer, default: 50000)
28
29
  - **query**: the query you wish to run. It should be prefixed with "search" (string required)
29
30
  - **earliest_time**: the earliest time for the splunk search. (string, default: nil, which is unbounded)
30
31
  - **latest_time**: the latest time for the splunk search. (string, default: nil, which is unbounded)
@@ -102,11 +103,30 @@ in:
102
103
  - {name: "bar", type: "long"}
103
104
  ```
104
105
 
106
+ ### Max results
107
+
108
+ The query below assumes to return 100 rows, but the max_results is set to 100. This will cause the plugin to loop 10 times, returning 10 results each time. In the end, you will receive the full 100 events.
109
+
110
+ ```yaml
111
+ in:
112
+ type: splunk
113
+ host: splunk.example.com
114
+ username: splunk_user
115
+ password: abc123
116
+ port: 8089
117
+ max_results: 10
118
+ query: search index="main" | head 100
119
+ table:
120
+ - {name: "_time", type: "string"}
121
+ - {name: "foo", type: "string"}
122
+ - {name: "bar", type: "long"}
123
+ ```
124
+
105
125
  ### Complex Searches
106
126
 
107
- For those unfamiliar with YAML, the pipe (|) indicates a multiline value. In Splunk the pipe operator is used for creating multi-step processing.
127
+ For those unfamiliar with YAML, `>` or `|` indicates a multiline string. In Splunk the pipe operator is also used for creating multi-step processing.
108
128
 
109
- For non-trivial Splunk queries, you should leverage the YAML pipe alongside Splunk pipes for easier to read queries.
129
+ For non-trivial Splunk queries, you should leverage the YAML pipe or > alongside Splunk pipes for easier to read queries.
110
130
 
111
131
  ```yaml
112
132
  in:
@@ -127,6 +147,27 @@ in:
127
147
  - {name: "foo", type: "string"} # Uses foo from the above query
128
148
  ```
129
149
 
150
+ Or with the greater than symbol:
151
+
152
+ ```yaml
153
+ in:
154
+ type: splunk
155
+ host: splunk.example.com
156
+ username: splunk_user
157
+ password: abc123
158
+ port: 8089
159
+ query: >
160
+ search index="main" |
161
+ eval foo=bar |
162
+ where like(bar, "%baz%") |
163
+ head 100
164
+ earliest_time: 2017-01-18T19:23:08.237+11:00
165
+ latest_time: 2018-01-18T19:23:08.237+11:00
166
+ table:
167
+ - {name: "_time", type: "string"}
168
+ - {name: "foo", type: "string"} # Uses foo from the above query
169
+ ```
170
+
130
171
  ## Build
131
172
 
132
173
  ```
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-input-splunk"
3
- spec.version = "0.2.0"
3
+ spec.version = "0.2.1"
4
4
  spec.authors = ["Scott Arbeitman"]
5
5
  spec.summary = "Splunk input plugin for Embulk"
6
6
  spec.description = "Loads records from a Splunk query."
@@ -9,30 +9,13 @@ module Embulk
9
9
  class Splunk < InputPlugin
10
10
  Plugin.register_input("splunk", self)
11
11
 
12
- # Zero means unlimited results. Splunk's default is 100.
13
- SPLUNK_UNLIMITED_RESULTS = 0
14
12
  SPLUNK_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%L%:z"
15
- SPLUNK_OUTPUT_FORMAT = "json"
16
13
  SPLUNK_DEFAULT_TIME_FIELD = "_time"
17
- SPLUNK_TIME_FIELD = { "name" => SPLUNK_DEFAULT_TIME_FIELD, "type" => "string" }
14
+ SPLUNK_TIME_FIELD = { "name" => SPLUNK_DEFAULT_TIME_FIELD, "type" => "string" }.freeze
18
15
 
19
16
  def self.transaction(config, &control)
20
17
 
21
- task = {
22
- "scheme" => config.param("scheme", :string, default: "https"),
23
- "host" => config.param("host", :string),
24
- "port" => config.param("port", :integer, default: 8089),
25
- "username" => config.param("username", :string),
26
- "password" => config.param("password", :string),
27
-
28
- "query" => config.param("query", :string),
29
-
30
- "earliest_time" => config.param(:earliest_time, :string, default: nil),
31
- "latest_time" => config.param(:latest_time, :string, default: nil),
32
-
33
- "incremental" => config.param("incremental", :bool, default: false),
34
- "table" => config.param("table", :array, default: [])
35
- }
18
+ task = task_from_config(config)
36
19
 
37
20
  if task["incremental"] && task["latest_time"]
38
21
  Embulk.logger.warn "Incremental is 'true' and latest_time is set. This may have unexpected results."
@@ -54,75 +37,118 @@ module Embulk
54
37
  task_reports = yield(task, columns, count)
55
38
 
56
39
  next_config_diff = {}
40
+
41
+ # This will work with multiple threads
42
+ latest_time_in_results = task_reports.collect do |report|
43
+ report[:latest_time_in_results].to_i
44
+ end.max
57
45
 
58
- latest_time_in_results = task_reports.first[:latest_time_in_results]
59
46
 
60
47
  if task["incremental"] && latest_time_in_results.present?
61
- next_config_diff[:earliest_time] = latest_time_in_results
48
+ next_config_diff[:earliest_time] = DateTime.strptime(latest_time_in_results.to_s, "%Q").strftime(SPLUNK_TIME_FORMAT)
62
49
  end
63
50
 
64
51
  return next_config_diff
65
52
  end
66
53
 
67
- def init
68
- splunk_config = {
54
+ def self.task_from_config(config)
55
+ task = {
56
+ "scheme" => config.param("scheme", :string, default: "https"),
57
+ "host" => config.param("host", :string),
58
+ "port" => config.param("port", :integer, default: 8089),
59
+ "username" => config.param("username", :string),
60
+ "password" => config.param("password", :string),
61
+
62
+ "max_results" => config.param("max_results", :integer, default: 50_000),
63
+
64
+ "query" => config.param("query", :string),
65
+
66
+ "earliest_time" => config.param(:earliest_time, :string, default: nil),
67
+ "latest_time" => config.param(:latest_time, :string, default: nil),
68
+
69
+ "incremental" => config.param("incremental", :bool, default: false),
70
+ "table" => config.param("table", :array, default: [])
71
+ }
72
+ end
73
+
74
+ protected
75
+
76
+ def build_query(query)
77
+ %Q{
78
+ #{query}
79
+ | sort #{SPLUNK_DEFAULT_TIME_FIELD}
80
+ | table #{ @fields.join(", ") }
81
+ }
82
+ end
83
+
84
+ def splunk_config
85
+ {
69
86
  :scheme => task[:scheme],
70
87
  :host => task[:host],
71
88
  :port => task[:port],
72
89
  :username => task[:username],
73
90
  :password => task[:password]
74
91
  }
92
+ end
93
+
94
+ public
95
+
96
+ def init
97
+ @max_results = task[:max_results]
75
98
  @earliest_time, @latest_time = task[:earliest_time], task[:latest_time]
76
- Embulk.logger.info "Earliest time: #{@earliest_time} / Latest time: #{@latest_time}"
77
-
78
99
  @fields = task["table"].collect { |entry| entry["name"] }
79
- Embulk.logger.info "Using fields #{@fields.join', '} in query"
80
-
81
100
  @query = build_query( task[:query] )
82
-
83
- Embulk.logger.info "Establishing connection to Splunk"
84
- @service = ::Splunk::connect(splunk_config)
85
- end
86
-
87
- def build_query(query)
88
- # Append table expression to query. Even if already present in the query, this should do no harm.
89
- "#{query} | table #{ @fields.join(", ") } "
90
101
  end
91
102
 
92
103
  def run
93
- Embulk.logger.info "Running query `#{@query}`"
94
-
95
- stream = @service.create_oneshot(@query,
96
- count: SPLUNK_UNLIMITED_RESULTS,
97
- output_format: SPLUNK_OUTPUT_FORMAT,
98
- earliest_time: @earliest_time,
99
- latest_time: @latest_time)
100
-
101
- reader = ::Splunk::ResultsReader.new(stream)
104
+ Embulk.logger.debug "Establishing connection to Splunk"
105
+ service = ::Splunk::connect(splunk_config)
102
106
 
103
107
  latest_time = nil
104
-
105
- reader.each do |result|
106
- #We convert event_time to Ruby time for comparison only.
107
- event_time = Time.strptime( result[SPLUNK_DEFAULT_TIME_FIELD], SPLUNK_TIME_FORMAT )
108
+ loop_count = 0
109
+
110
+ # There is a limit to how many results Splunk API will return.
111
+ # To avoid silently dropping results, we need to iterate until there are not more results.
112
+ loop do
113
+ number_of_results = 0
108
114
 
109
- #We need to keep track of latest time for incremental loads.
110
- # Unfortunately, Splunk was not respecting our sort requests, so we need to do a comparison for each row.
111
- latest_time = latest_time.nil? ? event_time : [latest_time, event_time].max
115
+ query_options = {
116
+ count: @max_results,
117
+ offset: loop_count * @max_results,
118
+ earliest_time: @earliest_time,
119
+ latest_time: @latest_time,
120
+ }
121
+
122
+ Embulk.logger.debug "Running query `#{@query}` with options #{query_options} in loop #{loop_count}"
123
+ stream = service.create_oneshot(@query, query_options)
112
124
 
113
- row = @fields.map { |field| result[ field ] }
114
- page_builder.add( row )
125
+ reader = ::Splunk::ResultsReader.new(stream)
126
+
127
+ reader.each do |result|
128
+ number_of_results += 1
129
+
130
+ # We convert event_time to integer easy comparison only.
131
+ event_time = DateTime.strptime( result[SPLUNK_DEFAULT_TIME_FIELD], SPLUNK_TIME_FORMAT ).strftime("%Q").to_i
132
+
133
+ # We need to keep track of latest time for incremental loads.
134
+ # Unfortunately, Splunk was not respecting our sort requests, so we need to do a comparison for each row.
135
+ latest_time = latest_time.nil? ? event_time : [latest_time, event_time].max
136
+
137
+ row = @fields.map { |field| result[ field ] }
138
+ page_builder.add( row )
139
+ end
140
+
141
+ break if (number_of_results < @max_results) || (@max_results == 0)
142
+
143
+ loop_count += 1
115
144
  end
116
145
 
117
146
  page_builder.finish
118
147
 
119
- task_result = {
120
- latest_time_in_results: latest_time.strftime(SPLUNK_TIME_FORMAT)
121
- }
122
-
123
- return task_result
148
+ return { latest_time_in_results: latest_time }
124
149
  end
125
150
  end
151
+
126
152
 
127
153
  end
128
154
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-splunk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Arbeitman
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-21 00:00:00.000000000 Z
11
+ date: 2018-02-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement