embulk-input-splunk 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: daf345bda2bb42c7ae1945b66cd0d70ebdd6c02cb1137c83c9ee34916716e6e4
4
- data.tar.gz: 503aabbc4ff9e9c5b0674f4bb910a0b40f1f01d73d53cbf0f89a4702935641ae
3
+ metadata.gz: 83626f940cf8546f4efce77990d9bc3d2cc4a7accc2e4c7d81eb512ee5d4b7a1
4
+ data.tar.gz: d2bbd85c9286e6f24b32f67332883c421091878dfa61b2163221ece6abde7f09
5
5
  SHA512:
6
- metadata.gz: 78f10b454a736fccfb373419c35897c7fcd9fc76970d4116e8101dd536d2e5b9fb108d29a2bebd7c3077d88ef93ef8253dcb3507f77327ddca3e9ab0ddb3de7b
7
- data.tar.gz: 398aa35c57d07675a6ad2cf1e0f71b731304d3942644002b28789c898e454c29df171e3ce4ada0d3681434c071b95c0e04412ae48c2e44f5f52809f00de1b556
6
+ metadata.gz: 4b628c93a3417e01f4fae427225cc69e92cb9a96de7d1fb91ba7e93cfd1bba4d0b26b7b373bd7e269a5a7c91730d0409811dc5d27a5774dcccf3a193c37ab9b5
7
+ data.tar.gz: 062b9db314f9540c3617cb5adba8401a5677775f51ee55df4f86ac7b8450f8826445415942a08d369cdc17312e605d3fc31d18bebf6b34fbccef80d597ff9c8f
data/.gitignore CHANGED
@@ -3,3 +3,5 @@
3
3
  /tmp/
4
4
  /.bundle/
5
5
  /Gemfile.lock
6
+ config.yml
7
+ config_diff.yml
data/README.md CHANGED
@@ -25,6 +25,7 @@ In addition, as a column we treat `_time` as a String, but only because we could
25
25
  - **username**: splunk username (string, required)
26
26
  - **password**: splunk password (string, required)
27
27
  - **port**: splunk API port (integer, default: 8089)
28
+ - **max_results**: API flag to limit results returned. Set to zero for theoretical no limit. However, Splunk server config will generally limit this to 50,000. Setting this to non-zero value will cause the plugin to keep fetching results in `max_results` batches (pagination) (integer, default: 50000)
28
29
  - **query**: the query you wish to run. It should be prefixed with "search" (string required)
29
30
  - **earliest_time**: the earliest time for the splunk search. (string, default: nil, which is unbounded)
30
31
  - **latest_time**: the latest time for the splunk search. (string, default: nil, which is unbounded)
@@ -102,11 +103,30 @@ in:
102
103
  - {name: "bar", type: "long"}
103
104
  ```
104
105
 
106
+ ### Max results
107
+
108
+ The query below assumes to return 100 rows, but the max_results is set to 100. This will cause the plugin to loop 10 times, returning 10 results each time. In the end, you will receive the full 100 events.
109
+
110
+ ```yaml
111
+ in:
112
+ type: splunk
113
+ host: splunk.example.com
114
+ username: splunk_user
115
+ password: abc123
116
+ port: 8089
117
+ max_results: 10
118
+ query: search index="main" | head 100
119
+ table:
120
+ - {name: "_time", type: "string"}
121
+ - {name: "foo", type: "string"}
122
+ - {name: "bar", type: "long"}
123
+ ```
124
+
105
125
  ### Complex Searches
106
126
 
107
- For those unfamiliar with YAML, the pipe (|) indicates a multiline value. In Splunk the pipe operator is used for creating multi-step processing.
127
+ For those unfamiliar with YAML, `>` or `|` indicates a multiline string. In Splunk the pipe operator is also used for creating multi-step processing.
108
128
 
109
- For non-trivial Splunk queries, you should leverage the YAML pipe alongside Splunk pipes for easier to read queries.
129
+ For non-trivial Splunk queries, you should leverage the YAML pipe or > alongside Splunk pipes for easier to read queries.
110
130
 
111
131
  ```yaml
112
132
  in:
@@ -127,6 +147,27 @@ in:
127
147
  - {name: "foo", type: "string"} # Uses foo from the above query
128
148
  ```
129
149
 
150
+ Or with the greater than symbol:
151
+
152
+ ```yaml
153
+ in:
154
+ type: splunk
155
+ host: splunk.example.com
156
+ username: splunk_user
157
+ password: abc123
158
+ port: 8089
159
+ query: >
160
+ search index="main" |
161
+ eval foo=bar |
162
+ where like(bar, "%baz%") |
163
+ head 100
164
+ earliest_time: 2017-01-18T19:23:08.237+11:00
165
+ latest_time: 2018-01-18T19:23:08.237+11:00
166
+ table:
167
+ - {name: "_time", type: "string"}
168
+ - {name: "foo", type: "string"} # Uses foo from the above query
169
+ ```
170
+
130
171
  ## Build
131
172
 
132
173
  ```
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-input-splunk"
3
- spec.version = "0.2.0"
3
+ spec.version = "0.2.1"
4
4
  spec.authors = ["Scott Arbeitman"]
5
5
  spec.summary = "Splunk input plugin for Embulk"
6
6
  spec.description = "Loads records from a Splunk query."
@@ -9,30 +9,13 @@ module Embulk
9
9
  class Splunk < InputPlugin
10
10
  Plugin.register_input("splunk", self)
11
11
 
12
- # Zero means unlimited results. Splunk's default is 100.
13
- SPLUNK_UNLIMITED_RESULTS = 0
14
12
  SPLUNK_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%L%:z"
15
- SPLUNK_OUTPUT_FORMAT = "json"
16
13
  SPLUNK_DEFAULT_TIME_FIELD = "_time"
17
- SPLUNK_TIME_FIELD = { "name" => SPLUNK_DEFAULT_TIME_FIELD, "type" => "string" }
14
+ SPLUNK_TIME_FIELD = { "name" => SPLUNK_DEFAULT_TIME_FIELD, "type" => "string" }.freeze
18
15
 
19
16
  def self.transaction(config, &control)
20
17
 
21
- task = {
22
- "scheme" => config.param("scheme", :string, default: "https"),
23
- "host" => config.param("host", :string),
24
- "port" => config.param("port", :integer, default: 8089),
25
- "username" => config.param("username", :string),
26
- "password" => config.param("password", :string),
27
-
28
- "query" => config.param("query", :string),
29
-
30
- "earliest_time" => config.param(:earliest_time, :string, default: nil),
31
- "latest_time" => config.param(:latest_time, :string, default: nil),
32
-
33
- "incremental" => config.param("incremental", :bool, default: false),
34
- "table" => config.param("table", :array, default: [])
35
- }
18
+ task = task_from_config(config)
36
19
 
37
20
  if task["incremental"] && task["latest_time"]
38
21
  Embulk.logger.warn "Incremental is 'true' and latest_time is set. This may have unexpected results."
@@ -54,75 +37,118 @@ module Embulk
54
37
  task_reports = yield(task, columns, count)
55
38
 
56
39
  next_config_diff = {}
40
+
41
+ # This will work with multiple threads
42
+ latest_time_in_results = task_reports.collect do |report|
43
+ report[:latest_time_in_results].to_i
44
+ end.max
57
45
 
58
- latest_time_in_results = task_reports.first[:latest_time_in_results]
59
46
 
60
47
  if task["incremental"] && latest_time_in_results.present?
61
- next_config_diff[:earliest_time] = latest_time_in_results
48
+ next_config_diff[:earliest_time] = DateTime.strptime(latest_time_in_results.to_s, "%Q").strftime(SPLUNK_TIME_FORMAT)
62
49
  end
63
50
 
64
51
  return next_config_diff
65
52
  end
66
53
 
67
- def init
68
- splunk_config = {
54
+ def self.task_from_config(config)
55
+ task = {
56
+ "scheme" => config.param("scheme", :string, default: "https"),
57
+ "host" => config.param("host", :string),
58
+ "port" => config.param("port", :integer, default: 8089),
59
+ "username" => config.param("username", :string),
60
+ "password" => config.param("password", :string),
61
+
62
+ "max_results" => config.param("max_results", :integer, default: 50_000),
63
+
64
+ "query" => config.param("query", :string),
65
+
66
+ "earliest_time" => config.param(:earliest_time, :string, default: nil),
67
+ "latest_time" => config.param(:latest_time, :string, default: nil),
68
+
69
+ "incremental" => config.param("incremental", :bool, default: false),
70
+ "table" => config.param("table", :array, default: [])
71
+ }
72
+ end
73
+
74
+ protected
75
+
76
+ def build_query(query)
77
+ %Q{
78
+ #{query}
79
+ | sort #{SPLUNK_DEFAULT_TIME_FIELD}
80
+ | table #{ @fields.join(", ") }
81
+ }
82
+ end
83
+
84
+ def splunk_config
85
+ {
69
86
  :scheme => task[:scheme],
70
87
  :host => task[:host],
71
88
  :port => task[:port],
72
89
  :username => task[:username],
73
90
  :password => task[:password]
74
91
  }
92
+ end
93
+
94
+ public
95
+
96
+ def init
97
+ @max_results = task[:max_results]
75
98
  @earliest_time, @latest_time = task[:earliest_time], task[:latest_time]
76
- Embulk.logger.info "Earliest time: #{@earliest_time} / Latest time: #{@latest_time}"
77
-
78
99
  @fields = task["table"].collect { |entry| entry["name"] }
79
- Embulk.logger.info "Using fields #{@fields.join', '} in query"
80
-
81
100
  @query = build_query( task[:query] )
82
-
83
- Embulk.logger.info "Establishing connection to Splunk"
84
- @service = ::Splunk::connect(splunk_config)
85
- end
86
-
87
- def build_query(query)
88
- # Append table expression to query. Even if already present in the query, this should do no harm.
89
- "#{query} | table #{ @fields.join(", ") } "
90
101
  end
91
102
 
92
103
  def run
93
- Embulk.logger.info "Running query `#{@query}`"
94
-
95
- stream = @service.create_oneshot(@query,
96
- count: SPLUNK_UNLIMITED_RESULTS,
97
- output_format: SPLUNK_OUTPUT_FORMAT,
98
- earliest_time: @earliest_time,
99
- latest_time: @latest_time)
100
-
101
- reader = ::Splunk::ResultsReader.new(stream)
104
+ Embulk.logger.debug "Establishing connection to Splunk"
105
+ service = ::Splunk::connect(splunk_config)
102
106
 
103
107
  latest_time = nil
104
-
105
- reader.each do |result|
106
- #We convert event_time to Ruby time for comparison only.
107
- event_time = Time.strptime( result[SPLUNK_DEFAULT_TIME_FIELD], SPLUNK_TIME_FORMAT )
108
+ loop_count = 0
109
+
110
+ # There is a limit to how many results Splunk API will return.
111
+ # To avoid silently dropping results, we need to iterate until there are not more results.
112
+ loop do
113
+ number_of_results = 0
108
114
 
109
- #We need to keep track of latest time for incremental loads.
110
- # Unfortunately, Splunk was not respecting our sort requests, so we need to do a comparison for each row.
111
- latest_time = latest_time.nil? ? event_time : [latest_time, event_time].max
115
+ query_options = {
116
+ count: @max_results,
117
+ offset: loop_count * @max_results,
118
+ earliest_time: @earliest_time,
119
+ latest_time: @latest_time,
120
+ }
121
+
122
+ Embulk.logger.debug "Running query `#{@query}` with options #{query_options} in loop #{loop_count}"
123
+ stream = service.create_oneshot(@query, query_options)
112
124
 
113
- row = @fields.map { |field| result[ field ] }
114
- page_builder.add( row )
125
+ reader = ::Splunk::ResultsReader.new(stream)
126
+
127
+ reader.each do |result|
128
+ number_of_results += 1
129
+
130
+ # We convert event_time to integer easy comparison only.
131
+ event_time = DateTime.strptime( result[SPLUNK_DEFAULT_TIME_FIELD], SPLUNK_TIME_FORMAT ).strftime("%Q").to_i
132
+
133
+ # We need to keep track of latest time for incremental loads.
134
+ # Unfortunately, Splunk was not respecting our sort requests, so we need to do a comparison for each row.
135
+ latest_time = latest_time.nil? ? event_time : [latest_time, event_time].max
136
+
137
+ row = @fields.map { |field| result[ field ] }
138
+ page_builder.add( row )
139
+ end
140
+
141
+ break if (number_of_results < @max_results) || (@max_results == 0)
142
+
143
+ loop_count += 1
115
144
  end
116
145
 
117
146
  page_builder.finish
118
147
 
119
- task_result = {
120
- latest_time_in_results: latest_time.strftime(SPLUNK_TIME_FORMAT)
121
- }
122
-
123
- return task_result
148
+ return { latest_time_in_results: latest_time }
124
149
  end
125
150
  end
151
+
126
152
 
127
153
  end
128
154
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-splunk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Arbeitman
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-21 00:00:00.000000000 Z
11
+ date: 2018-02-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement