embulk-input-splunk 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +43 -2
- data/embulk-input-splunk.gemspec +1 -1
- data/lib/embulk/input/splunk.rb +84 -58
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 83626f940cf8546f4efce77990d9bc3d2cc4a7accc2e4c7d81eb512ee5d4b7a1
|
4
|
+
data.tar.gz: d2bbd85c9286e6f24b32f67332883c421091878dfa61b2163221ece6abde7f09
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4b628c93a3417e01f4fae427225cc69e92cb9a96de7d1fb91ba7e93cfd1bba4d0b26b7b373bd7e269a5a7c91730d0409811dc5d27a5774dcccf3a193c37ab9b5
|
7
|
+
data.tar.gz: 062b9db314f9540c3617cb5adba8401a5677775f51ee55df4f86ac7b8450f8826445415942a08d369cdc17312e605d3fc31d18bebf6b34fbccef80d597ff9c8f
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -25,6 +25,7 @@ In addition, as a column we treat `_time` as a String, but only because we could
|
|
25
25
|
- **username**: splunk username (string, required)
|
26
26
|
- **password**: splunk password (string, required)
|
27
27
|
- **port**: splunk API port (integer, default: 8089)
|
28
|
+
- **max_results**: API flag to limit results returned. Set to zero for theoretical no limit. However, Splunk server config will generally limit this to 50,000. Setting this to non-zero value will cause the plugin to keep fetching results in `max_results` batches (pagination) (integer, default: 50000)
|
28
29
|
- **query**: the query you wish to run. It should be prefixed with "search" (string required)
|
29
30
|
- **earliest_time**: the earliest time for the splunk search. (string, default: nil, which is unbounded)
|
30
31
|
- **latest_time**: the latest time for the splunk search. (string, default: nil, which is unbounded)
|
@@ -102,11 +103,30 @@ in:
|
|
102
103
|
- {name: "bar", type: "long"}
|
103
104
|
```
|
104
105
|
|
106
|
+
### Max results
|
107
|
+
|
108
|
+
The query below assumes to return 100 rows, but the max_results is set to 100. This will cause the plugin to loop 10 times, returning 10 results each time. In the end, you will receive the full 100 events.
|
109
|
+
|
110
|
+
```yaml
|
111
|
+
in:
|
112
|
+
type: splunk
|
113
|
+
host: splunk.example.com
|
114
|
+
username: splunk_user
|
115
|
+
password: abc123
|
116
|
+
port: 8089
|
117
|
+
max_results: 10
|
118
|
+
query: search index="main" | head 100
|
119
|
+
table:
|
120
|
+
- {name: "_time", type: "string"}
|
121
|
+
- {name: "foo", type: "string"}
|
122
|
+
- {name: "bar", type: "long"}
|
123
|
+
```
|
124
|
+
|
105
125
|
### Complex Searches
|
106
126
|
|
107
|
-
For those unfamiliar with YAML,
|
127
|
+
For those unfamiliar with YAML, `>` or `|` indicates a multiline string. In Splunk the pipe operator is also used for creating multi-step processing.
|
108
128
|
|
109
|
-
For non-trivial Splunk queries, you should leverage the YAML pipe alongside Splunk pipes for easier to read queries.
|
129
|
+
For non-trivial Splunk queries, you should leverage the YAML pipe or > alongside Splunk pipes for easier to read queries.
|
110
130
|
|
111
131
|
```yaml
|
112
132
|
in:
|
@@ -127,6 +147,27 @@ in:
|
|
127
147
|
- {name: "foo", type: "string"} # Uses foo from the above query
|
128
148
|
```
|
129
149
|
|
150
|
+
Or with the greater than symbol:
|
151
|
+
|
152
|
+
```yaml
|
153
|
+
in:
|
154
|
+
type: splunk
|
155
|
+
host: splunk.example.com
|
156
|
+
username: splunk_user
|
157
|
+
password: abc123
|
158
|
+
port: 8089
|
159
|
+
query: >
|
160
|
+
search index="main" |
|
161
|
+
eval foo=bar |
|
162
|
+
where like(bar, "%baz%") |
|
163
|
+
head 100
|
164
|
+
earliest_time: 2017-01-18T19:23:08.237+11:00
|
165
|
+
latest_time: 2018-01-18T19:23:08.237+11:00
|
166
|
+
table:
|
167
|
+
- {name: "_time", type: "string"}
|
168
|
+
- {name: "foo", type: "string"} # Uses foo from the above query
|
169
|
+
```
|
170
|
+
|
130
171
|
## Build
|
131
172
|
|
132
173
|
```
|
data/embulk-input-splunk.gemspec
CHANGED
data/lib/embulk/input/splunk.rb
CHANGED
@@ -9,30 +9,13 @@ module Embulk
|
|
9
9
|
class Splunk < InputPlugin
|
10
10
|
Plugin.register_input("splunk", self)
|
11
11
|
|
12
|
-
# Zero means unlimited results. Splunk's default is 100.
|
13
|
-
SPLUNK_UNLIMITED_RESULTS = 0
|
14
12
|
SPLUNK_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%L%:z"
|
15
|
-
SPLUNK_OUTPUT_FORMAT = "json"
|
16
13
|
SPLUNK_DEFAULT_TIME_FIELD = "_time"
|
17
|
-
SPLUNK_TIME_FIELD = { "name" => SPLUNK_DEFAULT_TIME_FIELD, "type" => "string" }
|
14
|
+
SPLUNK_TIME_FIELD = { "name" => SPLUNK_DEFAULT_TIME_FIELD, "type" => "string" }.freeze
|
18
15
|
|
19
16
|
def self.transaction(config, &control)
|
20
17
|
|
21
|
-
task =
|
22
|
-
"scheme" => config.param("scheme", :string, default: "https"),
|
23
|
-
"host" => config.param("host", :string),
|
24
|
-
"port" => config.param("port", :integer, default: 8089),
|
25
|
-
"username" => config.param("username", :string),
|
26
|
-
"password" => config.param("password", :string),
|
27
|
-
|
28
|
-
"query" => config.param("query", :string),
|
29
|
-
|
30
|
-
"earliest_time" => config.param(:earliest_time, :string, default: nil),
|
31
|
-
"latest_time" => config.param(:latest_time, :string, default: nil),
|
32
|
-
|
33
|
-
"incremental" => config.param("incremental", :bool, default: false),
|
34
|
-
"table" => config.param("table", :array, default: [])
|
35
|
-
}
|
18
|
+
task = task_from_config(config)
|
36
19
|
|
37
20
|
if task["incremental"] && task["latest_time"]
|
38
21
|
Embulk.logger.warn "Incremental is 'true' and latest_time is set. This may have unexpected results."
|
@@ -54,75 +37,118 @@ module Embulk
|
|
54
37
|
task_reports = yield(task, columns, count)
|
55
38
|
|
56
39
|
next_config_diff = {}
|
40
|
+
|
41
|
+
# This will work with multiple threads
|
42
|
+
latest_time_in_results = task_reports.collect do |report|
|
43
|
+
report[:latest_time_in_results].to_i
|
44
|
+
end.max
|
57
45
|
|
58
|
-
latest_time_in_results = task_reports.first[:latest_time_in_results]
|
59
46
|
|
60
47
|
if task["incremental"] && latest_time_in_results.present?
|
61
|
-
next_config_diff[:earliest_time] = latest_time_in_results
|
48
|
+
next_config_diff[:earliest_time] = DateTime.strptime(latest_time_in_results.to_s, "%Q").strftime(SPLUNK_TIME_FORMAT)
|
62
49
|
end
|
63
50
|
|
64
51
|
return next_config_diff
|
65
52
|
end
|
66
53
|
|
67
|
-
def
|
68
|
-
|
54
|
+
def self.task_from_config(config)
|
55
|
+
task = {
|
56
|
+
"scheme" => config.param("scheme", :string, default: "https"),
|
57
|
+
"host" => config.param("host", :string),
|
58
|
+
"port" => config.param("port", :integer, default: 8089),
|
59
|
+
"username" => config.param("username", :string),
|
60
|
+
"password" => config.param("password", :string),
|
61
|
+
|
62
|
+
"max_results" => config.param("max_results", :integer, default: 50_000),
|
63
|
+
|
64
|
+
"query" => config.param("query", :string),
|
65
|
+
|
66
|
+
"earliest_time" => config.param(:earliest_time, :string, default: nil),
|
67
|
+
"latest_time" => config.param(:latest_time, :string, default: nil),
|
68
|
+
|
69
|
+
"incremental" => config.param("incremental", :bool, default: false),
|
70
|
+
"table" => config.param("table", :array, default: [])
|
71
|
+
}
|
72
|
+
end
|
73
|
+
|
74
|
+
protected
|
75
|
+
|
76
|
+
def build_query(query)
|
77
|
+
%Q{
|
78
|
+
#{query}
|
79
|
+
| sort #{SPLUNK_DEFAULT_TIME_FIELD}
|
80
|
+
| table #{ @fields.join(", ") }
|
81
|
+
}
|
82
|
+
end
|
83
|
+
|
84
|
+
def splunk_config
|
85
|
+
{
|
69
86
|
:scheme => task[:scheme],
|
70
87
|
:host => task[:host],
|
71
88
|
:port => task[:port],
|
72
89
|
:username => task[:username],
|
73
90
|
:password => task[:password]
|
74
91
|
}
|
92
|
+
end
|
93
|
+
|
94
|
+
public
|
95
|
+
|
96
|
+
def init
|
97
|
+
@max_results = task[:max_results]
|
75
98
|
@earliest_time, @latest_time = task[:earliest_time], task[:latest_time]
|
76
|
-
Embulk.logger.info "Earliest time: #{@earliest_time} / Latest time: #{@latest_time}"
|
77
|
-
|
78
99
|
@fields = task["table"].collect { |entry| entry["name"] }
|
79
|
-
Embulk.logger.info "Using fields #{@fields.join', '} in query"
|
80
|
-
|
81
100
|
@query = build_query( task[:query] )
|
82
|
-
|
83
|
-
Embulk.logger.info "Establishing connection to Splunk"
|
84
|
-
@service = ::Splunk::connect(splunk_config)
|
85
|
-
end
|
86
|
-
|
87
|
-
def build_query(query)
|
88
|
-
# Append table expression to query. Even if already present in the query, this should do no harm.
|
89
|
-
"#{query} | table #{ @fields.join(", ") } "
|
90
101
|
end
|
91
102
|
|
92
103
|
def run
|
93
|
-
Embulk.logger.
|
94
|
-
|
95
|
-
stream = @service.create_oneshot(@query,
|
96
|
-
count: SPLUNK_UNLIMITED_RESULTS,
|
97
|
-
output_format: SPLUNK_OUTPUT_FORMAT,
|
98
|
-
earliest_time: @earliest_time,
|
99
|
-
latest_time: @latest_time)
|
100
|
-
|
101
|
-
reader = ::Splunk::ResultsReader.new(stream)
|
104
|
+
Embulk.logger.debug "Establishing connection to Splunk"
|
105
|
+
service = ::Splunk::connect(splunk_config)
|
102
106
|
|
103
107
|
latest_time = nil
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
+
loop_count = 0
|
109
|
+
|
110
|
+
# There is a limit to how many results Splunk API will return.
|
111
|
+
# To avoid silently dropping results, we need to iterate until there are not more results.
|
112
|
+
loop do
|
113
|
+
number_of_results = 0
|
108
114
|
|
109
|
-
|
110
|
-
|
111
|
-
|
115
|
+
query_options = {
|
116
|
+
count: @max_results,
|
117
|
+
offset: loop_count * @max_results,
|
118
|
+
earliest_time: @earliest_time,
|
119
|
+
latest_time: @latest_time,
|
120
|
+
}
|
121
|
+
|
122
|
+
Embulk.logger.debug "Running query `#{@query}` with options #{query_options} in loop #{loop_count}"
|
123
|
+
stream = service.create_oneshot(@query, query_options)
|
112
124
|
|
113
|
-
|
114
|
-
|
125
|
+
reader = ::Splunk::ResultsReader.new(stream)
|
126
|
+
|
127
|
+
reader.each do |result|
|
128
|
+
number_of_results += 1
|
129
|
+
|
130
|
+
# We convert event_time to integer easy comparison only.
|
131
|
+
event_time = DateTime.strptime( result[SPLUNK_DEFAULT_TIME_FIELD], SPLUNK_TIME_FORMAT ).strftime("%Q").to_i
|
132
|
+
|
133
|
+
# We need to keep track of latest time for incremental loads.
|
134
|
+
# Unfortunately, Splunk was not respecting our sort requests, so we need to do a comparison for each row.
|
135
|
+
latest_time = latest_time.nil? ? event_time : [latest_time, event_time].max
|
136
|
+
|
137
|
+
row = @fields.map { |field| result[ field ] }
|
138
|
+
page_builder.add( row )
|
139
|
+
end
|
140
|
+
|
141
|
+
break if (number_of_results < @max_results) || (@max_results == 0)
|
142
|
+
|
143
|
+
loop_count += 1
|
115
144
|
end
|
116
145
|
|
117
146
|
page_builder.finish
|
118
147
|
|
119
|
-
|
120
|
-
latest_time_in_results: latest_time.strftime(SPLUNK_TIME_FORMAT)
|
121
|
-
}
|
122
|
-
|
123
|
-
return task_result
|
148
|
+
return { latest_time_in_results: latest_time }
|
124
149
|
end
|
125
150
|
end
|
151
|
+
|
126
152
|
|
127
153
|
end
|
128
154
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-splunk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Scott Arbeitman
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-02-
|
11
|
+
date: 2018-02-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|