embulk-input-splunk 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +43 -2
- data/embulk-input-splunk.gemspec +1 -1
- data/lib/embulk/input/splunk.rb +84 -58
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 83626f940cf8546f4efce77990d9bc3d2cc4a7accc2e4c7d81eb512ee5d4b7a1
|
4
|
+
data.tar.gz: d2bbd85c9286e6f24b32f67332883c421091878dfa61b2163221ece6abde7f09
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4b628c93a3417e01f4fae427225cc69e92cb9a96de7d1fb91ba7e93cfd1bba4d0b26b7b373bd7e269a5a7c91730d0409811dc5d27a5774dcccf3a193c37ab9b5
|
7
|
+
data.tar.gz: 062b9db314f9540c3617cb5adba8401a5677775f51ee55df4f86ac7b8450f8826445415942a08d369cdc17312e605d3fc31d18bebf6b34fbccef80d597ff9c8f
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -25,6 +25,7 @@ In addition, as a column we treat `_time` as a String, but only because we could
|
|
25
25
|
- **username**: splunk username (string, required)
|
26
26
|
- **password**: splunk password (string, required)
|
27
27
|
- **port**: splunk API port (integer, default: 8089)
|
28
|
+
- **max_results**: API flag to limit results returned. Set to zero for theoretical no limit. However, Splunk server config will generally limit this to 50,000. Setting this to non-zero value will cause the plugin to keep fetching results in `max_results` batches (pagination) (integer, default: 50000)
|
28
29
|
- **query**: the query you wish to run. It should be prefixed with "search" (string required)
|
29
30
|
- **earliest_time**: the earliest time for the splunk search. (string, default: nil, which is unbounded)
|
30
31
|
- **latest_time**: the latest time for the splunk search. (string, default: nil, which is unbounded)
|
@@ -102,11 +103,30 @@ in:
|
|
102
103
|
- {name: "bar", type: "long"}
|
103
104
|
```
|
104
105
|
|
106
|
+
### Max results
|
107
|
+
|
108
|
+
The query below assumes to return 100 rows, but the max_results is set to 100. This will cause the plugin to loop 10 times, returning 10 results each time. In the end, you will receive the full 100 events.
|
109
|
+
|
110
|
+
```yaml
|
111
|
+
in:
|
112
|
+
type: splunk
|
113
|
+
host: splunk.example.com
|
114
|
+
username: splunk_user
|
115
|
+
password: abc123
|
116
|
+
port: 8089
|
117
|
+
max_results: 10
|
118
|
+
query: search index="main" | head 100
|
119
|
+
table:
|
120
|
+
- {name: "_time", type: "string"}
|
121
|
+
- {name: "foo", type: "string"}
|
122
|
+
- {name: "bar", type: "long"}
|
123
|
+
```
|
124
|
+
|
105
125
|
### Complex Searches
|
106
126
|
|
107
|
-
For those unfamiliar with YAML,
|
127
|
+
For those unfamiliar with YAML, `>` or `|` indicates a multiline string. In Splunk the pipe operator is also used for creating multi-step processing.
|
108
128
|
|
109
|
-
For non-trivial Splunk queries, you should leverage the YAML pipe alongside Splunk pipes for easier to read queries.
|
129
|
+
For non-trivial Splunk queries, you should leverage the YAML pipe or > alongside Splunk pipes for easier to read queries.
|
110
130
|
|
111
131
|
```yaml
|
112
132
|
in:
|
@@ -127,6 +147,27 @@ in:
|
|
127
147
|
- {name: "foo", type: "string"} # Uses foo from the above query
|
128
148
|
```
|
129
149
|
|
150
|
+
Or with the greater than symbol:
|
151
|
+
|
152
|
+
```yaml
|
153
|
+
in:
|
154
|
+
type: splunk
|
155
|
+
host: splunk.example.com
|
156
|
+
username: splunk_user
|
157
|
+
password: abc123
|
158
|
+
port: 8089
|
159
|
+
query: >
|
160
|
+
search index="main" |
|
161
|
+
eval foo=bar |
|
162
|
+
where like(bar, "%baz%") |
|
163
|
+
head 100
|
164
|
+
earliest_time: 2017-01-18T19:23:08.237+11:00
|
165
|
+
latest_time: 2018-01-18T19:23:08.237+11:00
|
166
|
+
table:
|
167
|
+
- {name: "_time", type: "string"}
|
168
|
+
- {name: "foo", type: "string"} # Uses foo from the above query
|
169
|
+
```
|
170
|
+
|
130
171
|
## Build
|
131
172
|
|
132
173
|
```
|
data/embulk-input-splunk.gemspec
CHANGED
data/lib/embulk/input/splunk.rb
CHANGED
@@ -9,30 +9,13 @@ module Embulk
|
|
9
9
|
class Splunk < InputPlugin
|
10
10
|
Plugin.register_input("splunk", self)
|
11
11
|
|
12
|
-
# Zero means unlimited results. Splunk's default is 100.
|
13
|
-
SPLUNK_UNLIMITED_RESULTS = 0
|
14
12
|
SPLUNK_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%L%:z"
|
15
|
-
SPLUNK_OUTPUT_FORMAT = "json"
|
16
13
|
SPLUNK_DEFAULT_TIME_FIELD = "_time"
|
17
|
-
SPLUNK_TIME_FIELD = { "name" => SPLUNK_DEFAULT_TIME_FIELD, "type" => "string" }
|
14
|
+
SPLUNK_TIME_FIELD = { "name" => SPLUNK_DEFAULT_TIME_FIELD, "type" => "string" }.freeze
|
18
15
|
|
19
16
|
def self.transaction(config, &control)
|
20
17
|
|
21
|
-
task =
|
22
|
-
"scheme" => config.param("scheme", :string, default: "https"),
|
23
|
-
"host" => config.param("host", :string),
|
24
|
-
"port" => config.param("port", :integer, default: 8089),
|
25
|
-
"username" => config.param("username", :string),
|
26
|
-
"password" => config.param("password", :string),
|
27
|
-
|
28
|
-
"query" => config.param("query", :string),
|
29
|
-
|
30
|
-
"earliest_time" => config.param(:earliest_time, :string, default: nil),
|
31
|
-
"latest_time" => config.param(:latest_time, :string, default: nil),
|
32
|
-
|
33
|
-
"incremental" => config.param("incremental", :bool, default: false),
|
34
|
-
"table" => config.param("table", :array, default: [])
|
35
|
-
}
|
18
|
+
task = task_from_config(config)
|
36
19
|
|
37
20
|
if task["incremental"] && task["latest_time"]
|
38
21
|
Embulk.logger.warn "Incremental is 'true' and latest_time is set. This may have unexpected results."
|
@@ -54,75 +37,118 @@ module Embulk
|
|
54
37
|
task_reports = yield(task, columns, count)
|
55
38
|
|
56
39
|
next_config_diff = {}
|
40
|
+
|
41
|
+
# This will work with multiple threads
|
42
|
+
latest_time_in_results = task_reports.collect do |report|
|
43
|
+
report[:latest_time_in_results].to_i
|
44
|
+
end.max
|
57
45
|
|
58
|
-
latest_time_in_results = task_reports.first[:latest_time_in_results]
|
59
46
|
|
60
47
|
if task["incremental"] && latest_time_in_results.present?
|
61
|
-
next_config_diff[:earliest_time] = latest_time_in_results
|
48
|
+
next_config_diff[:earliest_time] = DateTime.strptime(latest_time_in_results.to_s, "%Q").strftime(SPLUNK_TIME_FORMAT)
|
62
49
|
end
|
63
50
|
|
64
51
|
return next_config_diff
|
65
52
|
end
|
66
53
|
|
67
|
-
def
|
68
|
-
|
54
|
+
def self.task_from_config(config)
|
55
|
+
task = {
|
56
|
+
"scheme" => config.param("scheme", :string, default: "https"),
|
57
|
+
"host" => config.param("host", :string),
|
58
|
+
"port" => config.param("port", :integer, default: 8089),
|
59
|
+
"username" => config.param("username", :string),
|
60
|
+
"password" => config.param("password", :string),
|
61
|
+
|
62
|
+
"max_results" => config.param("max_results", :integer, default: 50_000),
|
63
|
+
|
64
|
+
"query" => config.param("query", :string),
|
65
|
+
|
66
|
+
"earliest_time" => config.param(:earliest_time, :string, default: nil),
|
67
|
+
"latest_time" => config.param(:latest_time, :string, default: nil),
|
68
|
+
|
69
|
+
"incremental" => config.param("incremental", :bool, default: false),
|
70
|
+
"table" => config.param("table", :array, default: [])
|
71
|
+
}
|
72
|
+
end
|
73
|
+
|
74
|
+
protected
|
75
|
+
|
76
|
+
def build_query(query)
|
77
|
+
%Q{
|
78
|
+
#{query}
|
79
|
+
| sort #{SPLUNK_DEFAULT_TIME_FIELD}
|
80
|
+
| table #{ @fields.join(", ") }
|
81
|
+
}
|
82
|
+
end
|
83
|
+
|
84
|
+
def splunk_config
|
85
|
+
{
|
69
86
|
:scheme => task[:scheme],
|
70
87
|
:host => task[:host],
|
71
88
|
:port => task[:port],
|
72
89
|
:username => task[:username],
|
73
90
|
:password => task[:password]
|
74
91
|
}
|
92
|
+
end
|
93
|
+
|
94
|
+
public
|
95
|
+
|
96
|
+
def init
|
97
|
+
@max_results = task[:max_results]
|
75
98
|
@earliest_time, @latest_time = task[:earliest_time], task[:latest_time]
|
76
|
-
Embulk.logger.info "Earliest time: #{@earliest_time} / Latest time: #{@latest_time}"
|
77
|
-
|
78
99
|
@fields = task["table"].collect { |entry| entry["name"] }
|
79
|
-
Embulk.logger.info "Using fields #{@fields.join', '} in query"
|
80
|
-
|
81
100
|
@query = build_query( task[:query] )
|
82
|
-
|
83
|
-
Embulk.logger.info "Establishing connection to Splunk"
|
84
|
-
@service = ::Splunk::connect(splunk_config)
|
85
|
-
end
|
86
|
-
|
87
|
-
def build_query(query)
|
88
|
-
# Append table expression to query. Even if already present in the query, this should do no harm.
|
89
|
-
"#{query} | table #{ @fields.join(", ") } "
|
90
101
|
end
|
91
102
|
|
92
103
|
def run
|
93
|
-
Embulk.logger.
|
94
|
-
|
95
|
-
stream = @service.create_oneshot(@query,
|
96
|
-
count: SPLUNK_UNLIMITED_RESULTS,
|
97
|
-
output_format: SPLUNK_OUTPUT_FORMAT,
|
98
|
-
earliest_time: @earliest_time,
|
99
|
-
latest_time: @latest_time)
|
100
|
-
|
101
|
-
reader = ::Splunk::ResultsReader.new(stream)
|
104
|
+
Embulk.logger.debug "Establishing connection to Splunk"
|
105
|
+
service = ::Splunk::connect(splunk_config)
|
102
106
|
|
103
107
|
latest_time = nil
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
+
loop_count = 0
|
109
|
+
|
110
|
+
# There is a limit to how many results Splunk API will return.
|
111
|
+
# To avoid silently dropping results, we need to iterate until there are not more results.
|
112
|
+
loop do
|
113
|
+
number_of_results = 0
|
108
114
|
|
109
|
-
|
110
|
-
|
111
|
-
|
115
|
+
query_options = {
|
116
|
+
count: @max_results,
|
117
|
+
offset: loop_count * @max_results,
|
118
|
+
earliest_time: @earliest_time,
|
119
|
+
latest_time: @latest_time,
|
120
|
+
}
|
121
|
+
|
122
|
+
Embulk.logger.debug "Running query `#{@query}` with options #{query_options} in loop #{loop_count}"
|
123
|
+
stream = service.create_oneshot(@query, query_options)
|
112
124
|
|
113
|
-
|
114
|
-
|
125
|
+
reader = ::Splunk::ResultsReader.new(stream)
|
126
|
+
|
127
|
+
reader.each do |result|
|
128
|
+
number_of_results += 1
|
129
|
+
|
130
|
+
# We convert event_time to integer easy comparison only.
|
131
|
+
event_time = DateTime.strptime( result[SPLUNK_DEFAULT_TIME_FIELD], SPLUNK_TIME_FORMAT ).strftime("%Q").to_i
|
132
|
+
|
133
|
+
# We need to keep track of latest time for incremental loads.
|
134
|
+
# Unfortunately, Splunk was not respecting our sort requests, so we need to do a comparison for each row.
|
135
|
+
latest_time = latest_time.nil? ? event_time : [latest_time, event_time].max
|
136
|
+
|
137
|
+
row = @fields.map { |field| result[ field ] }
|
138
|
+
page_builder.add( row )
|
139
|
+
end
|
140
|
+
|
141
|
+
break if (number_of_results < @max_results) || (@max_results == 0)
|
142
|
+
|
143
|
+
loop_count += 1
|
115
144
|
end
|
116
145
|
|
117
146
|
page_builder.finish
|
118
147
|
|
119
|
-
|
120
|
-
latest_time_in_results: latest_time.strftime(SPLUNK_TIME_FORMAT)
|
121
|
-
}
|
122
|
-
|
123
|
-
return task_result
|
148
|
+
return { latest_time_in_results: latest_time }
|
124
149
|
end
|
125
150
|
end
|
151
|
+
|
126
152
|
|
127
153
|
end
|
128
154
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-splunk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Scott Arbeitman
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-02-
|
11
|
+
date: 2018-02-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|