logstash-input-azureblob 0.9.8 → 0.9.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +185 -1
- data/lib/logstash/inputs/azureblob.rb +121 -7
- data/logstash-input-azureblob.gemspec +2 -2
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 56832bbecde64cfccd20199348a414c0738ed9c1
|
4
|
+
data.tar.gz: fcf9cf2cae426c7f4b4f94dd7151ebac0b2b2964
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cae7c2d9420417e758cea143bd9852e112fc98aed2c0e7040923e15e7379c1acb149f94f7eab73c73b50e67dde8869233d10341dfa6f2594078e4104d46c294e
|
7
|
+
data.tar.gz: 0962291e17c6f9e9c3eea68e898cb3ccdd0ba8ded581b7d6d087958155f71100ac4013ea2306264b5b9d36d88567f40804a51cc77b7902c9b4daf257a4064049
|
data/README.md
CHANGED
@@ -56,8 +56,37 @@ When set to `start_over`, it assumes none of the blob is consumed and it will re
|
|
56
56
|
|
57
57
|
Offsets will be picked up from registry file whenever it exists.
|
58
58
|
|
59
|
+
__*file_head_bytes*__
|
60
|
+
|
61
|
+
Specifies the header of the file in bytes that does not repeat over records. Usually, these are json opening tags. The default value is `0`.
|
62
|
+
|
63
|
+
__*file_tail_bytes*__
|
64
|
+
|
65
|
+
Specifies the tail of the file that does not repeat over records. Usually, these are json closing tags. The defaul tvalue is `0`.
|
66
|
+
|
67
|
+
### Advanced tweaking parameters
|
68
|
+
|
69
|
+
Keep these parameters default to use under normal situration. Tweak these parameters when dealing with large scale azure blobs and logs.
|
70
|
+
|
71
|
+
__*blob_list_page_size*__
|
72
|
+
|
73
|
+
Specifies the page-size for returned blob items. Too big number will hit heap overflow; Too small number will leads to too many requests. The default of `100` is good for heap size of 1G.
|
74
|
+
|
75
|
+
__*break_json_down_policy*__
|
76
|
+
|
77
|
+
Only works when the codec is set to `json`. Sets the policy to break the json object in the array into small events. Break json into small sections will not be as efficient as keep it as a whole, but will reduce the usage of the memory. Possible options: `do_not_break`, `with_head_tail`, `without_head_tail`.
|
78
|
+
|
79
|
+
The default value is: `do_not_break`.
|
80
|
+
|
81
|
+
__*break_json_batch_count*__
|
82
|
+
|
83
|
+
Only works when the codec is set to `json`. Sets when break json happens, how many json object will be put in 1 batch. The bigger this is set, more memory is taken and the bigger the json will be handing to the codec. This is useful when we need to break the big json array into small pieces. Set to `1` when expect to send json 1 by 1 in the array.
|
84
|
+
|
59
85
|
### Examples
|
60
|
-
|
86
|
+
|
87
|
+
* Bare-bone settings:
|
88
|
+
|
89
|
+
```yaml
|
61
90
|
input
|
62
91
|
{
|
63
92
|
azureblob
|
@@ -69,5 +98,160 @@ input
|
|
69
98
|
}
|
70
99
|
```
|
71
100
|
|
101
|
+
* Example for Wad-IIS
|
102
|
+
|
103
|
+
```yaml
|
104
|
+
input {
|
105
|
+
azureblob
|
106
|
+
{
|
107
|
+
storage_account_name => 'mystorageaccount'
|
108
|
+
storage_access_key => 'VGhpcyBpcyBhIGZha2Uga2V5Lg=='
|
109
|
+
container => 'wad-iis-logfiles'
|
110
|
+
codec => line
|
111
|
+
}
|
112
|
+
}
|
113
|
+
filter {
|
114
|
+
## Ignore the comments that IIS will add to the start of the W3C logs
|
115
|
+
#
|
116
|
+
if [message] =~ "^#" {
|
117
|
+
drop {}
|
118
|
+
}
|
119
|
+
|
120
|
+
grok {
|
121
|
+
# https://grokdebug.herokuapp.com/
|
122
|
+
match => ["message", "%{TIMESTAMP_ISO8601:log_timestamp} %{WORD:sitename} %{WORD:computername} %{IP:server_ip} %{WORD:method} %{URIPATH:uriStem} %{NOTSPACE:uriQuery} %{NUMBER:port} %{NOTSPACE:username} %{IPORHOST:clientIP} %{NOTSPACE:protocolVersion} %{NOTSPACE:userAgent} %{NOTSPACE:cookie} %{NOTSPACE:referer} %{NOTSPACE:requestHost} %{NUMBER:response} %{NUMBER:subresponse} %{NUMBER:win32response} %{NUMBER:bytesSent} %{NUMBER:bytesReceived} %{NUMBER:timetaken}"]
|
123
|
+
}
|
124
|
+
|
125
|
+
## Set the Event Timesteamp from the log
|
126
|
+
#
|
127
|
+
date {
|
128
|
+
match => [ "log_timestamp", "YYYY-MM-dd HH:mm:ss" ]
|
129
|
+
timezone => "Etc/UTC"
|
130
|
+
}
|
131
|
+
|
132
|
+
## If the log record has a value for 'bytesSent', then add a new field
|
133
|
+
# to the event that converts it to kilobytes
|
134
|
+
#
|
135
|
+
if [bytesSent] {
|
136
|
+
ruby {
|
137
|
+
code => "event['kilobytesSent'] = event['bytesSent'].to_i / 1024.0"
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
## Do the same conversion for the bytes received value
|
142
|
+
#
|
143
|
+
if [bytesReceived] {
|
144
|
+
ruby {
|
145
|
+
code => "event['kilobytesReceived'] = event['bytesReceived'].to_i / 1024.0"
|
146
|
+
}
|
147
|
+
}
|
148
|
+
|
149
|
+
## Perform some mutations on the records to prep them for Elastic
|
150
|
+
#
|
151
|
+
mutate {
|
152
|
+
## Convert some fields from strings to integers
|
153
|
+
#
|
154
|
+
convert => ["bytesSent", "integer"]
|
155
|
+
convert => ["bytesReceived", "integer"]
|
156
|
+
convert => ["timetaken", "integer"]
|
157
|
+
|
158
|
+
## Create a new field for the reverse DNS lookup below
|
159
|
+
#
|
160
|
+
add_field => { "clientHostname" => "%{clientIP}" }
|
161
|
+
|
162
|
+
## Finally remove the original log_timestamp field since the event will
|
163
|
+
# have the proper date on it
|
164
|
+
#
|
165
|
+
remove_field => [ "log_timestamp"]
|
166
|
+
}
|
167
|
+
|
168
|
+
## Do a reverse lookup on the client IP to get their hostname.
|
169
|
+
#
|
170
|
+
dns {
|
171
|
+
## Now that we've copied the clientIP into a new field we can
|
172
|
+
# simply replace it here using a reverse lookup
|
173
|
+
#
|
174
|
+
action => "replace"
|
175
|
+
reverse => ["clientHostname"]
|
176
|
+
}
|
177
|
+
|
178
|
+
## Parse out the user agent
|
179
|
+
#
|
180
|
+
useragent {
|
181
|
+
source=> "useragent"
|
182
|
+
prefix=> "browser"
|
183
|
+
}
|
184
|
+
}
|
185
|
+
output {
|
186
|
+
file {
|
187
|
+
path => '/var/tmp/logstash-file-output'
|
188
|
+
codec => rubydebug
|
189
|
+
}
|
190
|
+
stdout {
|
191
|
+
codec => rubydebug
|
192
|
+
}
|
193
|
+
}
|
194
|
+
```
|
195
|
+
|
196
|
+
* NSG Logs
|
197
|
+
|
198
|
+
```yaml
|
199
|
+
input {
|
200
|
+
azureblob
|
201
|
+
{
|
202
|
+
storage_account_name => "mystorageaccount"
|
203
|
+
storage_access_key => "VGhpcyBpcyBhIGZha2Uga2V5Lg=="
|
204
|
+
container => "insights-logs-networksecuritygroupflowevent"
|
205
|
+
codec => "json"
|
206
|
+
# Refer https://docs.microsoft.com/en-us/azure/network-watcher/network-watcher-read-nsg-flow-logs
|
207
|
+
# Typical numbers could be 21/9 or 12/2 depends on the nsg log file types
|
208
|
+
file_head_bytes => 21
|
209
|
+
file_tail_bytes => 9
|
210
|
+
# Enable / tweak these settings when event is too big for codec to handle.
|
211
|
+
# break_json_down_policy => "with_head_tail"
|
212
|
+
# break_json_batch_count => 2
|
213
|
+
}
|
214
|
+
}
|
215
|
+
|
216
|
+
filter {
|
217
|
+
split { field => "[records]" }
|
218
|
+
split { field => "[records][properties][flows]"}
|
219
|
+
split { field => "[records][properties][flows][flows]"}
|
220
|
+
split { field => "[records][properties][flows][flows][flowTuples]"}
|
221
|
+
|
222
|
+
mutate{
|
223
|
+
split => { "[records][resourceId]" => "/"}
|
224
|
+
add_field => {"Subscription" => "%{[records][resourceId][2]}"
|
225
|
+
"ResourceGroup" => "%{[records][resourceId][4]}"
|
226
|
+
"NetworkSecurityGroup" => "%{[records][resourceId][8]}"}
|
227
|
+
convert => {"Subscription" => "string"}
|
228
|
+
convert => {"ResourceGroup" => "string"}
|
229
|
+
convert => {"NetworkSecurityGroup" => "string"}
|
230
|
+
split => { "[records][properties][flows][flows][flowTuples]" => ","}
|
231
|
+
add_field => {
|
232
|
+
"unixtimestamp" => "%{[records][properties][flows][flows][flowTuples][0]}"
|
233
|
+
"srcIp" => "%{[records][properties][flows][flows][flowTuples][1]}"
|
234
|
+
"destIp" => "%{[records][properties][flows][flows][flowTuples][2]}"
|
235
|
+
"srcPort" => "%{[records][properties][flows][flows][flowTuples][3]}"
|
236
|
+
"destPort" => "%{[records][properties][flows][flows][flowTuples][4]}"
|
237
|
+
"protocol" => "%{[records][properties][flows][flows][flowTuples][5]}"
|
238
|
+
"trafficflow" => "%{[records][properties][flows][flows][flowTuples][6]}"
|
239
|
+
"traffic" => "%{[records][properties][flows][flows][flowTuples][7]}"
|
240
|
+
}
|
241
|
+
convert => {"unixtimestamp" => "integer"}
|
242
|
+
convert => {"srcPort" => "integer"}
|
243
|
+
convert => {"destPort" => "integer"}
|
244
|
+
}
|
245
|
+
|
246
|
+
date{
|
247
|
+
match => ["unixtimestamp" , "UNIX"]
|
248
|
+
}
|
249
|
+
}
|
250
|
+
|
251
|
+
output {
|
252
|
+
stdout { codec => rubydebug }
|
253
|
+
}
|
254
|
+
```
|
255
|
+
|
72
256
|
## More information
|
73
257
|
The source code of this plugin is hosted in GitHub repo [Microsoft Azure Diagnostics with ELK](https://github.com/Azure/azure-diagnostics-tools). We welcome you to provide feedback and/or contribute to the project.
|
@@ -77,6 +77,28 @@ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
|
|
77
77
|
# When set to `start_over`, it will read all log files from begining.
|
78
78
|
config :registry_create_policy, :validate => :string, :default => 'resume'
|
79
79
|
|
80
|
+
# Sets the header of the file that does not repeat over records. Usually, these are json opening tags.
|
81
|
+
config :file_head_bytes, :validate => :number, :default => 0
|
82
|
+
|
83
|
+
# Sets the tail of the file that does not repeat over records. Usually, these are json closing tags.
|
84
|
+
config :file_tail_bytes, :validate => :number, :default => 0
|
85
|
+
|
86
|
+
# Sets how to break json
|
87
|
+
#
|
88
|
+
# Only works when the codec is set to `json`. Sets the policy to break the json object in the array into small events.
|
89
|
+
# Break json into small sections will not be as efficient as keep it as a whole, but will reduce the usage of
|
90
|
+
# the memory.
|
91
|
+
# Possible options: `do_not_break`, `with_head_tail`, `without_head_tail`
|
92
|
+
config :break_json_down_policy, :validate => :string, :default => 'do_not_break'
|
93
|
+
|
94
|
+
# Sets when break json happens, how many json object will be put in 1 batch
|
95
|
+
config :break_json_batch_count, :validate => :number, :default => 10
|
96
|
+
|
97
|
+
# Sets the page-size for returned blob items. Too big number will hit heap overflow; Too small number will leads to too many requests.
|
98
|
+
#
|
99
|
+
# The default, `100` is good for default heap size of 1G.
|
100
|
+
config :blob_list_page_size, :validate => :number, :default => 100
|
101
|
+
|
80
102
|
# Constant of max integer
|
81
103
|
MAX = 2 ** ([42].pack('i').size * 16 -2 ) -1
|
82
104
|
|
@@ -98,6 +120,7 @@ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
|
|
98
120
|
# we can abort the loop if stop? becomes true
|
99
121
|
while !stop?
|
100
122
|
process(queue)
|
123
|
+
@logger.debug("Hitting interval of #{@interval}ms . . .")
|
101
124
|
Stud.stoppable_sleep(@interval) { stop? }
|
102
125
|
end # loop
|
103
126
|
end # def run
|
@@ -117,15 +140,63 @@ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
|
|
117
140
|
# Work-around: After returned by get_blob, the etag will contains quotes.
|
118
141
|
new_etag = blob.properties[:etag]
|
119
142
|
# ~ Work-around
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
143
|
+
|
144
|
+
blob, header = @azure_blob.get_blob(@container, blob_name, {:end_range => (@file_head_bytes-1) }) if header.nil? unless @file_head_bytes.nil? or @file_head_bytes <= 0
|
145
|
+
|
146
|
+
if start_index == 0
|
147
|
+
# Skip the header since it is already read.
|
148
|
+
start_index = @file_head_bytes
|
149
|
+
else
|
150
|
+
# Adjust the offset when it is other than first time, then read till the end of the file, including the tail.
|
151
|
+
start_index = start_index - @file_tail_bytes
|
152
|
+
start_index = 0 if start_index < 0
|
153
|
+
end
|
154
|
+
|
155
|
+
blob, content = @azure_blob.get_blob(@container, blob_name, {:start_range => start_index} )
|
156
|
+
|
157
|
+
# content will be used to calculate the new offset. Create a new variable for processed content.
|
158
|
+
processed_content = content
|
159
|
+
|
160
|
+
is_json_codec = (defined?(LogStash::Codecs::JSON) == 'constant') && (@codec.is_a? LogStash::Codecs::JSON)
|
161
|
+
if (is_json_codec)
|
162
|
+
skip = processed_content.index '{'
|
163
|
+
processed_content = processed_content[skip..-1] unless skip.nil?
|
164
|
+
end #if
|
165
|
+
|
166
|
+
if is_json_codec && (@break_json_down_policy != 'do_not_break')
|
167
|
+
@logger.debug("codec is json and policy is not do_not_break")
|
168
|
+
|
169
|
+
@break_json_batch_count = 1 if break_json_batch_count <= 0
|
170
|
+
tail = processed_content[-@file_tail_bytes..-1]
|
171
|
+
while (!processed_content.nil? && processed_content.length > @file_tail_bytes)
|
172
|
+
json_event, processed_content = get_jsons(processed_content, @break_json_batch_count)
|
173
|
+
@logger.debug("Got json: ========================")
|
174
|
+
@logger.debug("#{json_event[0..50]}...#{json_event[-50..-1]}")
|
175
|
+
@logger.debug("End got json: ========================")
|
176
|
+
@logger.debug("Processed content: #{processed_content[0..50]}...")
|
177
|
+
break if json_event.nil?
|
178
|
+
if @break_json_down_policy == 'with_head_tail'
|
179
|
+
@logger.debug("Adding json head/tails.")
|
180
|
+
json_event = "#{header}#{json_event}#{tail}"
|
181
|
+
end #if
|
182
|
+
@codec.decode(json_event) do |event|
|
183
|
+
decorate(event)
|
184
|
+
queue << event
|
185
|
+
end # decode
|
186
|
+
end
|
187
|
+
else
|
188
|
+
@logger.debug("Non-json codec or the policy is do not break")
|
189
|
+
# Putting header and content and tail together before pushing into event queue
|
190
|
+
processed_content = "#{header}#{processed_content}" unless header.nil? || header.length == 0
|
191
|
+
@codec.decode(processed_content) do |event|
|
192
|
+
decorate(event)
|
193
|
+
queue << event
|
194
|
+
end # decode
|
195
|
+
end #if
|
126
196
|
ensure
|
127
197
|
# Making sure the reader is removed from the registry even when there's exception.
|
128
198
|
new_offset = start_index
|
199
|
+
new_offset = 0 if start_index == @file_head_bytes && content.nil? # Reset the offset when nothing has been read.
|
129
200
|
new_offset = new_offset + content.length unless content.nil?
|
130
201
|
new_registry_item = LogStash::Inputs::RegistryItem.new(blob_name, new_etag, nil, new_offset, gen)
|
131
202
|
update_registry(new_registry_item)
|
@@ -135,6 +206,47 @@ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
|
|
135
206
|
@logger.error("Oh My, An error occurred. \nError:#{e}:\nTrace:\n#{e.backtrace}", :exception => e)
|
136
207
|
end # begin
|
137
208
|
end # process
|
209
|
+
|
210
|
+
# Get first json object out of a string, return the rest of the string
|
211
|
+
def get_jsons(content, batch_size)
|
212
|
+
return nil, content, 0 if content.nil? || content.length == 0
|
213
|
+
return nil, content, 0 if (content.index '{').nil?
|
214
|
+
|
215
|
+
hit = 0
|
216
|
+
count = 0
|
217
|
+
index = 0
|
218
|
+
first = content.index('{')
|
219
|
+
move_opening = true
|
220
|
+
move_closing = true
|
221
|
+
while(hit < batch_size)
|
222
|
+
inIndex = content.index('{', index) if move_opening
|
223
|
+
outIndex = content.index('}', index) if move_closing
|
224
|
+
|
225
|
+
# TODO: Fix the ending condition
|
226
|
+
break if count == 0 && (inIndex.nil? || outIndex.nil?)
|
227
|
+
|
228
|
+
if(inIndex.nil?)
|
229
|
+
index = outIndex
|
230
|
+
elsif(outIndex.nil?)
|
231
|
+
index = inIndex
|
232
|
+
else
|
233
|
+
index = [inIndex, outIndex].min
|
234
|
+
end #if
|
235
|
+
if content[index] == '{'
|
236
|
+
count += 1
|
237
|
+
move_opening = true
|
238
|
+
move_closing = false
|
239
|
+
elsif content[index] == '}'
|
240
|
+
count -= 1
|
241
|
+
move_closing = true
|
242
|
+
move_opening = false
|
243
|
+
end #if
|
244
|
+
index += 1
|
245
|
+
hit += 1 if count == 0
|
246
|
+
end
|
247
|
+
|
248
|
+
return content[first..index-1], content[index..-1], hit
|
249
|
+
end #def get_first_json
|
138
250
|
|
139
251
|
# Deserialize registry hash from json string.
|
140
252
|
def deserialize_registry_hash (json_string)
|
@@ -150,8 +262,10 @@ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
|
|
150
262
|
def list_all_blobs
|
151
263
|
blobs = Set.new []
|
152
264
|
continuation_token = NIL
|
265
|
+
@blob_list_page_size = 100 if @blob_list_page_size <= 0
|
153
266
|
loop do
|
154
|
-
|
267
|
+
# Need to limit the returned number of the returned entries to avoid out of memory exception.
|
268
|
+
entries = @azure_blob.list_blobs(@container, { :timeout => 10, :marker => continuation_token, :max_results => @blob_list_page_size })
|
155
269
|
entries.each do |entry|
|
156
270
|
blobs << entry
|
157
271
|
end # each
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'logstash-input-azureblob'
|
3
|
-
s.version = '0.9.
|
3
|
+
s.version = '0.9.9'
|
4
4
|
s.licenses = ['Apache License (2.0)']
|
5
5
|
s.summary = 'This plugin collects Microsoft Azure Diagnostics data from Azure Storage Blobs.'
|
6
6
|
s.description = 'This gem is a Logstash plugin. It reads and parses data from Azure Storage Blobs.'
|
@@ -21,6 +21,6 @@ Gem::Specification.new do |s|
|
|
21
21
|
s.add_runtime_dependency "logstash-core-plugin-api", '>= 1.60', '<= 2.99'
|
22
22
|
s.add_runtime_dependency 'logstash-codec-json_lines'
|
23
23
|
s.add_runtime_dependency 'stud', '>= 0.0.22'
|
24
|
-
s.add_runtime_dependency 'azure-storage', '~> 0.
|
24
|
+
s.add_runtime_dependency 'azure-storage', '~> 0.12.3.preview'
|
25
25
|
s.add_development_dependency 'logstash-devutils'
|
26
26
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: logstash-input-azureblob
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Microsoft Corporation
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -63,7 +63,7 @@ dependencies:
|
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: 0.
|
66
|
+
version: 0.12.3.preview
|
67
67
|
name: azure-storage
|
68
68
|
prerelease: false
|
69
69
|
type: :runtime
|
@@ -71,7 +71,7 @@ dependencies:
|
|
71
71
|
requirements:
|
72
72
|
- - "~>"
|
73
73
|
- !ruby/object:Gem::Version
|
74
|
-
version: 0.
|
74
|
+
version: 0.12.3.preview
|
75
75
|
- !ruby/object:Gem::Dependency
|
76
76
|
requirement: !ruby/object:Gem::Requirement
|
77
77
|
requirements:
|