logstash-input-azureblob 0.9.8 → 0.9.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e72dbef9d73b6b872b786b1cd40ffd3434835ac4
4
- data.tar.gz: 9054b485b8073ca0dc9cce2e09c125630caec5fc
3
+ metadata.gz: 56832bbecde64cfccd20199348a414c0738ed9c1
4
+ data.tar.gz: fcf9cf2cae426c7f4b4f94dd7151ebac0b2b2964
5
5
  SHA512:
6
- metadata.gz: 917b7f0ad4b0a552091a6055c9e5fd5eeef38d6ce0ce85166eb432d2a5737a8d49f75c28579ce0119ed1887c952a0c372dc620cdf41bcdfdd9d30e9160e33c0a
7
- data.tar.gz: 3f513712c43563b6ea3407b23516e784b1d4d8fc706213247132b57827f7605ed55cbcacb9c357525fb84bc18920ab367a03e411b658d56e4da7506152560767
6
+ metadata.gz: cae7c2d9420417e758cea143bd9852e112fc98aed2c0e7040923e15e7379c1acb149f94f7eab73c73b50e67dde8869233d10341dfa6f2594078e4104d46c294e
7
+ data.tar.gz: 0962291e17c6f9e9c3eea68e898cb3ccdd0ba8ded581b7d6d087958155f71100ac4013ea2306264b5b9d36d88567f40804a51cc77b7902c9b4daf257a4064049
data/README.md CHANGED
@@ -56,8 +56,37 @@ When set to `start_over`, it assumes none of the blob is consumed and it will re
56
56
 
57
57
  Offsets will be picked up from registry file whenever it exists.
58
58
 
59
+ __*file_head_bytes*__
60
+
61
+ Specifies the header of the file in bytes that does not repeat over records. Usually, these are json opening tags. The default value is `0`.
62
+
63
+ __*file_tail_bytes*__
64
+
65
+ Specifies the tail of the file that does not repeat over records. Usually, these are json closing tags. The defaul tvalue is `0`.
66
+
67
+ ### Advanced tweaking parameters
68
+
69
+ Keep these parameters default to use under normal situration. Tweak these parameters when dealing with large scale azure blobs and logs.
70
+
71
+ __*blob_list_page_size*__
72
+
73
+ Specifies the page-size for returned blob items. Too big number will hit heap overflow; Too small number will leads to too many requests. The default of `100` is good for heap size of 1G.
74
+
75
+ __*break_json_down_policy*__
76
+
77
+ Only works when the codec is set to `json`. Sets the policy to break the json object in the array into small events. Break json into small sections will not be as efficient as keep it as a whole, but will reduce the usage of the memory. Possible options: `do_not_break`, `with_head_tail`, `without_head_tail`.
78
+
79
+ The default value is: `do_not_break`.
80
+
81
+ __*break_json_batch_count*__
82
+
83
+ Only works when the codec is set to `json`. Sets when break json happens, how many json object will be put in 1 batch. The bigger this is set, more memory is taken and the bigger the json will be handing to the codec. This is useful when we need to break the big json array into small pieces. Set to `1` when expect to send json 1 by 1 in the array.
84
+
59
85
  ### Examples
60
- ```
86
+
87
+ * Bare-bone settings:
88
+
89
+ ```yaml
61
90
  input
62
91
  {
63
92
  azureblob
@@ -69,5 +98,160 @@ input
69
98
  }
70
99
  ```
71
100
 
101
+ * Example for Wad-IIS
102
+
103
+ ```yaml
104
+ input {
105
+ azureblob
106
+ {
107
+ storage_account_name => 'mystorageaccount'
108
+ storage_access_key => 'VGhpcyBpcyBhIGZha2Uga2V5Lg=='
109
+ container => 'wad-iis-logfiles'
110
+ codec => line
111
+ }
112
+ }
113
+ filter {
114
+ ## Ignore the comments that IIS will add to the start of the W3C logs
115
+ #
116
+ if [message] =~ "^#" {
117
+ drop {}
118
+ }
119
+
120
+ grok {
121
+ # https://grokdebug.herokuapp.com/
122
+ match => ["message", "%{TIMESTAMP_ISO8601:log_timestamp} %{WORD:sitename} %{WORD:computername} %{IP:server_ip} %{WORD:method} %{URIPATH:uriStem} %{NOTSPACE:uriQuery} %{NUMBER:port} %{NOTSPACE:username} %{IPORHOST:clientIP} %{NOTSPACE:protocolVersion} %{NOTSPACE:userAgent} %{NOTSPACE:cookie} %{NOTSPACE:referer} %{NOTSPACE:requestHost} %{NUMBER:response} %{NUMBER:subresponse} %{NUMBER:win32response} %{NUMBER:bytesSent} %{NUMBER:bytesReceived} %{NUMBER:timetaken}"]
123
+ }
124
+
125
+ ## Set the Event Timesteamp from the log
126
+ #
127
+ date {
128
+ match => [ "log_timestamp", "YYYY-MM-dd HH:mm:ss" ]
129
+ timezone => "Etc/UTC"
130
+ }
131
+
132
+ ## If the log record has a value for 'bytesSent', then add a new field
133
+ # to the event that converts it to kilobytes
134
+ #
135
+ if [bytesSent] {
136
+ ruby {
137
+ code => "event['kilobytesSent'] = event['bytesSent'].to_i / 1024.0"
138
+ }
139
+ }
140
+
141
+ ## Do the same conversion for the bytes received value
142
+ #
143
+ if [bytesReceived] {
144
+ ruby {
145
+ code => "event['kilobytesReceived'] = event['bytesReceived'].to_i / 1024.0"
146
+ }
147
+ }
148
+
149
+ ## Perform some mutations on the records to prep them for Elastic
150
+ #
151
+ mutate {
152
+ ## Convert some fields from strings to integers
153
+ #
154
+ convert => ["bytesSent", "integer"]
155
+ convert => ["bytesReceived", "integer"]
156
+ convert => ["timetaken", "integer"]
157
+
158
+ ## Create a new field for the reverse DNS lookup below
159
+ #
160
+ add_field => { "clientHostname" => "%{clientIP}" }
161
+
162
+ ## Finally remove the original log_timestamp field since the event will
163
+ # have the proper date on it
164
+ #
165
+ remove_field => [ "log_timestamp"]
166
+ }
167
+
168
+ ## Do a reverse lookup on the client IP to get their hostname.
169
+ #
170
+ dns {
171
+ ## Now that we've copied the clientIP into a new field we can
172
+ # simply replace it here using a reverse lookup
173
+ #
174
+ action => "replace"
175
+ reverse => ["clientHostname"]
176
+ }
177
+
178
+ ## Parse out the user agent
179
+ #
180
+ useragent {
181
+ source=> "useragent"
182
+ prefix=> "browser"
183
+ }
184
+ }
185
+ output {
186
+ file {
187
+ path => '/var/tmp/logstash-file-output'
188
+ codec => rubydebug
189
+ }
190
+ stdout {
191
+ codec => rubydebug
192
+ }
193
+ }
194
+ ```
195
+
196
+ * NSG Logs
197
+
198
+ ```yaml
199
+ input {
200
+ azureblob
201
+ {
202
+ storage_account_name => "mystorageaccount"
203
+ storage_access_key => "VGhpcyBpcyBhIGZha2Uga2V5Lg=="
204
+ container => "insights-logs-networksecuritygroupflowevent"
205
+ codec => "json"
206
+ # Refer https://docs.microsoft.com/en-us/azure/network-watcher/network-watcher-read-nsg-flow-logs
207
+ # Typical numbers could be 21/9 or 12/2 depends on the nsg log file types
208
+ file_head_bytes => 21
209
+ file_tail_bytes => 9
210
+ # Enable / tweak these settings when event is too big for codec to handle.
211
+ # break_json_down_policy => "with_head_tail"
212
+ # break_json_batch_count => 2
213
+ }
214
+ }
215
+
216
+ filter {
217
+ split { field => "[records]" }
218
+ split { field => "[records][properties][flows]"}
219
+ split { field => "[records][properties][flows][flows]"}
220
+ split { field => "[records][properties][flows][flows][flowTuples]"}
221
+
222
+ mutate{
223
+ split => { "[records][resourceId]" => "/"}
224
+ add_field => {"Subscription" => "%{[records][resourceId][2]}"
225
+ "ResourceGroup" => "%{[records][resourceId][4]}"
226
+ "NetworkSecurityGroup" => "%{[records][resourceId][8]}"}
227
+ convert => {"Subscription" => "string"}
228
+ convert => {"ResourceGroup" => "string"}
229
+ convert => {"NetworkSecurityGroup" => "string"}
230
+ split => { "[records][properties][flows][flows][flowTuples]" => ","}
231
+ add_field => {
232
+ "unixtimestamp" => "%{[records][properties][flows][flows][flowTuples][0]}"
233
+ "srcIp" => "%{[records][properties][flows][flows][flowTuples][1]}"
234
+ "destIp" => "%{[records][properties][flows][flows][flowTuples][2]}"
235
+ "srcPort" => "%{[records][properties][flows][flows][flowTuples][3]}"
236
+ "destPort" => "%{[records][properties][flows][flows][flowTuples][4]}"
237
+ "protocol" => "%{[records][properties][flows][flows][flowTuples][5]}"
238
+ "trafficflow" => "%{[records][properties][flows][flows][flowTuples][6]}"
239
+ "traffic" => "%{[records][properties][flows][flows][flowTuples][7]}"
240
+ }
241
+ convert => {"unixtimestamp" => "integer"}
242
+ convert => {"srcPort" => "integer"}
243
+ convert => {"destPort" => "integer"}
244
+ }
245
+
246
+ date{
247
+ match => ["unixtimestamp" , "UNIX"]
248
+ }
249
+ }
250
+
251
+ output {
252
+ stdout { codec => rubydebug }
253
+ }
254
+ ```
255
+
72
256
  ## More information
73
257
  The source code of this plugin is hosted in GitHub repo [Microsoft Azure Diagnostics with ELK](https://github.com/Azure/azure-diagnostics-tools). We welcome you to provide feedback and/or contribute to the project.
@@ -77,6 +77,28 @@ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
77
77
  # When set to `start_over`, it will read all log files from begining.
78
78
  config :registry_create_policy, :validate => :string, :default => 'resume'
79
79
 
80
+ # Sets the header of the file that does not repeat over records. Usually, these are json opening tags.
81
+ config :file_head_bytes, :validate => :number, :default => 0
82
+
83
+ # Sets the tail of the file that does not repeat over records. Usually, these are json closing tags.
84
+ config :file_tail_bytes, :validate => :number, :default => 0
85
+
86
+ # Sets how to break json
87
+ #
88
+ # Only works when the codec is set to `json`. Sets the policy to break the json object in the array into small events.
89
+ # Break json into small sections will not be as efficient as keep it as a whole, but will reduce the usage of
90
+ # the memory.
91
+ # Possible options: `do_not_break`, `with_head_tail`, `without_head_tail`
92
+ config :break_json_down_policy, :validate => :string, :default => 'do_not_break'
93
+
94
+ # Sets when break json happens, how many json object will be put in 1 batch
95
+ config :break_json_batch_count, :validate => :number, :default => 10
96
+
97
+ # Sets the page-size for returned blob items. Too big number will hit heap overflow; Too small number will leads to too many requests.
98
+ #
99
+ # The default, `100` is good for default heap size of 1G.
100
+ config :blob_list_page_size, :validate => :number, :default => 100
101
+
80
102
  # Constant of max integer
81
103
  MAX = 2 ** ([42].pack('i').size * 16 -2 ) -1
82
104
 
@@ -98,6 +120,7 @@ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
98
120
  # we can abort the loop if stop? becomes true
99
121
  while !stop?
100
122
  process(queue)
123
+ @logger.debug("Hitting interval of #{@interval}ms . . .")
101
124
  Stud.stoppable_sleep(@interval) { stop? }
102
125
  end # loop
103
126
  end # def run
@@ -117,15 +140,63 @@ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
117
140
  # Work-around: After returned by get_blob, the etag will contains quotes.
118
141
  new_etag = blob.properties[:etag]
119
142
  # ~ Work-around
120
- blob, content = @azure_blob.get_blob(@container, blob_name, {:start_range=>start_index} )
121
-
122
- @codec.decode(content) do |event|
123
- decorate(event)
124
- queue << event
125
- end # decode
143
+
144
+ blob, header = @azure_blob.get_blob(@container, blob_name, {:end_range => (@file_head_bytes-1) }) if header.nil? unless @file_head_bytes.nil? or @file_head_bytes <= 0
145
+
146
+ if start_index == 0
147
+ # Skip the header since it is already read.
148
+ start_index = @file_head_bytes
149
+ else
150
+ # Adjust the offset when it is other than first time, then read till the end of the file, including the tail.
151
+ start_index = start_index - @file_tail_bytes
152
+ start_index = 0 if start_index < 0
153
+ end
154
+
155
+ blob, content = @azure_blob.get_blob(@container, blob_name, {:start_range => start_index} )
156
+
157
+ # content will be used to calculate the new offset. Create a new variable for processed content.
158
+ processed_content = content
159
+
160
+ is_json_codec = (defined?(LogStash::Codecs::JSON) == 'constant') && (@codec.is_a? LogStash::Codecs::JSON)
161
+ if (is_json_codec)
162
+ skip = processed_content.index '{'
163
+ processed_content = processed_content[skip..-1] unless skip.nil?
164
+ end #if
165
+
166
+ if is_json_codec && (@break_json_down_policy != 'do_not_break')
167
+ @logger.debug("codec is json and policy is not do_not_break")
168
+
169
+ @break_json_batch_count = 1 if break_json_batch_count <= 0
170
+ tail = processed_content[-@file_tail_bytes..-1]
171
+ while (!processed_content.nil? && processed_content.length > @file_tail_bytes)
172
+ json_event, processed_content = get_jsons(processed_content, @break_json_batch_count)
173
+ @logger.debug("Got json: ========================")
174
+ @logger.debug("#{json_event[0..50]}...#{json_event[-50..-1]}")
175
+ @logger.debug("End got json: ========================")
176
+ @logger.debug("Processed content: #{processed_content[0..50]}...")
177
+ break if json_event.nil?
178
+ if @break_json_down_policy == 'with_head_tail'
179
+ @logger.debug("Adding json head/tails.")
180
+ json_event = "#{header}#{json_event}#{tail}"
181
+ end #if
182
+ @codec.decode(json_event) do |event|
183
+ decorate(event)
184
+ queue << event
185
+ end # decode
186
+ end
187
+ else
188
+ @logger.debug("Non-json codec or the policy is do not break")
189
+ # Putting header and content and tail together before pushing into event queue
190
+ processed_content = "#{header}#{processed_content}" unless header.nil? || header.length == 0
191
+ @codec.decode(processed_content) do |event|
192
+ decorate(event)
193
+ queue << event
194
+ end # decode
195
+ end #if
126
196
  ensure
127
197
  # Making sure the reader is removed from the registry even when there's exception.
128
198
  new_offset = start_index
199
+ new_offset = 0 if start_index == @file_head_bytes && content.nil? # Reset the offset when nothing has been read.
129
200
  new_offset = new_offset + content.length unless content.nil?
130
201
  new_registry_item = LogStash::Inputs::RegistryItem.new(blob_name, new_etag, nil, new_offset, gen)
131
202
  update_registry(new_registry_item)
@@ -135,6 +206,47 @@ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
135
206
  @logger.error("Oh My, An error occurred. \nError:#{e}:\nTrace:\n#{e.backtrace}", :exception => e)
136
207
  end # begin
137
208
  end # process
209
+
210
+ # Get first json object out of a string, return the rest of the string
211
+ def get_jsons(content, batch_size)
212
+ return nil, content, 0 if content.nil? || content.length == 0
213
+ return nil, content, 0 if (content.index '{').nil?
214
+
215
+ hit = 0
216
+ count = 0
217
+ index = 0
218
+ first = content.index('{')
219
+ move_opening = true
220
+ move_closing = true
221
+ while(hit < batch_size)
222
+ inIndex = content.index('{', index) if move_opening
223
+ outIndex = content.index('}', index) if move_closing
224
+
225
+ # TODO: Fix the ending condition
226
+ break if count == 0 && (inIndex.nil? || outIndex.nil?)
227
+
228
+ if(inIndex.nil?)
229
+ index = outIndex
230
+ elsif(outIndex.nil?)
231
+ index = inIndex
232
+ else
233
+ index = [inIndex, outIndex].min
234
+ end #if
235
+ if content[index] == '{'
236
+ count += 1
237
+ move_opening = true
238
+ move_closing = false
239
+ elsif content[index] == '}'
240
+ count -= 1
241
+ move_closing = true
242
+ move_opening = false
243
+ end #if
244
+ index += 1
245
+ hit += 1 if count == 0
246
+ end
247
+
248
+ return content[first..index-1], content[index..-1], hit
249
+ end #def get_first_json
138
250
 
139
251
  # Deserialize registry hash from json string.
140
252
  def deserialize_registry_hash (json_string)
@@ -150,8 +262,10 @@ class LogStash::Inputs::LogstashInputAzureblob < LogStash::Inputs::Base
150
262
  def list_all_blobs
151
263
  blobs = Set.new []
152
264
  continuation_token = NIL
265
+ @blob_list_page_size = 100 if @blob_list_page_size <= 0
153
266
  loop do
154
- entries = @azure_blob.list_blobs(@container, { :timeout => 10, :marker => continuation_token})
267
+ # Need to limit the returned number of the returned entries to avoid out of memory exception.
268
+ entries = @azure_blob.list_blobs(@container, { :timeout => 10, :marker => continuation_token, :max_results => @blob_list_page_size })
155
269
  entries.each do |entry|
156
270
  blobs << entry
157
271
  end # each
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'logstash-input-azureblob'
3
- s.version = '0.9.8'
3
+ s.version = '0.9.9'
4
4
  s.licenses = ['Apache License (2.0)']
5
5
  s.summary = 'This plugin collects Microsoft Azure Diagnostics data from Azure Storage Blobs.'
6
6
  s.description = 'This gem is a Logstash plugin. It reads and parses data from Azure Storage Blobs.'
@@ -21,6 +21,6 @@ Gem::Specification.new do |s|
21
21
  s.add_runtime_dependency "logstash-core-plugin-api", '>= 1.60', '<= 2.99'
22
22
  s.add_runtime_dependency 'logstash-codec-json_lines'
23
23
  s.add_runtime_dependency 'stud', '>= 0.0.22'
24
- s.add_runtime_dependency 'azure-storage', '~> 0.11.4.preview'
24
+ s.add_runtime_dependency 'azure-storage', '~> 0.12.3.preview'
25
25
  s.add_development_dependency 'logstash-devutils'
26
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: logstash-input-azureblob
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.8
4
+ version: 0.9.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Microsoft Corporation
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-30 00:00:00.000000000 Z
11
+ date: 2017-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -63,7 +63,7 @@ dependencies:
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: 0.11.4.preview
66
+ version: 0.12.3.preview
67
67
  name: azure-storage
68
68
  prerelease: false
69
69
  type: :runtime
@@ -71,7 +71,7 @@ dependencies:
71
71
  requirements:
72
72
  - - "~>"
73
73
  - !ruby/object:Gem::Version
74
- version: 0.11.4.preview
74
+ version: 0.12.3.preview
75
75
  - !ruby/object:Gem::Dependency
76
76
  requirement: !ruby/object:Gem::Requirement
77
77
  requirements: