logstash-input-azure_blob_storage 0.10.7 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/logstash/inputs/azure_blob_storage.rb +132 -105
- data/logstash-input-azure_blob_storage.gemspec +3 -3
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cb93fc423babf6bc4cd7b13b1280a27ea6156fa2aeebe69ab172d8e925940d2c
|
4
|
+
data.tar.gz: ae92a22e56d87cc9d4d0f0b615460a8da33afb822aaa3f0d109cd3b86137ee53
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f5a311f322b04740a98182271e3074171feea7b7e899e3ec712b457182d91f34033cc73b1219042bad727f0c6566b1c3cf0ce362e5cc9d2b1e4d09a2029d5456
|
7
|
+
data.tar.gz: 53072a976feddc171ad02960fdcec6612099caaade967c28a501ba1ca413f1dccdc071e050e322eefa84a266b0c9a9ed487a6d98a36c62480e573421b2fc27b7
|
@@ -1,80 +1,80 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require
|
3
|
-
require
|
2
|
+
require 'logstash/inputs/base'
|
3
|
+
require 'stud/interval'
|
4
4
|
require 'azure/storage/blob'
|
5
5
|
require 'json'
|
6
6
|
|
7
7
|
# This is a logstash input plugin for files in Azure Blob Storage. There is a storage explorer in the portal and an application with the same name https://storageexplorer.com. A storage account has by default a globally unique name, {storageaccount}.blob.core.windows.net which is a CNAME to Azures blob servers blob.*.store.core.windows.net. A storageaccount has an container and those have a directory and blobs (like files). Blobs have one or more blocks. After writing the blocks, they can be committed. Some Azure diagnostics can send events to an EventHub that can be parse through the plugin logstash-input-azure_event_hubs, but for the events that are only stored in an storage account, use this plugin. The original logstash-input-azureblob from azure-diagnostics-tools is great for low volumes, but it suffers from outdated client, slow reads, lease locking issues and json parse errors.
|
8
8
|
# https://azure.microsoft.com/en-us/services/storage/blobs/
|
9
9
|
class LogStash::Inputs::AzureBlobStorage < LogStash::Inputs::Base
|
10
|
-
|
10
|
+
config_name "azure_blob_storage"
|
11
11
|
|
12
|
-
|
13
|
-
|
12
|
+
# If undefined, Logstash will complain, even if codec is unused. The codec for nsgflowlog is "json" and the for WADIIS and APPSERVICE is "line".
|
13
|
+
default :codec, "json"
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
# logtype can be nsgflowlog, wadiis, appservice or raw. The default is raw, where files are read and added as one event. If the file grows, the next interval the file is read from the offset, so that the delta is sent as another event. In raw mode, further processing has to be done in the filter block. If the logtype is specified, this plugin will split and mutate and add individual events to the queue.
|
16
|
+
config :logtype, :validate => ['nsgflowlog','wadiis','appservice','raw'], :default => 'raw'
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
# The storage account is accessed through Azure::Storage::Blob::BlobService, it needs either a sas_token, connection string or a storageaccount/access_key pair.
|
19
|
+
# https://github.com/Azure/azure-storage-ruby/blob/master/blob/lib/azure/storage/blob/blob_service.rb#L42
|
20
|
+
config :connection_string, :validate => :password, :required => false
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
# The storage account name for the azure storage account.
|
23
|
+
config :storageaccount, :validate => :string, :required => false
|
24
24
|
|
25
|
-
|
26
|
-
|
25
|
+
# DNS Suffix other then blob.core.windows.net
|
26
|
+
config :dns_suffix, :validate => :string, :required => false, :default => 'core.windows.net'
|
27
27
|
|
28
|
-
|
29
|
-
|
28
|
+
# The (primary or secondary) Access Key for the the storage account. The key can be found in the portal.azure.com or through the azure api StorageAccounts/ListKeys. For example the PowerShell command Get-AzStorageAccountKey.
|
29
|
+
config :access_key, :validate => :password, :required => false
|
30
30
|
|
31
|
-
|
32
|
-
|
31
|
+
# SAS is the Shared Access Signature, that provides restricted access rights. If the sas_token is absent, the access_key is used instead.
|
32
|
+
config :sas_token, :validate => :password, :required => false
|
33
33
|
|
34
|
-
|
35
|
-
|
34
|
+
# The container of the blobs.
|
35
|
+
config :container, :validate => :string, :default => 'insights-logs-networksecuritygroupflowevent'
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
37
|
+
# The registry file keeps track of the files that have been processed and until which offset in bytes. It's similar in function
|
38
|
+
#
|
39
|
+
# The default, `data/registry`, it contains a Ruby Marshal Serialized Hash of the filename the offset read sofar and the filelength the list time a filelisting was done.
|
40
|
+
config :registry_path, :validate => :string, :required => false, :default => 'data/registry.dat'
|
41
41
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
42
|
+
# The default, `resume`, will load the registry offsets and will start processing files from the offsets.
|
43
|
+
# When set to `start_over`, all log files are processed from begining.
|
44
|
+
# when set to `start_fresh`, it will read log files that are created or appended since this start of the pipeline.
|
45
|
+
config :registry_create_policy, :validate => ['resume','start_over','start_fresh'], :required => false, :default => 'resume'
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
47
|
+
# The registry keeps track of the files that where already procesed. The interval is used to save the registry regularly, when new events have have been processed. It is also used to wait before listing the files again and substraciting the registry of already processed files to determine the worklist.
|
48
|
+
#
|
49
|
+
# waiting time in seconds until processing the next batch. NSGFLOWLOGS append a block per minute, so use multiples of 60 seconds, 300 for 5 minutes, 600 for 10 minutes. The registry is also saved after every interval.
|
50
|
+
# Partial reading starts from the offset and reads until the end, so the starting tag is prepended
|
51
|
+
#
|
52
|
+
# A00000000000000000000000000000000 12 {"records":[
|
53
|
+
# D672f4bbd95a04209b00dc05d899e3cce 2576 json objects for 1st minute
|
54
|
+
# D7fe0d4f275a84c32982795b0e5c7d3a1 2312 json objects for 2nd minute
|
55
|
+
# Z00000000000000000000000000000000 2 ]}
|
56
|
+
config :interval, :validate => :number, :default => 60
|
57
57
|
|
58
|
-
|
59
|
-
|
58
|
+
# WAD IIS Grok Pattern
|
59
|
+
#config :grokpattern, :validate => :string, :required => false, :default => '%{TIMESTAMP_ISO8601:log_timestamp} %{NOTSPACE:instanceId} %{NOTSPACE:instanceId2} %{IPORHOST:ServerIP} %{WORD:httpMethod} %{URIPATH:requestUri} %{NOTSPACE:requestQuery} %{NUMBER:port} %{NOTSPACE:username} %{IPORHOST:clientIP} %{NOTSPACE:httpVersion} %{NOTSPACE:userAgent} %{NOTSPACE:cookie} %{NOTSPACE:referer} %{NOTSPACE:host} %{NUMBER:httpStatus} %{NUMBER:subresponse} %{NUMBER:win32response} %{NUMBER:sentBytes:int} %{NUMBER:receivedBytes:int} %{NUMBER:timeTaken:int}'
|
60
60
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
61
|
+
# The string that starts the JSON. Only needed when the codec is JSON. When partial file are read, the result will not be valid JSON unless the start and end are put back. the file_head and file_tail are learned at startup, by reading the first file in the blob_list and taking the first and last block, this would work for blobs that are appended like nsgflowlogs. The configuration can be set to override the learning. In case learning fails and the option is not set, the default is to use the 'records' as set by nsgflowlogs.
|
62
|
+
config :file_head, :validate => :string, :required => false, :default => '{"records":['
|
63
|
+
# The string that ends the JSON
|
64
|
+
config :file_tail, :validate => :string, :required => false, :default => ']}'
|
65
65
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
66
|
+
# The path(s) to the file(s) to use as an input. By default it will
|
67
|
+
# watch every files in the storage container.
|
68
|
+
# You can use filename patterns here, such as `logs/*.log`.
|
69
|
+
# If you use a pattern like `logs/**/*.log`, a recursive search
|
70
|
+
# of `logs` will be done for all `*.log` files.
|
71
|
+
# Do not include a leading `/`, as Azure path look like this:
|
72
|
+
# `path/to/blob/file.txt`
|
73
|
+
#
|
74
|
+
# You may also configure multiple paths. See an example
|
75
|
+
# on the <<array,Logstash configuration page>>.
|
76
|
+
# For NSGFLOWLOGS a path starts with "resourceId=/", but this would only be needed to exclude other files that may be written in the same container.
|
77
|
+
config :prefix, :validate => :string, :required => false
|
78
78
|
|
79
79
|
|
80
80
|
|
@@ -91,7 +91,7 @@ def register
|
|
91
91
|
@processed = 0
|
92
92
|
@regsaved = @processed
|
93
93
|
|
94
|
-
|
94
|
+
#@buffer = FileWatch::BufferedTokenizer.new('\n')
|
95
95
|
|
96
96
|
# Try in this order to access the storageaccount
|
97
97
|
# 1. storageaccount / sas_token
|
@@ -119,19 +119,23 @@ def register
|
|
119
119
|
end
|
120
120
|
|
121
121
|
@registry = Hash.new
|
122
|
-
|
122
|
+
if registry_create_policy == "resume"
|
123
123
|
begin
|
124
|
+
@logger.info(@pipe_id+" resuming from registry")
|
124
125
|
@registry = Marshal.load(@blob_client.get_blob(container, registry_path)[1])
|
125
126
|
#[0] headers [1] responsebody
|
126
127
|
rescue
|
127
128
|
@registry.clear
|
129
|
+
@logger.error(@pipe_id+" loading registry failed, starting over")
|
128
130
|
end
|
129
131
|
end
|
130
132
|
# read filelist and set offsets to file length to mark all the old files as done
|
131
133
|
if registry_create_policy == "start_fresh"
|
132
|
-
@
|
133
|
-
|
134
|
-
|
134
|
+
@logger.info(@pipe_id+" starting fresh")
|
135
|
+
@registry = list_blobs(true)
|
136
|
+
#tempreg.each do |name, file|
|
137
|
+
# @registry.store(name, { :offset => file[:length], :length => file[:length] })
|
138
|
+
#end
|
135
139
|
end
|
136
140
|
|
137
141
|
@is_json = false
|
@@ -158,34 +162,47 @@ end # def register
|
|
158
162
|
|
159
163
|
|
160
164
|
def run(queue)
|
165
|
+
newreg = Hash.new
|
161
166
|
filelist = Hash.new
|
162
|
-
|
167
|
+
worklist = Hash.new
|
163
168
|
# we can abort the loop if stop? becomes true
|
164
169
|
while !stop?
|
165
170
|
chrono = Time.now.to_i
|
166
171
|
# load te registry, compare it's offsets to file list, set offset to 0 for new files, process the whole list and if finished within the interval wait for next loop,
|
167
172
|
# TODO: sort by timestamp
|
168
173
|
#filelist.sort_by(|k,v|resource(k)[:date])
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
174
|
+
worklist.clear
|
175
|
+
filelist.clear
|
176
|
+
newreg.clear
|
177
|
+
filelist = list_blobs(false)
|
178
|
+
# registry.merge(filelist) {|key, :offset, :length| :offset.merge :length }
|
179
|
+
filelist.each do |name, file|
|
180
|
+
off = 0
|
181
|
+
begin
|
182
|
+
off = @registry[name][:offset]
|
183
|
+
rescue
|
184
|
+
off = 0
|
185
|
+
end
|
186
|
+
newreg.store(name, { :offset => off, :length => file[:length] })
|
187
|
+
end
|
173
188
|
|
174
189
|
# Worklist is the subset of files where the already read offset is smaller than the file size
|
175
|
-
|
176
|
-
|
190
|
+
worklist.clear
|
191
|
+
worklist = newreg.select {|name,file| file[:offset] < file[:length]}
|
177
192
|
# This would be ideal for threading since it's IO intensive, would be nice with a ruby native ThreadPool
|
178
193
|
worklist.each do |name, file|
|
179
|
-
res = resource(name)
|
194
|
+
#res = resource(name)
|
195
|
+
@logger.info(@pipe_id+" processing #{name} from #{file[:offset]} to #{file[:length]}")
|
196
|
+
size = 0
|
180
197
|
if file[:offset] == 0
|
181
198
|
chunk = full_read(name)
|
182
|
-
|
183
|
-
file[:length]=chunk.size
|
199
|
+
size=chunk.size
|
184
200
|
else
|
185
201
|
chunk = partial_read_json(name, file[:offset], file[:length])
|
186
202
|
@logger.debug(@pipe_id+" partial file #{name} from #{file[:offset]} to #{file[:length]}")
|
187
203
|
end
|
188
204
|
if logtype == "nsgflowlog" && @is_json
|
205
|
+
res = resource(name)
|
189
206
|
begin
|
190
207
|
fingjson = JSON.parse(chunk)
|
191
208
|
@processed += nsgflowlog(queue, fingjson)
|
@@ -193,9 +210,10 @@ def run(queue)
|
|
193
210
|
rescue JSON::ParserError
|
194
211
|
@logger.error(@pipe_id+" parse error on #{res[:nsg]} [#{res[:date]}] offset: #{file[:offset]} length: #{file[:length]}")
|
195
212
|
end
|
196
|
-
# TODO Convert this to line based grokking.
|
197
|
-
|
198
|
-
|
213
|
+
# TODO: Convert this to line based grokking.
|
214
|
+
# TODO: ECS Compliance?
|
215
|
+
elsif logtype == "wadiis" && !@is_json
|
216
|
+
@processed += wadiislog(queue, name)
|
199
217
|
else
|
200
218
|
counter = 0
|
201
219
|
@codec.decode(chunk) do |event|
|
@@ -205,7 +223,8 @@ def run(queue)
|
|
205
223
|
end
|
206
224
|
@processed += counter
|
207
225
|
end
|
208
|
-
@registry.store(name, { :offset =>
|
226
|
+
@registry.store(name, { :offset => size, :length => file[:length] })
|
227
|
+
#@logger.info(@pipe_id+" name #{name} size #{size} len #{file[:length]}")
|
209
228
|
# if stop? good moment to stop what we're doing
|
210
229
|
if stop?
|
211
230
|
return
|
@@ -300,51 +319,59 @@ def wadiislog(lines)
|
|
300
319
|
end
|
301
320
|
|
302
321
|
# list all blobs in the blobstore, set the offsets from the registry and return the filelist
|
303
|
-
|
322
|
+
# inspired by: https://github.com/Azure-Samples/storage-blobs-ruby-quickstart/blob/master/example.rb
|
323
|
+
def list_blobs(fill)
|
304
324
|
files = Hash.new
|
305
325
|
nextMarker = nil
|
326
|
+
counter = 0
|
306
327
|
loop do
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
328
|
+
begin
|
329
|
+
if (counter > 10)
|
330
|
+
@logger.error(@pipe_id+" lets try again for the 10th time, why don't faraday and azure storage accounts not play nice together? it has something to do with follow_redirect and a missing authorization header?")
|
331
|
+
end
|
332
|
+
blobs = @blob_client.list_blobs(container, { marker: nextMarker, prefix: @prefix})
|
333
|
+
blobs.each do |blob|
|
334
|
+
# exclude the registry itself
|
335
|
+
unless blob.name == registry_path
|
336
|
+
length = blob.properties[:content_length].to_i
|
337
|
+
offset = 0
|
338
|
+
if fill
|
339
|
+
offset = length
|
340
|
+
end
|
341
|
+
files.store(blob.name, { :offset => offset, :length => length })
|
342
|
+
end
|
343
|
+
end
|
344
|
+
nextMarker = blobs.continuation_token
|
345
|
+
break unless nextMarker && !nextMarker.empty?
|
346
|
+
rescue Exception => e
|
347
|
+
@logger.error(@pipe_id+" caught: #{e.message}")
|
348
|
+
counter += 1
|
349
|
+
end
|
323
350
|
end
|
324
|
-
@logger.debug(@pipe_id+" list_blobs found #{files.size} blobs")
|
325
351
|
return files
|
326
352
|
end
|
327
353
|
|
328
354
|
# When events were processed after the last registry save, start a thread to update the registry file.
|
329
355
|
def save_registry(filelist)
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
356
|
+
# TODO because of threading, processed values and regsaved are not thread safe, they can change as instance variable @!
|
357
|
+
unless @processed == @regsaved
|
358
|
+
@regsaved = @processed
|
359
|
+
@logger.info(@pipe_id+" processed #{@processed} events, saving #{filelist.size} blobs and offsets to registry #{registry_path}")
|
360
|
+
Thread.new {
|
361
|
+
begin
|
362
|
+
@blob_client.create_block_blob(container, registry_path, Marshal.dump(filelist))
|
363
|
+
rescue
|
364
|
+
@logger.error(@pipe_id+" Oh my, registry write failed, do you have write access?")
|
365
|
+
end
|
366
|
+
}
|
367
|
+
end
|
342
368
|
end
|
343
369
|
|
344
370
|
def learn_encapsulation
|
345
371
|
# From one file, read first block and last block to learn head and tail
|
346
372
|
# If the blobstorage can't be found, an error from farraday middleware will come with the text
|
347
373
|
# org.jruby.ext.set.RubySet cannot be cast to class org.jruby.RubyFixnum
|
374
|
+
# implement options ... prefix may ot exist!
|
348
375
|
blob = @blob_client.list_blobs(container, { maxresults: 1, prefix: @prefix }).first
|
349
376
|
return if blob.nil?
|
350
377
|
blocks = @blob_client.list_blob_blocks(container, blob.name)[:committed]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'logstash-input-azure_blob_storage'
|
3
|
-
s.version = '0.
|
3
|
+
s.version = '0.11.0'
|
4
4
|
s.licenses = ['Apache-2.0']
|
5
5
|
s.summary = 'This logstash plugin reads and parses data from Azure Storage Blobs.'
|
6
6
|
s.description = <<-EOF
|
@@ -20,8 +20,8 @@ EOF
|
|
20
20
|
s.metadata = { "logstash_plugin" => "true", "logstash_group" => "input" }
|
21
21
|
|
22
22
|
# Gem dependencies
|
23
|
-
s.add_runtime_dependency
|
23
|
+
s.add_runtime_dependency 'logstash-core-plugin-api', '~> 2.1'
|
24
24
|
s.add_runtime_dependency 'stud', '~> 0.0.23'
|
25
|
-
s.add_runtime_dependency 'azure-storage-blob', '~> 1.
|
25
|
+
s.add_runtime_dependency 'azure-storage-blob', '~> 1.0'
|
26
26
|
s.add_development_dependency 'logstash-devutils', '~> 1.0', '>= 1.0.0'
|
27
27
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: logstash-input-azure_blob_storage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Geertsma
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-11-
|
11
|
+
date: 2019-11-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: '1.
|
46
|
+
version: '1.0'
|
47
47
|
name: azure-storage-blob
|
48
48
|
prerelease: false
|
49
49
|
type: :runtime
|
@@ -51,7 +51,7 @@ dependencies:
|
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '1.
|
54
|
+
version: '1.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
requirement: !ruby/object:Gem::Requirement
|
57
57
|
requirements:
|