logstash-input-azure_blob_storage 0.12.7 → 0.12.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +26 -0
- data/lib/logstash/inputs/azure_blob_storage.rb +214 -132
- data/logstash-input-azure_blob_storage.gemspec +2 -2
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6226b48f09b69ea1fe5d5e65197cf87daed475a2dff3aecc1ff30b1c921d4e7e
|
4
|
+
data.tar.gz: 9ac324158bddc908f107663925a27ff289eb7b264293da88218a825d66c74d74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6cdd2d17fd57adc43b0c8e7354cbf396243b4bf691e8ef12d757c2c9dc515f9711ecbe9c64495b0d6f50040a28af98af2b641224c03dc83c3c4db9919ef1fb77
|
7
|
+
data.tar.gz: e1a71cfbe35af0d878374dcce499096331c82de867d86fb6ea3f4c876e1cc24f8b0fb59087b112012989c358ec9f238159564d17d9433ca1899a776a1c311683
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
## 0.12.8
|
2
|
+
- support append blob (use codec json_lines and logtype raw)
|
3
|
+
- change the default head and tail to an empty string, unless the logtype is nsgflowlog
|
4
|
+
- jsonclean configuration parameter to clean the json stream from faulty characters to prevent parse errors
|
5
|
+
- catch ContainerNotFound, print error message in log and sleep interval time.
|
1
6
|
|
2
7
|
## 0.12.7
|
3
8
|
- rewrote partial_read, now the occasional json parse errors should be fixed by reading only commited blocks.
|
data/README.md
CHANGED
@@ -8,6 +8,14 @@ For problems or feature requests with this specific plugin, raise a github issue
|
|
8
8
|
This plugin can read from Azure Storage Blobs, for instance JSON diagnostics logs for NSG flow logs or LINE based accesslogs from App Services.
|
9
9
|
[Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/)
|
10
10
|
|
11
|
+
## Alternatives
|
12
|
+
This plugin was inspired by the Azure diagnostics tools, but should work better for bigger amounts of files. the configuration is not compatible, the configuration azureblob refers to the diagnostics tools plugin and this plugin uses azure_blob_storage
|
13
|
+
https://github.com/Azure/azure-diagnostics-tools/tree/master/Logstash/logstash-input-azureblob
|
14
|
+
|
15
|
+
There is a Filebeat plugin, that may work in the future
|
16
|
+
https://www.elastic.co/guide/en/beats/filebeat/current/filebeat-input-azure-blob-storage.html
|
17
|
+
|
18
|
+
## Innerworking
|
11
19
|
The plugin depends on the [Ruby library azure-storage-blob](https://rubygems.org/gems/azure-storage-blob/versions/1.1.0) from Microsoft, that depends on Faraday for the HTTPS connection to Azure.
|
12
20
|
|
13
21
|
The plugin executes the following steps
|
@@ -184,6 +192,20 @@ output {
|
|
184
192
|
}
|
185
193
|
}
|
186
194
|
```
|
195
|
+
|
196
|
+
Another for json_lines on append_blobs
|
197
|
+
```
|
198
|
+
input {
|
199
|
+
azure_blob_storage {
|
200
|
+
codec => json_lines {
|
201
|
+
delimiter => "\n"
|
202
|
+
charset => "UTF-8"
|
203
|
+
}
|
204
|
+
# below options are optional
|
205
|
+
logtype => "raw"
|
206
|
+
append => true
|
207
|
+
cleanjson => true
|
208
|
+
```
|
187
209
|
The configuration documentation is in the first 100 lines of the code
|
188
210
|
[GITHUB/janmg/logstash-input-azure_blob_storage/blob/master/lib/logstash/inputs/azure_blob_storage.rb](https://github.com/janmg/logstash-input-azure_blob_storage/blob/master/lib/logstash/inputs/azure_blob_storage.rb)
|
189
211
|
|
@@ -228,5 +250,9 @@ filter {
|
|
228
250
|
remove_field => ["timestamp"]
|
229
251
|
}
|
230
252
|
}
|
253
|
+
|
254
|
+
output {
|
255
|
+
stdout { codec => rubydebug }
|
256
|
+
}
|
231
257
|
```
|
232
258
|
|
@@ -26,7 +26,7 @@ require 'json'
|
|
26
26
|
class LogStash::Inputs::AzureBlobStorage < LogStash::Inputs::Base
|
27
27
|
config_name "azure_blob_storage"
|
28
28
|
|
29
|
-
# If undefined, Logstash will complain, even if codec is unused. The codec for nsgflowlog is "json" and the for WADIIS and APPSERVICE is "line".
|
29
|
+
# If undefined, Logstash will complain, even if codec is unused. The codec for nsgflowlog is "json", "json_line" works and the for WADIIS and APPSERVICE is "line".
|
30
30
|
default :codec, "json"
|
31
31
|
|
32
32
|
# logtype can be nsgflowlog, wadiis, appservice or raw. The default is raw, where files are read and added as one event. If the file grows, the next interval the file is read from the offset, so that the delta is sent as another event. In raw mode, further processing has to be done in the filter block. If the logtype is specified, this plugin will split and mutate and add individual events to the queue.
|
@@ -68,7 +68,7 @@ class LogStash::Inputs::AzureBlobStorage < LogStash::Inputs::Base
|
|
68
68
|
# when set to `start_fresh`, it will read log files that are created or appended since this start of the pipeline.
|
69
69
|
config :registry_create_policy, :validate => ['resume','start_over','start_fresh'], :required => false, :default => 'resume'
|
70
70
|
|
71
|
-
|
71
|
+
# The interval is used to save the registry regularly, when new events have have been processed. It is also used to wait before listing the files again and substracting the registry of already processed files to determine the worklist.
|
72
72
|
# waiting time in seconds until processing the next batch. NSGFLOWLOGS append a block per minute, so use multiples of 60 seconds, 300 for 5 minutes, 600 for 10 minutes. The registry is also saved after every interval.
|
73
73
|
# Partial reading starts from the offset and reads until the end, so the starting tag is prepended
|
74
74
|
config :interval, :validate => :number, :default => 60
|
@@ -95,10 +95,14 @@ class LogStash::Inputs::AzureBlobStorage < LogStash::Inputs::Base
|
|
95
95
|
config :skip_learning, :validate => :boolean, :default => false, :required => false
|
96
96
|
|
97
97
|
# The string that starts the JSON. Only needed when the codec is JSON. When partial file are read, the result will not be valid JSON unless the start and end are put back. the file_head and file_tail are learned at startup, by reading the first file in the blob_list and taking the first and last block, this would work for blobs that are appended like nsgflowlogs. The configuration can be set to override the learning. In case learning fails and the option is not set, the default is to use the 'records' as set by nsgflowlogs.
|
98
|
-
config :file_head, :validate => :string, :required => false, :default => '
|
98
|
+
config :file_head, :validate => :string, :required => false, :default => ''
|
99
99
|
# The string that ends the JSON
|
100
|
-
config :file_tail, :validate => :string, :required => false, :default => '
|
100
|
+
config :file_tail, :validate => :string, :required => false, :default => ''
|
101
101
|
|
102
|
+
# inspect the bytes and remove faulty characters
|
103
|
+
config :cleanjson, :validate => :boolean, :default => false, :required => false
|
104
|
+
|
105
|
+
config :append, :validate => :boolean, :default => false, :required => false
|
102
106
|
# By default it will watch every file in the storage container. The prefix option is a simple filter that only processes files with a path that starts with that value.
|
103
107
|
# For NSGFLOWLOGS a path starts with "resourceId=/". This would only be needed to exclude other paths that may be written in the same container. The registry file will be excluded.
|
104
108
|
# You may also configure multiple paths. See an example on the <<array,Logstash configuration page>>.
|
@@ -118,6 +122,7 @@ public
|
|
118
122
|
@logger.info("If this plugin doesn't work, please raise an issue in https://github.com/janmg/logstash-input-azure_blob_storage")
|
119
123
|
@busy_writing_registry = Mutex.new
|
120
124
|
# TODO: consider multiple readers, so add pipeline @id or use logstash-to-logstash communication?
|
125
|
+
# For now it's difficult because the plugin would then have to synchronize the worklist
|
121
126
|
end
|
122
127
|
|
123
128
|
|
@@ -128,41 +133,10 @@ public
|
|
128
133
|
@regsaved = @processed
|
129
134
|
|
130
135
|
connect
|
131
|
-
|
132
136
|
@registry = Hash.new
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
if (!@registry_local_path.nil?)
|
137
|
-
unless File.file?(@registry_local_path+"/"+@pipe_id)
|
138
|
-
@registry = Marshal.load(@blob_client.get_blob(container, registry_path)[1])
|
139
|
-
#[0] headers [1] responsebody
|
140
|
-
@logger.info("migrating from remote registry #{registry_path}")
|
141
|
-
else
|
142
|
-
if !Dir.exist?(@registry_local_path)
|
143
|
-
FileUtils.mkdir_p(@registry_local_path)
|
144
|
-
end
|
145
|
-
@registry = Marshal.load(File.read(@registry_local_path+"/"+@pipe_id))
|
146
|
-
@logger.info("resuming from local registry #{registry_local_path+"/"+@pipe_id}")
|
147
|
-
end
|
148
|
-
else
|
149
|
-
@registry = Marshal.load(@blob_client.get_blob(container, registry_path)[1])
|
150
|
-
#[0] headers [1] responsebody
|
151
|
-
@logger.info("resuming from remote registry #{registry_path}")
|
152
|
-
end
|
153
|
-
break
|
154
|
-
rescue Exception => e
|
155
|
-
@logger.error("caught: #{e.message}")
|
156
|
-
@registry.clear
|
157
|
-
@logger.error("loading registry failed for attempt #{counter} of 3")
|
158
|
-
end
|
159
|
-
end
|
160
|
-
end
|
161
|
-
# read filelist and set offsets to file length to mark all the old files as done
|
162
|
-
if registry_create_policy == "start_fresh"
|
163
|
-
@registry = list_blobs(true)
|
164
|
-
save_registry()
|
165
|
-
@logger.info("starting fresh, writing a clean registry to contain #{@registry.size} blobs/files")
|
137
|
+
load_registry()
|
138
|
+
@registry.each do |name, file|
|
139
|
+
@logger.info("offset: #{file[:offset]} length: #{file[:length]}")
|
166
140
|
end
|
167
141
|
|
168
142
|
@is_json = false
|
@@ -174,22 +148,29 @@ public
|
|
174
148
|
@is_json_line = true
|
175
149
|
end
|
176
150
|
end
|
151
|
+
|
152
|
+
|
177
153
|
@head = ''
|
178
154
|
@tail = ''
|
179
|
-
# if codec=json sniff one files blocks A and Z to learn file_head and file_tail
|
180
155
|
if @is_json
|
156
|
+
# if codec=json sniff one files blocks A and Z to learn file_head and file_tail
|
157
|
+
if @logtype == 'nsgflowlog'
|
158
|
+
@head = '{"records":['
|
159
|
+
@tail = ']}'
|
160
|
+
end
|
181
161
|
if file_head
|
182
162
|
@head = file_head
|
183
163
|
end
|
184
164
|
if file_tail
|
185
165
|
@tail = file_tail
|
186
166
|
end
|
187
|
-
if
|
167
|
+
if !skip_learning
|
188
168
|
learn_encapsulation
|
189
169
|
end
|
190
|
-
@logger.info("head will be: #{@head} and tail is set to #{@tail}")
|
170
|
+
@logger.info("head will be: '#{@head}' and tail is set to: '#{@tail}'")
|
191
171
|
end
|
192
172
|
|
173
|
+
|
193
174
|
filelist = Hash.new
|
194
175
|
worklist = Hash.new
|
195
176
|
@last = start = Time.now.to_i
|
@@ -206,24 +187,27 @@ public
|
|
206
187
|
# load the registry, compare it's offsets to file list, set offset to 0 for new files, process the whole list and if finished within the interval wait for next loop,
|
207
188
|
# TODO: sort by timestamp ?
|
208
189
|
#filelist.sort_by(|k,v|resource(k)[:date])
|
209
|
-
worklist.clear
|
210
190
|
filelist.clear
|
211
191
|
|
212
192
|
# Listing all the files
|
213
193
|
filelist = list_blobs(false)
|
194
|
+
if (@debug_until > @processed) then
|
195
|
+
@registry.each do |name, file|
|
196
|
+
@logger.info("#{name} offset: #{file[:offset]} length: #{file[:length]}")
|
197
|
+
end
|
198
|
+
end
|
214
199
|
filelist.each do |name, file|
|
215
200
|
off = 0
|
216
201
|
if @registry.key?(name) then
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
202
|
+
begin
|
203
|
+
off = @registry[name][:offset]
|
204
|
+
rescue Exception => e
|
205
|
+
@logger.error("caught: #{e.message} while reading #{name}")
|
206
|
+
end
|
222
207
|
end
|
223
208
|
@registry.store(name, { :offset => off, :length => file[:length] })
|
224
209
|
if (@debug_until > @processed) then @logger.info("2: adding offsets: #{name} #{off} #{file[:length]}") end
|
225
210
|
end
|
226
|
-
# size nilClass when the list doesn't grow?!
|
227
211
|
|
228
212
|
# clean registry of files that are not in the filelist
|
229
213
|
@registry.each do |name,file|
|
@@ -242,14 +226,16 @@ public
|
|
242
226
|
|
243
227
|
# Start of processing
|
244
228
|
# This would be ideal for threading since it's IO intensive, would be nice with a ruby native ThreadPool
|
229
|
+
# pool = Concurrent::FixedThreadPool.new(5) # 5 threads
|
230
|
+
#pool.post do
|
231
|
+
# some parallel work
|
232
|
+
#end
|
245
233
|
if (worklist.size > 0) then
|
246
234
|
worklist.each do |name, file|
|
247
235
|
start = Time.now.to_i
|
248
236
|
if (@debug_until > @processed) then @logger.info("3: processing #{name} from #{file[:offset]} to #{file[:length]}") end
|
249
237
|
size = 0
|
250
238
|
if file[:offset] == 0
|
251
|
-
# This is where Sera4000 issue starts
|
252
|
-
# For an append blob, reading full and crashing, retry, last_modified? ... lenght? ... committed? ...
|
253
239
|
# length and skip reg value
|
254
240
|
if (file[:length] > 0)
|
255
241
|
begin
|
@@ -272,49 +258,68 @@ public
|
|
272
258
|
delta_size = chunk.size - @head.length - 1
|
273
259
|
end
|
274
260
|
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
261
|
+
#
|
262
|
+
# TODO! ... split out the logtypes and use individual methods
|
263
|
+
# how does a byte array chuck from json_lines get translated to strings/json/events
|
264
|
+
# should the byte array be converted to a multiline and then split? drawback need to know characterset and linefeed characters
|
265
|
+
# how does the json_line decoder work on byte arrays?
|
266
|
+
#
|
267
|
+
# so many questions
|
268
|
+
|
269
|
+
unless chunk.nil?
|
270
|
+
counter = 0
|
271
|
+
if @is_json
|
272
|
+
if logtype == "nsgflowlog"
|
273
|
+
res = resource(name)
|
274
|
+
begin
|
275
|
+
fingjson = JSON.parse(chunk)
|
276
|
+
@processed += nsgflowlog(queue, fingjson, name)
|
277
|
+
@logger.debug("Processed #{res[:nsg]} #{@processed} events")
|
278
|
+
rescue JSON::ParserError => e
|
279
|
+
@logger.error("parse error #{e.message} on #{res[:nsg]} offset: #{file[:offset]} length: #{file[:length]}")
|
280
|
+
if (@debug_until > @processed) then @logger.info("#{chunk}") end
|
281
|
+
end
|
282
|
+
else
|
283
|
+
begin
|
284
|
+
@codec.decode(chunk) do |event|
|
285
|
+
counter += 1
|
286
|
+
if @addfilename
|
287
|
+
event.set('filename', name)
|
288
|
+
end
|
289
|
+
decorate(event)
|
290
|
+
queue << event
|
291
|
+
end
|
292
|
+
@processed += counter
|
293
|
+
rescue Exception => e
|
294
|
+
@logger.error("codec exception: #{e.message} .. continue and pretend this never happened")
|
295
|
+
end
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
if logtype == "wadiis" && !@is_json
|
300
|
+
# TODO: Convert this to line based grokking.
|
301
|
+
@processed += wadiislog(queue, name)
|
302
302
|
end
|
303
303
|
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
304
|
+
if @is_json_line
|
305
|
+
# parse one line at a time and dump it in the chunk?
|
306
|
+
lines = chunk.to_s
|
307
|
+
if cleanjson
|
308
|
+
@logger.info("cleaning in progress")
|
309
|
+
lines.chars.select(&:valid_encoding?).join
|
310
|
+
#lines.delete "\\"
|
311
|
+
#lines.scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' }
|
312
|
+
end
|
313
|
+
begin
|
314
|
+
@codec.decode(lines) do |event|
|
315
|
+
counter += 1
|
316
|
+
queue << event
|
310
317
|
end
|
311
|
-
|
312
|
-
|
318
|
+
@processed += counter
|
319
|
+
rescue Exception => e
|
320
|
+
# todo: fix codec_lines exception: no implicit conversion of Array into String
|
321
|
+
@logger.error("json_lines codec exception: #{e.message} .. continue and pretend this never happened")
|
313
322
|
end
|
314
|
-
@processed += counter
|
315
|
-
rescue Exception => e
|
316
|
-
@logger.error("codec exception: #{e.message} .. will continue and pretend this never happened")
|
317
|
-
@logger.debug("#{chunk}")
|
318
323
|
end
|
319
324
|
end
|
320
325
|
|
@@ -354,6 +359,24 @@ public
|
|
354
359
|
|
355
360
|
|
356
361
|
private
|
362
|
+
def list_files
|
363
|
+
filelist = list_blobs(false)
|
364
|
+
filelist.each do |name, file|
|
365
|
+
off = 0
|
366
|
+
if @registry.key?(name) then
|
367
|
+
begin
|
368
|
+
off = @registry[name][:offset]
|
369
|
+
rescue Exception => e
|
370
|
+
@logger.error("caught: #{e.message} while reading #{name}")
|
371
|
+
end
|
372
|
+
end
|
373
|
+
@registry.store(name, { :offset => off, :length => file[:length] })
|
374
|
+
if (@debug_until > @processed) then @logger.info("2: adding offsets: #{name} #{off} #{file[:length]}") end
|
375
|
+
end
|
376
|
+
return filelist
|
377
|
+
end
|
378
|
+
# size nilClass when the list doesn't grow?!
|
379
|
+
|
357
380
|
def connect
|
358
381
|
# Try in this order to access the storageaccount
|
359
382
|
# 1. storageaccount / sas_token
|
@@ -384,11 +407,48 @@ private
|
|
384
407
|
# end
|
385
408
|
end
|
386
409
|
end
|
410
|
+
# @registry_create_policy,@registry_local_path,@container,@registry_path
|
411
|
+
def load_registry()
|
412
|
+
if @registry_create_policy == "resume"
|
413
|
+
for counter in 1..3
|
414
|
+
begin
|
415
|
+
if (!@registry_local_path.nil?)
|
416
|
+
unless File.file?(@registry_local_path+"/"+@pipe_id)
|
417
|
+
@registry = Marshal.load(@blob_client.get_blob(@container, path)[1])
|
418
|
+
#[0] headers [1] responsebody
|
419
|
+
@logger.info("migrating from remote registry #{path}")
|
420
|
+
else
|
421
|
+
if !Dir.exist?(@registry_local_path)
|
422
|
+
FileUtils.mkdir_p(@registry_local_path)
|
423
|
+
end
|
424
|
+
@registry = Marshal.load(File.read(@registry_local_path+"/"+@pipe_id))
|
425
|
+
@logger.info("resuming from local registry #{@registry_local_path+"/"+@pipe_id}")
|
426
|
+
end
|
427
|
+
else
|
428
|
+
@registry = Marshal.load(@blob_client.get_blob(container, path)[1])
|
429
|
+
#[0] headers [1] responsebody
|
430
|
+
@logger.info("resuming from remote registry #{path}")
|
431
|
+
end
|
432
|
+
break
|
433
|
+
rescue Exception => e
|
434
|
+
@logger.error("caught: #{e.message}")
|
435
|
+
@registry.clear
|
436
|
+
@logger.error("loading registry failed for attempt #{counter} of 3")
|
437
|
+
end
|
438
|
+
end
|
439
|
+
end
|
440
|
+
# read filelist and set offsets to file length to mark all the old files as done
|
441
|
+
if @registry_create_policy == "start_fresh"
|
442
|
+
@registry = list_blobs(true)
|
443
|
+
#save_registry()
|
444
|
+
@logger.info("starting fresh, with a clean registry containing #{@registry.size} blobs/files")
|
445
|
+
end
|
446
|
+
end
|
387
447
|
|
388
448
|
def full_read(filename)
|
389
449
|
tries ||= 2
|
390
450
|
begin
|
391
|
-
return @blob_client.get_blob(container, filename)[1]
|
451
|
+
return @blob_client.get_blob(@container, filename)[1]
|
392
452
|
rescue Exception => e
|
393
453
|
@logger.error("caught: #{e.message} for full_read")
|
394
454
|
if (tries -= 1) > 0
|
@@ -399,7 +459,7 @@ private
|
|
399
459
|
end
|
400
460
|
end
|
401
461
|
begin
|
402
|
-
chuck = @blob_client.get_blob(container, filename)[1]
|
462
|
+
chuck = @blob_client.get_blob(@container, filename)[1]
|
403
463
|
end
|
404
464
|
return chuck
|
405
465
|
end
|
@@ -410,29 +470,45 @@ private
|
|
410
470
|
# 3. strip comma
|
411
471
|
# if json strip comma and fix head and tail
|
412
472
|
size = 0
|
413
|
-
blocks = @blob_client.list_blob_blocks(container, blobname)
|
414
|
-
blocks[:committed].each do |block|
|
415
|
-
size += block.size
|
416
|
-
end
|
417
|
-
# read the new blob blocks from the offset to the last committed size.
|
418
|
-
# if it is json, fix the head and tail
|
419
|
-
# crap committed block at the end is the tail, so must be substracted from the read and then comma stripped and tail added.
|
420
|
-
# but why did I need a -1 for the length?? probably the offset starts at 0 and ends at size-1
|
421
473
|
|
422
|
-
|
423
|
-
|
474
|
+
begin
|
475
|
+
if @append
|
476
|
+
return @blob_client.get_blob(@container, blobname, start_range: offset-1)[1]
|
477
|
+
end
|
478
|
+
blocks = @blob_client.list_blob_blocks(@container, blobname)
|
479
|
+
blocks[:committed].each do |block|
|
480
|
+
size += block.size
|
481
|
+
end
|
482
|
+
# read the new blob blocks from the offset to the last committed size.
|
483
|
+
# if it is json, fix the head and tail
|
484
|
+
# crap committed block at the end is the tail, so must be substracted from the read and then comma stripped and tail added.
|
485
|
+
# but why did I need a -1 for the length?? probably the offset starts at 0 and ends at size-1
|
486
|
+
|
487
|
+
# should first check commit, read and the check committed again? no, only read the commited size
|
488
|
+
# should read the full content and then substract json tail
|
424
489
|
|
425
|
-
|
426
|
-
|
427
|
-
if content.end_with?(@tail)
|
428
|
-
return @head + strip_comma(content)
|
490
|
+
unless @is_json
|
491
|
+
return @blob_client.get_blob(@container, blobname, start_range: offset, end_range: size-1)[1]
|
429
492
|
else
|
430
|
-
@
|
431
|
-
|
432
|
-
|
493
|
+
content = @blob_client.get_blob(@container, blobname, start_range: offset-1, end_range: size-1)[1]
|
494
|
+
if content.end_with?(@tail)
|
495
|
+
return @head + strip_comma(content)
|
496
|
+
else
|
497
|
+
@logger.info("Fixed a tail! probably new committed blocks started appearing!")
|
498
|
+
# substract the length of the tail and add the tail, because the file grew.size was calculated as the block boundary, so replacing the last bytes with the tail should fix the problem
|
499
|
+
return @head + strip_comma(content[0...-@tail.length]) + @tail
|
500
|
+
end
|
433
501
|
end
|
434
|
-
|
435
|
-
|
502
|
+
rescue InvalidBlobType => ibt
|
503
|
+
@logger.error("caught #{ibt.message}. Setting BlobType to append")
|
504
|
+
@append = true
|
505
|
+
retry
|
506
|
+
rescue NoMethodError => nme
|
507
|
+
@logger.error("caught #{nme.message}. Setting append to true")
|
508
|
+
@append = true
|
509
|
+
retry
|
510
|
+
rescue Exception => e
|
511
|
+
@logger.error("caught #{e.message}")
|
436
512
|
end
|
437
513
|
end
|
438
514
|
|
@@ -532,26 +608,31 @@ private
|
|
532
608
|
nextMarker = nil
|
533
609
|
counter = 1
|
534
610
|
loop do
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
611
|
+
begin
|
612
|
+
blobs = @blob_client.list_blobs(@container, { marker: nextMarker, prefix: @prefix})
|
613
|
+
blobs.each do |blob|
|
614
|
+
# FNM_PATHNAME is required so that "**/test" can match "test" at the root folder
|
615
|
+
# FNM_EXTGLOB allows you to use "test{a,b,c}" to match either "testa", "testb" or "testc" (closer to shell behavior)
|
616
|
+
unless blob.name == registry_path
|
617
|
+
if @path_filters.any? {|path| File.fnmatch?(path, blob.name, File::FNM_PATHNAME | File::FNM_EXTGLOB)}
|
618
|
+
length = blob.properties[:content_length].to_i
|
619
|
+
offset = 0
|
620
|
+
if fill
|
621
|
+
offset = length
|
622
|
+
end
|
623
|
+
files.store(blob.name, { :offset => offset, :length => length })
|
624
|
+
if (@debug_until > @processed) then @logger.info("1: list_blobs #{blob.name} #{offset} #{length}") end
|
545
625
|
end
|
546
|
-
files.store(blob.name, { :offset => offset, :length => length })
|
547
|
-
if (@debug_until > @processed) then @logger.info("1: list_blobs #{blob.name} #{offset} #{length}") end
|
548
626
|
end
|
549
627
|
end
|
628
|
+
nextMarker = blobs.continuation_token
|
629
|
+
break unless nextMarker && !nextMarker.empty?
|
630
|
+
if (counter % 10 == 0) then @logger.info(" listing #{counter * 50000} files") end
|
631
|
+
counter+=1
|
632
|
+
rescue Exception => e
|
633
|
+
@logger.error("caught: #{e.message} while trying to list blobs")
|
634
|
+
return files
|
550
635
|
end
|
551
|
-
nextMarker = blobs.continuation_token
|
552
|
-
break unless nextMarker && !nextMarker.empty?
|
553
|
-
if (counter % 10 == 0) then @logger.info(" listing #{counter * 50000} files") end
|
554
|
-
counter+=1
|
555
636
|
end
|
556
637
|
if @debug_timer
|
557
638
|
@logger.info("list_blobs took #{Time.now.to_i - chrono} sec")
|
@@ -571,7 +652,7 @@ private
|
|
571
652
|
begin
|
572
653
|
@busy_writing_registry.lock
|
573
654
|
unless (@registry_local_path)
|
574
|
-
@blob_client.create_block_blob(container, registry_path, regdump)
|
655
|
+
@blob_client.create_block_blob(@container, registry_path, regdump)
|
575
656
|
@logger.info("processed #{@processed} events, saving #{regsize} blobs and offsets to remote registry #{registry_path}")
|
576
657
|
else
|
577
658
|
File.open(@registry_local_path+"/"+@pipe_id, 'w') { |file| file.write(regdump) }
|
@@ -597,20 +678,20 @@ private
|
|
597
678
|
@logger.info("learn_encapsulation, this can be skipped by setting skip_learning => true. Or set both head_file and tail_file")
|
598
679
|
# From one file, read first block and last block to learn head and tail
|
599
680
|
begin
|
600
|
-
blobs = @blob_client.list_blobs(container, { max_results: 3, prefix: @prefix})
|
681
|
+
blobs = @blob_client.list_blobs(@container, { max_results: 3, prefix: @prefix})
|
601
682
|
blobs.each do |blob|
|
602
683
|
unless blob.name == registry_path
|
603
684
|
begin
|
604
|
-
blocks = @blob_client.list_blob_blocks(container, blob.name)[:committed]
|
685
|
+
blocks = @blob_client.list_blob_blocks(@container, blob.name)[:committed]
|
605
686
|
if ['A00000000000000000000000000000000','QTAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAw'].include?(blocks.first.name)
|
606
687
|
@logger.debug("using #{blob.name}/#{blocks.first.name} to learn the json header")
|
607
|
-
@head = @blob_client.get_blob(container, blob.name, start_range: 0, end_range: blocks.first.size-1)[1]
|
688
|
+
@head = @blob_client.get_blob(@container, blob.name, start_range: 0, end_range: blocks.first.size-1)[1]
|
608
689
|
end
|
609
690
|
if ['Z00000000000000000000000000000000','WjAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAw'].include?(blocks.last.name)
|
610
691
|
@logger.debug("using #{blob.name}/#{blocks.last.name} to learn the json footer")
|
611
692
|
length = blob.properties[:content_length].to_i
|
612
693
|
offset = length - blocks.last.size
|
613
|
-
@tail = @blob_client.get_blob(container, blob.name, start_range: offset, end_range: length-1)[1]
|
694
|
+
@tail = @blob_client.get_blob(@container, blob.name, start_range: offset, end_range: length-1)[1]
|
614
695
|
@logger.debug("learned tail: #{@tail}")
|
615
696
|
end
|
616
697
|
rescue Exception => e
|
@@ -635,7 +716,9 @@ private
|
|
635
716
|
def val(str)
|
636
717
|
return str.split('=')[1]
|
637
718
|
end
|
719
|
+
end # class LogStash::Inputs::AzureBlobStorage
|
638
720
|
|
721
|
+
# This is a start towards mapping NSG events to ECS fields ... it's complicated
|
639
722
|
=begin
|
640
723
|
def ecs(old)
|
641
724
|
# https://www.elastic.co/guide/en/ecs/current/ecs-field-reference.html
|
@@ -681,4 +764,3 @@ private
|
|
681
764
|
return ecs
|
682
765
|
end
|
683
766
|
=end
|
684
|
-
end # class LogStash::Inputs::AzureBlobStorage
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |s|
|
2
2
|
s.name = 'logstash-input-azure_blob_storage'
|
3
|
-
s.version = '0.12.
|
3
|
+
s.version = '0.12.8'
|
4
4
|
s.licenses = ['Apache-2.0']
|
5
5
|
s.summary = 'This logstash plugin reads and parses data from Azure Storage Blobs.'
|
6
6
|
s.description = <<-EOF
|
@@ -24,5 +24,5 @@ EOF
|
|
24
24
|
s.add_runtime_dependency 'stud', '~> 0.0.23'
|
25
25
|
s.add_runtime_dependency 'azure-storage-blob', '~> 2', '>= 2.0.3'
|
26
26
|
s.add_development_dependency 'logstash-devutils', '~> 2.4'
|
27
|
-
s.add_development_dependency 'rubocop', '~> 1.
|
27
|
+
s.add_development_dependency 'rubocop', '~> 1.50'
|
28
28
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: logstash-input-azure_blob_storage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jan Geertsma
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -77,7 +77,7 @@ dependencies:
|
|
77
77
|
requirements:
|
78
78
|
- - "~>"
|
79
79
|
- !ruby/object:Gem::Version
|
80
|
-
version: '1.
|
80
|
+
version: '1.50'
|
81
81
|
name: rubocop
|
82
82
|
prerelease: false
|
83
83
|
type: :development
|
@@ -85,7 +85,7 @@ dependencies:
|
|
85
85
|
requirements:
|
86
86
|
- - "~>"
|
87
87
|
- !ruby/object:Gem::Version
|
88
|
-
version: '1.
|
88
|
+
version: '1.50'
|
89
89
|
description: " This gem is a Logstash plugin. It reads and parses data from Azure\
|
90
90
|
\ Storage Blobs. The azure_blob_storage is a reimplementation to replace azureblob\
|
91
91
|
\ from azure-diagnostics-tools/Logstash. It can deal with larger volumes and partial\
|