fluent-plugin-cloudwatch-ingest 0.6.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +38 -0
- data/README.md +19 -3
- data/circle.yml +3 -0
- data/lib/fluent/plugin/cloudwatch/ingest/version.rb +1 -1
- data/lib/fluent/plugin/in_cloudwatch_ingest.rb +128 -59
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 906c62816c5c16ed1fd7bfe31fc9eed873a0d149
|
4
|
+
data.tar.gz: a4d5b892302e21a96ef8c8d3b067a605cb1c49f5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a5e306f56e216c0742fd8983591481559569f93add886e1c5eca867f44c1b824dfcd912f29ea1a0ede09f0681841380a14cfddd1586557a2428e8e9b8a9ac87a
|
7
|
+
data.tar.gz: 9ea98c8202409bf0a0116dbbcd85204760c7f3e1645f24e4a8d1c69853b61a45b95ce9327a459190bc303359c0b9faee5d713b4597c408aebde4e603cab56987
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
## 0.1.3
|
4
|
+
|
5
|
+
* Initial release
|
6
|
+
|
7
|
+
## 0.2.1
|
8
|
+
|
9
|
+
* AWS SDK logging
|
10
|
+
* Code reorganization
|
11
|
+
|
12
|
+
## 0.3.1
|
13
|
+
|
14
|
+
* Limit events per API call
|
15
|
+
* Parser constructor fix (@snltd)
|
16
|
+
|
17
|
+
## 0.4.0
|
18
|
+
|
19
|
+
* Optionally fetch oldest logs first (@chaeyk)
|
20
|
+
|
21
|
+
## 0.5.4
|
22
|
+
|
23
|
+
* Optionally parse the body as JSON into structured fields
|
24
|
+
|
25
|
+
## 0.6.0
|
26
|
+
|
27
|
+
* Add statsd telemetry
|
28
|
+
|
29
|
+
## 1.0.0
|
30
|
+
|
31
|
+
* Print a stack trace when recusing exceptions (@chaeyk)
|
32
|
+
* If stored API token is invalid or corrupt, use a stored timestamp (@chaeyk)
|
33
|
+
* Truncate statefile before saving (@chaeyk)
|
34
|
+
* Amend how `api_interval` is used (see README.md) (@chaeyk)
|
35
|
+
* Improve null stream detection (@chaeyk)
|
36
|
+
* Remove streams from state file that are no longer present (@chaeyk)
|
37
|
+
* Apply `error_interval` when failing to get statefile lock (@chaeyk)
|
38
|
+
* `api_interval` deprecated in favour of `error_interval`
|
data/README.md
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
# Fluentd Cloudwatch Plugin
|
1
|
+
# Fluentd Cloudwatch Plugin
|
2
|
+
[](https://circleci.com/gh/sampointer/fluent-plugin-cloudwatch-ingest) [](https://badge.fury.io/rb/fluent-plugin-cloudwatch-ingest)  [](https://gitter.im/fluent-plugin-cloudwatch-ingest/Lobby?utm_source=share-link&utm_medium=link&utm_campaign=share-link)
|
2
3
|
|
3
4
|
## Introduction
|
4
5
|
|
@@ -36,10 +37,9 @@ Or install it yourself as:
|
|
36
37
|
aws_logging_enabled true
|
37
38
|
log_group_name_prefix /aws/lambda
|
38
39
|
log_stream_name_prefix 2017
|
39
|
-
limit_events 10000
|
40
40
|
state_file_name /mnt/nfs/cloudwatch.state
|
41
41
|
interval 60
|
42
|
-
|
42
|
+
error_interval 5 # Time to wait between error conditions before retry
|
43
43
|
limit_events 10000 # Number of events to fetch in any given iteration
|
44
44
|
event_start_time 0 # Do not fetch events before this time (UNIX epoch, miliseconds)
|
45
45
|
oldest_logs_first false # When true fetch the oldest logs first
|
@@ -81,6 +81,22 @@ If `fail_on_unparsable_json` is set to `true` a record body consisting of malfor
|
|
81
81
|
|
82
82
|
The `expression` is applied before JSON parsing is attempted. One may therefore extract a JSON fragment from within the event body if it is decorated with additional free-form text.
|
83
83
|
|
84
|
+
### Telemetry
|
85
|
+
With `telemetry` set to `true` and a valid `statsd_endpoint` the plugin will emit telemetry in statsd format to 8125:UDP. It is up to you to configure your statsd-speaking daemon to add any prefix or tagging that you might want.
|
86
|
+
|
87
|
+
The metrics emitted in this version are:
|
88
|
+
|
89
|
+
```
|
90
|
+
api.calls.describeloggroups.attempted
|
91
|
+
api.calls.describeloggroups.failed
|
92
|
+
api.calls.describelogstreams.attempted
|
93
|
+
api.calls.describelogstreams.failed
|
94
|
+
api.calls.getlogevents.attempted
|
95
|
+
api.calls.getlogevents.failed
|
96
|
+
api.calls.getlogevents.invalid_token
|
97
|
+
events.emitted.success
|
98
|
+
```
|
99
|
+
|
84
100
|
### Sub-second timestamps
|
85
101
|
When using `event_time true` the `@timestamp` field for the record is taken from the time recorded against the event by Cloudwatch. This is the most common mode to run in as it's an easy path to normalization: all of your Lambdas or other AWS service need not have the same, valid, `time_format` nor a regex that matches every case.
|
86
102
|
|
data/circle.yml
CHANGED
@@ -27,8 +27,9 @@ module Fluent::Plugin
|
|
27
27
|
config_param :state_file_name, :string, default: '/var/spool/td-agent/cloudwatch.state' # rubocop:disable LineLength
|
28
28
|
desc 'Fetch logs every interval'
|
29
29
|
config_param :interval, :time, default: 60
|
30
|
-
desc 'Time to pause between
|
31
|
-
config_param :
|
30
|
+
desc 'Time to pause between error conditions'
|
31
|
+
config_param :error_interval, :time, default: 5
|
32
|
+
config_param :api_interval, :time
|
32
33
|
desc 'Tag to apply to record'
|
33
34
|
config_param :tag, :string, default: 'cloudwatch'
|
34
35
|
desc 'Enabled AWS SDK logging'
|
@@ -88,6 +89,12 @@ module Fluent::Plugin
|
|
88
89
|
# Configure telemetry, if enabled
|
89
90
|
@statsd = Statsd.new @statsd_endpoint, 8125 if @telemetry
|
90
91
|
|
92
|
+
# Fixup deprecated options
|
93
|
+
if @api_interval
|
94
|
+
@error_interval = @api_interval
|
95
|
+
log.warn('api_interval is deprecated for error_interval')
|
96
|
+
end
|
97
|
+
|
91
98
|
@parser = parser_create(conf: parser_config)
|
92
99
|
log.info('Configured fluentd-plugin-cloudwatch-ingest')
|
93
100
|
end
|
@@ -154,10 +161,10 @@ module Fluent::Plugin
|
|
154
161
|
break unless response.next_token
|
155
162
|
next_token = response.next_token
|
156
163
|
rescue => boom
|
157
|
-
log.error("Unable to retrieve log groups: #{boom}")
|
164
|
+
log.error("Unable to retrieve log groups: #{boom.inspect}")
|
158
165
|
metric(:increment, 'api.calls.describeloggroups.failed')
|
159
166
|
next_token = nil
|
160
|
-
sleep @
|
167
|
+
sleep @error_interval
|
161
168
|
retry
|
162
169
|
end
|
163
170
|
end
|
@@ -189,11 +196,11 @@ module Fluent::Plugin
|
|
189
196
|
break unless response.next_token
|
190
197
|
next_token = response.next_token
|
191
198
|
rescue => boom
|
192
|
-
log.error("Unable to retrieve log streams for group #{log_group_name} with stream prefix #{log_stream_name_prefix}: #{boom}") # rubocop:disable LineLength
|
199
|
+
log.error("Unable to retrieve log streams for group #{log_group_name} with stream prefix #{log_stream_name_prefix}: #{boom.inspect}") # rubocop:disable LineLength
|
193
200
|
metric(:increment, 'api.calls.describelogstreams.failed')
|
194
201
|
log_streams = []
|
195
202
|
next_token = nil
|
196
|
-
sleep @
|
203
|
+
sleep @error_interval
|
197
204
|
retry
|
198
205
|
end
|
199
206
|
end
|
@@ -202,83 +209,135 @@ module Fluent::Plugin
|
|
202
209
|
return log_streams
|
203
210
|
end
|
204
211
|
|
212
|
+
def process_stream(group, stream, next_token, start_time, state)
|
213
|
+
event_count = 0
|
214
|
+
|
215
|
+
metric(:increment, 'api.calls.getlogevents.attempted')
|
216
|
+
response = @aws.get_log_events(
|
217
|
+
log_group_name: group,
|
218
|
+
log_stream_name: stream,
|
219
|
+
next_token: next_token,
|
220
|
+
limit: @limit_events,
|
221
|
+
start_time: start_time,
|
222
|
+
start_from_head: @oldest_logs_first
|
223
|
+
)
|
224
|
+
|
225
|
+
response.events.each do |e|
|
226
|
+
begin
|
227
|
+
emit(e, group, stream)
|
228
|
+
event_count += 1
|
229
|
+
rescue => boom
|
230
|
+
log.error("Failed to emit event #{e}: #{boom.inspect}")
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
has_stream_timestamp = true if state.store[group][stream]['timestamp']
|
235
|
+
|
236
|
+
if !has_stream_timestamp && response.events.count.zero?
|
237
|
+
# This stream has returned no data ever.
|
238
|
+
# In this case, don't save state (token could be an invalid one)
|
239
|
+
else
|
240
|
+
# Once all events for this stream have been processed,
|
241
|
+
# in this iteration, store the forward token
|
242
|
+
state.new_store[group][stream]['token'] = response.next_forward_token
|
243
|
+
if response.events.last
|
244
|
+
state.new_store[group][stream]['timestamp'] =
|
245
|
+
response.events.last.timestamp
|
246
|
+
else
|
247
|
+
state.new_store[group][stream]['timestamp'] =
|
248
|
+
state.store[group][stream]['timestamp']
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
return event_count
|
253
|
+
end
|
254
|
+
|
205
255
|
def run
|
206
256
|
until @finished
|
207
257
|
begin
|
208
258
|
state = State.new(@state_file_name, log)
|
209
259
|
rescue => boom
|
210
|
-
log.info("Failed lock state. Sleeping for #{@
|
211
|
-
|
212
|
-
|
260
|
+
log.info("Failed lock state. Sleeping for #{@error_interval}: "\
|
261
|
+
"#{boom.inspect}")
|
262
|
+
sleep @error_interval
|
263
|
+
next
|
213
264
|
end
|
214
265
|
|
266
|
+
event_count = 0
|
267
|
+
|
215
268
|
# Fetch the streams for each log group
|
216
269
|
log_groups(@log_group_name_prefix).each do |group|
|
217
270
|
# For each log stream get and emit the events
|
218
271
|
log_streams(group, @log_stream_name_prefix).each do |stream|
|
272
|
+
state.store[group][stream] = {} unless state.store[group][stream]
|
273
|
+
|
274
|
+
log.info("processing stream: #{stream}")
|
275
|
+
|
219
276
|
# See if we have some stored state for this group and stream.
|
220
277
|
# If we have then use the stored forward_token to pick up
|
221
278
|
# from that point. Otherwise start from the start.
|
222
|
-
if state.store[group] && state.store[group][stream]
|
223
|
-
stream_token =
|
224
|
-
(state.store[group][stream] if state.store[group][stream])
|
225
|
-
else
|
226
|
-
stream_token = nil
|
227
|
-
end
|
228
279
|
|
229
280
|
begin
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
281
|
+
event_count += process_stream(group, stream,
|
282
|
+
state.store[group][stream]['token'],
|
283
|
+
@event_start_time, state)
|
284
|
+
rescue Aws::CloudWatchLogs::Errors::InvalidParameterException
|
285
|
+
metric(:increment, 'api.calls.getlogevents.invalid_token')
|
286
|
+
log.error('cloudwatch token is expired or broken. '\
|
287
|
+
'trying with timestamp.')
|
288
|
+
|
289
|
+
# try again with timestamp instead of forward token
|
290
|
+
begin
|
291
|
+
timestamp = state.store[group][stream]['timestamp']
|
292
|
+
timestamp = @event_start_time unless timestamp
|
293
|
+
|
294
|
+
event_count += process_stream(group, stream,
|
295
|
+
nil, timestamp, state)
|
296
|
+
rescue => boom
|
297
|
+
log.error("Unable to retrieve events for stream #{stream} "\
|
298
|
+
"in group #{group}: #{boom.inspect}") # rubocop:disable all
|
299
|
+
metric(:increment, 'api.calls.getlogevents.failed')
|
300
|
+
sleep @error_interval
|
301
|
+
next
|
247
302
|
end
|
248
|
-
|
249
|
-
# Once all events for this stream have been processed,
|
250
|
-
# in this iteration, store the forward token
|
251
|
-
state.store[group][stream] = response.next_forward_token
|
252
303
|
rescue => boom
|
253
|
-
log.error("Unable to retrieve events for stream #{stream} in group #{group}: #{boom}") # rubocop:disable LineLength
|
304
|
+
log.error("Unable to retrieve events for stream #{stream} in group #{group}: #{boom.inspect}") # rubocop:disable LineLength
|
254
305
|
metric(:increment, 'api.calls.getlogevents.failed')
|
255
|
-
sleep @
|
256
|
-
|
306
|
+
sleep @error_interval
|
307
|
+
next
|
257
308
|
end
|
258
309
|
end
|
259
310
|
end
|
260
311
|
|
261
|
-
log.info('
|
262
|
-
state.prune(log_groups(@log_group_name_prefix)) # Remove dead streams
|
312
|
+
log.info('Saving state')
|
263
313
|
begin
|
264
314
|
state.save
|
265
315
|
state.close
|
266
|
-
rescue
|
267
|
-
log.error("Unable to save state file: #{boom}")
|
316
|
+
rescue => boom
|
317
|
+
log.error("Unable to save state file: #{boom.inspect}")
|
268
318
|
end
|
269
|
-
|
270
|
-
|
319
|
+
|
320
|
+
if event_count > 0
|
321
|
+
sleep_interval = @interval
|
322
|
+
else
|
323
|
+
sleep_interval = @error_interval # when there is no events, slow down
|
324
|
+
end
|
325
|
+
|
326
|
+
log.info("#{event_count} events processed.")
|
327
|
+
log.info("Pausing for #{sleep_interval}")
|
328
|
+
sleep sleep_interval
|
271
329
|
end
|
272
330
|
end
|
273
331
|
|
274
332
|
class CloudwatchIngestInput::State
|
275
333
|
class LockFailed < RuntimeError; end
|
276
|
-
attr_accessor :statefile, :store
|
334
|
+
attr_accessor :statefile, :store, :new_store
|
277
335
|
|
278
336
|
def initialize(filepath, log)
|
279
337
|
@filepath = filepath
|
280
338
|
@log = log
|
281
|
-
@store = Hash.new { |h, k| h[k] = {} }
|
339
|
+
@store = Hash.new { |h, k| h[k] = Hash.new { |x, y| x[y] = {} } }
|
340
|
+
@new_store = Hash.new { |h, k| h[k] = Hash.new { |x, y| x[y] = {} } }
|
282
341
|
|
283
342
|
if File.exist?(filepath)
|
284
343
|
self.statefile = Pathname.new(@filepath).open('r+')
|
@@ -288,7 +347,8 @@ module Fluent::Plugin
|
|
288
347
|
self.statefile = Pathname.new(@filepath).open('w+')
|
289
348
|
save
|
290
349
|
rescue => boom
|
291
|
-
@log.error("Unable to create new file #{statefile.path}:
|
350
|
+
@log.error("Unable to create new file #{statefile.path}: "\
|
351
|
+
"#{boom.inspect}")
|
292
352
|
end
|
293
353
|
end
|
294
354
|
|
@@ -298,13 +358,30 @@ module Fluent::Plugin
|
|
298
358
|
lockstatus = statefile.flock(File::LOCK_EX | File::LOCK_NB)
|
299
359
|
raise CloudwatchIngestInput::State::LockFailed if lockstatus == false
|
300
360
|
|
301
|
-
|
302
|
-
|
361
|
+
begin
|
362
|
+
@store.merge!(Psych.safe_load(statefile.read))
|
363
|
+
|
364
|
+
# Migrate old state file
|
365
|
+
@store.each do |_group, streams|
|
366
|
+
streams.update(streams) do |_name, stream|
|
367
|
+
if stream.is_a? String
|
368
|
+
return { 'token' => stream, 'timestamp' => Time.now.to_i }
|
369
|
+
end
|
370
|
+
return stream
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
@log.info("Loaded #{@store.keys.size} groups from #{statefile.path}")
|
375
|
+
rescue
|
376
|
+
statefile.close
|
377
|
+
raise
|
378
|
+
end
|
303
379
|
end
|
304
380
|
|
305
381
|
def save
|
306
382
|
statefile.rewind
|
307
|
-
statefile.
|
383
|
+
statefile.truncate(0)
|
384
|
+
statefile.write(Psych.dump(@new_store))
|
308
385
|
@log.info("Saved state to #{statefile.path}")
|
309
386
|
statefile.rewind
|
310
387
|
end
|
@@ -312,14 +389,6 @@ module Fluent::Plugin
|
|
312
389
|
def close
|
313
390
|
statefile.close
|
314
391
|
end
|
315
|
-
|
316
|
-
def prune(log_groups)
|
317
|
-
groups_before = @store.keys.size
|
318
|
-
@store.delete_if { |k, _v| true unless log_groups.include?(k) }
|
319
|
-
@log.info("Pruned #{groups_before - @store.keys.size} keys from store")
|
320
|
-
|
321
|
-
# TODO: also prune streams as these are most likely to be transient
|
322
|
-
end
|
323
392
|
end
|
324
393
|
end
|
325
394
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-cloudwatch-ingest
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Pointer
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-06-
|
11
|
+
date: 2017-06-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -133,6 +133,7 @@ files:
|
|
133
133
|
- ".rspec"
|
134
134
|
- ".rubocop.yml"
|
135
135
|
- ".ruby-version"
|
136
|
+
- CHANGELOG.md
|
136
137
|
- Gemfile
|
137
138
|
- LICENSE
|
138
139
|
- README.md
|