sensu-plugins-edgelab 1.2.2 → 1.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/check-nomad-jobs.rb +59 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c31c6c21f0a5babe6c6b5dd37c1cee1fb8e50dbf
|
4
|
+
data.tar.gz: 115871cd9dcdcc7a08c699b7c0b603cc0ceca937
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 18f0ff0025f8ef24f349550021492d4d7cb9ec71cf7ed99fc1c3fdb0f31d4aafb6350c291cff405fdc3cf9cf9dbeeaa19f848c3fb3603b4f02443aaf8c582678
|
7
|
+
data.tar.gz: 432ffee07129e3a7f6bbfb44dfb901942b1439459427ebeac15c3666916ab08126a94af27e615b9f2f5bbbd351bf2bf7d26084c7a5a4773c0c59546a670d91b3
|
data/bin/check-nomad-jobs.rb
CHANGED
@@ -38,6 +38,62 @@ class CheckNomadAllocations < Sensu::Plugin::Check::CLI
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
|
+
# Returning an array containing human readable explanation for placement failures
|
42
|
+
def placement_failures_reasons(failed_eval)
|
43
|
+
reasons = []
|
44
|
+
failed_eval['FailedTGAllocs'].each do |_, metrics|
|
45
|
+
metrics.fetch(:ClassFiltered, []).each do |class_, count|
|
46
|
+
reasons << "Class #{class_} filtered #{count} nodes"
|
47
|
+
end
|
48
|
+
|
49
|
+
metrics.fetch(:ConstraintFiltered, []).each do |constraint, count|
|
50
|
+
reasons << "Constraint #{constraint} filtered #{count} nodes"
|
51
|
+
end
|
52
|
+
|
53
|
+
if metrics['NodesExhausted'] > 0
|
54
|
+
reasons << "Resources exhausted on #{metrics['NodesExhausted']} nodes"
|
55
|
+
end
|
56
|
+
|
57
|
+
metrics.fetch(:ClassExhausted, []).each do |class_, count|
|
58
|
+
reasons << "Class #{class_} exhausted on #{count} nodes"
|
59
|
+
end
|
60
|
+
|
61
|
+
metrics.fetch('DimensionExhausted', []).each do |dimension, count|
|
62
|
+
reasons << "#{dimension} on #{count} nodes"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
reasons
|
67
|
+
end
|
68
|
+
|
69
|
+
# Check that there is no failed evaluations
|
70
|
+
def check_evaluations(job, failed)
|
71
|
+
evaluations = api_call "/v1/job/#{job['ID']}/evaluations"
|
72
|
+
|
73
|
+
blocked = false
|
74
|
+
last_failed = nil
|
75
|
+
|
76
|
+
evaluations.each do |evaluation|
|
77
|
+
if evaluation['Status'] == 'blocked'
|
78
|
+
blocked = true
|
79
|
+
end
|
80
|
+
|
81
|
+
next if evaluation['FailedTGAllocs'].nil?
|
82
|
+
|
83
|
+
if last_failed.nil? || last_failed['CreateIndex'] < evaluation['CreateIndex']
|
84
|
+
last_failed = evaluation
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
if blocked && !last_failed.nil?
|
89
|
+
failure_reasons = placement_failures_reasons last_failed
|
90
|
+
|
91
|
+
if failure_reasons.any?
|
92
|
+
failed << "#{job['ID']}: Placemement failure [" + failure_reasons.join(' / ') + ']'
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
41
97
|
# Check that allocations are in the desired status
|
42
98
|
def check_allocations(job, failed)
|
43
99
|
allocations = api_call "/v1/job/#{job['ID']}/allocations"
|
@@ -45,11 +101,11 @@ class CheckNomadAllocations < Sensu::Plugin::Check::CLI
|
|
45
101
|
allocations.each do |alloc|
|
46
102
|
if alloc['DesiredStatus'] == 'run'
|
47
103
|
# Batch stay in run DesiredStatus even if task completed successfully.
|
48
|
-
next if job['Type'] == 'batch'
|
104
|
+
next if job['Type'] == 'batch' && alloc['ClientStatus'] == 'complete'
|
49
105
|
|
50
106
|
alloc['TaskStates'].each do |_, state|
|
51
107
|
if state['State'] == 'dead'
|
52
|
-
|
108
|
+
failed << "Alloc #{alloc['Name']} is dead but desired status is 'run'"
|
53
109
|
|
54
110
|
# Check that pending alloc are not too old
|
55
111
|
elsif state['State'] == 'pending'
|
@@ -63,7 +119,6 @@ class CheckNomadAllocations < Sensu::Plugin::Check::CLI
|
|
63
119
|
# No need to check other task in the same task group.
|
64
120
|
break
|
65
121
|
end
|
66
|
-
|
67
122
|
end
|
68
123
|
end
|
69
124
|
end
|
@@ -79,6 +134,7 @@ class CheckNomadAllocations < Sensu::Plugin::Check::CLI
|
|
79
134
|
failed = []
|
80
135
|
|
81
136
|
jobs.each do |job|
|
137
|
+
check_evaluations job, failed
|
82
138
|
check_allocations job, failed
|
83
139
|
end
|
84
140
|
|