tobsch-krane 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.buildkite/pipeline.nightly.yml +43 -0
- data/.github/probots.yml +2 -0
- data/.gitignore +20 -0
- data/.rubocop.yml +17 -0
- data/.shopify-build/VERSION +1 -0
- data/.shopify-build/kubernetes-deploy.yml +53 -0
- data/1.0-Upgrade.md +185 -0
- data/CHANGELOG.md +431 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/CONTRIBUTING.md +164 -0
- data/Gemfile +16 -0
- data/ISSUE_TEMPLATE.md +25 -0
- data/LICENSE.txt +21 -0
- data/README.md +655 -0
- data/Rakefile +36 -0
- data/bin/ci +21 -0
- data/bin/setup +16 -0
- data/bin/test +47 -0
- data/dev.yml +28 -0
- data/dev/flamegraph-from-tests +35 -0
- data/exe/krane +5 -0
- data/krane.gemspec +44 -0
- data/lib/krane.rb +7 -0
- data/lib/krane/bindings_parser.rb +88 -0
- data/lib/krane/cli/deploy_command.rb +75 -0
- data/lib/krane/cli/global_deploy_command.rb +54 -0
- data/lib/krane/cli/krane.rb +91 -0
- data/lib/krane/cli/render_command.rb +41 -0
- data/lib/krane/cli/restart_command.rb +34 -0
- data/lib/krane/cli/run_command.rb +54 -0
- data/lib/krane/cli/version_command.rb +13 -0
- data/lib/krane/cluster_resource_discovery.rb +113 -0
- data/lib/krane/common.rb +23 -0
- data/lib/krane/concerns/template_reporting.rb +29 -0
- data/lib/krane/concurrency.rb +18 -0
- data/lib/krane/container_logs.rb +106 -0
- data/lib/krane/deferred_summary_logging.rb +95 -0
- data/lib/krane/delayed_exceptions.rb +14 -0
- data/lib/krane/deploy_task.rb +363 -0
- data/lib/krane/deploy_task_config_validator.rb +29 -0
- data/lib/krane/duration_parser.rb +27 -0
- data/lib/krane/ejson_secret_provisioner.rb +154 -0
- data/lib/krane/errors.rb +28 -0
- data/lib/krane/formatted_logger.rb +57 -0
- data/lib/krane/global_deploy_task.rb +210 -0
- data/lib/krane/global_deploy_task_config_validator.rb +12 -0
- data/lib/krane/kubeclient_builder.rb +156 -0
- data/lib/krane/kubectl.rb +120 -0
- data/lib/krane/kubernetes_resource.rb +621 -0
- data/lib/krane/kubernetes_resource/cloudsql.rb +43 -0
- data/lib/krane/kubernetes_resource/config_map.rb +22 -0
- data/lib/krane/kubernetes_resource/cron_job.rb +18 -0
- data/lib/krane/kubernetes_resource/custom_resource.rb +87 -0
- data/lib/krane/kubernetes_resource/custom_resource_definition.rb +98 -0
- data/lib/krane/kubernetes_resource/daemon_set.rb +90 -0
- data/lib/krane/kubernetes_resource/deployment.rb +213 -0
- data/lib/krane/kubernetes_resource/horizontal_pod_autoscaler.rb +65 -0
- data/lib/krane/kubernetes_resource/ingress.rb +18 -0
- data/lib/krane/kubernetes_resource/job.rb +60 -0
- data/lib/krane/kubernetes_resource/network_policy.rb +22 -0
- data/lib/krane/kubernetes_resource/persistent_volume_claim.rb +80 -0
- data/lib/krane/kubernetes_resource/pod.rb +269 -0
- data/lib/krane/kubernetes_resource/pod_disruption_budget.rb +23 -0
- data/lib/krane/kubernetes_resource/pod_set_base.rb +71 -0
- data/lib/krane/kubernetes_resource/pod_template.rb +20 -0
- data/lib/krane/kubernetes_resource/replica_set.rb +92 -0
- data/lib/krane/kubernetes_resource/resource_quota.rb +22 -0
- data/lib/krane/kubernetes_resource/role.rb +22 -0
- data/lib/krane/kubernetes_resource/role_binding.rb +22 -0
- data/lib/krane/kubernetes_resource/secret.rb +24 -0
- data/lib/krane/kubernetes_resource/service.rb +104 -0
- data/lib/krane/kubernetes_resource/service_account.rb +22 -0
- data/lib/krane/kubernetes_resource/stateful_set.rb +70 -0
- data/lib/krane/label_selector.rb +42 -0
- data/lib/krane/oj.rb +4 -0
- data/lib/krane/options_helper.rb +39 -0
- data/lib/krane/remote_logs.rb +60 -0
- data/lib/krane/render_task.rb +118 -0
- data/lib/krane/renderer.rb +118 -0
- data/lib/krane/resource_cache.rb +68 -0
- data/lib/krane/resource_deployer.rb +265 -0
- data/lib/krane/resource_watcher.rb +171 -0
- data/lib/krane/restart_task.rb +228 -0
- data/lib/krane/rollout_conditions.rb +103 -0
- data/lib/krane/runner_task.rb +212 -0
- data/lib/krane/runner_task_config_validator.rb +18 -0
- data/lib/krane/statsd.rb +65 -0
- data/lib/krane/task_config.rb +22 -0
- data/lib/krane/task_config_validator.rb +96 -0
- data/lib/krane/template_sets.rb +173 -0
- data/lib/krane/version.rb +4 -0
- data/pull_request_template.md +8 -0
- data/screenshots/deploy-demo.gif +0 -0
- data/screenshots/migrate-logs.png +0 -0
- data/screenshots/missing-secret-fail.png +0 -0
- data/screenshots/success.png +0 -0
- data/screenshots/test-output.png +0 -0
- metadata +375 -0
@@ -0,0 +1,265 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'krane/resource_watcher'
|
4
|
+
require 'krane/concerns/template_reporting'
|
5
|
+
|
6
|
+
module Krane
|
7
|
+
class ResourceDeployer
|
8
|
+
extend Krane::StatsD::MeasureMethods
|
9
|
+
include Krane::TemplateReporting
|
10
|
+
|
11
|
+
delegate :logger, to: :@task_config
|
12
|
+
attr_reader :statsd_tags
|
13
|
+
|
14
|
+
def initialize(task_config:, prune_whitelist:, global_timeout:, current_sha: nil, selector:, statsd_tags:)
|
15
|
+
@task_config = task_config
|
16
|
+
@prune_whitelist = prune_whitelist
|
17
|
+
@global_timeout = global_timeout
|
18
|
+
@current_sha = current_sha
|
19
|
+
@selector = selector
|
20
|
+
@statsd_tags = statsd_tags
|
21
|
+
end
|
22
|
+
|
23
|
+
def deploy!(resources, verify_result, prune)
|
24
|
+
if verify_result
|
25
|
+
deploy_all_resources(resources, prune: prune, verify: true)
|
26
|
+
failed_resources = resources.reject(&:deploy_succeeded?)
|
27
|
+
success = failed_resources.empty?
|
28
|
+
if !success && failed_resources.all?(&:deploy_timed_out?)
|
29
|
+
raise DeploymentTimeoutError
|
30
|
+
end
|
31
|
+
raise FatalDeploymentError unless success
|
32
|
+
else
|
33
|
+
deploy_all_resources(resources, prune: prune, verify: false)
|
34
|
+
logger.summary.add_action("deployed #{resources.length} #{'resource'.pluralize(resources.length)}")
|
35
|
+
warning = <<~MSG
|
36
|
+
Deploy result verification is disabled for this deploy.
|
37
|
+
This means the desired changes were communicated to Kubernetes, but the deploy did not make sure they actually succeeded.
|
38
|
+
MSG
|
39
|
+
logger.summary.add_paragraph(ColorizedString.new(warning).yellow)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def predeploy_priority_resources(resource_list, predeploy_sequence)
|
44
|
+
bare_pods = resource_list.select { |resource| resource.is_a?(Pod) }
|
45
|
+
if bare_pods.count == 1
|
46
|
+
bare_pods.first.stream_logs = true
|
47
|
+
end
|
48
|
+
|
49
|
+
predeploy_sequence.each do |resource_type|
|
50
|
+
matching_resources = resource_list.select { |r| r.type == resource_type }
|
51
|
+
next if matching_resources.empty?
|
52
|
+
deploy_resources(matching_resources, verify: true, record_summary: false)
|
53
|
+
|
54
|
+
failed_resources = matching_resources.reject(&:deploy_succeeded?)
|
55
|
+
fail_count = failed_resources.length
|
56
|
+
if fail_count > 0
|
57
|
+
Krane::Concurrency.split_across_threads(failed_resources) do |r|
|
58
|
+
r.sync_debug_info(kubectl)
|
59
|
+
end
|
60
|
+
failed_resources.each { |r| logger.summary.add_paragraph(r.debug_message) }
|
61
|
+
raise FatalDeploymentError, "Failed to deploy #{fail_count} priority #{'resource'.pluralize(fail_count)}"
|
62
|
+
end
|
63
|
+
logger.blank_line
|
64
|
+
end
|
65
|
+
end
|
66
|
+
measure_method(:predeploy_priority_resources, 'priority_resources.duration')
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def deploy_all_resources(resources, prune: false, verify:, record_summary: true)
|
71
|
+
deploy_resources(resources, prune: prune, verify: verify, record_summary: record_summary)
|
72
|
+
end
|
73
|
+
measure_method(:deploy_all_resources, 'normal_resources.duration')
|
74
|
+
|
75
|
+
def deploy_resources(resources, prune: false, verify:, record_summary: true)
|
76
|
+
return if resources.empty?
|
77
|
+
deploy_started_at = Time.now.utc
|
78
|
+
|
79
|
+
if resources.length > 1
|
80
|
+
logger.info("Deploying resources:")
|
81
|
+
resources.each do |r|
|
82
|
+
logger.info("- #{r.id} (#{r.pretty_timeout_type})")
|
83
|
+
end
|
84
|
+
else
|
85
|
+
resource = resources.first
|
86
|
+
logger.info("Deploying #{resource.id} (#{resource.pretty_timeout_type})")
|
87
|
+
end
|
88
|
+
|
89
|
+
# Apply can be done in one large batch, the rest have to be done individually
|
90
|
+
applyables, individuals = resources.partition { |r| r.deploy_method == :apply }
|
91
|
+
# Prunable resources should also applied so that they can be pruned
|
92
|
+
pruneable_types = @prune_whitelist.map { |t| t.split("/").last }
|
93
|
+
applyables += individuals.select { |r| pruneable_types.include?(r.type) }
|
94
|
+
|
95
|
+
individuals.each do |individual_resource|
|
96
|
+
individual_resource.deploy_started_at = Time.now.utc
|
97
|
+
|
98
|
+
case individual_resource.deploy_method
|
99
|
+
when :create
|
100
|
+
err, status = create_resource(individual_resource)
|
101
|
+
when :replace
|
102
|
+
err, status = replace_or_create_resource(individual_resource)
|
103
|
+
when :replace_force
|
104
|
+
err, status = replace_or_create_resource(individual_resource, force: true)
|
105
|
+
else
|
106
|
+
# Fail Fast! This is a programmer mistake.
|
107
|
+
raise ArgumentError, "Unexpected deploy method! (#{individual_resource.deploy_method.inspect})"
|
108
|
+
end
|
109
|
+
|
110
|
+
next if status.success?
|
111
|
+
|
112
|
+
raise FatalDeploymentError, <<~MSG
|
113
|
+
Failed to replace or create resource: #{individual_resource.id}
|
114
|
+
#{individual_resource.sensitive_template_content? ? '<suppressed sensitive output>' : err}
|
115
|
+
MSG
|
116
|
+
end
|
117
|
+
|
118
|
+
apply_all(applyables, prune)
|
119
|
+
|
120
|
+
if verify
|
121
|
+
watcher = Krane::ResourceWatcher.new(resources: resources, deploy_started_at: deploy_started_at,
|
122
|
+
timeout: @global_timeout, task_config: @task_config, sha: @current_sha)
|
123
|
+
watcher.run(record_summary: record_summary)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def apply_all(resources, prune)
|
128
|
+
return unless resources.present?
|
129
|
+
command = %w(apply)
|
130
|
+
|
131
|
+
Dir.mktmpdir do |tmp_dir|
|
132
|
+
resources.each do |r|
|
133
|
+
FileUtils.symlink(r.file_path, tmp_dir)
|
134
|
+
r.deploy_started_at = Time.now.utc
|
135
|
+
end
|
136
|
+
command.push("-f", tmp_dir)
|
137
|
+
|
138
|
+
if prune && @prune_whitelist.present?
|
139
|
+
command.push("--prune")
|
140
|
+
if @selector
|
141
|
+
command.push("--selector", @selector.to_s)
|
142
|
+
else
|
143
|
+
command.push("--all")
|
144
|
+
end
|
145
|
+
@prune_whitelist.each { |type| command.push("--prune-whitelist=#{type}") }
|
146
|
+
end
|
147
|
+
|
148
|
+
output_is_sensitive = resources.any?(&:sensitive_template_content?)
|
149
|
+
global_mode = resources.all?(&:global?)
|
150
|
+
out, err, st = kubectl.run(*command, log_failure: false, output_is_sensitive: output_is_sensitive,
|
151
|
+
use_namespace: !global_mode)
|
152
|
+
|
153
|
+
if st.success?
|
154
|
+
log_pruning(out) if prune
|
155
|
+
else
|
156
|
+
record_apply_failure(err, resources: resources)
|
157
|
+
raise FatalDeploymentError, "Command failed: #{Shellwords.join(command)}"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
measure_method(:apply_all)
|
162
|
+
|
163
|
+
def log_pruning(kubectl_output)
|
164
|
+
pruned = kubectl_output.scan(/^(.*) pruned$/)
|
165
|
+
return unless pruned.present?
|
166
|
+
|
167
|
+
logger.info("The following resources were pruned: #{pruned.join(', ')}")
|
168
|
+
logger.summary.add_action("pruned #{pruned.length} #{'resource'.pluralize(pruned.length)}")
|
169
|
+
end
|
170
|
+
|
171
|
+
def record_apply_failure(err, resources: [])
|
172
|
+
warn_msg = "WARNING: Any resources not mentioned in the error(s) below were likely created/updated. " \
|
173
|
+
"You may wish to roll back this deploy."
|
174
|
+
logger.summary.add_paragraph(ColorizedString.new(warn_msg).yellow)
|
175
|
+
|
176
|
+
unidentified_errors = []
|
177
|
+
filenames_with_sensitive_content = resources
|
178
|
+
.select(&:sensitive_template_content?)
|
179
|
+
.map { |r| File.basename(r.file_path) }
|
180
|
+
|
181
|
+
server_dry_run_validated_resource = resources
|
182
|
+
.select(&:server_dry_run_validated?)
|
183
|
+
.map { |r| File.basename(r.file_path) }
|
184
|
+
|
185
|
+
err.each_line do |line|
|
186
|
+
bad_files = find_bad_files_from_kubectl_output(line)
|
187
|
+
unless bad_files.present?
|
188
|
+
unidentified_errors << line
|
189
|
+
next
|
190
|
+
end
|
191
|
+
|
192
|
+
bad_files.each do |f|
|
193
|
+
err_msg = f[:err]
|
194
|
+
if filenames_with_sensitive_content.include?(f[:filename])
|
195
|
+
# Hide the error and template contents in case it has sensitive information
|
196
|
+
# we display full error messages as we assume there's no sensitive info leak after server-dry-run
|
197
|
+
err_msg = "SUPPRESSED FOR SECURITY" unless server_dry_run_validated_resource.include?(f[:filename])
|
198
|
+
record_invalid_template(logger: logger, err: err_msg, filename: f[:filename], content: nil)
|
199
|
+
else
|
200
|
+
record_invalid_template(logger: logger, err: err_msg, filename: f[:filename], content: f[:content])
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
return unless unidentified_errors.any?
|
205
|
+
|
206
|
+
if (filenames_with_sensitive_content - server_dry_run_validated_resource).present?
|
207
|
+
warn_msg = "WARNING: There was an error applying some or all resources. The raw output may be sensitive and " \
|
208
|
+
"so cannot be displayed."
|
209
|
+
logger.summary.add_paragraph(ColorizedString.new(warn_msg).yellow)
|
210
|
+
else
|
211
|
+
heading = ColorizedString.new('Unidentified error(s):').red
|
212
|
+
msg = FormattedLogger.indent_four(unidentified_errors.join)
|
213
|
+
logger.summary.add_paragraph("#{heading}\n#{msg}")
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def replace_or_create_resource(resource, force: false)
|
218
|
+
args = if force
|
219
|
+
["replace", "--force", "--cascade", "-f", resource.file_path]
|
220
|
+
else
|
221
|
+
["replace", "-f", resource.file_path]
|
222
|
+
end
|
223
|
+
|
224
|
+
_, err, status = kubectl.run(*args, log_failure: false, output_is_sensitive: resource.sensitive_template_content?,
|
225
|
+
raise_if_not_found: true, use_namespace: !resource.global?)
|
226
|
+
|
227
|
+
[err, status]
|
228
|
+
rescue Krane::Kubectl::ResourceNotFoundError
|
229
|
+
# it doesn't exist so we can't replace it, we try to create it
|
230
|
+
create_resource(resource)
|
231
|
+
end
|
232
|
+
|
233
|
+
def create_resource(resource)
|
234
|
+
out, err, status = kubectl.run("create", "-f", resource.file_path, log_failure: false,
|
235
|
+
output: 'json', output_is_sensitive: resource.sensitive_template_content?,
|
236
|
+
use_namespace: !resource.global?)
|
237
|
+
|
238
|
+
# For resources that rely on a generateName attribute, we get the `name` from the result of the call to `create`
|
239
|
+
# We must explicitly set this name value so that the `apply` step for pruning can run successfully
|
240
|
+
if status.success? && resource.uses_generate_name?
|
241
|
+
resource.use_generated_name(JSON.parse(out))
|
242
|
+
end
|
243
|
+
|
244
|
+
[err, status]
|
245
|
+
end
|
246
|
+
|
247
|
+
# Inspect the file referenced in the kubectl stderr
|
248
|
+
# to make it easier for developer to understand what's going on
|
249
|
+
def find_bad_files_from_kubectl_output(line)
|
250
|
+
# stderr often contains one or more lines like the following, from which we can extract the file path(s):
|
251
|
+
# Error from server (TypeOfError): error when creating "/path/to/service-gqq5oh.yml": Service "web" is invalid:
|
252
|
+
|
253
|
+
line.scan(%r{"(/\S+\.ya?ml\S*)"}).each_with_object([]) do |matches, bad_files|
|
254
|
+
matches.each do |path|
|
255
|
+
content = File.read(path) if File.file?(path)
|
256
|
+
bad_files << { filename: File.basename(path), err: line, content: content }
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def kubectl
|
262
|
+
@kubectl ||= Kubectl.new(task_config: @task_config, log_failure_by_default: true)
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'krane/concurrency'
|
4
|
+
require 'krane/resource_cache'
|
5
|
+
|
6
|
+
module Krane
|
7
|
+
class ResourceWatcher
|
8
|
+
extend Krane::StatsD::MeasureMethods
|
9
|
+
delegate :namespace, :context, :logger, to: :@task_config
|
10
|
+
|
11
|
+
def initialize(resources:, task_config:, deploy_started_at: Time.now.utc,
|
12
|
+
operation_name: "deploy", timeout: nil, sha: nil)
|
13
|
+
unless resources.is_a?(Enumerable)
|
14
|
+
raise ArgumentError, <<~MSG
|
15
|
+
ResourceWatcher expects Enumerable collection, got `#{resources.class}` instead
|
16
|
+
MSG
|
17
|
+
end
|
18
|
+
@resources = resources
|
19
|
+
@task_config = task_config
|
20
|
+
@deploy_started_at = deploy_started_at
|
21
|
+
@operation_name = operation_name
|
22
|
+
@timeout = timeout
|
23
|
+
@sha = sha
|
24
|
+
end
|
25
|
+
|
26
|
+
def run(delay_sync: 3.seconds, reminder_interval: 30.seconds, record_summary: true)
|
27
|
+
last_message_logged_at = monitoring_started = Time.now.utc
|
28
|
+
remainder = @resources.dup
|
29
|
+
|
30
|
+
while remainder.present?
|
31
|
+
report_and_give_up(remainder) if global_timeout?(monitoring_started)
|
32
|
+
sleep_until_next_sync(delay_sync)
|
33
|
+
|
34
|
+
sync_resources(remainder)
|
35
|
+
|
36
|
+
new_successes, remainder = remainder.partition(&:deploy_succeeded?)
|
37
|
+
new_failures, remainder = remainder.partition(&:deploy_failed?)
|
38
|
+
new_timeouts, remainder = remainder.partition(&:deploy_timed_out?)
|
39
|
+
|
40
|
+
if new_successes.present? || new_failures.present? || new_timeouts.present?
|
41
|
+
report_what_just_happened(new_successes, new_failures, new_timeouts)
|
42
|
+
report_what_is_left(remainder, reminder: false)
|
43
|
+
last_message_logged_at = Time.now.utc
|
44
|
+
elsif due_for_reminder?(last_message_logged_at, reminder_interval)
|
45
|
+
report_what_is_left(remainder, reminder: true)
|
46
|
+
last_message_logged_at = Time.now.utc
|
47
|
+
end
|
48
|
+
end
|
49
|
+
record_statuses_for_summary(@resources) if record_summary
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def sync_resources(resources)
|
55
|
+
cache = ResourceCache.new(@task_config)
|
56
|
+
Krane::Concurrency.split_across_threads(resources) { |r| r.sync(cache) }
|
57
|
+
resources.each(&:after_sync)
|
58
|
+
end
|
59
|
+
measure_method(:sync_resources, "sync.duration")
|
60
|
+
|
61
|
+
def statsd_tags
|
62
|
+
{
|
63
|
+
namespace: namespace,
|
64
|
+
context: context,
|
65
|
+
sha: @sha,
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
def global_timeout?(started_at)
|
70
|
+
@timeout && (Time.now.utc - started_at > @timeout)
|
71
|
+
end
|
72
|
+
|
73
|
+
def sleep_until_next_sync(min_interval)
|
74
|
+
@next_sync_time ||= Time.now.utc
|
75
|
+
if (sleep_duration = @next_sync_time - Time.now.utc) > 0
|
76
|
+
sleep(sleep_duration)
|
77
|
+
end
|
78
|
+
@next_sync_time = Time.now.utc + min_interval
|
79
|
+
end
|
80
|
+
|
81
|
+
def report_what_just_happened(new_successes, new_failures, new_timeouts)
|
82
|
+
watch_time = (Time.now.utc - @deploy_started_at).round(1)
|
83
|
+
new_failures.each do |resource|
|
84
|
+
resource.report_status_to_statsd(watch_time)
|
85
|
+
logger.error("#{resource.id} failed to #{@operation_name} after #{watch_time}s")
|
86
|
+
end
|
87
|
+
|
88
|
+
new_timeouts.each do |resource|
|
89
|
+
resource.report_status_to_statsd(watch_time)
|
90
|
+
logger.error("#{resource.id} rollout timed out after #{watch_time}s")
|
91
|
+
end
|
92
|
+
|
93
|
+
if new_successes.present?
|
94
|
+
new_successes.each { |r| r.report_status_to_statsd(watch_time) }
|
95
|
+
success_string = ColorizedString.new("Successfully #{past_tense_operation} in #{watch_time}s:").green
|
96
|
+
logger.info("#{success_string} #{new_successes.map(&:id).join(', ')}")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def report_what_is_left(resources, reminder:)
|
101
|
+
return unless resources.present?
|
102
|
+
resource_list = resources.map(&:id).join(', ')
|
103
|
+
msg = reminder ? "Still waiting for: #{resource_list}" : "Continuing to wait for: #{resource_list}"
|
104
|
+
logger.info(msg)
|
105
|
+
end
|
106
|
+
|
107
|
+
def report_and_give_up(remaining_resources)
|
108
|
+
successful_resources, failed_resources = (@resources - remaining_resources).partition(&:deploy_succeeded?)
|
109
|
+
record_success_statuses(successful_resources)
|
110
|
+
record_failed_statuses(failed_resources, remaining_resources)
|
111
|
+
|
112
|
+
if failed_resources.present? && !failed_resources.all?(&:deploy_timed_out?)
|
113
|
+
raise FatalDeploymentError
|
114
|
+
else
|
115
|
+
raise DeploymentTimeoutError
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def record_statuses_for_summary(resources)
|
120
|
+
successful_resources, failed_resources = resources.partition(&:deploy_succeeded?)
|
121
|
+
record_success_statuses(successful_resources)
|
122
|
+
record_failed_statuses(failed_resources)
|
123
|
+
end
|
124
|
+
|
125
|
+
def record_failed_statuses(failed_resources, global_timeouts = [])
|
126
|
+
fail_count = failed_resources.length + global_timeouts.length
|
127
|
+
|
128
|
+
if fail_count > 0
|
129
|
+
timeouts, failures = failed_resources.partition(&:deploy_timed_out?)
|
130
|
+
timeouts += global_timeouts
|
131
|
+
if timeouts.present?
|
132
|
+
logger.summary.add_action(
|
133
|
+
"timed out waiting for #{timeouts.length} #{'resource'.pluralize(timeouts.length)} to #{@operation_name}"
|
134
|
+
)
|
135
|
+
end
|
136
|
+
|
137
|
+
if failures.present?
|
138
|
+
logger.summary.add_action(
|
139
|
+
"failed to #{@operation_name} #{failures.length} #{'resource'.pluralize(failures.length)}"
|
140
|
+
)
|
141
|
+
end
|
142
|
+
|
143
|
+
kubectl = Kubectl.new(task_config: @task_config, log_failure_by_default: false)
|
144
|
+
Krane::Concurrency.split_across_threads(failed_resources + global_timeouts) do |r|
|
145
|
+
r.sync_debug_info(kubectl)
|
146
|
+
end
|
147
|
+
|
148
|
+
failed_resources.each { |r| logger.summary.add_paragraph(r.debug_message) }
|
149
|
+
global_timeouts.each { |r| logger.summary.add_paragraph(r.debug_message(:gave_up, timeout: @timeout)) }
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def record_success_statuses(successful_resources)
|
154
|
+
success_count = successful_resources.length
|
155
|
+
if success_count > 0
|
156
|
+
logger.summary.add_action("successfully #{past_tense_operation} #{success_count} "\
|
157
|
+
"#{'resource'.pluralize(success_count)}")
|
158
|
+
final_statuses = successful_resources.map(&:pretty_status).join("\n")
|
159
|
+
logger.summary.add_paragraph("#{ColorizedString.new('Successful resources').green}\n#{final_statuses}")
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def due_for_reminder?(last_message_logged_at, reminder_interval)
|
164
|
+
(last_message_logged_at.to_f + reminder_interval.to_f) <= Time.now.utc.to_f
|
165
|
+
end
|
166
|
+
|
167
|
+
def past_tense_operation
|
168
|
+
@operation_name == "run" ? "ran" : "#{@operation_name}ed"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
@@ -0,0 +1,228 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'krane/common'
|
3
|
+
require 'krane/kubernetes_resource'
|
4
|
+
require 'krane/kubernetes_resource/deployment'
|
5
|
+
require 'krane/kubeclient_builder'
|
6
|
+
require 'krane/resource_watcher'
|
7
|
+
require 'krane/kubectl'
|
8
|
+
|
9
|
+
module Krane
|
10
|
+
# Restart the pods in one or more deployments
|
11
|
+
class RestartTask
|
12
|
+
class FatalRestartError < FatalDeploymentError; end
|
13
|
+
|
14
|
+
class RestartAPIError < FatalRestartError
|
15
|
+
def initialize(deployment_name, response)
|
16
|
+
super("Failed to restart #{deployment_name}. " \
|
17
|
+
"API returned non-200 response code (#{response.code})\n" \
|
18
|
+
"Response:\n#{response.body}")
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
HTTP_OK_RANGE = 200..299
|
23
|
+
ANNOTATION = "shipit.shopify.io/restart"
|
24
|
+
|
25
|
+
# Initializes the restart task
|
26
|
+
#
|
27
|
+
# @param context [String] Kubernetes context / cluster (*required*)
|
28
|
+
# @param namespace [String] Kubernetes namespace (*required*)
|
29
|
+
# @param logger [Object] Logger object (defaults to an instance of Krane::FormattedLogger)
|
30
|
+
# @param global_timeout [Integer] Timeout in seconds
|
31
|
+
def initialize(context:, namespace:, logger: nil, global_timeout: nil)
|
32
|
+
@logger = logger || Krane::FormattedLogger.build(namespace, context)
|
33
|
+
@task_config = Krane::TaskConfig.new(context, namespace, @logger)
|
34
|
+
@context = context
|
35
|
+
@namespace = namespace
|
36
|
+
@global_timeout = global_timeout
|
37
|
+
end
|
38
|
+
|
39
|
+
# Runs the task, returning a boolean representing success or failure
|
40
|
+
#
|
41
|
+
# @return [Boolean]
|
42
|
+
def run(*args)
|
43
|
+
perform!(*args)
|
44
|
+
true
|
45
|
+
rescue FatalDeploymentError
|
46
|
+
false
|
47
|
+
end
|
48
|
+
alias_method :perform, :run
|
49
|
+
|
50
|
+
# Runs the task, raising exceptions in case of issues
|
51
|
+
#
|
52
|
+
# @param deployments [Array<String>] Array of workload names to restart
|
53
|
+
# @param selector [Hash] Selector(s) parsed by Krane::LabelSelector
|
54
|
+
# @param verify_result [Boolean] Wait for completion and verify success
|
55
|
+
#
|
56
|
+
# @return [nil]
|
57
|
+
def run!(deployments: nil, selector: nil, verify_result: true)
|
58
|
+
start = Time.now.utc
|
59
|
+
@logger.reset
|
60
|
+
|
61
|
+
@logger.phase_heading("Initializing restart")
|
62
|
+
verify_config!
|
63
|
+
deployments = identify_target_deployments(deployments, selector: selector)
|
64
|
+
|
65
|
+
@logger.phase_heading("Triggering restart by touching ENV[RESTARTED_AT]")
|
66
|
+
patch_kubeclient_deployments(deployments)
|
67
|
+
|
68
|
+
if verify_result
|
69
|
+
@logger.phase_heading("Waiting for rollout")
|
70
|
+
resources = build_watchables(deployments, start)
|
71
|
+
verify_restart(resources)
|
72
|
+
else
|
73
|
+
warning = "Result verification is disabled for this task"
|
74
|
+
@logger.summary.add_paragraph(ColorizedString.new(warning).yellow)
|
75
|
+
end
|
76
|
+
StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('success', deployments))
|
77
|
+
@logger.print_summary(:success)
|
78
|
+
rescue DeploymentTimeoutError
|
79
|
+
StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('timeout', deployments))
|
80
|
+
@logger.print_summary(:timed_out)
|
81
|
+
raise
|
82
|
+
rescue FatalDeploymentError => error
|
83
|
+
StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('failure', deployments))
|
84
|
+
@logger.summary.add_action(error.message) if error.message != error.class.to_s
|
85
|
+
@logger.print_summary(:failure)
|
86
|
+
raise
|
87
|
+
end
|
88
|
+
alias_method :perform!, :run!
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def tags(status, deployments)
|
93
|
+
%W(namespace:#{@namespace} context:#{@context} status:#{status} deployments:#{deployments.to_a.length}})
|
94
|
+
end
|
95
|
+
|
96
|
+
def identify_target_deployments(deployment_names, selector: nil)
|
97
|
+
if deployment_names.nil?
|
98
|
+
deployments = if selector.nil?
|
99
|
+
@logger.info("Configured to restart all deployments with the `#{ANNOTATION}` annotation")
|
100
|
+
apps_v1_kubeclient.get_deployments(namespace: @namespace)
|
101
|
+
else
|
102
|
+
selector_string = selector.to_s
|
103
|
+
@logger.info(
|
104
|
+
"Configured to restart all deployments with the `#{ANNOTATION}` annotation and #{selector_string} selector"
|
105
|
+
)
|
106
|
+
apps_v1_kubeclient.get_deployments(namespace: @namespace, label_selector: selector_string)
|
107
|
+
end
|
108
|
+
deployments.select! { |d| d.metadata.annotations[ANNOTATION] }
|
109
|
+
|
110
|
+
if deployments.none?
|
111
|
+
raise FatalRestartError, "no deployments with the `#{ANNOTATION}` annotation found in namespace #{@namespace}"
|
112
|
+
end
|
113
|
+
elsif deployment_names.empty?
|
114
|
+
raise FatalRestartError, "Configured to restart deployments by name, but list of names was blank"
|
115
|
+
elsif !selector.nil?
|
116
|
+
raise FatalRestartError, "Can't specify deployment names and selector at the same time"
|
117
|
+
else
|
118
|
+
deployment_names = deployment_names.uniq
|
119
|
+
list = deployment_names.join(', ')
|
120
|
+
@logger.info("Configured to restart deployments by name: #{list}")
|
121
|
+
|
122
|
+
deployments = fetch_deployments(deployment_names)
|
123
|
+
if deployments.none?
|
124
|
+
raise FatalRestartError, "no deployments with names #{list} found in namespace #{@namespace}"
|
125
|
+
end
|
126
|
+
end
|
127
|
+
deployments
|
128
|
+
end
|
129
|
+
|
130
|
+
def build_watchables(kubeclient_resources, started)
|
131
|
+
kubeclient_resources.map do |d|
|
132
|
+
definition = d.to_h.deep_stringify_keys
|
133
|
+
r = Deployment.new(namespace: @namespace, context: @context, definition: definition, logger: @logger)
|
134
|
+
r.deploy_started_at = started # we don't care what happened to the resource before the restart cmd ran
|
135
|
+
r
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def patch_deployment_with_restart(record)
|
140
|
+
apps_v1_kubeclient.patch_deployment(
|
141
|
+
record.metadata.name,
|
142
|
+
build_patch_payload(record),
|
143
|
+
@namespace
|
144
|
+
)
|
145
|
+
end
|
146
|
+
|
147
|
+
def patch_kubeclient_deployments(deployments)
|
148
|
+
deployments.each do |record|
|
149
|
+
begin
|
150
|
+
patch_deployment_with_restart(record)
|
151
|
+
@logger.info("Triggered `#{record.metadata.name}` restart")
|
152
|
+
rescue Kubeclient::HttpError => e
|
153
|
+
raise RestartAPIError.new(record.metadata.name, e.message)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def fetch_deployments(list)
|
159
|
+
list.map do |name|
|
160
|
+
record = nil
|
161
|
+
begin
|
162
|
+
record = apps_v1_kubeclient.get_deployment(name, @namespace)
|
163
|
+
rescue Kubeclient::ResourceNotFoundError
|
164
|
+
raise FatalRestartError, "Deployment `#{name}` not found in namespace `#{@namespace}`"
|
165
|
+
end
|
166
|
+
record
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
def build_patch_payload(deployment)
|
171
|
+
containers = deployment.spec.template.spec.containers
|
172
|
+
{
|
173
|
+
spec: {
|
174
|
+
template: {
|
175
|
+
spec: {
|
176
|
+
containers: containers.map do |container|
|
177
|
+
{
|
178
|
+
name: container.name,
|
179
|
+
env: [{ name: "RESTARTED_AT", value: Time.now.to_i.to_s }],
|
180
|
+
}
|
181
|
+
end,
|
182
|
+
},
|
183
|
+
},
|
184
|
+
},
|
185
|
+
}
|
186
|
+
end
|
187
|
+
|
188
|
+
def verify_restart(resources)
|
189
|
+
ResourceWatcher.new(resources: resources, operation_name: "restart",
|
190
|
+
timeout: @global_timeout, task_config: @task_config).run
|
191
|
+
failed_resources = resources.reject(&:deploy_succeeded?)
|
192
|
+
success = failed_resources.empty?
|
193
|
+
if !success && failed_resources.all?(&:deploy_timed_out?)
|
194
|
+
raise DeploymentTimeoutError
|
195
|
+
end
|
196
|
+
raise FatalDeploymentError unless success
|
197
|
+
end
|
198
|
+
|
199
|
+
def verify_config!
|
200
|
+
task_config_validator = TaskConfigValidator.new(@task_config, kubectl, kubeclient_builder)
|
201
|
+
unless task_config_validator.valid?
|
202
|
+
@logger.summary.add_action("Configuration invalid")
|
203
|
+
@logger.summary.add_paragraph(task_config_validator.errors.map { |err| "- #{err}" }.join("\n"))
|
204
|
+
raise Krane::TaskConfigurationError
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def apps_v1_kubeclient
|
209
|
+
@apps_v1_kubeclient ||= kubeclient_builder.build_apps_v1_kubeclient(@context)
|
210
|
+
end
|
211
|
+
|
212
|
+
def kubeclient
|
213
|
+
@kubeclient ||= kubeclient_builder.build_v1_kubeclient(@context)
|
214
|
+
end
|
215
|
+
|
216
|
+
def kubectl
|
217
|
+
@kubectl ||= Kubectl.new(task_config: @task_config, log_failure_by_default: true)
|
218
|
+
end
|
219
|
+
|
220
|
+
def v1beta1_kubeclient
|
221
|
+
@v1beta1_kubeclient ||= kubeclient_builder.build_v1beta1_kubeclient(@context)
|
222
|
+
end
|
223
|
+
|
224
|
+
def kubeclient_builder
|
225
|
+
@kubeclient_builder ||= KubeclientBuilder.new
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|