tobsch-krane 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.buildkite/pipeline.nightly.yml +43 -0
- data/.github/probots.yml +2 -0
- data/.gitignore +20 -0
- data/.rubocop.yml +17 -0
- data/.shopify-build/VERSION +1 -0
- data/.shopify-build/kubernetes-deploy.yml +53 -0
- data/1.0-Upgrade.md +185 -0
- data/CHANGELOG.md +431 -0
- data/CODE_OF_CONDUCT.md +46 -0
- data/CONTRIBUTING.md +164 -0
- data/Gemfile +16 -0
- data/ISSUE_TEMPLATE.md +25 -0
- data/LICENSE.txt +21 -0
- data/README.md +655 -0
- data/Rakefile +36 -0
- data/bin/ci +21 -0
- data/bin/setup +16 -0
- data/bin/test +47 -0
- data/dev.yml +28 -0
- data/dev/flamegraph-from-tests +35 -0
- data/exe/krane +5 -0
- data/krane.gemspec +44 -0
- data/lib/krane.rb +7 -0
- data/lib/krane/bindings_parser.rb +88 -0
- data/lib/krane/cli/deploy_command.rb +75 -0
- data/lib/krane/cli/global_deploy_command.rb +54 -0
- data/lib/krane/cli/krane.rb +91 -0
- data/lib/krane/cli/render_command.rb +41 -0
- data/lib/krane/cli/restart_command.rb +34 -0
- data/lib/krane/cli/run_command.rb +54 -0
- data/lib/krane/cli/version_command.rb +13 -0
- data/lib/krane/cluster_resource_discovery.rb +113 -0
- data/lib/krane/common.rb +23 -0
- data/lib/krane/concerns/template_reporting.rb +29 -0
- data/lib/krane/concurrency.rb +18 -0
- data/lib/krane/container_logs.rb +106 -0
- data/lib/krane/deferred_summary_logging.rb +95 -0
- data/lib/krane/delayed_exceptions.rb +14 -0
- data/lib/krane/deploy_task.rb +363 -0
- data/lib/krane/deploy_task_config_validator.rb +29 -0
- data/lib/krane/duration_parser.rb +27 -0
- data/lib/krane/ejson_secret_provisioner.rb +154 -0
- data/lib/krane/errors.rb +28 -0
- data/lib/krane/formatted_logger.rb +57 -0
- data/lib/krane/global_deploy_task.rb +210 -0
- data/lib/krane/global_deploy_task_config_validator.rb +12 -0
- data/lib/krane/kubeclient_builder.rb +156 -0
- data/lib/krane/kubectl.rb +120 -0
- data/lib/krane/kubernetes_resource.rb +621 -0
- data/lib/krane/kubernetes_resource/cloudsql.rb +43 -0
- data/lib/krane/kubernetes_resource/config_map.rb +22 -0
- data/lib/krane/kubernetes_resource/cron_job.rb +18 -0
- data/lib/krane/kubernetes_resource/custom_resource.rb +87 -0
- data/lib/krane/kubernetes_resource/custom_resource_definition.rb +98 -0
- data/lib/krane/kubernetes_resource/daemon_set.rb +90 -0
- data/lib/krane/kubernetes_resource/deployment.rb +213 -0
- data/lib/krane/kubernetes_resource/horizontal_pod_autoscaler.rb +65 -0
- data/lib/krane/kubernetes_resource/ingress.rb +18 -0
- data/lib/krane/kubernetes_resource/job.rb +60 -0
- data/lib/krane/kubernetes_resource/network_policy.rb +22 -0
- data/lib/krane/kubernetes_resource/persistent_volume_claim.rb +80 -0
- data/lib/krane/kubernetes_resource/pod.rb +269 -0
- data/lib/krane/kubernetes_resource/pod_disruption_budget.rb +23 -0
- data/lib/krane/kubernetes_resource/pod_set_base.rb +71 -0
- data/lib/krane/kubernetes_resource/pod_template.rb +20 -0
- data/lib/krane/kubernetes_resource/replica_set.rb +92 -0
- data/lib/krane/kubernetes_resource/resource_quota.rb +22 -0
- data/lib/krane/kubernetes_resource/role.rb +22 -0
- data/lib/krane/kubernetes_resource/role_binding.rb +22 -0
- data/lib/krane/kubernetes_resource/secret.rb +24 -0
- data/lib/krane/kubernetes_resource/service.rb +104 -0
- data/lib/krane/kubernetes_resource/service_account.rb +22 -0
- data/lib/krane/kubernetes_resource/stateful_set.rb +70 -0
- data/lib/krane/label_selector.rb +42 -0
- data/lib/krane/oj.rb +4 -0
- data/lib/krane/options_helper.rb +39 -0
- data/lib/krane/remote_logs.rb +60 -0
- data/lib/krane/render_task.rb +118 -0
- data/lib/krane/renderer.rb +118 -0
- data/lib/krane/resource_cache.rb +68 -0
- data/lib/krane/resource_deployer.rb +265 -0
- data/lib/krane/resource_watcher.rb +171 -0
- data/lib/krane/restart_task.rb +228 -0
- data/lib/krane/rollout_conditions.rb +103 -0
- data/lib/krane/runner_task.rb +212 -0
- data/lib/krane/runner_task_config_validator.rb +18 -0
- data/lib/krane/statsd.rb +65 -0
- data/lib/krane/task_config.rb +22 -0
- data/lib/krane/task_config_validator.rb +96 -0
- data/lib/krane/template_sets.rb +173 -0
- data/lib/krane/version.rb +4 -0
- data/pull_request_template.md +8 -0
- data/screenshots/deploy-demo.gif +0 -0
- data/screenshots/migrate-logs.png +0 -0
- data/screenshots/missing-secret-fail.png +0 -0
- data/screenshots/success.png +0 -0
- data/screenshots/test-output.png +0 -0
- metadata +375 -0
@@ -0,0 +1,265 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'krane/resource_watcher'
|
4
|
+
require 'krane/concerns/template_reporting'
|
5
|
+
|
6
|
+
module Krane
|
7
|
+
class ResourceDeployer
|
8
|
+
extend Krane::StatsD::MeasureMethods
|
9
|
+
include Krane::TemplateReporting
|
10
|
+
|
11
|
+
delegate :logger, to: :@task_config
|
12
|
+
attr_reader :statsd_tags
|
13
|
+
|
14
|
+
def initialize(task_config:, prune_whitelist:, global_timeout:, current_sha: nil, selector:, statsd_tags:)
|
15
|
+
@task_config = task_config
|
16
|
+
@prune_whitelist = prune_whitelist
|
17
|
+
@global_timeout = global_timeout
|
18
|
+
@current_sha = current_sha
|
19
|
+
@selector = selector
|
20
|
+
@statsd_tags = statsd_tags
|
21
|
+
end
|
22
|
+
|
23
|
+
def deploy!(resources, verify_result, prune)
|
24
|
+
if verify_result
|
25
|
+
deploy_all_resources(resources, prune: prune, verify: true)
|
26
|
+
failed_resources = resources.reject(&:deploy_succeeded?)
|
27
|
+
success = failed_resources.empty?
|
28
|
+
if !success && failed_resources.all?(&:deploy_timed_out?)
|
29
|
+
raise DeploymentTimeoutError
|
30
|
+
end
|
31
|
+
raise FatalDeploymentError unless success
|
32
|
+
else
|
33
|
+
deploy_all_resources(resources, prune: prune, verify: false)
|
34
|
+
logger.summary.add_action("deployed #{resources.length} #{'resource'.pluralize(resources.length)}")
|
35
|
+
warning = <<~MSG
|
36
|
+
Deploy result verification is disabled for this deploy.
|
37
|
+
This means the desired changes were communicated to Kubernetes, but the deploy did not make sure they actually succeeded.
|
38
|
+
MSG
|
39
|
+
logger.summary.add_paragraph(ColorizedString.new(warning).yellow)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def predeploy_priority_resources(resource_list, predeploy_sequence)
|
44
|
+
bare_pods = resource_list.select { |resource| resource.is_a?(Pod) }
|
45
|
+
if bare_pods.count == 1
|
46
|
+
bare_pods.first.stream_logs = true
|
47
|
+
end
|
48
|
+
|
49
|
+
predeploy_sequence.each do |resource_type|
|
50
|
+
matching_resources = resource_list.select { |r| r.type == resource_type }
|
51
|
+
next if matching_resources.empty?
|
52
|
+
deploy_resources(matching_resources, verify: true, record_summary: false)
|
53
|
+
|
54
|
+
failed_resources = matching_resources.reject(&:deploy_succeeded?)
|
55
|
+
fail_count = failed_resources.length
|
56
|
+
if fail_count > 0
|
57
|
+
Krane::Concurrency.split_across_threads(failed_resources) do |r|
|
58
|
+
r.sync_debug_info(kubectl)
|
59
|
+
end
|
60
|
+
failed_resources.each { |r| logger.summary.add_paragraph(r.debug_message) }
|
61
|
+
raise FatalDeploymentError, "Failed to deploy #{fail_count} priority #{'resource'.pluralize(fail_count)}"
|
62
|
+
end
|
63
|
+
logger.blank_line
|
64
|
+
end
|
65
|
+
end
|
66
|
+
measure_method(:predeploy_priority_resources, 'priority_resources.duration')
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def deploy_all_resources(resources, prune: false, verify:, record_summary: true)
|
71
|
+
deploy_resources(resources, prune: prune, verify: verify, record_summary: record_summary)
|
72
|
+
end
|
73
|
+
measure_method(:deploy_all_resources, 'normal_resources.duration')
|
74
|
+
|
75
|
+
def deploy_resources(resources, prune: false, verify:, record_summary: true)
|
76
|
+
return if resources.empty?
|
77
|
+
deploy_started_at = Time.now.utc
|
78
|
+
|
79
|
+
if resources.length > 1
|
80
|
+
logger.info("Deploying resources:")
|
81
|
+
resources.each do |r|
|
82
|
+
logger.info("- #{r.id} (#{r.pretty_timeout_type})")
|
83
|
+
end
|
84
|
+
else
|
85
|
+
resource = resources.first
|
86
|
+
logger.info("Deploying #{resource.id} (#{resource.pretty_timeout_type})")
|
87
|
+
end
|
88
|
+
|
89
|
+
# Apply can be done in one large batch, the rest have to be done individually
|
90
|
+
applyables, individuals = resources.partition { |r| r.deploy_method == :apply }
|
91
|
+
# Prunable resources should also applied so that they can be pruned
|
92
|
+
pruneable_types = @prune_whitelist.map { |t| t.split("/").last }
|
93
|
+
applyables += individuals.select { |r| pruneable_types.include?(r.type) }
|
94
|
+
|
95
|
+
individuals.each do |individual_resource|
|
96
|
+
individual_resource.deploy_started_at = Time.now.utc
|
97
|
+
|
98
|
+
case individual_resource.deploy_method
|
99
|
+
when :create
|
100
|
+
err, status = create_resource(individual_resource)
|
101
|
+
when :replace
|
102
|
+
err, status = replace_or_create_resource(individual_resource)
|
103
|
+
when :replace_force
|
104
|
+
err, status = replace_or_create_resource(individual_resource, force: true)
|
105
|
+
else
|
106
|
+
# Fail Fast! This is a programmer mistake.
|
107
|
+
raise ArgumentError, "Unexpected deploy method! (#{individual_resource.deploy_method.inspect})"
|
108
|
+
end
|
109
|
+
|
110
|
+
next if status.success?
|
111
|
+
|
112
|
+
raise FatalDeploymentError, <<~MSG
|
113
|
+
Failed to replace or create resource: #{individual_resource.id}
|
114
|
+
#{individual_resource.sensitive_template_content? ? '<suppressed sensitive output>' : err}
|
115
|
+
MSG
|
116
|
+
end
|
117
|
+
|
118
|
+
apply_all(applyables, prune)
|
119
|
+
|
120
|
+
if verify
|
121
|
+
watcher = Krane::ResourceWatcher.new(resources: resources, deploy_started_at: deploy_started_at,
|
122
|
+
timeout: @global_timeout, task_config: @task_config, sha: @current_sha)
|
123
|
+
watcher.run(record_summary: record_summary)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def apply_all(resources, prune)
|
128
|
+
return unless resources.present?
|
129
|
+
command = %w(apply)
|
130
|
+
|
131
|
+
Dir.mktmpdir do |tmp_dir|
|
132
|
+
resources.each do |r|
|
133
|
+
FileUtils.symlink(r.file_path, tmp_dir)
|
134
|
+
r.deploy_started_at = Time.now.utc
|
135
|
+
end
|
136
|
+
command.push("-f", tmp_dir)
|
137
|
+
|
138
|
+
if prune && @prune_whitelist.present?
|
139
|
+
command.push("--prune")
|
140
|
+
if @selector
|
141
|
+
command.push("--selector", @selector.to_s)
|
142
|
+
else
|
143
|
+
command.push("--all")
|
144
|
+
end
|
145
|
+
@prune_whitelist.each { |type| command.push("--prune-whitelist=#{type}") }
|
146
|
+
end
|
147
|
+
|
148
|
+
output_is_sensitive = resources.any?(&:sensitive_template_content?)
|
149
|
+
global_mode = resources.all?(&:global?)
|
150
|
+
out, err, st = kubectl.run(*command, log_failure: false, output_is_sensitive: output_is_sensitive,
|
151
|
+
use_namespace: !global_mode)
|
152
|
+
|
153
|
+
if st.success?
|
154
|
+
log_pruning(out) if prune
|
155
|
+
else
|
156
|
+
record_apply_failure(err, resources: resources)
|
157
|
+
raise FatalDeploymentError, "Command failed: #{Shellwords.join(command)}"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
measure_method(:apply_all)
|
162
|
+
|
163
|
+
def log_pruning(kubectl_output)
|
164
|
+
pruned = kubectl_output.scan(/^(.*) pruned$/)
|
165
|
+
return unless pruned.present?
|
166
|
+
|
167
|
+
logger.info("The following resources were pruned: #{pruned.join(', ')}")
|
168
|
+
logger.summary.add_action("pruned #{pruned.length} #{'resource'.pluralize(pruned.length)}")
|
169
|
+
end
|
170
|
+
|
171
|
+
def record_apply_failure(err, resources: [])
|
172
|
+
warn_msg = "WARNING: Any resources not mentioned in the error(s) below were likely created/updated. " \
|
173
|
+
"You may wish to roll back this deploy."
|
174
|
+
logger.summary.add_paragraph(ColorizedString.new(warn_msg).yellow)
|
175
|
+
|
176
|
+
unidentified_errors = []
|
177
|
+
filenames_with_sensitive_content = resources
|
178
|
+
.select(&:sensitive_template_content?)
|
179
|
+
.map { |r| File.basename(r.file_path) }
|
180
|
+
|
181
|
+
server_dry_run_validated_resource = resources
|
182
|
+
.select(&:server_dry_run_validated?)
|
183
|
+
.map { |r| File.basename(r.file_path) }
|
184
|
+
|
185
|
+
err.each_line do |line|
|
186
|
+
bad_files = find_bad_files_from_kubectl_output(line)
|
187
|
+
unless bad_files.present?
|
188
|
+
unidentified_errors << line
|
189
|
+
next
|
190
|
+
end
|
191
|
+
|
192
|
+
bad_files.each do |f|
|
193
|
+
err_msg = f[:err]
|
194
|
+
if filenames_with_sensitive_content.include?(f[:filename])
|
195
|
+
# Hide the error and template contents in case it has sensitive information
|
196
|
+
# we display full error messages as we assume there's no sensitive info leak after server-dry-run
|
197
|
+
err_msg = "SUPPRESSED FOR SECURITY" unless server_dry_run_validated_resource.include?(f[:filename])
|
198
|
+
record_invalid_template(logger: logger, err: err_msg, filename: f[:filename], content: nil)
|
199
|
+
else
|
200
|
+
record_invalid_template(logger: logger, err: err_msg, filename: f[:filename], content: f[:content])
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
return unless unidentified_errors.any?
|
205
|
+
|
206
|
+
if (filenames_with_sensitive_content - server_dry_run_validated_resource).present?
|
207
|
+
warn_msg = "WARNING: There was an error applying some or all resources. The raw output may be sensitive and " \
|
208
|
+
"so cannot be displayed."
|
209
|
+
logger.summary.add_paragraph(ColorizedString.new(warn_msg).yellow)
|
210
|
+
else
|
211
|
+
heading = ColorizedString.new('Unidentified error(s):').red
|
212
|
+
msg = FormattedLogger.indent_four(unidentified_errors.join)
|
213
|
+
logger.summary.add_paragraph("#{heading}\n#{msg}")
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def replace_or_create_resource(resource, force: false)
|
218
|
+
args = if force
|
219
|
+
["replace", "--force", "--cascade", "-f", resource.file_path]
|
220
|
+
else
|
221
|
+
["replace", "-f", resource.file_path]
|
222
|
+
end
|
223
|
+
|
224
|
+
_, err, status = kubectl.run(*args, log_failure: false, output_is_sensitive: resource.sensitive_template_content?,
|
225
|
+
raise_if_not_found: true, use_namespace: !resource.global?)
|
226
|
+
|
227
|
+
[err, status]
|
228
|
+
rescue Krane::Kubectl::ResourceNotFoundError
|
229
|
+
# it doesn't exist so we can't replace it, we try to create it
|
230
|
+
create_resource(resource)
|
231
|
+
end
|
232
|
+
|
233
|
+
def create_resource(resource)
|
234
|
+
out, err, status = kubectl.run("create", "-f", resource.file_path, log_failure: false,
|
235
|
+
output: 'json', output_is_sensitive: resource.sensitive_template_content?,
|
236
|
+
use_namespace: !resource.global?)
|
237
|
+
|
238
|
+
# For resources that rely on a generateName attribute, we get the `name` from the result of the call to `create`
|
239
|
+
# We must explicitly set this name value so that the `apply` step for pruning can run successfully
|
240
|
+
if status.success? && resource.uses_generate_name?
|
241
|
+
resource.use_generated_name(JSON.parse(out))
|
242
|
+
end
|
243
|
+
|
244
|
+
[err, status]
|
245
|
+
end
|
246
|
+
|
247
|
+
# Inspect the file referenced in the kubectl stderr
|
248
|
+
# to make it easier for developer to understand what's going on
|
249
|
+
def find_bad_files_from_kubectl_output(line)
|
250
|
+
# stderr often contains one or more lines like the following, from which we can extract the file path(s):
|
251
|
+
# Error from server (TypeOfError): error when creating "/path/to/service-gqq5oh.yml": Service "web" is invalid:
|
252
|
+
|
253
|
+
line.scan(%r{"(/\S+\.ya?ml\S*)"}).each_with_object([]) do |matches, bad_files|
|
254
|
+
matches.each do |path|
|
255
|
+
content = File.read(path) if File.file?(path)
|
256
|
+
bad_files << { filename: File.basename(path), err: line, content: content }
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def kubectl
|
262
|
+
@kubectl ||= Kubectl.new(task_config: @task_config, log_failure_by_default: true)
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'krane/concurrency'
|
4
|
+
require 'krane/resource_cache'
|
5
|
+
|
6
|
+
module Krane
|
7
|
+
class ResourceWatcher
|
8
|
+
extend Krane::StatsD::MeasureMethods
|
9
|
+
delegate :namespace, :context, :logger, to: :@task_config
|
10
|
+
|
11
|
+
def initialize(resources:, task_config:, deploy_started_at: Time.now.utc,
|
12
|
+
operation_name: "deploy", timeout: nil, sha: nil)
|
13
|
+
unless resources.is_a?(Enumerable)
|
14
|
+
raise ArgumentError, <<~MSG
|
15
|
+
ResourceWatcher expects Enumerable collection, got `#{resources.class}` instead
|
16
|
+
MSG
|
17
|
+
end
|
18
|
+
@resources = resources
|
19
|
+
@task_config = task_config
|
20
|
+
@deploy_started_at = deploy_started_at
|
21
|
+
@operation_name = operation_name
|
22
|
+
@timeout = timeout
|
23
|
+
@sha = sha
|
24
|
+
end
|
25
|
+
|
26
|
+
def run(delay_sync: 3.seconds, reminder_interval: 30.seconds, record_summary: true)
|
27
|
+
last_message_logged_at = monitoring_started = Time.now.utc
|
28
|
+
remainder = @resources.dup
|
29
|
+
|
30
|
+
while remainder.present?
|
31
|
+
report_and_give_up(remainder) if global_timeout?(monitoring_started)
|
32
|
+
sleep_until_next_sync(delay_sync)
|
33
|
+
|
34
|
+
sync_resources(remainder)
|
35
|
+
|
36
|
+
new_successes, remainder = remainder.partition(&:deploy_succeeded?)
|
37
|
+
new_failures, remainder = remainder.partition(&:deploy_failed?)
|
38
|
+
new_timeouts, remainder = remainder.partition(&:deploy_timed_out?)
|
39
|
+
|
40
|
+
if new_successes.present? || new_failures.present? || new_timeouts.present?
|
41
|
+
report_what_just_happened(new_successes, new_failures, new_timeouts)
|
42
|
+
report_what_is_left(remainder, reminder: false)
|
43
|
+
last_message_logged_at = Time.now.utc
|
44
|
+
elsif due_for_reminder?(last_message_logged_at, reminder_interval)
|
45
|
+
report_what_is_left(remainder, reminder: true)
|
46
|
+
last_message_logged_at = Time.now.utc
|
47
|
+
end
|
48
|
+
end
|
49
|
+
record_statuses_for_summary(@resources) if record_summary
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def sync_resources(resources)
|
55
|
+
cache = ResourceCache.new(@task_config)
|
56
|
+
Krane::Concurrency.split_across_threads(resources) { |r| r.sync(cache) }
|
57
|
+
resources.each(&:after_sync)
|
58
|
+
end
|
59
|
+
measure_method(:sync_resources, "sync.duration")
|
60
|
+
|
61
|
+
def statsd_tags
|
62
|
+
{
|
63
|
+
namespace: namespace,
|
64
|
+
context: context,
|
65
|
+
sha: @sha,
|
66
|
+
}
|
67
|
+
end
|
68
|
+
|
69
|
+
def global_timeout?(started_at)
|
70
|
+
@timeout && (Time.now.utc - started_at > @timeout)
|
71
|
+
end
|
72
|
+
|
73
|
+
def sleep_until_next_sync(min_interval)
|
74
|
+
@next_sync_time ||= Time.now.utc
|
75
|
+
if (sleep_duration = @next_sync_time - Time.now.utc) > 0
|
76
|
+
sleep(sleep_duration)
|
77
|
+
end
|
78
|
+
@next_sync_time = Time.now.utc + min_interval
|
79
|
+
end
|
80
|
+
|
81
|
+
def report_what_just_happened(new_successes, new_failures, new_timeouts)
|
82
|
+
watch_time = (Time.now.utc - @deploy_started_at).round(1)
|
83
|
+
new_failures.each do |resource|
|
84
|
+
resource.report_status_to_statsd(watch_time)
|
85
|
+
logger.error("#{resource.id} failed to #{@operation_name} after #{watch_time}s")
|
86
|
+
end
|
87
|
+
|
88
|
+
new_timeouts.each do |resource|
|
89
|
+
resource.report_status_to_statsd(watch_time)
|
90
|
+
logger.error("#{resource.id} rollout timed out after #{watch_time}s")
|
91
|
+
end
|
92
|
+
|
93
|
+
if new_successes.present?
|
94
|
+
new_successes.each { |r| r.report_status_to_statsd(watch_time) }
|
95
|
+
success_string = ColorizedString.new("Successfully #{past_tense_operation} in #{watch_time}s:").green
|
96
|
+
logger.info("#{success_string} #{new_successes.map(&:id).join(', ')}")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def report_what_is_left(resources, reminder:)
|
101
|
+
return unless resources.present?
|
102
|
+
resource_list = resources.map(&:id).join(', ')
|
103
|
+
msg = reminder ? "Still waiting for: #{resource_list}" : "Continuing to wait for: #{resource_list}"
|
104
|
+
logger.info(msg)
|
105
|
+
end
|
106
|
+
|
107
|
+
def report_and_give_up(remaining_resources)
|
108
|
+
successful_resources, failed_resources = (@resources - remaining_resources).partition(&:deploy_succeeded?)
|
109
|
+
record_success_statuses(successful_resources)
|
110
|
+
record_failed_statuses(failed_resources, remaining_resources)
|
111
|
+
|
112
|
+
if failed_resources.present? && !failed_resources.all?(&:deploy_timed_out?)
|
113
|
+
raise FatalDeploymentError
|
114
|
+
else
|
115
|
+
raise DeploymentTimeoutError
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def record_statuses_for_summary(resources)
|
120
|
+
successful_resources, failed_resources = resources.partition(&:deploy_succeeded?)
|
121
|
+
record_success_statuses(successful_resources)
|
122
|
+
record_failed_statuses(failed_resources)
|
123
|
+
end
|
124
|
+
|
125
|
+
def record_failed_statuses(failed_resources, global_timeouts = [])
|
126
|
+
fail_count = failed_resources.length + global_timeouts.length
|
127
|
+
|
128
|
+
if fail_count > 0
|
129
|
+
timeouts, failures = failed_resources.partition(&:deploy_timed_out?)
|
130
|
+
timeouts += global_timeouts
|
131
|
+
if timeouts.present?
|
132
|
+
logger.summary.add_action(
|
133
|
+
"timed out waiting for #{timeouts.length} #{'resource'.pluralize(timeouts.length)} to #{@operation_name}"
|
134
|
+
)
|
135
|
+
end
|
136
|
+
|
137
|
+
if failures.present?
|
138
|
+
logger.summary.add_action(
|
139
|
+
"failed to #{@operation_name} #{failures.length} #{'resource'.pluralize(failures.length)}"
|
140
|
+
)
|
141
|
+
end
|
142
|
+
|
143
|
+
kubectl = Kubectl.new(task_config: @task_config, log_failure_by_default: false)
|
144
|
+
Krane::Concurrency.split_across_threads(failed_resources + global_timeouts) do |r|
|
145
|
+
r.sync_debug_info(kubectl)
|
146
|
+
end
|
147
|
+
|
148
|
+
failed_resources.each { |r| logger.summary.add_paragraph(r.debug_message) }
|
149
|
+
global_timeouts.each { |r| logger.summary.add_paragraph(r.debug_message(:gave_up, timeout: @timeout)) }
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def record_success_statuses(successful_resources)
|
154
|
+
success_count = successful_resources.length
|
155
|
+
if success_count > 0
|
156
|
+
logger.summary.add_action("successfully #{past_tense_operation} #{success_count} "\
|
157
|
+
"#{'resource'.pluralize(success_count)}")
|
158
|
+
final_statuses = successful_resources.map(&:pretty_status).join("\n")
|
159
|
+
logger.summary.add_paragraph("#{ColorizedString.new('Successful resources').green}\n#{final_statuses}")
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def due_for_reminder?(last_message_logged_at, reminder_interval)
|
164
|
+
(last_message_logged_at.to_f + reminder_interval.to_f) <= Time.now.utc.to_f
|
165
|
+
end
|
166
|
+
|
167
|
+
def past_tense_operation
|
168
|
+
@operation_name == "run" ? "ran" : "#{@operation_name}ed"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
@@ -0,0 +1,228 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'krane/common'
|
3
|
+
require 'krane/kubernetes_resource'
|
4
|
+
require 'krane/kubernetes_resource/deployment'
|
5
|
+
require 'krane/kubeclient_builder'
|
6
|
+
require 'krane/resource_watcher'
|
7
|
+
require 'krane/kubectl'
|
8
|
+
|
9
|
+
module Krane
|
10
|
+
# Restart the pods in one or more deployments
|
11
|
+
class RestartTask
|
12
|
+
class FatalRestartError < FatalDeploymentError; end
|
13
|
+
|
14
|
+
class RestartAPIError < FatalRestartError
|
15
|
+
def initialize(deployment_name, response)
|
16
|
+
super("Failed to restart #{deployment_name}. " \
|
17
|
+
"API returned non-200 response code (#{response.code})\n" \
|
18
|
+
"Response:\n#{response.body}")
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
HTTP_OK_RANGE = 200..299
|
23
|
+
ANNOTATION = "shipit.shopify.io/restart"
|
24
|
+
|
25
|
+
# Initializes the restart task
|
26
|
+
#
|
27
|
+
# @param context [String] Kubernetes context / cluster (*required*)
|
28
|
+
# @param namespace [String] Kubernetes namespace (*required*)
|
29
|
+
# @param logger [Object] Logger object (defaults to an instance of Krane::FormattedLogger)
|
30
|
+
# @param global_timeout [Integer] Timeout in seconds
|
31
|
+
def initialize(context:, namespace:, logger: nil, global_timeout: nil)
|
32
|
+
@logger = logger || Krane::FormattedLogger.build(namespace, context)
|
33
|
+
@task_config = Krane::TaskConfig.new(context, namespace, @logger)
|
34
|
+
@context = context
|
35
|
+
@namespace = namespace
|
36
|
+
@global_timeout = global_timeout
|
37
|
+
end
|
38
|
+
|
39
|
+
# Runs the task, returning a boolean representing success or failure
|
40
|
+
#
|
41
|
+
# @return [Boolean]
|
42
|
+
def run(*args)
|
43
|
+
perform!(*args)
|
44
|
+
true
|
45
|
+
rescue FatalDeploymentError
|
46
|
+
false
|
47
|
+
end
|
48
|
+
alias_method :perform, :run
|
49
|
+
|
50
|
+
# Runs the task, raising exceptions in case of issues
|
51
|
+
#
|
52
|
+
# @param deployments [Array<String>] Array of workload names to restart
|
53
|
+
# @param selector [Hash] Selector(s) parsed by Krane::LabelSelector
|
54
|
+
# @param verify_result [Boolean] Wait for completion and verify success
|
55
|
+
#
|
56
|
+
# @return [nil]
|
57
|
+
def run!(deployments: nil, selector: nil, verify_result: true)
|
58
|
+
start = Time.now.utc
|
59
|
+
@logger.reset
|
60
|
+
|
61
|
+
@logger.phase_heading("Initializing restart")
|
62
|
+
verify_config!
|
63
|
+
deployments = identify_target_deployments(deployments, selector: selector)
|
64
|
+
|
65
|
+
@logger.phase_heading("Triggering restart by touching ENV[RESTARTED_AT]")
|
66
|
+
patch_kubeclient_deployments(deployments)
|
67
|
+
|
68
|
+
if verify_result
|
69
|
+
@logger.phase_heading("Waiting for rollout")
|
70
|
+
resources = build_watchables(deployments, start)
|
71
|
+
verify_restart(resources)
|
72
|
+
else
|
73
|
+
warning = "Result verification is disabled for this task"
|
74
|
+
@logger.summary.add_paragraph(ColorizedString.new(warning).yellow)
|
75
|
+
end
|
76
|
+
StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('success', deployments))
|
77
|
+
@logger.print_summary(:success)
|
78
|
+
rescue DeploymentTimeoutError
|
79
|
+
StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('timeout', deployments))
|
80
|
+
@logger.print_summary(:timed_out)
|
81
|
+
raise
|
82
|
+
rescue FatalDeploymentError => error
|
83
|
+
StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('failure', deployments))
|
84
|
+
@logger.summary.add_action(error.message) if error.message != error.class.to_s
|
85
|
+
@logger.print_summary(:failure)
|
86
|
+
raise
|
87
|
+
end
|
88
|
+
alias_method :perform!, :run!
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def tags(status, deployments)
|
93
|
+
%W(namespace:#{@namespace} context:#{@context} status:#{status} deployments:#{deployments.to_a.length}})
|
94
|
+
end
|
95
|
+
|
96
|
+
def identify_target_deployments(deployment_names, selector: nil)
|
97
|
+
if deployment_names.nil?
|
98
|
+
deployments = if selector.nil?
|
99
|
+
@logger.info("Configured to restart all deployments with the `#{ANNOTATION}` annotation")
|
100
|
+
apps_v1_kubeclient.get_deployments(namespace: @namespace)
|
101
|
+
else
|
102
|
+
selector_string = selector.to_s
|
103
|
+
@logger.info(
|
104
|
+
"Configured to restart all deployments with the `#{ANNOTATION}` annotation and #{selector_string} selector"
|
105
|
+
)
|
106
|
+
apps_v1_kubeclient.get_deployments(namespace: @namespace, label_selector: selector_string)
|
107
|
+
end
|
108
|
+
deployments.select! { |d| d.metadata.annotations[ANNOTATION] }
|
109
|
+
|
110
|
+
if deployments.none?
|
111
|
+
raise FatalRestartError, "no deployments with the `#{ANNOTATION}` annotation found in namespace #{@namespace}"
|
112
|
+
end
|
113
|
+
elsif deployment_names.empty?
|
114
|
+
raise FatalRestartError, "Configured to restart deployments by name, but list of names was blank"
|
115
|
+
elsif !selector.nil?
|
116
|
+
raise FatalRestartError, "Can't specify deployment names and selector at the same time"
|
117
|
+
else
|
118
|
+
deployment_names = deployment_names.uniq
|
119
|
+
list = deployment_names.join(', ')
|
120
|
+
@logger.info("Configured to restart deployments by name: #{list}")
|
121
|
+
|
122
|
+
deployments = fetch_deployments(deployment_names)
|
123
|
+
if deployments.none?
|
124
|
+
raise FatalRestartError, "no deployments with names #{list} found in namespace #{@namespace}"
|
125
|
+
end
|
126
|
+
end
|
127
|
+
deployments
|
128
|
+
end
|
129
|
+
|
130
|
+
def build_watchables(kubeclient_resources, started)
|
131
|
+
kubeclient_resources.map do |d|
|
132
|
+
definition = d.to_h.deep_stringify_keys
|
133
|
+
r = Deployment.new(namespace: @namespace, context: @context, definition: definition, logger: @logger)
|
134
|
+
r.deploy_started_at = started # we don't care what happened to the resource before the restart cmd ran
|
135
|
+
r
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def patch_deployment_with_restart(record)
|
140
|
+
apps_v1_kubeclient.patch_deployment(
|
141
|
+
record.metadata.name,
|
142
|
+
build_patch_payload(record),
|
143
|
+
@namespace
|
144
|
+
)
|
145
|
+
end
|
146
|
+
|
147
|
+
def patch_kubeclient_deployments(deployments)
|
148
|
+
deployments.each do |record|
|
149
|
+
begin
|
150
|
+
patch_deployment_with_restart(record)
|
151
|
+
@logger.info("Triggered `#{record.metadata.name}` restart")
|
152
|
+
rescue Kubeclient::HttpError => e
|
153
|
+
raise RestartAPIError.new(record.metadata.name, e.message)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def fetch_deployments(list)
|
159
|
+
list.map do |name|
|
160
|
+
record = nil
|
161
|
+
begin
|
162
|
+
record = apps_v1_kubeclient.get_deployment(name, @namespace)
|
163
|
+
rescue Kubeclient::ResourceNotFoundError
|
164
|
+
raise FatalRestartError, "Deployment `#{name}` not found in namespace `#{@namespace}`"
|
165
|
+
end
|
166
|
+
record
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
def build_patch_payload(deployment)
|
171
|
+
containers = deployment.spec.template.spec.containers
|
172
|
+
{
|
173
|
+
spec: {
|
174
|
+
template: {
|
175
|
+
spec: {
|
176
|
+
containers: containers.map do |container|
|
177
|
+
{
|
178
|
+
name: container.name,
|
179
|
+
env: [{ name: "RESTARTED_AT", value: Time.now.to_i.to_s }],
|
180
|
+
}
|
181
|
+
end,
|
182
|
+
},
|
183
|
+
},
|
184
|
+
},
|
185
|
+
}
|
186
|
+
end
|
187
|
+
|
188
|
+
def verify_restart(resources)
|
189
|
+
ResourceWatcher.new(resources: resources, operation_name: "restart",
|
190
|
+
timeout: @global_timeout, task_config: @task_config).run
|
191
|
+
failed_resources = resources.reject(&:deploy_succeeded?)
|
192
|
+
success = failed_resources.empty?
|
193
|
+
if !success && failed_resources.all?(&:deploy_timed_out?)
|
194
|
+
raise DeploymentTimeoutError
|
195
|
+
end
|
196
|
+
raise FatalDeploymentError unless success
|
197
|
+
end
|
198
|
+
|
199
|
+
def verify_config!
|
200
|
+
task_config_validator = TaskConfigValidator.new(@task_config, kubectl, kubeclient_builder)
|
201
|
+
unless task_config_validator.valid?
|
202
|
+
@logger.summary.add_action("Configuration invalid")
|
203
|
+
@logger.summary.add_paragraph(task_config_validator.errors.map { |err| "- #{err}" }.join("\n"))
|
204
|
+
raise Krane::TaskConfigurationError
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def apps_v1_kubeclient
|
209
|
+
@apps_v1_kubeclient ||= kubeclient_builder.build_apps_v1_kubeclient(@context)
|
210
|
+
end
|
211
|
+
|
212
|
+
def kubeclient
|
213
|
+
@kubeclient ||= kubeclient_builder.build_v1_kubeclient(@context)
|
214
|
+
end
|
215
|
+
|
216
|
+
def kubectl
|
217
|
+
@kubectl ||= Kubectl.new(task_config: @task_config, log_failure_by_default: true)
|
218
|
+
end
|
219
|
+
|
220
|
+
def v1beta1_kubeclient
|
221
|
+
@v1beta1_kubeclient ||= kubeclient_builder.build_v1beta1_kubeclient(@context)
|
222
|
+
end
|
223
|
+
|
224
|
+
def kubeclient_builder
|
225
|
+
@kubeclient_builder ||= KubeclientBuilder.new
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|