tobsch-krane 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +7 -0
  2. data/.buildkite/pipeline.nightly.yml +43 -0
  3. data/.github/probots.yml +2 -0
  4. data/.gitignore +20 -0
  5. data/.rubocop.yml +17 -0
  6. data/.shopify-build/VERSION +1 -0
  7. data/.shopify-build/kubernetes-deploy.yml +53 -0
  8. data/1.0-Upgrade.md +185 -0
  9. data/CHANGELOG.md +431 -0
  10. data/CODE_OF_CONDUCT.md +46 -0
  11. data/CONTRIBUTING.md +164 -0
  12. data/Gemfile +16 -0
  13. data/ISSUE_TEMPLATE.md +25 -0
  14. data/LICENSE.txt +21 -0
  15. data/README.md +655 -0
  16. data/Rakefile +36 -0
  17. data/bin/ci +21 -0
  18. data/bin/setup +16 -0
  19. data/bin/test +47 -0
  20. data/dev.yml +28 -0
  21. data/dev/flamegraph-from-tests +35 -0
  22. data/exe/krane +5 -0
  23. data/krane.gemspec +44 -0
  24. data/lib/krane.rb +7 -0
  25. data/lib/krane/bindings_parser.rb +88 -0
  26. data/lib/krane/cli/deploy_command.rb +75 -0
  27. data/lib/krane/cli/global_deploy_command.rb +54 -0
  28. data/lib/krane/cli/krane.rb +91 -0
  29. data/lib/krane/cli/render_command.rb +41 -0
  30. data/lib/krane/cli/restart_command.rb +34 -0
  31. data/lib/krane/cli/run_command.rb +54 -0
  32. data/lib/krane/cli/version_command.rb +13 -0
  33. data/lib/krane/cluster_resource_discovery.rb +113 -0
  34. data/lib/krane/common.rb +23 -0
  35. data/lib/krane/concerns/template_reporting.rb +29 -0
  36. data/lib/krane/concurrency.rb +18 -0
  37. data/lib/krane/container_logs.rb +106 -0
  38. data/lib/krane/deferred_summary_logging.rb +95 -0
  39. data/lib/krane/delayed_exceptions.rb +14 -0
  40. data/lib/krane/deploy_task.rb +363 -0
  41. data/lib/krane/deploy_task_config_validator.rb +29 -0
  42. data/lib/krane/duration_parser.rb +27 -0
  43. data/lib/krane/ejson_secret_provisioner.rb +154 -0
  44. data/lib/krane/errors.rb +28 -0
  45. data/lib/krane/formatted_logger.rb +57 -0
  46. data/lib/krane/global_deploy_task.rb +210 -0
  47. data/lib/krane/global_deploy_task_config_validator.rb +12 -0
  48. data/lib/krane/kubeclient_builder.rb +156 -0
  49. data/lib/krane/kubectl.rb +120 -0
  50. data/lib/krane/kubernetes_resource.rb +621 -0
  51. data/lib/krane/kubernetes_resource/cloudsql.rb +43 -0
  52. data/lib/krane/kubernetes_resource/config_map.rb +22 -0
  53. data/lib/krane/kubernetes_resource/cron_job.rb +18 -0
  54. data/lib/krane/kubernetes_resource/custom_resource.rb +87 -0
  55. data/lib/krane/kubernetes_resource/custom_resource_definition.rb +98 -0
  56. data/lib/krane/kubernetes_resource/daemon_set.rb +90 -0
  57. data/lib/krane/kubernetes_resource/deployment.rb +213 -0
  58. data/lib/krane/kubernetes_resource/horizontal_pod_autoscaler.rb +65 -0
  59. data/lib/krane/kubernetes_resource/ingress.rb +18 -0
  60. data/lib/krane/kubernetes_resource/job.rb +60 -0
  61. data/lib/krane/kubernetes_resource/network_policy.rb +22 -0
  62. data/lib/krane/kubernetes_resource/persistent_volume_claim.rb +80 -0
  63. data/lib/krane/kubernetes_resource/pod.rb +269 -0
  64. data/lib/krane/kubernetes_resource/pod_disruption_budget.rb +23 -0
  65. data/lib/krane/kubernetes_resource/pod_set_base.rb +71 -0
  66. data/lib/krane/kubernetes_resource/pod_template.rb +20 -0
  67. data/lib/krane/kubernetes_resource/replica_set.rb +92 -0
  68. data/lib/krane/kubernetes_resource/resource_quota.rb +22 -0
  69. data/lib/krane/kubernetes_resource/role.rb +22 -0
  70. data/lib/krane/kubernetes_resource/role_binding.rb +22 -0
  71. data/lib/krane/kubernetes_resource/secret.rb +24 -0
  72. data/lib/krane/kubernetes_resource/service.rb +104 -0
  73. data/lib/krane/kubernetes_resource/service_account.rb +22 -0
  74. data/lib/krane/kubernetes_resource/stateful_set.rb +70 -0
  75. data/lib/krane/label_selector.rb +42 -0
  76. data/lib/krane/oj.rb +4 -0
  77. data/lib/krane/options_helper.rb +39 -0
  78. data/lib/krane/remote_logs.rb +60 -0
  79. data/lib/krane/render_task.rb +118 -0
  80. data/lib/krane/renderer.rb +118 -0
  81. data/lib/krane/resource_cache.rb +68 -0
  82. data/lib/krane/resource_deployer.rb +265 -0
  83. data/lib/krane/resource_watcher.rb +171 -0
  84. data/lib/krane/restart_task.rb +228 -0
  85. data/lib/krane/rollout_conditions.rb +103 -0
  86. data/lib/krane/runner_task.rb +212 -0
  87. data/lib/krane/runner_task_config_validator.rb +18 -0
  88. data/lib/krane/statsd.rb +65 -0
  89. data/lib/krane/task_config.rb +22 -0
  90. data/lib/krane/task_config_validator.rb +96 -0
  91. data/lib/krane/template_sets.rb +173 -0
  92. data/lib/krane/version.rb +4 -0
  93. data/pull_request_template.md +8 -0
  94. data/screenshots/deploy-demo.gif +0 -0
  95. data/screenshots/migrate-logs.png +0 -0
  96. data/screenshots/missing-secret-fail.png +0 -0
  97. data/screenshots/success.png +0 -0
  98. data/screenshots/test-output.png +0 -0
  99. metadata +375 -0
@@ -0,0 +1,265 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'krane/resource_watcher'
4
+ require 'krane/concerns/template_reporting'
5
+
6
+ module Krane
7
+ class ResourceDeployer
8
+ extend Krane::StatsD::MeasureMethods
9
+ include Krane::TemplateReporting
10
+
11
+ delegate :logger, to: :@task_config
12
+ attr_reader :statsd_tags
13
+
14
+ def initialize(task_config:, prune_whitelist:, global_timeout:, current_sha: nil, selector:, statsd_tags:)
15
+ @task_config = task_config
16
+ @prune_whitelist = prune_whitelist
17
+ @global_timeout = global_timeout
18
+ @current_sha = current_sha
19
+ @selector = selector
20
+ @statsd_tags = statsd_tags
21
+ end
22
+
23
+ def deploy!(resources, verify_result, prune)
24
+ if verify_result
25
+ deploy_all_resources(resources, prune: prune, verify: true)
26
+ failed_resources = resources.reject(&:deploy_succeeded?)
27
+ success = failed_resources.empty?
28
+ if !success && failed_resources.all?(&:deploy_timed_out?)
29
+ raise DeploymentTimeoutError
30
+ end
31
+ raise FatalDeploymentError unless success
32
+ else
33
+ deploy_all_resources(resources, prune: prune, verify: false)
34
+ logger.summary.add_action("deployed #{resources.length} #{'resource'.pluralize(resources.length)}")
35
+ warning = <<~MSG
36
+ Deploy result verification is disabled for this deploy.
37
+ This means the desired changes were communicated to Kubernetes, but the deploy did not make sure they actually succeeded.
38
+ MSG
39
+ logger.summary.add_paragraph(ColorizedString.new(warning).yellow)
40
+ end
41
+ end
42
+
43
+ def predeploy_priority_resources(resource_list, predeploy_sequence)
44
+ bare_pods = resource_list.select { |resource| resource.is_a?(Pod) }
45
+ if bare_pods.count == 1
46
+ bare_pods.first.stream_logs = true
47
+ end
48
+
49
+ predeploy_sequence.each do |resource_type|
50
+ matching_resources = resource_list.select { |r| r.type == resource_type }
51
+ next if matching_resources.empty?
52
+ deploy_resources(matching_resources, verify: true, record_summary: false)
53
+
54
+ failed_resources = matching_resources.reject(&:deploy_succeeded?)
55
+ fail_count = failed_resources.length
56
+ if fail_count > 0
57
+ Krane::Concurrency.split_across_threads(failed_resources) do |r|
58
+ r.sync_debug_info(kubectl)
59
+ end
60
+ failed_resources.each { |r| logger.summary.add_paragraph(r.debug_message) }
61
+ raise FatalDeploymentError, "Failed to deploy #{fail_count} priority #{'resource'.pluralize(fail_count)}"
62
+ end
63
+ logger.blank_line
64
+ end
65
+ end
66
+ measure_method(:predeploy_priority_resources, 'priority_resources.duration')
67
+
68
+ private
69
+
70
+ def deploy_all_resources(resources, prune: false, verify:, record_summary: true)
71
+ deploy_resources(resources, prune: prune, verify: verify, record_summary: record_summary)
72
+ end
73
+ measure_method(:deploy_all_resources, 'normal_resources.duration')
74
+
75
+ def deploy_resources(resources, prune: false, verify:, record_summary: true)
76
+ return if resources.empty?
77
+ deploy_started_at = Time.now.utc
78
+
79
+ if resources.length > 1
80
+ logger.info("Deploying resources:")
81
+ resources.each do |r|
82
+ logger.info("- #{r.id} (#{r.pretty_timeout_type})")
83
+ end
84
+ else
85
+ resource = resources.first
86
+ logger.info("Deploying #{resource.id} (#{resource.pretty_timeout_type})")
87
+ end
88
+
89
+ # Apply can be done in one large batch, the rest have to be done individually
90
+ applyables, individuals = resources.partition { |r| r.deploy_method == :apply }
91
+ # Prunable resources should also applied so that they can be pruned
92
+ pruneable_types = @prune_whitelist.map { |t| t.split("/").last }
93
+ applyables += individuals.select { |r| pruneable_types.include?(r.type) }
94
+
95
+ individuals.each do |individual_resource|
96
+ individual_resource.deploy_started_at = Time.now.utc
97
+
98
+ case individual_resource.deploy_method
99
+ when :create
100
+ err, status = create_resource(individual_resource)
101
+ when :replace
102
+ err, status = replace_or_create_resource(individual_resource)
103
+ when :replace_force
104
+ err, status = replace_or_create_resource(individual_resource, force: true)
105
+ else
106
+ # Fail Fast! This is a programmer mistake.
107
+ raise ArgumentError, "Unexpected deploy method! (#{individual_resource.deploy_method.inspect})"
108
+ end
109
+
110
+ next if status.success?
111
+
112
+ raise FatalDeploymentError, <<~MSG
113
+ Failed to replace or create resource: #{individual_resource.id}
114
+ #{individual_resource.sensitive_template_content? ? '<suppressed sensitive output>' : err}
115
+ MSG
116
+ end
117
+
118
+ apply_all(applyables, prune)
119
+
120
+ if verify
121
+ watcher = Krane::ResourceWatcher.new(resources: resources, deploy_started_at: deploy_started_at,
122
+ timeout: @global_timeout, task_config: @task_config, sha: @current_sha)
123
+ watcher.run(record_summary: record_summary)
124
+ end
125
+ end
126
+
127
+ def apply_all(resources, prune)
128
+ return unless resources.present?
129
+ command = %w(apply)
130
+
131
+ Dir.mktmpdir do |tmp_dir|
132
+ resources.each do |r|
133
+ FileUtils.symlink(r.file_path, tmp_dir)
134
+ r.deploy_started_at = Time.now.utc
135
+ end
136
+ command.push("-f", tmp_dir)
137
+
138
+ if prune && @prune_whitelist.present?
139
+ command.push("--prune")
140
+ if @selector
141
+ command.push("--selector", @selector.to_s)
142
+ else
143
+ command.push("--all")
144
+ end
145
+ @prune_whitelist.each { |type| command.push("--prune-whitelist=#{type}") }
146
+ end
147
+
148
+ output_is_sensitive = resources.any?(&:sensitive_template_content?)
149
+ global_mode = resources.all?(&:global?)
150
+ out, err, st = kubectl.run(*command, log_failure: false, output_is_sensitive: output_is_sensitive,
151
+ use_namespace: !global_mode)
152
+
153
+ if st.success?
154
+ log_pruning(out) if prune
155
+ else
156
+ record_apply_failure(err, resources: resources)
157
+ raise FatalDeploymentError, "Command failed: #{Shellwords.join(command)}"
158
+ end
159
+ end
160
+ end
161
+ measure_method(:apply_all)
162
+
163
+ def log_pruning(kubectl_output)
164
+ pruned = kubectl_output.scan(/^(.*) pruned$/)
165
+ return unless pruned.present?
166
+
167
+ logger.info("The following resources were pruned: #{pruned.join(', ')}")
168
+ logger.summary.add_action("pruned #{pruned.length} #{'resource'.pluralize(pruned.length)}")
169
+ end
170
+
171
+ def record_apply_failure(err, resources: [])
172
+ warn_msg = "WARNING: Any resources not mentioned in the error(s) below were likely created/updated. " \
173
+ "You may wish to roll back this deploy."
174
+ logger.summary.add_paragraph(ColorizedString.new(warn_msg).yellow)
175
+
176
+ unidentified_errors = []
177
+ filenames_with_sensitive_content = resources
178
+ .select(&:sensitive_template_content?)
179
+ .map { |r| File.basename(r.file_path) }
180
+
181
+ server_dry_run_validated_resource = resources
182
+ .select(&:server_dry_run_validated?)
183
+ .map { |r| File.basename(r.file_path) }
184
+
185
+ err.each_line do |line|
186
+ bad_files = find_bad_files_from_kubectl_output(line)
187
+ unless bad_files.present?
188
+ unidentified_errors << line
189
+ next
190
+ end
191
+
192
+ bad_files.each do |f|
193
+ err_msg = f[:err]
194
+ if filenames_with_sensitive_content.include?(f[:filename])
195
+ # Hide the error and template contents in case it has sensitive information
196
+ # we display full error messages as we assume there's no sensitive info leak after server-dry-run
197
+ err_msg = "SUPPRESSED FOR SECURITY" unless server_dry_run_validated_resource.include?(f[:filename])
198
+ record_invalid_template(logger: logger, err: err_msg, filename: f[:filename], content: nil)
199
+ else
200
+ record_invalid_template(logger: logger, err: err_msg, filename: f[:filename], content: f[:content])
201
+ end
202
+ end
203
+ end
204
+ return unless unidentified_errors.any?
205
+
206
+ if (filenames_with_sensitive_content - server_dry_run_validated_resource).present?
207
+ warn_msg = "WARNING: There was an error applying some or all resources. The raw output may be sensitive and " \
208
+ "so cannot be displayed."
209
+ logger.summary.add_paragraph(ColorizedString.new(warn_msg).yellow)
210
+ else
211
+ heading = ColorizedString.new('Unidentified error(s):').red
212
+ msg = FormattedLogger.indent_four(unidentified_errors.join)
213
+ logger.summary.add_paragraph("#{heading}\n#{msg}")
214
+ end
215
+ end
216
+
217
+ def replace_or_create_resource(resource, force: false)
218
+ args = if force
219
+ ["replace", "--force", "--cascade", "-f", resource.file_path]
220
+ else
221
+ ["replace", "-f", resource.file_path]
222
+ end
223
+
224
+ _, err, status = kubectl.run(*args, log_failure: false, output_is_sensitive: resource.sensitive_template_content?,
225
+ raise_if_not_found: true, use_namespace: !resource.global?)
226
+
227
+ [err, status]
228
+ rescue Krane::Kubectl::ResourceNotFoundError
229
+ # it doesn't exist so we can't replace it, we try to create it
230
+ create_resource(resource)
231
+ end
232
+
233
+ def create_resource(resource)
234
+ out, err, status = kubectl.run("create", "-f", resource.file_path, log_failure: false,
235
+ output: 'json', output_is_sensitive: resource.sensitive_template_content?,
236
+ use_namespace: !resource.global?)
237
+
238
+ # For resources that rely on a generateName attribute, we get the `name` from the result of the call to `create`
239
+ # We must explicitly set this name value so that the `apply` step for pruning can run successfully
240
+ if status.success? && resource.uses_generate_name?
241
+ resource.use_generated_name(JSON.parse(out))
242
+ end
243
+
244
+ [err, status]
245
+ end
246
+
247
+ # Inspect the file referenced in the kubectl stderr
248
+ # to make it easier for developer to understand what's going on
249
+ def find_bad_files_from_kubectl_output(line)
250
+ # stderr often contains one or more lines like the following, from which we can extract the file path(s):
251
+ # Error from server (TypeOfError): error when creating "/path/to/service-gqq5oh.yml": Service "web" is invalid:
252
+
253
+ line.scan(%r{"(/\S+\.ya?ml\S*)"}).each_with_object([]) do |matches, bad_files|
254
+ matches.each do |path|
255
+ content = File.read(path) if File.file?(path)
256
+ bad_files << { filename: File.basename(path), err: line, content: content }
257
+ end
258
+ end
259
+ end
260
+
261
+ def kubectl
262
+ @kubectl ||= Kubectl.new(task_config: @task_config, log_failure_by_default: true)
263
+ end
264
+ end
265
+ end
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'krane/concurrency'
4
+ require 'krane/resource_cache'
5
+
6
+ module Krane
7
+ class ResourceWatcher
8
+ extend Krane::StatsD::MeasureMethods
9
+ delegate :namespace, :context, :logger, to: :@task_config
10
+
11
+ def initialize(resources:, task_config:, deploy_started_at: Time.now.utc,
12
+ operation_name: "deploy", timeout: nil, sha: nil)
13
+ unless resources.is_a?(Enumerable)
14
+ raise ArgumentError, <<~MSG
15
+ ResourceWatcher expects Enumerable collection, got `#{resources.class}` instead
16
+ MSG
17
+ end
18
+ @resources = resources
19
+ @task_config = task_config
20
+ @deploy_started_at = deploy_started_at
21
+ @operation_name = operation_name
22
+ @timeout = timeout
23
+ @sha = sha
24
+ end
25
+
26
+ def run(delay_sync: 3.seconds, reminder_interval: 30.seconds, record_summary: true)
27
+ last_message_logged_at = monitoring_started = Time.now.utc
28
+ remainder = @resources.dup
29
+
30
+ while remainder.present?
31
+ report_and_give_up(remainder) if global_timeout?(monitoring_started)
32
+ sleep_until_next_sync(delay_sync)
33
+
34
+ sync_resources(remainder)
35
+
36
+ new_successes, remainder = remainder.partition(&:deploy_succeeded?)
37
+ new_failures, remainder = remainder.partition(&:deploy_failed?)
38
+ new_timeouts, remainder = remainder.partition(&:deploy_timed_out?)
39
+
40
+ if new_successes.present? || new_failures.present? || new_timeouts.present?
41
+ report_what_just_happened(new_successes, new_failures, new_timeouts)
42
+ report_what_is_left(remainder, reminder: false)
43
+ last_message_logged_at = Time.now.utc
44
+ elsif due_for_reminder?(last_message_logged_at, reminder_interval)
45
+ report_what_is_left(remainder, reminder: true)
46
+ last_message_logged_at = Time.now.utc
47
+ end
48
+ end
49
+ record_statuses_for_summary(@resources) if record_summary
50
+ end
51
+
52
+ private
53
+
54
+ def sync_resources(resources)
55
+ cache = ResourceCache.new(@task_config)
56
+ Krane::Concurrency.split_across_threads(resources) { |r| r.sync(cache) }
57
+ resources.each(&:after_sync)
58
+ end
59
+ measure_method(:sync_resources, "sync.duration")
60
+
61
+ def statsd_tags
62
+ {
63
+ namespace: namespace,
64
+ context: context,
65
+ sha: @sha,
66
+ }
67
+ end
68
+
69
+ def global_timeout?(started_at)
70
+ @timeout && (Time.now.utc - started_at > @timeout)
71
+ end
72
+
73
+ def sleep_until_next_sync(min_interval)
74
+ @next_sync_time ||= Time.now.utc
75
+ if (sleep_duration = @next_sync_time - Time.now.utc) > 0
76
+ sleep(sleep_duration)
77
+ end
78
+ @next_sync_time = Time.now.utc + min_interval
79
+ end
80
+
81
+ def report_what_just_happened(new_successes, new_failures, new_timeouts)
82
+ watch_time = (Time.now.utc - @deploy_started_at).round(1)
83
+ new_failures.each do |resource|
84
+ resource.report_status_to_statsd(watch_time)
85
+ logger.error("#{resource.id} failed to #{@operation_name} after #{watch_time}s")
86
+ end
87
+
88
+ new_timeouts.each do |resource|
89
+ resource.report_status_to_statsd(watch_time)
90
+ logger.error("#{resource.id} rollout timed out after #{watch_time}s")
91
+ end
92
+
93
+ if new_successes.present?
94
+ new_successes.each { |r| r.report_status_to_statsd(watch_time) }
95
+ success_string = ColorizedString.new("Successfully #{past_tense_operation} in #{watch_time}s:").green
96
+ logger.info("#{success_string} #{new_successes.map(&:id).join(', ')}")
97
+ end
98
+ end
99
+
100
+ def report_what_is_left(resources, reminder:)
101
+ return unless resources.present?
102
+ resource_list = resources.map(&:id).join(', ')
103
+ msg = reminder ? "Still waiting for: #{resource_list}" : "Continuing to wait for: #{resource_list}"
104
+ logger.info(msg)
105
+ end
106
+
107
+ def report_and_give_up(remaining_resources)
108
+ successful_resources, failed_resources = (@resources - remaining_resources).partition(&:deploy_succeeded?)
109
+ record_success_statuses(successful_resources)
110
+ record_failed_statuses(failed_resources, remaining_resources)
111
+
112
+ if failed_resources.present? && !failed_resources.all?(&:deploy_timed_out?)
113
+ raise FatalDeploymentError
114
+ else
115
+ raise DeploymentTimeoutError
116
+ end
117
+ end
118
+
119
+ def record_statuses_for_summary(resources)
120
+ successful_resources, failed_resources = resources.partition(&:deploy_succeeded?)
121
+ record_success_statuses(successful_resources)
122
+ record_failed_statuses(failed_resources)
123
+ end
124
+
125
+ def record_failed_statuses(failed_resources, global_timeouts = [])
126
+ fail_count = failed_resources.length + global_timeouts.length
127
+
128
+ if fail_count > 0
129
+ timeouts, failures = failed_resources.partition(&:deploy_timed_out?)
130
+ timeouts += global_timeouts
131
+ if timeouts.present?
132
+ logger.summary.add_action(
133
+ "timed out waiting for #{timeouts.length} #{'resource'.pluralize(timeouts.length)} to #{@operation_name}"
134
+ )
135
+ end
136
+
137
+ if failures.present?
138
+ logger.summary.add_action(
139
+ "failed to #{@operation_name} #{failures.length} #{'resource'.pluralize(failures.length)}"
140
+ )
141
+ end
142
+
143
+ kubectl = Kubectl.new(task_config: @task_config, log_failure_by_default: false)
144
+ Krane::Concurrency.split_across_threads(failed_resources + global_timeouts) do |r|
145
+ r.sync_debug_info(kubectl)
146
+ end
147
+
148
+ failed_resources.each { |r| logger.summary.add_paragraph(r.debug_message) }
149
+ global_timeouts.each { |r| logger.summary.add_paragraph(r.debug_message(:gave_up, timeout: @timeout)) }
150
+ end
151
+ end
152
+
153
+ def record_success_statuses(successful_resources)
154
+ success_count = successful_resources.length
155
+ if success_count > 0
156
+ logger.summary.add_action("successfully #{past_tense_operation} #{success_count} "\
157
+ "#{'resource'.pluralize(success_count)}")
158
+ final_statuses = successful_resources.map(&:pretty_status).join("\n")
159
+ logger.summary.add_paragraph("#{ColorizedString.new('Successful resources').green}\n#{final_statuses}")
160
+ end
161
+ end
162
+
163
+ def due_for_reminder?(last_message_logged_at, reminder_interval)
164
+ (last_message_logged_at.to_f + reminder_interval.to_f) <= Time.now.utc.to_f
165
+ end
166
+
167
+ def past_tense_operation
168
+ @operation_name == "run" ? "ran" : "#{@operation_name}ed"
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,228 @@
1
+ # frozen_string_literal: true
2
+ require 'krane/common'
3
+ require 'krane/kubernetes_resource'
4
+ require 'krane/kubernetes_resource/deployment'
5
+ require 'krane/kubeclient_builder'
6
+ require 'krane/resource_watcher'
7
+ require 'krane/kubectl'
8
+
9
+ module Krane
10
+ # Restart the pods in one or more deployments
11
+ class RestartTask
12
+ class FatalRestartError < FatalDeploymentError; end
13
+
14
+ class RestartAPIError < FatalRestartError
15
+ def initialize(deployment_name, response)
16
+ super("Failed to restart #{deployment_name}. " \
17
+ "API returned non-200 response code (#{response.code})\n" \
18
+ "Response:\n#{response.body}")
19
+ end
20
+ end
21
+
22
+ HTTP_OK_RANGE = 200..299
23
+ ANNOTATION = "shipit.shopify.io/restart"
24
+
25
+ # Initializes the restart task
26
+ #
27
+ # @param context [String] Kubernetes context / cluster (*required*)
28
+ # @param namespace [String] Kubernetes namespace (*required*)
29
+ # @param logger [Object] Logger object (defaults to an instance of Krane::FormattedLogger)
30
+ # @param global_timeout [Integer] Timeout in seconds
31
+ def initialize(context:, namespace:, logger: nil, global_timeout: nil)
32
+ @logger = logger || Krane::FormattedLogger.build(namespace, context)
33
+ @task_config = Krane::TaskConfig.new(context, namespace, @logger)
34
+ @context = context
35
+ @namespace = namespace
36
+ @global_timeout = global_timeout
37
+ end
38
+
39
+ # Runs the task, returning a boolean representing success or failure
40
+ #
41
+ # @return [Boolean]
42
+ def run(*args)
43
+ perform!(*args)
44
+ true
45
+ rescue FatalDeploymentError
46
+ false
47
+ end
48
+ alias_method :perform, :run
49
+
50
+ # Runs the task, raising exceptions in case of issues
51
+ #
52
+ # @param deployments [Array<String>] Array of workload names to restart
53
+ # @param selector [Hash] Selector(s) parsed by Krane::LabelSelector
54
+ # @param verify_result [Boolean] Wait for completion and verify success
55
+ #
56
+ # @return [nil]
57
+ def run!(deployments: nil, selector: nil, verify_result: true)
58
+ start = Time.now.utc
59
+ @logger.reset
60
+
61
+ @logger.phase_heading("Initializing restart")
62
+ verify_config!
63
+ deployments = identify_target_deployments(deployments, selector: selector)
64
+
65
+ @logger.phase_heading("Triggering restart by touching ENV[RESTARTED_AT]")
66
+ patch_kubeclient_deployments(deployments)
67
+
68
+ if verify_result
69
+ @logger.phase_heading("Waiting for rollout")
70
+ resources = build_watchables(deployments, start)
71
+ verify_restart(resources)
72
+ else
73
+ warning = "Result verification is disabled for this task"
74
+ @logger.summary.add_paragraph(ColorizedString.new(warning).yellow)
75
+ end
76
+ StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('success', deployments))
77
+ @logger.print_summary(:success)
78
+ rescue DeploymentTimeoutError
79
+ StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('timeout', deployments))
80
+ @logger.print_summary(:timed_out)
81
+ raise
82
+ rescue FatalDeploymentError => error
83
+ StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('failure', deployments))
84
+ @logger.summary.add_action(error.message) if error.message != error.class.to_s
85
+ @logger.print_summary(:failure)
86
+ raise
87
+ end
88
+ alias_method :perform!, :run!
89
+
90
+ private
91
+
92
+ def tags(status, deployments)
93
+ %W(namespace:#{@namespace} context:#{@context} status:#{status} deployments:#{deployments.to_a.length}})
94
+ end
95
+
96
+ def identify_target_deployments(deployment_names, selector: nil)
97
+ if deployment_names.nil?
98
+ deployments = if selector.nil?
99
+ @logger.info("Configured to restart all deployments with the `#{ANNOTATION}` annotation")
100
+ apps_v1_kubeclient.get_deployments(namespace: @namespace)
101
+ else
102
+ selector_string = selector.to_s
103
+ @logger.info(
104
+ "Configured to restart all deployments with the `#{ANNOTATION}` annotation and #{selector_string} selector"
105
+ )
106
+ apps_v1_kubeclient.get_deployments(namespace: @namespace, label_selector: selector_string)
107
+ end
108
+ deployments.select! { |d| d.metadata.annotations[ANNOTATION] }
109
+
110
+ if deployments.none?
111
+ raise FatalRestartError, "no deployments with the `#{ANNOTATION}` annotation found in namespace #{@namespace}"
112
+ end
113
+ elsif deployment_names.empty?
114
+ raise FatalRestartError, "Configured to restart deployments by name, but list of names was blank"
115
+ elsif !selector.nil?
116
+ raise FatalRestartError, "Can't specify deployment names and selector at the same time"
117
+ else
118
+ deployment_names = deployment_names.uniq
119
+ list = deployment_names.join(', ')
120
+ @logger.info("Configured to restart deployments by name: #{list}")
121
+
122
+ deployments = fetch_deployments(deployment_names)
123
+ if deployments.none?
124
+ raise FatalRestartError, "no deployments with names #{list} found in namespace #{@namespace}"
125
+ end
126
+ end
127
+ deployments
128
+ end
129
+
130
+ def build_watchables(kubeclient_resources, started)
131
+ kubeclient_resources.map do |d|
132
+ definition = d.to_h.deep_stringify_keys
133
+ r = Deployment.new(namespace: @namespace, context: @context, definition: definition, logger: @logger)
134
+ r.deploy_started_at = started # we don't care what happened to the resource before the restart cmd ran
135
+ r
136
+ end
137
+ end
138
+
139
+ def patch_deployment_with_restart(record)
140
+ apps_v1_kubeclient.patch_deployment(
141
+ record.metadata.name,
142
+ build_patch_payload(record),
143
+ @namespace
144
+ )
145
+ end
146
+
147
+ def patch_kubeclient_deployments(deployments)
148
+ deployments.each do |record|
149
+ begin
150
+ patch_deployment_with_restart(record)
151
+ @logger.info("Triggered `#{record.metadata.name}` restart")
152
+ rescue Kubeclient::HttpError => e
153
+ raise RestartAPIError.new(record.metadata.name, e.message)
154
+ end
155
+ end
156
+ end
157
+
158
+ def fetch_deployments(list)
159
+ list.map do |name|
160
+ record = nil
161
+ begin
162
+ record = apps_v1_kubeclient.get_deployment(name, @namespace)
163
+ rescue Kubeclient::ResourceNotFoundError
164
+ raise FatalRestartError, "Deployment `#{name}` not found in namespace `#{@namespace}`"
165
+ end
166
+ record
167
+ end
168
+ end
169
+
170
+ def build_patch_payload(deployment)
171
+ containers = deployment.spec.template.spec.containers
172
+ {
173
+ spec: {
174
+ template: {
175
+ spec: {
176
+ containers: containers.map do |container|
177
+ {
178
+ name: container.name,
179
+ env: [{ name: "RESTARTED_AT", value: Time.now.to_i.to_s }],
180
+ }
181
+ end,
182
+ },
183
+ },
184
+ },
185
+ }
186
+ end
187
+
188
+ def verify_restart(resources)
189
+ ResourceWatcher.new(resources: resources, operation_name: "restart",
190
+ timeout: @global_timeout, task_config: @task_config).run
191
+ failed_resources = resources.reject(&:deploy_succeeded?)
192
+ success = failed_resources.empty?
193
+ if !success && failed_resources.all?(&:deploy_timed_out?)
194
+ raise DeploymentTimeoutError
195
+ end
196
+ raise FatalDeploymentError unless success
197
+ end
198
+
199
+ def verify_config!
200
+ task_config_validator = TaskConfigValidator.new(@task_config, kubectl, kubeclient_builder)
201
+ unless task_config_validator.valid?
202
+ @logger.summary.add_action("Configuration invalid")
203
+ @logger.summary.add_paragraph(task_config_validator.errors.map { |err| "- #{err}" }.join("\n"))
204
+ raise Krane::TaskConfigurationError
205
+ end
206
+ end
207
+
208
+ def apps_v1_kubeclient
209
+ @apps_v1_kubeclient ||= kubeclient_builder.build_apps_v1_kubeclient(@context)
210
+ end
211
+
212
+ def kubeclient
213
+ @kubeclient ||= kubeclient_builder.build_v1_kubeclient(@context)
214
+ end
215
+
216
+ def kubectl
217
+ @kubectl ||= Kubectl.new(task_config: @task_config, log_failure_by_default: true)
218
+ end
219
+
220
+ def v1beta1_kubeclient
221
+ @v1beta1_kubeclient ||= kubeclient_builder.build_v1beta1_kubeclient(@context)
222
+ end
223
+
224
+ def kubeclient_builder
225
+ @kubeclient_builder ||= KubeclientBuilder.new
226
+ end
227
+ end
228
+ end