tobsch-krane 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. checksums.yaml +7 -0
  2. data/.buildkite/pipeline.nightly.yml +43 -0
  3. data/.github/probots.yml +2 -0
  4. data/.gitignore +20 -0
  5. data/.rubocop.yml +17 -0
  6. data/.shopify-build/VERSION +1 -0
  7. data/.shopify-build/kubernetes-deploy.yml +53 -0
  8. data/1.0-Upgrade.md +185 -0
  9. data/CHANGELOG.md +431 -0
  10. data/CODE_OF_CONDUCT.md +46 -0
  11. data/CONTRIBUTING.md +164 -0
  12. data/Gemfile +16 -0
  13. data/ISSUE_TEMPLATE.md +25 -0
  14. data/LICENSE.txt +21 -0
  15. data/README.md +655 -0
  16. data/Rakefile +36 -0
  17. data/bin/ci +21 -0
  18. data/bin/setup +16 -0
  19. data/bin/test +47 -0
  20. data/dev.yml +28 -0
  21. data/dev/flamegraph-from-tests +35 -0
  22. data/exe/krane +5 -0
  23. data/krane.gemspec +44 -0
  24. data/lib/krane.rb +7 -0
  25. data/lib/krane/bindings_parser.rb +88 -0
  26. data/lib/krane/cli/deploy_command.rb +75 -0
  27. data/lib/krane/cli/global_deploy_command.rb +54 -0
  28. data/lib/krane/cli/krane.rb +91 -0
  29. data/lib/krane/cli/render_command.rb +41 -0
  30. data/lib/krane/cli/restart_command.rb +34 -0
  31. data/lib/krane/cli/run_command.rb +54 -0
  32. data/lib/krane/cli/version_command.rb +13 -0
  33. data/lib/krane/cluster_resource_discovery.rb +113 -0
  34. data/lib/krane/common.rb +23 -0
  35. data/lib/krane/concerns/template_reporting.rb +29 -0
  36. data/lib/krane/concurrency.rb +18 -0
  37. data/lib/krane/container_logs.rb +106 -0
  38. data/lib/krane/deferred_summary_logging.rb +95 -0
  39. data/lib/krane/delayed_exceptions.rb +14 -0
  40. data/lib/krane/deploy_task.rb +363 -0
  41. data/lib/krane/deploy_task_config_validator.rb +29 -0
  42. data/lib/krane/duration_parser.rb +27 -0
  43. data/lib/krane/ejson_secret_provisioner.rb +154 -0
  44. data/lib/krane/errors.rb +28 -0
  45. data/lib/krane/formatted_logger.rb +57 -0
  46. data/lib/krane/global_deploy_task.rb +210 -0
  47. data/lib/krane/global_deploy_task_config_validator.rb +12 -0
  48. data/lib/krane/kubeclient_builder.rb +156 -0
  49. data/lib/krane/kubectl.rb +120 -0
  50. data/lib/krane/kubernetes_resource.rb +621 -0
  51. data/lib/krane/kubernetes_resource/cloudsql.rb +43 -0
  52. data/lib/krane/kubernetes_resource/config_map.rb +22 -0
  53. data/lib/krane/kubernetes_resource/cron_job.rb +18 -0
  54. data/lib/krane/kubernetes_resource/custom_resource.rb +87 -0
  55. data/lib/krane/kubernetes_resource/custom_resource_definition.rb +98 -0
  56. data/lib/krane/kubernetes_resource/daemon_set.rb +90 -0
  57. data/lib/krane/kubernetes_resource/deployment.rb +213 -0
  58. data/lib/krane/kubernetes_resource/horizontal_pod_autoscaler.rb +65 -0
  59. data/lib/krane/kubernetes_resource/ingress.rb +18 -0
  60. data/lib/krane/kubernetes_resource/job.rb +60 -0
  61. data/lib/krane/kubernetes_resource/network_policy.rb +22 -0
  62. data/lib/krane/kubernetes_resource/persistent_volume_claim.rb +80 -0
  63. data/lib/krane/kubernetes_resource/pod.rb +269 -0
  64. data/lib/krane/kubernetes_resource/pod_disruption_budget.rb +23 -0
  65. data/lib/krane/kubernetes_resource/pod_set_base.rb +71 -0
  66. data/lib/krane/kubernetes_resource/pod_template.rb +20 -0
  67. data/lib/krane/kubernetes_resource/replica_set.rb +92 -0
  68. data/lib/krane/kubernetes_resource/resource_quota.rb +22 -0
  69. data/lib/krane/kubernetes_resource/role.rb +22 -0
  70. data/lib/krane/kubernetes_resource/role_binding.rb +22 -0
  71. data/lib/krane/kubernetes_resource/secret.rb +24 -0
  72. data/lib/krane/kubernetes_resource/service.rb +104 -0
  73. data/lib/krane/kubernetes_resource/service_account.rb +22 -0
  74. data/lib/krane/kubernetes_resource/stateful_set.rb +70 -0
  75. data/lib/krane/label_selector.rb +42 -0
  76. data/lib/krane/oj.rb +4 -0
  77. data/lib/krane/options_helper.rb +39 -0
  78. data/lib/krane/remote_logs.rb +60 -0
  79. data/lib/krane/render_task.rb +118 -0
  80. data/lib/krane/renderer.rb +118 -0
  81. data/lib/krane/resource_cache.rb +68 -0
  82. data/lib/krane/resource_deployer.rb +265 -0
  83. data/lib/krane/resource_watcher.rb +171 -0
  84. data/lib/krane/restart_task.rb +228 -0
  85. data/lib/krane/rollout_conditions.rb +103 -0
  86. data/lib/krane/runner_task.rb +212 -0
  87. data/lib/krane/runner_task_config_validator.rb +18 -0
  88. data/lib/krane/statsd.rb +65 -0
  89. data/lib/krane/task_config.rb +22 -0
  90. data/lib/krane/task_config_validator.rb +96 -0
  91. data/lib/krane/template_sets.rb +173 -0
  92. data/lib/krane/version.rb +4 -0
  93. data/pull_request_template.md +8 -0
  94. data/screenshots/deploy-demo.gif +0 -0
  95. data/screenshots/migrate-logs.png +0 -0
  96. data/screenshots/missing-secret-fail.png +0 -0
  97. data/screenshots/success.png +0 -0
  98. data/screenshots/test-output.png +0 -0
  99. metadata +375 -0
@@ -0,0 +1,265 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'krane/resource_watcher'
4
+ require 'krane/concerns/template_reporting'
5
+
6
+ module Krane
7
+ class ResourceDeployer
8
+ extend Krane::StatsD::MeasureMethods
9
+ include Krane::TemplateReporting
10
+
11
+ delegate :logger, to: :@task_config
12
+ attr_reader :statsd_tags
13
+
14
+ def initialize(task_config:, prune_whitelist:, global_timeout:, current_sha: nil, selector:, statsd_tags:)
15
+ @task_config = task_config
16
+ @prune_whitelist = prune_whitelist
17
+ @global_timeout = global_timeout
18
+ @current_sha = current_sha
19
+ @selector = selector
20
+ @statsd_tags = statsd_tags
21
+ end
22
+
23
+ def deploy!(resources, verify_result, prune)
24
+ if verify_result
25
+ deploy_all_resources(resources, prune: prune, verify: true)
26
+ failed_resources = resources.reject(&:deploy_succeeded?)
27
+ success = failed_resources.empty?
28
+ if !success && failed_resources.all?(&:deploy_timed_out?)
29
+ raise DeploymentTimeoutError
30
+ end
31
+ raise FatalDeploymentError unless success
32
+ else
33
+ deploy_all_resources(resources, prune: prune, verify: false)
34
+ logger.summary.add_action("deployed #{resources.length} #{'resource'.pluralize(resources.length)}")
35
+ warning = <<~MSG
36
+ Deploy result verification is disabled for this deploy.
37
+ This means the desired changes were communicated to Kubernetes, but the deploy did not make sure they actually succeeded.
38
+ MSG
39
+ logger.summary.add_paragraph(ColorizedString.new(warning).yellow)
40
+ end
41
+ end
42
+
43
+ def predeploy_priority_resources(resource_list, predeploy_sequence)
44
+ bare_pods = resource_list.select { |resource| resource.is_a?(Pod) }
45
+ if bare_pods.count == 1
46
+ bare_pods.first.stream_logs = true
47
+ end
48
+
49
+ predeploy_sequence.each do |resource_type|
50
+ matching_resources = resource_list.select { |r| r.type == resource_type }
51
+ next if matching_resources.empty?
52
+ deploy_resources(matching_resources, verify: true, record_summary: false)
53
+
54
+ failed_resources = matching_resources.reject(&:deploy_succeeded?)
55
+ fail_count = failed_resources.length
56
+ if fail_count > 0
57
+ Krane::Concurrency.split_across_threads(failed_resources) do |r|
58
+ r.sync_debug_info(kubectl)
59
+ end
60
+ failed_resources.each { |r| logger.summary.add_paragraph(r.debug_message) }
61
+ raise FatalDeploymentError, "Failed to deploy #{fail_count} priority #{'resource'.pluralize(fail_count)}"
62
+ end
63
+ logger.blank_line
64
+ end
65
+ end
66
+ measure_method(:predeploy_priority_resources, 'priority_resources.duration')
67
+
68
+ private
69
+
70
+ def deploy_all_resources(resources, prune: false, verify:, record_summary: true)
71
+ deploy_resources(resources, prune: prune, verify: verify, record_summary: record_summary)
72
+ end
73
+ measure_method(:deploy_all_resources, 'normal_resources.duration')
74
+
75
+ def deploy_resources(resources, prune: false, verify:, record_summary: true)
76
+ return if resources.empty?
77
+ deploy_started_at = Time.now.utc
78
+
79
+ if resources.length > 1
80
+ logger.info("Deploying resources:")
81
+ resources.each do |r|
82
+ logger.info("- #{r.id} (#{r.pretty_timeout_type})")
83
+ end
84
+ else
85
+ resource = resources.first
86
+ logger.info("Deploying #{resource.id} (#{resource.pretty_timeout_type})")
87
+ end
88
+
89
+ # Apply can be done in one large batch, the rest have to be done individually
90
+ applyables, individuals = resources.partition { |r| r.deploy_method == :apply }
91
+ # Prunable resources should also applied so that they can be pruned
92
+ pruneable_types = @prune_whitelist.map { |t| t.split("/").last }
93
+ applyables += individuals.select { |r| pruneable_types.include?(r.type) }
94
+
95
+ individuals.each do |individual_resource|
96
+ individual_resource.deploy_started_at = Time.now.utc
97
+
98
+ case individual_resource.deploy_method
99
+ when :create
100
+ err, status = create_resource(individual_resource)
101
+ when :replace
102
+ err, status = replace_or_create_resource(individual_resource)
103
+ when :replace_force
104
+ err, status = replace_or_create_resource(individual_resource, force: true)
105
+ else
106
+ # Fail Fast! This is a programmer mistake.
107
+ raise ArgumentError, "Unexpected deploy method! (#{individual_resource.deploy_method.inspect})"
108
+ end
109
+
110
+ next if status.success?
111
+
112
+ raise FatalDeploymentError, <<~MSG
113
+ Failed to replace or create resource: #{individual_resource.id}
114
+ #{individual_resource.sensitive_template_content? ? '<suppressed sensitive output>' : err}
115
+ MSG
116
+ end
117
+
118
+ apply_all(applyables, prune)
119
+
120
+ if verify
121
+ watcher = Krane::ResourceWatcher.new(resources: resources, deploy_started_at: deploy_started_at,
122
+ timeout: @global_timeout, task_config: @task_config, sha: @current_sha)
123
+ watcher.run(record_summary: record_summary)
124
+ end
125
+ end
126
+
127
+ def apply_all(resources, prune)
128
+ return unless resources.present?
129
+ command = %w(apply)
130
+
131
+ Dir.mktmpdir do |tmp_dir|
132
+ resources.each do |r|
133
+ FileUtils.symlink(r.file_path, tmp_dir)
134
+ r.deploy_started_at = Time.now.utc
135
+ end
136
+ command.push("-f", tmp_dir)
137
+
138
+ if prune && @prune_whitelist.present?
139
+ command.push("--prune")
140
+ if @selector
141
+ command.push("--selector", @selector.to_s)
142
+ else
143
+ command.push("--all")
144
+ end
145
+ @prune_whitelist.each { |type| command.push("--prune-whitelist=#{type}") }
146
+ end
147
+
148
+ output_is_sensitive = resources.any?(&:sensitive_template_content?)
149
+ global_mode = resources.all?(&:global?)
150
+ out, err, st = kubectl.run(*command, log_failure: false, output_is_sensitive: output_is_sensitive,
151
+ use_namespace: !global_mode)
152
+
153
+ if st.success?
154
+ log_pruning(out) if prune
155
+ else
156
+ record_apply_failure(err, resources: resources)
157
+ raise FatalDeploymentError, "Command failed: #{Shellwords.join(command)}"
158
+ end
159
+ end
160
+ end
161
+ measure_method(:apply_all)
162
+
163
+ def log_pruning(kubectl_output)
164
+ pruned = kubectl_output.scan(/^(.*) pruned$/)
165
+ return unless pruned.present?
166
+
167
+ logger.info("The following resources were pruned: #{pruned.join(', ')}")
168
+ logger.summary.add_action("pruned #{pruned.length} #{'resource'.pluralize(pruned.length)}")
169
+ end
170
+
171
+ def record_apply_failure(err, resources: [])
172
+ warn_msg = "WARNING: Any resources not mentioned in the error(s) below were likely created/updated. " \
173
+ "You may wish to roll back this deploy."
174
+ logger.summary.add_paragraph(ColorizedString.new(warn_msg).yellow)
175
+
176
+ unidentified_errors = []
177
+ filenames_with_sensitive_content = resources
178
+ .select(&:sensitive_template_content?)
179
+ .map { |r| File.basename(r.file_path) }
180
+
181
+ server_dry_run_validated_resource = resources
182
+ .select(&:server_dry_run_validated?)
183
+ .map { |r| File.basename(r.file_path) }
184
+
185
+ err.each_line do |line|
186
+ bad_files = find_bad_files_from_kubectl_output(line)
187
+ unless bad_files.present?
188
+ unidentified_errors << line
189
+ next
190
+ end
191
+
192
+ bad_files.each do |f|
193
+ err_msg = f[:err]
194
+ if filenames_with_sensitive_content.include?(f[:filename])
195
+ # Hide the error and template contents in case it has sensitive information
196
+ # we display full error messages as we assume there's no sensitive info leak after server-dry-run
197
+ err_msg = "SUPPRESSED FOR SECURITY" unless server_dry_run_validated_resource.include?(f[:filename])
198
+ record_invalid_template(logger: logger, err: err_msg, filename: f[:filename], content: nil)
199
+ else
200
+ record_invalid_template(logger: logger, err: err_msg, filename: f[:filename], content: f[:content])
201
+ end
202
+ end
203
+ end
204
+ return unless unidentified_errors.any?
205
+
206
+ if (filenames_with_sensitive_content - server_dry_run_validated_resource).present?
207
+ warn_msg = "WARNING: There was an error applying some or all resources. The raw output may be sensitive and " \
208
+ "so cannot be displayed."
209
+ logger.summary.add_paragraph(ColorizedString.new(warn_msg).yellow)
210
+ else
211
+ heading = ColorizedString.new('Unidentified error(s):').red
212
+ msg = FormattedLogger.indent_four(unidentified_errors.join)
213
+ logger.summary.add_paragraph("#{heading}\n#{msg}")
214
+ end
215
+ end
216
+
217
+ def replace_or_create_resource(resource, force: false)
218
+ args = if force
219
+ ["replace", "--force", "--cascade", "-f", resource.file_path]
220
+ else
221
+ ["replace", "-f", resource.file_path]
222
+ end
223
+
224
+ _, err, status = kubectl.run(*args, log_failure: false, output_is_sensitive: resource.sensitive_template_content?,
225
+ raise_if_not_found: true, use_namespace: !resource.global?)
226
+
227
+ [err, status]
228
+ rescue Krane::Kubectl::ResourceNotFoundError
229
+ # it doesn't exist so we can't replace it, we try to create it
230
+ create_resource(resource)
231
+ end
232
+
233
+ def create_resource(resource)
234
+ out, err, status = kubectl.run("create", "-f", resource.file_path, log_failure: false,
235
+ output: 'json', output_is_sensitive: resource.sensitive_template_content?,
236
+ use_namespace: !resource.global?)
237
+
238
+ # For resources that rely on a generateName attribute, we get the `name` from the result of the call to `create`
239
+ # We must explicitly set this name value so that the `apply` step for pruning can run successfully
240
+ if status.success? && resource.uses_generate_name?
241
+ resource.use_generated_name(JSON.parse(out))
242
+ end
243
+
244
+ [err, status]
245
+ end
246
+
247
+ # Inspect the file referenced in the kubectl stderr
248
+ # to make it easier for developer to understand what's going on
249
+ def find_bad_files_from_kubectl_output(line)
250
+ # stderr often contains one or more lines like the following, from which we can extract the file path(s):
251
+ # Error from server (TypeOfError): error when creating "/path/to/service-gqq5oh.yml": Service "web" is invalid:
252
+
253
+ line.scan(%r{"(/\S+\.ya?ml\S*)"}).each_with_object([]) do |matches, bad_files|
254
+ matches.each do |path|
255
+ content = File.read(path) if File.file?(path)
256
+ bad_files << { filename: File.basename(path), err: line, content: content }
257
+ end
258
+ end
259
+ end
260
+
261
+ def kubectl
262
+ @kubectl ||= Kubectl.new(task_config: @task_config, log_failure_by_default: true)
263
+ end
264
+ end
265
+ end
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'krane/concurrency'
4
+ require 'krane/resource_cache'
5
+
6
+ module Krane
7
+ class ResourceWatcher
8
+ extend Krane::StatsD::MeasureMethods
9
+ delegate :namespace, :context, :logger, to: :@task_config
10
+
11
+ def initialize(resources:, task_config:, deploy_started_at: Time.now.utc,
12
+ operation_name: "deploy", timeout: nil, sha: nil)
13
+ unless resources.is_a?(Enumerable)
14
+ raise ArgumentError, <<~MSG
15
+ ResourceWatcher expects Enumerable collection, got `#{resources.class}` instead
16
+ MSG
17
+ end
18
+ @resources = resources
19
+ @task_config = task_config
20
+ @deploy_started_at = deploy_started_at
21
+ @operation_name = operation_name
22
+ @timeout = timeout
23
+ @sha = sha
24
+ end
25
+
26
+ def run(delay_sync: 3.seconds, reminder_interval: 30.seconds, record_summary: true)
27
+ last_message_logged_at = monitoring_started = Time.now.utc
28
+ remainder = @resources.dup
29
+
30
+ while remainder.present?
31
+ report_and_give_up(remainder) if global_timeout?(monitoring_started)
32
+ sleep_until_next_sync(delay_sync)
33
+
34
+ sync_resources(remainder)
35
+
36
+ new_successes, remainder = remainder.partition(&:deploy_succeeded?)
37
+ new_failures, remainder = remainder.partition(&:deploy_failed?)
38
+ new_timeouts, remainder = remainder.partition(&:deploy_timed_out?)
39
+
40
+ if new_successes.present? || new_failures.present? || new_timeouts.present?
41
+ report_what_just_happened(new_successes, new_failures, new_timeouts)
42
+ report_what_is_left(remainder, reminder: false)
43
+ last_message_logged_at = Time.now.utc
44
+ elsif due_for_reminder?(last_message_logged_at, reminder_interval)
45
+ report_what_is_left(remainder, reminder: true)
46
+ last_message_logged_at = Time.now.utc
47
+ end
48
+ end
49
+ record_statuses_for_summary(@resources) if record_summary
50
+ end
51
+
52
+ private
53
+
54
+ def sync_resources(resources)
55
+ cache = ResourceCache.new(@task_config)
56
+ Krane::Concurrency.split_across_threads(resources) { |r| r.sync(cache) }
57
+ resources.each(&:after_sync)
58
+ end
59
+ measure_method(:sync_resources, "sync.duration")
60
+
61
+ def statsd_tags
62
+ {
63
+ namespace: namespace,
64
+ context: context,
65
+ sha: @sha,
66
+ }
67
+ end
68
+
69
+ def global_timeout?(started_at)
70
+ @timeout && (Time.now.utc - started_at > @timeout)
71
+ end
72
+
73
+ def sleep_until_next_sync(min_interval)
74
+ @next_sync_time ||= Time.now.utc
75
+ if (sleep_duration = @next_sync_time - Time.now.utc) > 0
76
+ sleep(sleep_duration)
77
+ end
78
+ @next_sync_time = Time.now.utc + min_interval
79
+ end
80
+
81
+ def report_what_just_happened(new_successes, new_failures, new_timeouts)
82
+ watch_time = (Time.now.utc - @deploy_started_at).round(1)
83
+ new_failures.each do |resource|
84
+ resource.report_status_to_statsd(watch_time)
85
+ logger.error("#{resource.id} failed to #{@operation_name} after #{watch_time}s")
86
+ end
87
+
88
+ new_timeouts.each do |resource|
89
+ resource.report_status_to_statsd(watch_time)
90
+ logger.error("#{resource.id} rollout timed out after #{watch_time}s")
91
+ end
92
+
93
+ if new_successes.present?
94
+ new_successes.each { |r| r.report_status_to_statsd(watch_time) }
95
+ success_string = ColorizedString.new("Successfully #{past_tense_operation} in #{watch_time}s:").green
96
+ logger.info("#{success_string} #{new_successes.map(&:id).join(', ')}")
97
+ end
98
+ end
99
+
100
+ def report_what_is_left(resources, reminder:)
101
+ return unless resources.present?
102
+ resource_list = resources.map(&:id).join(', ')
103
+ msg = reminder ? "Still waiting for: #{resource_list}" : "Continuing to wait for: #{resource_list}"
104
+ logger.info(msg)
105
+ end
106
+
107
+ def report_and_give_up(remaining_resources)
108
+ successful_resources, failed_resources = (@resources - remaining_resources).partition(&:deploy_succeeded?)
109
+ record_success_statuses(successful_resources)
110
+ record_failed_statuses(failed_resources, remaining_resources)
111
+
112
+ if failed_resources.present? && !failed_resources.all?(&:deploy_timed_out?)
113
+ raise FatalDeploymentError
114
+ else
115
+ raise DeploymentTimeoutError
116
+ end
117
+ end
118
+
119
+ def record_statuses_for_summary(resources)
120
+ successful_resources, failed_resources = resources.partition(&:deploy_succeeded?)
121
+ record_success_statuses(successful_resources)
122
+ record_failed_statuses(failed_resources)
123
+ end
124
+
125
+ def record_failed_statuses(failed_resources, global_timeouts = [])
126
+ fail_count = failed_resources.length + global_timeouts.length
127
+
128
+ if fail_count > 0
129
+ timeouts, failures = failed_resources.partition(&:deploy_timed_out?)
130
+ timeouts += global_timeouts
131
+ if timeouts.present?
132
+ logger.summary.add_action(
133
+ "timed out waiting for #{timeouts.length} #{'resource'.pluralize(timeouts.length)} to #{@operation_name}"
134
+ )
135
+ end
136
+
137
+ if failures.present?
138
+ logger.summary.add_action(
139
+ "failed to #{@operation_name} #{failures.length} #{'resource'.pluralize(failures.length)}"
140
+ )
141
+ end
142
+
143
+ kubectl = Kubectl.new(task_config: @task_config, log_failure_by_default: false)
144
+ Krane::Concurrency.split_across_threads(failed_resources + global_timeouts) do |r|
145
+ r.sync_debug_info(kubectl)
146
+ end
147
+
148
+ failed_resources.each { |r| logger.summary.add_paragraph(r.debug_message) }
149
+ global_timeouts.each { |r| logger.summary.add_paragraph(r.debug_message(:gave_up, timeout: @timeout)) }
150
+ end
151
+ end
152
+
153
+ def record_success_statuses(successful_resources)
154
+ success_count = successful_resources.length
155
+ if success_count > 0
156
+ logger.summary.add_action("successfully #{past_tense_operation} #{success_count} "\
157
+ "#{'resource'.pluralize(success_count)}")
158
+ final_statuses = successful_resources.map(&:pretty_status).join("\n")
159
+ logger.summary.add_paragraph("#{ColorizedString.new('Successful resources').green}\n#{final_statuses}")
160
+ end
161
+ end
162
+
163
+ def due_for_reminder?(last_message_logged_at, reminder_interval)
164
+ (last_message_logged_at.to_f + reminder_interval.to_f) <= Time.now.utc.to_f
165
+ end
166
+
167
+ def past_tense_operation
168
+ @operation_name == "run" ? "ran" : "#{@operation_name}ed"
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,228 @@
1
+ # frozen_string_literal: true
2
+ require 'krane/common'
3
+ require 'krane/kubernetes_resource'
4
+ require 'krane/kubernetes_resource/deployment'
5
+ require 'krane/kubeclient_builder'
6
+ require 'krane/resource_watcher'
7
+ require 'krane/kubectl'
8
+
9
+ module Krane
10
+ # Restart the pods in one or more deployments
11
+ class RestartTask
12
+ class FatalRestartError < FatalDeploymentError; end
13
+
14
+ class RestartAPIError < FatalRestartError
15
+ def initialize(deployment_name, response)
16
+ super("Failed to restart #{deployment_name}. " \
17
+ "API returned non-200 response code (#{response.code})\n" \
18
+ "Response:\n#{response.body}")
19
+ end
20
+ end
21
+
22
+ HTTP_OK_RANGE = 200..299
23
+ ANNOTATION = "shipit.shopify.io/restart"
24
+
25
+ # Initializes the restart task
26
+ #
27
+ # @param context [String] Kubernetes context / cluster (*required*)
28
+ # @param namespace [String] Kubernetes namespace (*required*)
29
+ # @param logger [Object] Logger object (defaults to an instance of Krane::FormattedLogger)
30
+ # @param global_timeout [Integer] Timeout in seconds
31
+ def initialize(context:, namespace:, logger: nil, global_timeout: nil)
32
+ @logger = logger || Krane::FormattedLogger.build(namespace, context)
33
+ @task_config = Krane::TaskConfig.new(context, namespace, @logger)
34
+ @context = context
35
+ @namespace = namespace
36
+ @global_timeout = global_timeout
37
+ end
38
+
39
+ # Runs the task, returning a boolean representing success or failure
40
+ #
41
+ # @return [Boolean]
42
+ def run(*args)
43
+ perform!(*args)
44
+ true
45
+ rescue FatalDeploymentError
46
+ false
47
+ end
48
+ alias_method :perform, :run
49
+
50
+ # Runs the task, raising exceptions in case of issues
51
+ #
52
+ # @param deployments [Array<String>] Array of workload names to restart
53
+ # @param selector [Hash] Selector(s) parsed by Krane::LabelSelector
54
+ # @param verify_result [Boolean] Wait for completion and verify success
55
+ #
56
+ # @return [nil]
57
+ def run!(deployments: nil, selector: nil, verify_result: true)
58
+ start = Time.now.utc
59
+ @logger.reset
60
+
61
+ @logger.phase_heading("Initializing restart")
62
+ verify_config!
63
+ deployments = identify_target_deployments(deployments, selector: selector)
64
+
65
+ @logger.phase_heading("Triggering restart by touching ENV[RESTARTED_AT]")
66
+ patch_kubeclient_deployments(deployments)
67
+
68
+ if verify_result
69
+ @logger.phase_heading("Waiting for rollout")
70
+ resources = build_watchables(deployments, start)
71
+ verify_restart(resources)
72
+ else
73
+ warning = "Result verification is disabled for this task"
74
+ @logger.summary.add_paragraph(ColorizedString.new(warning).yellow)
75
+ end
76
+ StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('success', deployments))
77
+ @logger.print_summary(:success)
78
+ rescue DeploymentTimeoutError
79
+ StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('timeout', deployments))
80
+ @logger.print_summary(:timed_out)
81
+ raise
82
+ rescue FatalDeploymentError => error
83
+ StatsD.client.distribution('restart.duration', StatsD.duration(start), tags: tags('failure', deployments))
84
+ @logger.summary.add_action(error.message) if error.message != error.class.to_s
85
+ @logger.print_summary(:failure)
86
+ raise
87
+ end
88
+ alias_method :perform!, :run!
89
+
90
+ private
91
+
92
+ def tags(status, deployments)
93
+ %W(namespace:#{@namespace} context:#{@context} status:#{status} deployments:#{deployments.to_a.length}})
94
+ end
95
+
96
+ def identify_target_deployments(deployment_names, selector: nil)
97
+ if deployment_names.nil?
98
+ deployments = if selector.nil?
99
+ @logger.info("Configured to restart all deployments with the `#{ANNOTATION}` annotation")
100
+ apps_v1_kubeclient.get_deployments(namespace: @namespace)
101
+ else
102
+ selector_string = selector.to_s
103
+ @logger.info(
104
+ "Configured to restart all deployments with the `#{ANNOTATION}` annotation and #{selector_string} selector"
105
+ )
106
+ apps_v1_kubeclient.get_deployments(namespace: @namespace, label_selector: selector_string)
107
+ end
108
+ deployments.select! { |d| d.metadata.annotations[ANNOTATION] }
109
+
110
+ if deployments.none?
111
+ raise FatalRestartError, "no deployments with the `#{ANNOTATION}` annotation found in namespace #{@namespace}"
112
+ end
113
+ elsif deployment_names.empty?
114
+ raise FatalRestartError, "Configured to restart deployments by name, but list of names was blank"
115
+ elsif !selector.nil?
116
+ raise FatalRestartError, "Can't specify deployment names and selector at the same time"
117
+ else
118
+ deployment_names = deployment_names.uniq
119
+ list = deployment_names.join(', ')
120
+ @logger.info("Configured to restart deployments by name: #{list}")
121
+
122
+ deployments = fetch_deployments(deployment_names)
123
+ if deployments.none?
124
+ raise FatalRestartError, "no deployments with names #{list} found in namespace #{@namespace}"
125
+ end
126
+ end
127
+ deployments
128
+ end
129
+
130
+ def build_watchables(kubeclient_resources, started)
131
+ kubeclient_resources.map do |d|
132
+ definition = d.to_h.deep_stringify_keys
133
+ r = Deployment.new(namespace: @namespace, context: @context, definition: definition, logger: @logger)
134
+ r.deploy_started_at = started # we don't care what happened to the resource before the restart cmd ran
135
+ r
136
+ end
137
+ end
138
+
139
+ def patch_deployment_with_restart(record)
140
+ apps_v1_kubeclient.patch_deployment(
141
+ record.metadata.name,
142
+ build_patch_payload(record),
143
+ @namespace
144
+ )
145
+ end
146
+
147
+ def patch_kubeclient_deployments(deployments)
148
+ deployments.each do |record|
149
+ begin
150
+ patch_deployment_with_restart(record)
151
+ @logger.info("Triggered `#{record.metadata.name}` restart")
152
+ rescue Kubeclient::HttpError => e
153
+ raise RestartAPIError.new(record.metadata.name, e.message)
154
+ end
155
+ end
156
+ end
157
+
158
+ def fetch_deployments(list)
159
+ list.map do |name|
160
+ record = nil
161
+ begin
162
+ record = apps_v1_kubeclient.get_deployment(name, @namespace)
163
+ rescue Kubeclient::ResourceNotFoundError
164
+ raise FatalRestartError, "Deployment `#{name}` not found in namespace `#{@namespace}`"
165
+ end
166
+ record
167
+ end
168
+ end
169
+
170
+ def build_patch_payload(deployment)
171
+ containers = deployment.spec.template.spec.containers
172
+ {
173
+ spec: {
174
+ template: {
175
+ spec: {
176
+ containers: containers.map do |container|
177
+ {
178
+ name: container.name,
179
+ env: [{ name: "RESTARTED_AT", value: Time.now.to_i.to_s }],
180
+ }
181
+ end,
182
+ },
183
+ },
184
+ },
185
+ }
186
+ end
187
+
188
+ def verify_restart(resources)
189
+ ResourceWatcher.new(resources: resources, operation_name: "restart",
190
+ timeout: @global_timeout, task_config: @task_config).run
191
+ failed_resources = resources.reject(&:deploy_succeeded?)
192
+ success = failed_resources.empty?
193
+ if !success && failed_resources.all?(&:deploy_timed_out?)
194
+ raise DeploymentTimeoutError
195
+ end
196
+ raise FatalDeploymentError unless success
197
+ end
198
+
199
+ def verify_config!
200
+ task_config_validator = TaskConfigValidator.new(@task_config, kubectl, kubeclient_builder)
201
+ unless task_config_validator.valid?
202
+ @logger.summary.add_action("Configuration invalid")
203
+ @logger.summary.add_paragraph(task_config_validator.errors.map { |err| "- #{err}" }.join("\n"))
204
+ raise Krane::TaskConfigurationError
205
+ end
206
+ end
207
+
208
+ def apps_v1_kubeclient
209
+ @apps_v1_kubeclient ||= kubeclient_builder.build_apps_v1_kubeclient(@context)
210
+ end
211
+
212
+ def kubeclient
213
+ @kubeclient ||= kubeclient_builder.build_v1_kubeclient(@context)
214
+ end
215
+
216
+ def kubectl
217
+ @kubectl ||= Kubectl.new(task_config: @task_config, log_failure_by_default: true)
218
+ end
219
+
220
+ def v1beta1_kubeclient
221
+ @v1beta1_kubeclient ||= kubeclient_builder.build_v1beta1_kubeclient(@context)
222
+ end
223
+
224
+ def kubeclient_builder
225
+ @kubeclient_builder ||= KubeclientBuilder.new
226
+ end
227
+ end
228
+ end