tobsch-krane 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +7 -0
  2. data/.buildkite/pipeline.nightly.yml +43 -0
  3. data/.github/probots.yml +2 -0
  4. data/.gitignore +20 -0
  5. data/.rubocop.yml +17 -0
  6. data/.shopify-build/VERSION +1 -0
  7. data/.shopify-build/kubernetes-deploy.yml +53 -0
  8. data/1.0-Upgrade.md +185 -0
  9. data/CHANGELOG.md +431 -0
  10. data/CODE_OF_CONDUCT.md +46 -0
  11. data/CONTRIBUTING.md +164 -0
  12. data/Gemfile +16 -0
  13. data/ISSUE_TEMPLATE.md +25 -0
  14. data/LICENSE.txt +21 -0
  15. data/README.md +655 -0
  16. data/Rakefile +36 -0
  17. data/bin/ci +21 -0
  18. data/bin/setup +16 -0
  19. data/bin/test +47 -0
  20. data/dev.yml +28 -0
  21. data/dev/flamegraph-from-tests +35 -0
  22. data/exe/krane +5 -0
  23. data/krane.gemspec +44 -0
  24. data/lib/krane.rb +7 -0
  25. data/lib/krane/bindings_parser.rb +88 -0
  26. data/lib/krane/cli/deploy_command.rb +75 -0
  27. data/lib/krane/cli/global_deploy_command.rb +54 -0
  28. data/lib/krane/cli/krane.rb +91 -0
  29. data/lib/krane/cli/render_command.rb +41 -0
  30. data/lib/krane/cli/restart_command.rb +34 -0
  31. data/lib/krane/cli/run_command.rb +54 -0
  32. data/lib/krane/cli/version_command.rb +13 -0
  33. data/lib/krane/cluster_resource_discovery.rb +113 -0
  34. data/lib/krane/common.rb +23 -0
  35. data/lib/krane/concerns/template_reporting.rb +29 -0
  36. data/lib/krane/concurrency.rb +18 -0
  37. data/lib/krane/container_logs.rb +106 -0
  38. data/lib/krane/deferred_summary_logging.rb +95 -0
  39. data/lib/krane/delayed_exceptions.rb +14 -0
  40. data/lib/krane/deploy_task.rb +363 -0
  41. data/lib/krane/deploy_task_config_validator.rb +29 -0
  42. data/lib/krane/duration_parser.rb +27 -0
  43. data/lib/krane/ejson_secret_provisioner.rb +154 -0
  44. data/lib/krane/errors.rb +28 -0
  45. data/lib/krane/formatted_logger.rb +57 -0
  46. data/lib/krane/global_deploy_task.rb +210 -0
  47. data/lib/krane/global_deploy_task_config_validator.rb +12 -0
  48. data/lib/krane/kubeclient_builder.rb +156 -0
  49. data/lib/krane/kubectl.rb +120 -0
  50. data/lib/krane/kubernetes_resource.rb +621 -0
  51. data/lib/krane/kubernetes_resource/cloudsql.rb +43 -0
  52. data/lib/krane/kubernetes_resource/config_map.rb +22 -0
  53. data/lib/krane/kubernetes_resource/cron_job.rb +18 -0
  54. data/lib/krane/kubernetes_resource/custom_resource.rb +87 -0
  55. data/lib/krane/kubernetes_resource/custom_resource_definition.rb +98 -0
  56. data/lib/krane/kubernetes_resource/daemon_set.rb +90 -0
  57. data/lib/krane/kubernetes_resource/deployment.rb +213 -0
  58. data/lib/krane/kubernetes_resource/horizontal_pod_autoscaler.rb +65 -0
  59. data/lib/krane/kubernetes_resource/ingress.rb +18 -0
  60. data/lib/krane/kubernetes_resource/job.rb +60 -0
  61. data/lib/krane/kubernetes_resource/network_policy.rb +22 -0
  62. data/lib/krane/kubernetes_resource/persistent_volume_claim.rb +80 -0
  63. data/lib/krane/kubernetes_resource/pod.rb +269 -0
  64. data/lib/krane/kubernetes_resource/pod_disruption_budget.rb +23 -0
  65. data/lib/krane/kubernetes_resource/pod_set_base.rb +71 -0
  66. data/lib/krane/kubernetes_resource/pod_template.rb +20 -0
  67. data/lib/krane/kubernetes_resource/replica_set.rb +92 -0
  68. data/lib/krane/kubernetes_resource/resource_quota.rb +22 -0
  69. data/lib/krane/kubernetes_resource/role.rb +22 -0
  70. data/lib/krane/kubernetes_resource/role_binding.rb +22 -0
  71. data/lib/krane/kubernetes_resource/secret.rb +24 -0
  72. data/lib/krane/kubernetes_resource/service.rb +104 -0
  73. data/lib/krane/kubernetes_resource/service_account.rb +22 -0
  74. data/lib/krane/kubernetes_resource/stateful_set.rb +70 -0
  75. data/lib/krane/label_selector.rb +42 -0
  76. data/lib/krane/oj.rb +4 -0
  77. data/lib/krane/options_helper.rb +39 -0
  78. data/lib/krane/remote_logs.rb +60 -0
  79. data/lib/krane/render_task.rb +118 -0
  80. data/lib/krane/renderer.rb +118 -0
  81. data/lib/krane/resource_cache.rb +68 -0
  82. data/lib/krane/resource_deployer.rb +265 -0
  83. data/lib/krane/resource_watcher.rb +171 -0
  84. data/lib/krane/restart_task.rb +228 -0
  85. data/lib/krane/rollout_conditions.rb +103 -0
  86. data/lib/krane/runner_task.rb +212 -0
  87. data/lib/krane/runner_task_config_validator.rb +18 -0
  88. data/lib/krane/statsd.rb +65 -0
  89. data/lib/krane/task_config.rb +22 -0
  90. data/lib/krane/task_config_validator.rb +96 -0
  91. data/lib/krane/template_sets.rb +173 -0
  92. data/lib/krane/version.rb +4 -0
  93. data/pull_request_template.md +8 -0
  94. data/screenshots/deploy-demo.gif +0 -0
  95. data/screenshots/migrate-logs.png +0 -0
  96. data/screenshots/missing-secret-fail.png +0 -0
  97. data/screenshots/success.png +0 -0
  98. data/screenshots/test-output.png +0 -0
  99. metadata +375 -0
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class HorizontalPodAutoscaler < KubernetesResource
4
+ TIMEOUT = 3.minutes
5
+ RECOVERABLE_CONDITION_PREFIX = "FailedGet"
6
+
7
+ def deploy_succeeded?
8
+ scaling_active_condition["status"] == "True" || scaling_disabled?
9
+ end
10
+
11
+ def deploy_failed?
12
+ return false unless exists?
13
+ return false if scaling_disabled?
14
+ scaling_active_condition["status"] == "False" &&
15
+ !scaling_active_condition.fetch("reason", "").start_with?(RECOVERABLE_CONDITION_PREFIX)
16
+ end
17
+
18
+ def kubectl_resource_type
19
+ 'hpa.v2beta1.autoscaling'
20
+ end
21
+
22
+ def status
23
+ if !exists?
24
+ super
25
+ elsif scaling_disabled?
26
+ "ScalingDisabled"
27
+ elsif deploy_succeeded?
28
+ "Configured"
29
+ elsif scaling_active_condition.present? || able_to_scale_condition.present?
30
+ condition = scaling_active_condition.presence || able_to_scale_condition
31
+ condition['reason']
32
+ else
33
+ "Unknown"
34
+ end
35
+ end
36
+
37
+ def failure_message
38
+ condition = scaling_active_condition.presence || able_to_scale_condition.presence || {}
39
+ condition['message']
40
+ end
41
+
42
+ def timeout_message
43
+ failure_message.presence || super
44
+ end
45
+
46
+ private
47
+
48
+ def scaling_disabled?
49
+ scaling_active_condition["status"] == "False" &&
50
+ scaling_active_condition["reason"] == "ScalingDisabled"
51
+ end
52
+
53
+ def conditions
54
+ @instance_data.dig("status", "conditions") || []
55
+ end
56
+
57
+ def able_to_scale_condition
58
+ conditions.detect { |c| c["type"] == "AbleToScale" } || {}
59
+ end
60
+
61
+ def scaling_active_condition
62
+ conditions.detect { |c| c["type"] == "ScalingActive" } || {}
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class Ingress < KubernetesResource
4
+ TIMEOUT = 30.seconds
5
+
6
+ def status
7
+ exists? ? "Created" : "Not Found"
8
+ end
9
+
10
+ def deploy_succeeded?
11
+ exists?
12
+ end
13
+
14
+ def deploy_failed?
15
+ false
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class Job < KubernetesResource
4
+ TIMEOUT = 10.minutes
5
+
6
+ def deploy_succeeded?
7
+ # Don't block deploys for long running jobs,
8
+ # Instead report success when there is at least 1 active
9
+ return false unless deploy_started?
10
+ done? || running?
11
+ end
12
+
13
+ def deploy_failed?
14
+ return false unless deploy_started?
15
+ return true if failed_status_condition
16
+ return false unless @instance_data.dig("spec", "backoffLimit").present?
17
+ (@instance_data.dig("status", "failed") || 0) >= @instance_data.dig("spec", "backoffLimit")
18
+ end
19
+
20
+ def status
21
+ if !exists?
22
+ super
23
+ elsif done?
24
+ "Succeeded"
25
+ elsif running?
26
+ "Started"
27
+ elsif deploy_failed?
28
+ "Failed"
29
+ else
30
+ "Unknown"
31
+ end
32
+ end
33
+
34
+ def failure_message
35
+ if (condition = failed_status_condition.presence)
36
+ "#{condition['reason']} (#{condition['message']})"
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def failed_status_condition
43
+ @instance_data.dig("status", "conditions")&.detect do |condition|
44
+ condition["type"] == 'Failed' && condition['status'] == "True"
45
+ end
46
+ end
47
+
48
+ def done?
49
+ (@instance_data.dig("status", "succeeded") || 0) == @instance_data.dig("spec", "completions")
50
+ end
51
+
52
+ def running?
53
+ now = Time.now.utc
54
+ start_time = @instance_data.dig("status", "startTime")
55
+ # Wait 5 seconds to ensure job doesn't immediately fail.
56
+ return false if !start_time.present? || now - Time.parse(start_time) < 5.second
57
+ (@instance_data.dig("status", "active") || 0) >= 1
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class NetworkPolicy < KubernetesResource
4
+ TIMEOUT = 30.seconds
5
+
6
+ def status
7
+ exists? ? "Created" : "Not Found"
8
+ end
9
+
10
+ def deploy_succeeded?
11
+ exists?
12
+ end
13
+
14
+ def deploy_failed?
15
+ false
16
+ end
17
+
18
+ def timeout_message
19
+ UNUSUAL_FAILURE_MESSAGE
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class PersistentVolumeClaim < KubernetesResource
4
+ TIMEOUT = 5.minutes
5
+
6
+ def sync(cache)
7
+ super
8
+ @storage_classes = cache.get_all("StorageClass").map { |sc| StorageClass.new(sc) }
9
+ end
10
+
11
+ def status
12
+ exists? ? @instance_data["status"]["phase"] : "Not Found"
13
+ end
14
+
15
+ def deploy_succeeded?
16
+ return true if status == "Bound"
17
+
18
+ # if the StorageClass has volumeBindingMode: WaitForFirstConsumer,
19
+ # it won't bind until after a Pod mounts it. But it must be pre-deployed,
20
+ # as the Pod requires it. So 'Pending' must be treated as a 'Success' state
21
+ if storage_class&.volume_binding_mode == "WaitForFirstConsumer"
22
+ return status == "Pending" || status == "Bound"
23
+ end
24
+ false
25
+ end
26
+
27
+ def deploy_failed?
28
+ status == "Lost" || failure_message.present?
29
+ end
30
+
31
+ def failure_message
32
+ if storage_class_name.nil? && @storage_classes.count(&:default?) > 1
33
+ "PVC has no StorageClass specified and there are multiple StorageClasses " \
34
+ "annotated as default. This is an invalid cluster configuration."
35
+ end
36
+ end
37
+
38
+ def timeout_message
39
+ return STANDARD_TIMEOUT_MESSAGE unless storage_class_name.present? && !storage_class
40
+ "PVC specified a StorageClass of #{storage_class_name} but the resource does not exist"
41
+ end
42
+
43
+ private
44
+
45
+ def storage_class_name
46
+ @definition.dig("spec", "storageClassName")
47
+ end
48
+
49
+ def storage_class
50
+ if storage_class_name.present?
51
+ @storage_classes.detect { |sc| sc.name == storage_class_name }
52
+ # storage_class_name = "" is an explicit request for no storage class
53
+ # storage_class_name = nil is an impplicit request for default storage class
54
+ elsif storage_class_name != ""
55
+ @storage_classes.detect(&:default?)
56
+ end
57
+ end
58
+
59
+ class StorageClass < KubernetesResource
60
+ DEFAULT_CLASS_ANNOTATION = "storageclass.kubernetes.io/is-default-class"
61
+ DEFAULT_CLASS_BETA_ANNOTATION = "storageclass.beta.kubernetes.io/is-default-class"
62
+
63
+ attr_reader :name
64
+
65
+ def initialize(definition)
66
+ @definition = definition
67
+ @name = definition.dig("metadata", "name").to_s
68
+ end
69
+
70
+ def volume_binding_mode
71
+ @definition.dig("volumeBindingMode")
72
+ end
73
+
74
+ def default?
75
+ @definition.dig("metadata", "annotations", DEFAULT_CLASS_ANNOTATION) == "true" ||
76
+ @definition.dig("metadata", "annotations", DEFAULT_CLASS_BETA_ANNOTATION) == "true"
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class Pod < KubernetesResource
4
+ TIMEOUT = 10.minutes
5
+
6
+ FAILED_PHASE_NAME = "Failed"
7
+ TRANSIENT_FAILURE_REASONS = %w(
8
+ Evicted
9
+ Preempting
10
+ )
11
+
12
+ attr_accessor :stream_logs
13
+
14
+ def initialize(namespace:, context:, definition:, logger:,
15
+ statsd_tags: nil, parent: nil, deploy_started_at: nil, stream_logs: false)
16
+ @parent = parent
17
+ @deploy_started_at = deploy_started_at
18
+
19
+ @containers = definition.fetch("spec", {}).fetch("containers", []).map { |c| Container.new(c) }
20
+ unless @containers.present?
21
+ logger.summary.add_paragraph("Rendered template content:\n#{definition.to_yaml}")
22
+ raise FatalDeploymentError, "Template is missing required field spec.containers"
23
+ end
24
+ @containers += definition["spec"].fetch("initContainers", []).map { |c| Container.new(c, init_container: true) }
25
+ @stream_logs = stream_logs
26
+ super(namespace: namespace, context: context, definition: definition,
27
+ logger: logger, statsd_tags: statsd_tags)
28
+ end
29
+
30
+ def sync(_cache)
31
+ super
32
+ raise_predates_deploy_error if exists? && unmanaged? && !deploy_started?
33
+
34
+ if exists?
35
+ logs.sync if unmanaged?
36
+ update_container_statuses(@instance_data["status"])
37
+ else # reset
38
+ @containers.each(&:reset_status)
39
+ end
40
+ end
41
+
42
+ def after_sync
43
+ if @stream_logs
44
+ logs.print_latest
45
+ elsif unmanaged? && deploy_succeeded?
46
+ logs.print_all
47
+ end
48
+ end
49
+
50
+ def status
51
+ return phase if reason.blank?
52
+ "#{phase} (Reason: #{reason})"
53
+ end
54
+
55
+ def deploy_succeeded?
56
+ if unmanaged?
57
+ phase == "Succeeded"
58
+ else
59
+ phase == "Running" && ready?
60
+ end
61
+ end
62
+
63
+ def deploy_failed?
64
+ failure_message.present?
65
+ end
66
+
67
+ def timeout_message
68
+ if readiness_probe_failure?
69
+ probe_failure_msgs = @containers.map(&:readiness_fail_reason).compact
70
+ header = "The following containers have not passed their readiness probes on at least one pod:\n"
71
+ header + probe_failure_msgs.join("\n")
72
+ elsif failed_schedule_reason.present?
73
+ "Pod could not be scheduled because #{failed_schedule_reason}"
74
+ else
75
+ STANDARD_TIMEOUT_MESSAGE
76
+ end
77
+ end
78
+
79
+ def failure_message
80
+ doomed_containers = @containers.select(&:doomed?)
81
+ if doomed_containers.present?
82
+ container_problems = if unmanaged?
83
+ "The following containers encountered errors:\n"
84
+ else
85
+ "The following containers are in a state that is unlikely to be recoverable:\n"
86
+ end
87
+ doomed_containers.each do |c|
88
+ red_name = ColorizedString.new(c.name).red
89
+ container_problems += "> #{red_name}: #{c.doom_reason}\n"
90
+ end
91
+ end
92
+ "#{phase_failure_message} #{container_problems}".strip.presence
93
+ end
94
+
95
+ def fetch_debug_logs
96
+ logs.sync
97
+ logs
98
+ end
99
+
100
+ def print_debug_logs?
101
+ exists? && !@stream_logs # don't print them a second time
102
+ end
103
+
104
+ def node_name
105
+ @instance_data.dig('spec', 'nodeName')
106
+ end
107
+
108
+ private
109
+
110
+ def failed_schedule_reason
111
+ if phase == "Pending"
112
+ conditions = @instance_data.dig('status', 'conditions') || []
113
+ unschedulable = conditions.find do |condition|
114
+ condition["type"] == "PodScheduled" && condition["status"] == "False"
115
+ end
116
+ unschedulable&.dig('message')
117
+ end
118
+ end
119
+
120
+ def failed_phase?
121
+ phase == FAILED_PHASE_NAME
122
+ end
123
+
124
+ def transient_failure_reason?
125
+ return false if unmanaged?
126
+ TRANSIENT_FAILURE_REASONS.include?(reason)
127
+ end
128
+
129
+ def phase_failure_message
130
+ if failed_phase? && !transient_failure_reason?
131
+ return "Pod status: #{status}."
132
+ end
133
+
134
+ return unless unmanaged?
135
+
136
+ if terminating?
137
+ "Pod status: Terminating."
138
+ elsif disappeared?
139
+ "Pod status: Disappeared."
140
+ end
141
+ end
142
+
143
+ def logs
144
+ @logs ||= Krane::RemoteLogs.new(
145
+ logger: @logger,
146
+ parent_id: id,
147
+ container_names: @containers.map(&:name),
148
+ namespace: @namespace,
149
+ context: @context
150
+ )
151
+ end
152
+
153
+ def phase
154
+ @instance_data.dig("status", "phase") || "Unknown"
155
+ end
156
+
157
+ def reason
158
+ @instance_data.dig('status', 'reason')
159
+ end
160
+
161
+ def readiness_probe_failure?
162
+ return false if ready? || unmanaged?
163
+ return false if phase != "Running"
164
+ @containers.any?(&:readiness_fail_reason)
165
+ end
166
+
167
+ def ready?
168
+ return false unless (status_data = @instance_data["status"])
169
+ ready_condition = status_data.fetch("conditions", []).find { |condition| condition["type"] == "Ready" }
170
+ ready_condition.present? && (ready_condition["status"] == "True")
171
+ end
172
+
173
+ def update_container_statuses(status_data)
174
+ @containers.each do |c|
175
+ key = c.init_container? ? "initContainerStatuses" : "containerStatuses"
176
+ if status_data.key?(key)
177
+ data = status_data[key].find { |st| st["name"] == c.name }
178
+ c.update_status(data)
179
+ else
180
+ c.reset_status
181
+ end
182
+ end
183
+ end
184
+
185
+ def unmanaged?
186
+ @parent.blank?
187
+ end
188
+
189
+ def raise_predates_deploy_error
190
+ example_color = :green
191
+ msg = <<-STRING.strip_heredoc
192
+ Unmanaged pods like #{id} must have unique names on every deploy in order to work as intended.
193
+ The recommended way to achieve this is to include "<%= deployment_id %>" in the pod's name, like this:
194
+ #{ColorizedString.new('kind: Pod').colorize(example_color)}
195
+ #{ColorizedString.new('metadata:').colorize(example_color)}
196
+ #{ColorizedString.new("name: #{@name}-<%= deployment_id %>").colorize(example_color)}
197
+ STRING
198
+ @logger.summary.add_paragraph(msg)
199
+ raise FatalDeploymentError, "#{id} existed before the deploy started"
200
+ end
201
+
202
+ class Container
203
+ attr_reader :name
204
+
205
+ def initialize(definition, init_container: false)
206
+ @init_container = init_container
207
+ @name = definition["name"]
208
+ @image = definition["image"]
209
+ @http_probe_location = definition.dig("readinessProbe", "httpGet", "path")
210
+ @exec_probe_command = definition.dig("readinessProbe", "exec", "command")
211
+ @status = {}
212
+ end
213
+
214
+ def doomed?
215
+ doom_reason.present?
216
+ end
217
+
218
+ def doom_reason
219
+ limbo_reason = @status.dig("state", "waiting", "reason")
220
+ limbo_message = @status.dig("state", "waiting", "message")
221
+
222
+ if @status.dig("lastState", "terminated", "reason") == "ContainerCannotRun"
223
+ # ref: https://github.com/kubernetes/kubernetes/blob/562e721ece8a16e05c7e7d6bdd6334c910733ab2/pkg/kubelet/dockershim/docker_container.go#L353
224
+ exit_code = @status.dig('lastState', 'terminated', 'exitCode')
225
+ "Failed to start (exit #{exit_code}): #{@status.dig('lastState', 'terminated', 'message')}"
226
+ elsif @status.dig("state", "terminated", "reason") == "ContainerCannotRun"
227
+ exit_code = @status.dig('state', 'terminated', 'exitCode')
228
+ "Failed to start (exit #{exit_code}): #{@status.dig('state', 'terminated', 'message')}"
229
+ elsif limbo_reason == "CrashLoopBackOff"
230
+ exit_code = @status.dig('lastState', 'terminated', 'exitCode')
231
+ "Crashing repeatedly (exit #{exit_code}). See logs for more information."
232
+ elsif limbo_reason == "ErrImagePull" && limbo_message.match(/not found/i)
233
+ "Failed to pull image #{@image}. "\
234
+ "Did you wait for it to be built and pushed to the registry before deploying?"
235
+ elsif limbo_reason == "CreateContainerConfigError"
236
+ "Failed to generate container configuration: #{limbo_message}"
237
+ end
238
+ end
239
+
240
+ def readiness_fail_reason
241
+ return if ready? || init_container?
242
+ return unless (@http_probe_location || @exec_probe_command).present?
243
+
244
+ yellow_name = ColorizedString.new(name).yellow
245
+ if @http_probe_location
246
+ "> #{yellow_name} must respond with a good status code at '#{@http_probe_location}'"
247
+ elsif @exec_probe_command
248
+ "> #{yellow_name} must exit 0 from the following command: '#{@exec_probe_command.join(' ')}'"
249
+ end
250
+ end
251
+
252
+ def ready?
253
+ @status['ready'] == true
254
+ end
255
+
256
+ def init_container?
257
+ @init_container
258
+ end
259
+
260
+ def update_status(data)
261
+ @status = data || {}
262
+ end
263
+
264
+ def reset_status
265
+ @status = {}
266
+ end
267
+ end
268
+ end
269
+ end