tobsch-krane 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. checksums.yaml +7 -0
  2. data/.buildkite/pipeline.nightly.yml +43 -0
  3. data/.github/probots.yml +2 -0
  4. data/.gitignore +20 -0
  5. data/.rubocop.yml +17 -0
  6. data/.shopify-build/VERSION +1 -0
  7. data/.shopify-build/kubernetes-deploy.yml +53 -0
  8. data/1.0-Upgrade.md +185 -0
  9. data/CHANGELOG.md +431 -0
  10. data/CODE_OF_CONDUCT.md +46 -0
  11. data/CONTRIBUTING.md +164 -0
  12. data/Gemfile +16 -0
  13. data/ISSUE_TEMPLATE.md +25 -0
  14. data/LICENSE.txt +21 -0
  15. data/README.md +655 -0
  16. data/Rakefile +36 -0
  17. data/bin/ci +21 -0
  18. data/bin/setup +16 -0
  19. data/bin/test +47 -0
  20. data/dev.yml +28 -0
  21. data/dev/flamegraph-from-tests +35 -0
  22. data/exe/krane +5 -0
  23. data/krane.gemspec +44 -0
  24. data/lib/krane.rb +7 -0
  25. data/lib/krane/bindings_parser.rb +88 -0
  26. data/lib/krane/cli/deploy_command.rb +75 -0
  27. data/lib/krane/cli/global_deploy_command.rb +54 -0
  28. data/lib/krane/cli/krane.rb +91 -0
  29. data/lib/krane/cli/render_command.rb +41 -0
  30. data/lib/krane/cli/restart_command.rb +34 -0
  31. data/lib/krane/cli/run_command.rb +54 -0
  32. data/lib/krane/cli/version_command.rb +13 -0
  33. data/lib/krane/cluster_resource_discovery.rb +113 -0
  34. data/lib/krane/common.rb +23 -0
  35. data/lib/krane/concerns/template_reporting.rb +29 -0
  36. data/lib/krane/concurrency.rb +18 -0
  37. data/lib/krane/container_logs.rb +106 -0
  38. data/lib/krane/deferred_summary_logging.rb +95 -0
  39. data/lib/krane/delayed_exceptions.rb +14 -0
  40. data/lib/krane/deploy_task.rb +363 -0
  41. data/lib/krane/deploy_task_config_validator.rb +29 -0
  42. data/lib/krane/duration_parser.rb +27 -0
  43. data/lib/krane/ejson_secret_provisioner.rb +154 -0
  44. data/lib/krane/errors.rb +28 -0
  45. data/lib/krane/formatted_logger.rb +57 -0
  46. data/lib/krane/global_deploy_task.rb +210 -0
  47. data/lib/krane/global_deploy_task_config_validator.rb +12 -0
  48. data/lib/krane/kubeclient_builder.rb +156 -0
  49. data/lib/krane/kubectl.rb +120 -0
  50. data/lib/krane/kubernetes_resource.rb +621 -0
  51. data/lib/krane/kubernetes_resource/cloudsql.rb +43 -0
  52. data/lib/krane/kubernetes_resource/config_map.rb +22 -0
  53. data/lib/krane/kubernetes_resource/cron_job.rb +18 -0
  54. data/lib/krane/kubernetes_resource/custom_resource.rb +87 -0
  55. data/lib/krane/kubernetes_resource/custom_resource_definition.rb +98 -0
  56. data/lib/krane/kubernetes_resource/daemon_set.rb +90 -0
  57. data/lib/krane/kubernetes_resource/deployment.rb +213 -0
  58. data/lib/krane/kubernetes_resource/horizontal_pod_autoscaler.rb +65 -0
  59. data/lib/krane/kubernetes_resource/ingress.rb +18 -0
  60. data/lib/krane/kubernetes_resource/job.rb +60 -0
  61. data/lib/krane/kubernetes_resource/network_policy.rb +22 -0
  62. data/lib/krane/kubernetes_resource/persistent_volume_claim.rb +80 -0
  63. data/lib/krane/kubernetes_resource/pod.rb +269 -0
  64. data/lib/krane/kubernetes_resource/pod_disruption_budget.rb +23 -0
  65. data/lib/krane/kubernetes_resource/pod_set_base.rb +71 -0
  66. data/lib/krane/kubernetes_resource/pod_template.rb +20 -0
  67. data/lib/krane/kubernetes_resource/replica_set.rb +92 -0
  68. data/lib/krane/kubernetes_resource/resource_quota.rb +22 -0
  69. data/lib/krane/kubernetes_resource/role.rb +22 -0
  70. data/lib/krane/kubernetes_resource/role_binding.rb +22 -0
  71. data/lib/krane/kubernetes_resource/secret.rb +24 -0
  72. data/lib/krane/kubernetes_resource/service.rb +104 -0
  73. data/lib/krane/kubernetes_resource/service_account.rb +22 -0
  74. data/lib/krane/kubernetes_resource/stateful_set.rb +70 -0
  75. data/lib/krane/label_selector.rb +42 -0
  76. data/lib/krane/oj.rb +4 -0
  77. data/lib/krane/options_helper.rb +39 -0
  78. data/lib/krane/remote_logs.rb +60 -0
  79. data/lib/krane/render_task.rb +118 -0
  80. data/lib/krane/renderer.rb +118 -0
  81. data/lib/krane/resource_cache.rb +68 -0
  82. data/lib/krane/resource_deployer.rb +265 -0
  83. data/lib/krane/resource_watcher.rb +171 -0
  84. data/lib/krane/restart_task.rb +228 -0
  85. data/lib/krane/rollout_conditions.rb +103 -0
  86. data/lib/krane/runner_task.rb +212 -0
  87. data/lib/krane/runner_task_config_validator.rb +18 -0
  88. data/lib/krane/statsd.rb +65 -0
  89. data/lib/krane/task_config.rb +22 -0
  90. data/lib/krane/task_config_validator.rb +96 -0
  91. data/lib/krane/template_sets.rb +173 -0
  92. data/lib/krane/version.rb +4 -0
  93. data/pull_request_template.md +8 -0
  94. data/screenshots/deploy-demo.gif +0 -0
  95. data/screenshots/migrate-logs.png +0 -0
  96. data/screenshots/missing-secret-fail.png +0 -0
  97. data/screenshots/success.png +0 -0
  98. data/screenshots/test-output.png +0 -0
  99. metadata +375 -0
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class HorizontalPodAutoscaler < KubernetesResource
4
+ TIMEOUT = 3.minutes
5
+ RECOVERABLE_CONDITION_PREFIX = "FailedGet"
6
+
7
+ def deploy_succeeded?
8
+ scaling_active_condition["status"] == "True" || scaling_disabled?
9
+ end
10
+
11
+ def deploy_failed?
12
+ return false unless exists?
13
+ return false if scaling_disabled?
14
+ scaling_active_condition["status"] == "False" &&
15
+ !scaling_active_condition.fetch("reason", "").start_with?(RECOVERABLE_CONDITION_PREFIX)
16
+ end
17
+
18
+ def kubectl_resource_type
19
+ 'hpa.v2beta1.autoscaling'
20
+ end
21
+
22
+ def status
23
+ if !exists?
24
+ super
25
+ elsif scaling_disabled?
26
+ "ScalingDisabled"
27
+ elsif deploy_succeeded?
28
+ "Configured"
29
+ elsif scaling_active_condition.present? || able_to_scale_condition.present?
30
+ condition = scaling_active_condition.presence || able_to_scale_condition
31
+ condition['reason']
32
+ else
33
+ "Unknown"
34
+ end
35
+ end
36
+
37
+ def failure_message
38
+ condition = scaling_active_condition.presence || able_to_scale_condition.presence || {}
39
+ condition['message']
40
+ end
41
+
42
+ def timeout_message
43
+ failure_message.presence || super
44
+ end
45
+
46
+ private
47
+
48
+ def scaling_disabled?
49
+ scaling_active_condition["status"] == "False" &&
50
+ scaling_active_condition["reason"] == "ScalingDisabled"
51
+ end
52
+
53
+ def conditions
54
+ @instance_data.dig("status", "conditions") || []
55
+ end
56
+
57
+ def able_to_scale_condition
58
+ conditions.detect { |c| c["type"] == "AbleToScale" } || {}
59
+ end
60
+
61
+ def scaling_active_condition
62
+ conditions.detect { |c| c["type"] == "ScalingActive" } || {}
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class Ingress < KubernetesResource
4
+ TIMEOUT = 30.seconds
5
+
6
+ def status
7
+ exists? ? "Created" : "Not Found"
8
+ end
9
+
10
+ def deploy_succeeded?
11
+ exists?
12
+ end
13
+
14
+ def deploy_failed?
15
+ false
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class Job < KubernetesResource
4
+ TIMEOUT = 10.minutes
5
+
6
+ def deploy_succeeded?
7
+ # Don't block deploys for long running jobs,
8
+ # Instead report success when there is at least 1 active
9
+ return false unless deploy_started?
10
+ done? || running?
11
+ end
12
+
13
+ def deploy_failed?
14
+ return false unless deploy_started?
15
+ return true if failed_status_condition
16
+ return false unless @instance_data.dig("spec", "backoffLimit").present?
17
+ (@instance_data.dig("status", "failed") || 0) >= @instance_data.dig("spec", "backoffLimit")
18
+ end
19
+
20
+ def status
21
+ if !exists?
22
+ super
23
+ elsif done?
24
+ "Succeeded"
25
+ elsif running?
26
+ "Started"
27
+ elsif deploy_failed?
28
+ "Failed"
29
+ else
30
+ "Unknown"
31
+ end
32
+ end
33
+
34
+ def failure_message
35
+ if (condition = failed_status_condition.presence)
36
+ "#{condition['reason']} (#{condition['message']})"
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def failed_status_condition
43
+ @instance_data.dig("status", "conditions")&.detect do |condition|
44
+ condition["type"] == 'Failed' && condition['status'] == "True"
45
+ end
46
+ end
47
+
48
+ def done?
49
+ (@instance_data.dig("status", "succeeded") || 0) == @instance_data.dig("spec", "completions")
50
+ end
51
+
52
+ def running?
53
+ now = Time.now.utc
54
+ start_time = @instance_data.dig("status", "startTime")
55
+ # Wait 5 seconds to ensure job doesn't immediately fail.
56
+ return false if !start_time.present? || now - Time.parse(start_time) < 5.second
57
+ (@instance_data.dig("status", "active") || 0) >= 1
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class NetworkPolicy < KubernetesResource
4
+ TIMEOUT = 30.seconds
5
+
6
+ def status
7
+ exists? ? "Created" : "Not Found"
8
+ end
9
+
10
+ def deploy_succeeded?
11
+ exists?
12
+ end
13
+
14
+ def deploy_failed?
15
+ false
16
+ end
17
+
18
+ def timeout_message
19
+ UNUSUAL_FAILURE_MESSAGE
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class PersistentVolumeClaim < KubernetesResource
4
+ TIMEOUT = 5.minutes
5
+
6
+ def sync(cache)
7
+ super
8
+ @storage_classes = cache.get_all("StorageClass").map { |sc| StorageClass.new(sc) }
9
+ end
10
+
11
+ def status
12
+ exists? ? @instance_data["status"]["phase"] : "Not Found"
13
+ end
14
+
15
+ def deploy_succeeded?
16
+ return true if status == "Bound"
17
+
18
+ # if the StorageClass has volumeBindingMode: WaitForFirstConsumer,
19
+ # it won't bind until after a Pod mounts it. But it must be pre-deployed,
20
+ # as the Pod requires it. So 'Pending' must be treated as a 'Success' state
21
+ if storage_class&.volume_binding_mode == "WaitForFirstConsumer"
22
+ return status == "Pending" || status == "Bound"
23
+ end
24
+ false
25
+ end
26
+
27
+ def deploy_failed?
28
+ status == "Lost" || failure_message.present?
29
+ end
30
+
31
+ def failure_message
32
+ if storage_class_name.nil? && @storage_classes.count(&:default?) > 1
33
+ "PVC has no StorageClass specified and there are multiple StorageClasses " \
34
+ "annotated as default. This is an invalid cluster configuration."
35
+ end
36
+ end
37
+
38
+ def timeout_message
39
+ return STANDARD_TIMEOUT_MESSAGE unless storage_class_name.present? && !storage_class
40
+ "PVC specified a StorageClass of #{storage_class_name} but the resource does not exist"
41
+ end
42
+
43
+ private
44
+
45
+ def storage_class_name
46
+ @definition.dig("spec", "storageClassName")
47
+ end
48
+
49
+ def storage_class
50
+ if storage_class_name.present?
51
+ @storage_classes.detect { |sc| sc.name == storage_class_name }
52
+ # storage_class_name = "" is an explicit request for no storage class
53
+ # storage_class_name = nil is an impplicit request for default storage class
54
+ elsif storage_class_name != ""
55
+ @storage_classes.detect(&:default?)
56
+ end
57
+ end
58
+
59
+ class StorageClass < KubernetesResource
60
+ DEFAULT_CLASS_ANNOTATION = "storageclass.kubernetes.io/is-default-class"
61
+ DEFAULT_CLASS_BETA_ANNOTATION = "storageclass.beta.kubernetes.io/is-default-class"
62
+
63
+ attr_reader :name
64
+
65
+ def initialize(definition)
66
+ @definition = definition
67
+ @name = definition.dig("metadata", "name").to_s
68
+ end
69
+
70
+ def volume_binding_mode
71
+ @definition.dig("volumeBindingMode")
72
+ end
73
+
74
+ def default?
75
+ @definition.dig("metadata", "annotations", DEFAULT_CLASS_ANNOTATION) == "true" ||
76
+ @definition.dig("metadata", "annotations", DEFAULT_CLASS_BETA_ANNOTATION) == "true"
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+ module Krane
3
+ class Pod < KubernetesResource
4
+ TIMEOUT = 10.minutes
5
+
6
+ FAILED_PHASE_NAME = "Failed"
7
+ TRANSIENT_FAILURE_REASONS = %w(
8
+ Evicted
9
+ Preempting
10
+ )
11
+
12
+ attr_accessor :stream_logs
13
+
14
+ def initialize(namespace:, context:, definition:, logger:,
15
+ statsd_tags: nil, parent: nil, deploy_started_at: nil, stream_logs: false)
16
+ @parent = parent
17
+ @deploy_started_at = deploy_started_at
18
+
19
+ @containers = definition.fetch("spec", {}).fetch("containers", []).map { |c| Container.new(c) }
20
+ unless @containers.present?
21
+ logger.summary.add_paragraph("Rendered template content:\n#{definition.to_yaml}")
22
+ raise FatalDeploymentError, "Template is missing required field spec.containers"
23
+ end
24
+ @containers += definition["spec"].fetch("initContainers", []).map { |c| Container.new(c, init_container: true) }
25
+ @stream_logs = stream_logs
26
+ super(namespace: namespace, context: context, definition: definition,
27
+ logger: logger, statsd_tags: statsd_tags)
28
+ end
29
+
30
+ def sync(_cache)
31
+ super
32
+ raise_predates_deploy_error if exists? && unmanaged? && !deploy_started?
33
+
34
+ if exists?
35
+ logs.sync if unmanaged?
36
+ update_container_statuses(@instance_data["status"])
37
+ else # reset
38
+ @containers.each(&:reset_status)
39
+ end
40
+ end
41
+
42
+ def after_sync
43
+ if @stream_logs
44
+ logs.print_latest
45
+ elsif unmanaged? && deploy_succeeded?
46
+ logs.print_all
47
+ end
48
+ end
49
+
50
+ def status
51
+ return phase if reason.blank?
52
+ "#{phase} (Reason: #{reason})"
53
+ end
54
+
55
+ def deploy_succeeded?
56
+ if unmanaged?
57
+ phase == "Succeeded"
58
+ else
59
+ phase == "Running" && ready?
60
+ end
61
+ end
62
+
63
+ def deploy_failed?
64
+ failure_message.present?
65
+ end
66
+
67
+ def timeout_message
68
+ if readiness_probe_failure?
69
+ probe_failure_msgs = @containers.map(&:readiness_fail_reason).compact
70
+ header = "The following containers have not passed their readiness probes on at least one pod:\n"
71
+ header + probe_failure_msgs.join("\n")
72
+ elsif failed_schedule_reason.present?
73
+ "Pod could not be scheduled because #{failed_schedule_reason}"
74
+ else
75
+ STANDARD_TIMEOUT_MESSAGE
76
+ end
77
+ end
78
+
79
+ def failure_message
80
+ doomed_containers = @containers.select(&:doomed?)
81
+ if doomed_containers.present?
82
+ container_problems = if unmanaged?
83
+ "The following containers encountered errors:\n"
84
+ else
85
+ "The following containers are in a state that is unlikely to be recoverable:\n"
86
+ end
87
+ doomed_containers.each do |c|
88
+ red_name = ColorizedString.new(c.name).red
89
+ container_problems += "> #{red_name}: #{c.doom_reason}\n"
90
+ end
91
+ end
92
+ "#{phase_failure_message} #{container_problems}".strip.presence
93
+ end
94
+
95
+ def fetch_debug_logs
96
+ logs.sync
97
+ logs
98
+ end
99
+
100
+ def print_debug_logs?
101
+ exists? && !@stream_logs # don't print them a second time
102
+ end
103
+
104
+ def node_name
105
+ @instance_data.dig('spec', 'nodeName')
106
+ end
107
+
108
+ private
109
+
110
+ def failed_schedule_reason
111
+ if phase == "Pending"
112
+ conditions = @instance_data.dig('status', 'conditions') || []
113
+ unschedulable = conditions.find do |condition|
114
+ condition["type"] == "PodScheduled" && condition["status"] == "False"
115
+ end
116
+ unschedulable&.dig('message')
117
+ end
118
+ end
119
+
120
+ def failed_phase?
121
+ phase == FAILED_PHASE_NAME
122
+ end
123
+
124
+ def transient_failure_reason?
125
+ return false if unmanaged?
126
+ TRANSIENT_FAILURE_REASONS.include?(reason)
127
+ end
128
+
129
+ def phase_failure_message
130
+ if failed_phase? && !transient_failure_reason?
131
+ return "Pod status: #{status}."
132
+ end
133
+
134
+ return unless unmanaged?
135
+
136
+ if terminating?
137
+ "Pod status: Terminating."
138
+ elsif disappeared?
139
+ "Pod status: Disappeared."
140
+ end
141
+ end
142
+
143
+ def logs
144
+ @logs ||= Krane::RemoteLogs.new(
145
+ logger: @logger,
146
+ parent_id: id,
147
+ container_names: @containers.map(&:name),
148
+ namespace: @namespace,
149
+ context: @context
150
+ )
151
+ end
152
+
153
+ def phase
154
+ @instance_data.dig("status", "phase") || "Unknown"
155
+ end
156
+
157
+ def reason
158
+ @instance_data.dig('status', 'reason')
159
+ end
160
+
161
+ def readiness_probe_failure?
162
+ return false if ready? || unmanaged?
163
+ return false if phase != "Running"
164
+ @containers.any?(&:readiness_fail_reason)
165
+ end
166
+
167
+ def ready?
168
+ return false unless (status_data = @instance_data["status"])
169
+ ready_condition = status_data.fetch("conditions", []).find { |condition| condition["type"] == "Ready" }
170
+ ready_condition.present? && (ready_condition["status"] == "True")
171
+ end
172
+
173
+ def update_container_statuses(status_data)
174
+ @containers.each do |c|
175
+ key = c.init_container? ? "initContainerStatuses" : "containerStatuses"
176
+ if status_data.key?(key)
177
+ data = status_data[key].find { |st| st["name"] == c.name }
178
+ c.update_status(data)
179
+ else
180
+ c.reset_status
181
+ end
182
+ end
183
+ end
184
+
185
+ def unmanaged?
186
+ @parent.blank?
187
+ end
188
+
189
+ def raise_predates_deploy_error
190
+ example_color = :green
191
+ msg = <<-STRING.strip_heredoc
192
+ Unmanaged pods like #{id} must have unique names on every deploy in order to work as intended.
193
+ The recommended way to achieve this is to include "<%= deployment_id %>" in the pod's name, like this:
194
+ #{ColorizedString.new('kind: Pod').colorize(example_color)}
195
+ #{ColorizedString.new('metadata:').colorize(example_color)}
196
+ #{ColorizedString.new("name: #{@name}-<%= deployment_id %>").colorize(example_color)}
197
+ STRING
198
+ @logger.summary.add_paragraph(msg)
199
+ raise FatalDeploymentError, "#{id} existed before the deploy started"
200
+ end
201
+
202
+ class Container
203
+ attr_reader :name
204
+
205
+ def initialize(definition, init_container: false)
206
+ @init_container = init_container
207
+ @name = definition["name"]
208
+ @image = definition["image"]
209
+ @http_probe_location = definition.dig("readinessProbe", "httpGet", "path")
210
+ @exec_probe_command = definition.dig("readinessProbe", "exec", "command")
211
+ @status = {}
212
+ end
213
+
214
+ def doomed?
215
+ doom_reason.present?
216
+ end
217
+
218
+ def doom_reason
219
+ limbo_reason = @status.dig("state", "waiting", "reason")
220
+ limbo_message = @status.dig("state", "waiting", "message")
221
+
222
+ if @status.dig("lastState", "terminated", "reason") == "ContainerCannotRun"
223
+ # ref: https://github.com/kubernetes/kubernetes/blob/562e721ece8a16e05c7e7d6bdd6334c910733ab2/pkg/kubelet/dockershim/docker_container.go#L353
224
+ exit_code = @status.dig('lastState', 'terminated', 'exitCode')
225
+ "Failed to start (exit #{exit_code}): #{@status.dig('lastState', 'terminated', 'message')}"
226
+ elsif @status.dig("state", "terminated", "reason") == "ContainerCannotRun"
227
+ exit_code = @status.dig('state', 'terminated', 'exitCode')
228
+ "Failed to start (exit #{exit_code}): #{@status.dig('state', 'terminated', 'message')}"
229
+ elsif limbo_reason == "CrashLoopBackOff"
230
+ exit_code = @status.dig('lastState', 'terminated', 'exitCode')
231
+ "Crashing repeatedly (exit #{exit_code}). See logs for more information."
232
+ elsif limbo_reason == "ErrImagePull" && limbo_message.match(/not found/i)
233
+ "Failed to pull image #{@image}. "\
234
+ "Did you wait for it to be built and pushed to the registry before deploying?"
235
+ elsif limbo_reason == "CreateContainerConfigError"
236
+ "Failed to generate container configuration: #{limbo_message}"
237
+ end
238
+ end
239
+
240
+ def readiness_fail_reason
241
+ return if ready? || init_container?
242
+ return unless (@http_probe_location || @exec_probe_command).present?
243
+
244
+ yellow_name = ColorizedString.new(name).yellow
245
+ if @http_probe_location
246
+ "> #{yellow_name} must respond with a good status code at '#{@http_probe_location}'"
247
+ elsif @exec_probe_command
248
+ "> #{yellow_name} must exit 0 from the following command: '#{@exec_probe_command.join(' ')}'"
249
+ end
250
+ end
251
+
252
+ def ready?
253
+ @status['ready'] == true
254
+ end
255
+
256
+ def init_container?
257
+ @init_container
258
+ end
259
+
260
+ def update_status(data)
261
+ @status = data || {}
262
+ end
263
+
264
+ def reset_status
265
+ @status = {}
266
+ end
267
+ end
268
+ end
269
+ end