container_broker 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CheckForSlotRemoval
4
+ attr_reader :slot
5
+
6
+ def initialize(slot:)
7
+ @slot = slot
8
+ end
9
+
10
+ def perform
11
+ return unless adjust_execution_type_slots_instance.decrement?
12
+
13
+ slot.destroy!
14
+ FriendlyNameSlots.new(node: slot.node.reload).perform
15
+
16
+ @removed = true
17
+ end
18
+
19
+ def removed?
20
+ @removed
21
+ end
22
+
23
+ private
24
+
25
+ def adjust_execution_type_slots_instance
26
+ AdjustExecutionTypeSlots.new(node: slot.node, execution_type: slot.execution_type)
27
+ end
28
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CollectLoadMetrics
4
+ def perform
5
+ send_tasks_count
6
+ send_slots_count
7
+ send_slots_usage_percent
8
+ end
9
+
10
+ private
11
+
12
+ def send_tasks_count
13
+ send_metrics("tasks_count", Task.where(:status.in => %w[waiting starting started retry failed]))
14
+ end
15
+
16
+ def send_slots_count
17
+ send_metrics("slots_count", Node.available.flat_map(&:slots))
18
+ end
19
+
20
+ def send_slots_usage_percent
21
+ Node
22
+ .available
23
+ .flat_map(&:slots)
24
+ .group_by(&:execution_type)
25
+ .each do |execution_type, slots|
26
+ Metrics.new("slots_usage_percent").count(execution_type: execution_type, percent: SlotsUsagePercentage.new(slots).perform)
27
+ end
28
+ end
29
+
30
+ def send_metrics(metric, items)
31
+ items
32
+ .group_by { |s| { execution_type: s.execution_type, status: s.status } }
33
+ .transform_values(&:count)
34
+ .each do |data, count|
35
+ Metrics.new(metric).count(
36
+ data.merge(amount: count)
37
+ )
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DeleteNode
4
+ class NodeWithRunningSlotsError < StandardError; end
5
+
6
+ attr_reader :node
7
+
8
+ def initialize(node:)
9
+ @node = node
10
+ end
11
+
12
+ def perform
13
+ if node.accept_new_tasks
14
+ was_accepting_new_tasks = true
15
+ NodeTaskAcceptance.new(node: node).reject!
16
+ end
17
+
18
+ if node.slots.working.any?
19
+ NodeTaskAcceptance.new(node: node).accept! if was_accepting_new_tasks
20
+ raise NodeWithRunningSlotsError
21
+ else
22
+ node.destroy!
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ class FriendlyNameNodes
4
+ def perform
5
+ Node.order(runner_provider: :desc, hostname: :asc, id: :asc).each_with_index do |node, index|
6
+ node.update(name: "n#{format("%02d%s", (index + 1), node.runner_provider.first)}")
7
+ FriendlyNameSlots.new(node: node).perform
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ class FriendlyNameSlots
4
+ attr_reader :node
5
+
6
+ def initialize(node:)
7
+ @node = node
8
+ end
9
+
10
+ def perform
11
+ node.slots.each_with_index do |slot, index|
12
+ slot.update(name: "#{node.name}-s#{format("%02d", (index + 1))}-#{slot.execution_type}")
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ class KillNodeRunners
4
+ attr_reader :node
5
+
6
+ def initialize(node:)
7
+ @node = node
8
+ end
9
+
10
+ def perform
11
+ node.slots.running.each do |slot|
12
+ node
13
+ .runner_service(:kill_slot_runner)
14
+ .perform(slot: slot)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ class KillTaskContainer
4
+ attr_reader :task
5
+
6
+ class TaskNotRunningError < StandardError; end
7
+
8
+ delegate :node, to: :task
9
+
10
+ def initialize(task:)
11
+ @task = task
12
+ end
13
+
14
+ def perform
15
+ validate_task_status
16
+
17
+ task
18
+ .slot
19
+ .node
20
+ .runner_service(:kill_slot_runner)
21
+ .perform(slot: task.slot)
22
+ end
23
+
24
+ private
25
+
26
+ def validate_task_status
27
+ raise TaskNotRunningError, "#{task} is not running" unless task.started?
28
+ end
29
+ end
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ class KubernetesClient
4
+ class PodNotFoundError < StandardError; end
5
+ class NetworkError < StandardError; end
6
+ class LogsNotFoundError < StandardError; end
7
+
8
+ LOG_UNAVAILABLE_HTTP_ERROR = 400
9
+
10
+ attr_reader :uri, :bearer_token, :namespace
11
+
12
+ def initialize(uri:, bearer_token:, namespace:)
13
+ @uri = uri
14
+ @bearer_token = bearer_token
15
+ @namespace = namespace
16
+ end
17
+
18
+ def api_info
19
+ handle_exception { pod_client.api }
20
+ end
21
+
22
+ # rubocop:disable Metrics/ParameterLists
23
+ def create_pod(pod_name:, image:, cmd:, internal_mounts: [], external_mounts: [], node_selector:)
24
+ handle_exception do
25
+ pod = Kubeclient::Resource.new(
26
+ metadata: {
27
+ name: pod_name,
28
+ namespace: namespace
29
+ },
30
+ spec: {
31
+ containers: [
32
+ container_options(name: pod_name, image: image, cmd: cmd, internal_mounts: internal_mounts)
33
+ ],
34
+ restartPolicy: "Never",
35
+ nodeSelector: { node_selector => "" },
36
+ tolerations: [
37
+ {
38
+ key: node_selector,
39
+ effect: "NoSchedule"
40
+ }
41
+ ],
42
+ volumes: external_mounts
43
+ }
44
+ )
45
+
46
+ pod_client.create_pod(pod)
47
+
48
+ pod_name
49
+ end
50
+ end
51
+ # rubocop:enable Metrics/ParameterLists
52
+
53
+ def fetch_pod_logs(pod_name:)
54
+ handle_exception(pod_name) do
55
+ pod_client.get_pod_log(pod_name, namespace).body
56
+ rescue Kubeclient::HttpError => e
57
+ raise LogsNotFoundError if e.error_code == LOG_UNAVAILABLE_HTTP_ERROR
58
+
59
+ raise
60
+ end
61
+ end
62
+
63
+ def fetch_pod(pod_name:)
64
+ handle_exception(pod_name) { pod_client.get_pod(pod_name, namespace) }
65
+ end
66
+
67
+ def force_delete_pod(pod_name:)
68
+ handle_exception(pod_name) { pod_client.delete_pod(pod_name, namespace, delete_options: delete_options) }
69
+ end
70
+
71
+ def fetch_pods
72
+ handle_exception do
73
+ pod_client
74
+ .get_pods(namespace: namespace)
75
+ .each_with_object({}) do |pod, result|
76
+ result[pod.metadata.name] = pod
77
+ end
78
+ end
79
+ end
80
+
81
+ def handle_exception(pod_name = nil)
82
+ yield
83
+ rescue Kubeclient::ResourceNotFoundError
84
+ raise PodNotFoundError, "Pod not found #{pod_name}"
85
+ rescue Kubeclient::HttpError, SocketError, Errno::ECONNREFUSED, OpenSSL::SSL::SSLError => e
86
+ raise NetworkError, "#{e.class}: #{e.message}"
87
+ end
88
+
89
+ private
90
+
91
+ def container_options(name:, image:, cmd:, internal_mounts:)
92
+ {
93
+ name: name,
94
+ image: image,
95
+ command: ["sh", "-c", cmd],
96
+ resources: {
97
+ requests: { cpu: Settings.kubernetes.requests.cpu, memory: Settings.kubernetes.requests.memory },
98
+ limits: { cpu: Settings.kubernetes.limits.cpu, memory: Settings.kubernetes.limits.memory }
99
+ },
100
+ volumeMounts: internal_mounts,
101
+ securityContext: {
102
+ runAsUser: Settings.run_container_as.user_id,
103
+ runAsGroup: Settings.run_container_as.group_id
104
+ },
105
+ livenessProbe: liveness_probe_options(internal_mounts: internal_mounts)
106
+ }
107
+ end
108
+
109
+ def liveness_probe_options(internal_mounts:)
110
+ return if internal_mounts.empty?
111
+
112
+ {
113
+ exec: {
114
+ command: %w[ls] + internal_mounts.map { |mount| mount[:mountPath] }
115
+ },
116
+ periodSeconds: Settings.kubernetes.liveness_probe_seconds_interval
117
+ }
118
+ end
119
+
120
+ def delete_options
121
+ Kubeclient::Resource.new(
122
+ apiVersion: "v1",
123
+ gracePeriodSeconds: 0,
124
+ kind: "DeleteOptions"
125
+ )
126
+ end
127
+
128
+ def pod_client
129
+ Kubeclient::Client.new(build_client_uri(path: "/api"), "v1", auth_options: { bearer_token: bearer_token }, ssl_options: { verify_ssl: false })
130
+ end
131
+
132
+ def build_client_uri(path:)
133
+ parsed_uri = URI.parse(uri)
134
+ URI::Generic.build(host: parsed_uri.host, port: parsed_uri.port, scheme: parsed_uri.scheme, path: path).to_s
135
+ end
136
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ class LeastUsedNode
4
+ attr_reader :execution_type
5
+
6
+ def initialize(execution_type:)
7
+ @execution_type = execution_type
8
+ end
9
+
10
+ def call
11
+ least_used_nodes&.sample
12
+ end
13
+
14
+ private
15
+
16
+ def least_used_nodes
17
+ nodes_by_usage[nodes_by_usage.keys.min]
18
+ end
19
+
20
+ def nodes_by_usage
21
+ @nodes_by_usage ||=
22
+ nodes
23
+ .filter { |node| slots(node.id).to_a.filter(&:available?).any? }
24
+ .group_by { |node| SlotsUsagePercentage.new(slots(node.id)).perform }
25
+ end
26
+
27
+ def nodes
28
+ @nodes ||=
29
+ Node
30
+ .accepting_new_tasks
31
+ .available
32
+ end
33
+
34
+ def slots(node_id)
35
+ @slots ||=
36
+ Slot
37
+ .only(:id, :node_id, :status)
38
+ .where(:node_id.in => nodes.map(&:id))
39
+ .where(execution_type: execution_type)
40
+ .group_by(&:node_id)
41
+
42
+ @slots[node_id]
43
+ end
44
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ class LockManager
4
+ attr_reader :expire, :wait, :locked, :key
5
+ KEY_PREFIX = "lockmanager"
6
+
7
+ def initialize(type:, id:, expire:, wait: true)
8
+ @key = "#{KEY_PREFIX}-#{type}-#{id}"
9
+ @expire = expire
10
+ @wait = wait
11
+ end
12
+
13
+ def lock
14
+ if lock!
15
+ begin
16
+ yield(self)
17
+ ensure
18
+ unlock!
19
+ end
20
+ else
21
+ false
22
+ end
23
+ end
24
+
25
+ def lock!
26
+ try_lock
27
+
28
+ if wait
29
+ until locked
30
+ sleep 0.1
31
+ try_lock
32
+ end
33
+ end
34
+
35
+ locked
36
+ end
37
+
38
+ def unlock!
39
+ redis_client.del(key)
40
+ @locked = false
41
+ end
42
+
43
+ def keep_locked
44
+ raise "Lock not acquired" unless locked
45
+
46
+ if redis_set(xx: true)
47
+ puts "[LockManager] lock extended by #{expire}"
48
+ else
49
+ raise "[LockManager] Lock expired"
50
+ end
51
+ end
52
+
53
+ def self.active_locks
54
+ redis_client.keys("#{KEY_PREFIX}*").each_with_object({}) do |key, result|
55
+ result[key] = redis_client.ttl(key)
56
+ end
57
+ end
58
+
59
+ def try_lock
60
+ @locked = redis_set(nx: true)
61
+ end
62
+
63
+ def redis_set(options)
64
+ redis_client.set(key, 1, **options.merge(ex: expire))
65
+ end
66
+
67
+ def redis_client
68
+ LockManager.redis_client
69
+ end
70
+
71
+ def self.redis_client
72
+ Redis.new(RedisUrlParser.call(Settings.redis_url))
73
+ end
74
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ class LockSlot
4
+ attr_reader :execution_type, :node
5
+
6
+ def initialize(execution_type:, node: nil)
7
+ @execution_type = execution_type
8
+ @node = node
9
+ end
10
+
11
+ def perform
12
+ return unless selected_node
13
+
14
+ selected_node
15
+ .slots
16
+ .available
17
+ .where(execution_type: execution_type)
18
+ .find_one_and_update(
19
+ {
20
+ "$set" => {
21
+ status: "attaching"
22
+ }
23
+ },
24
+ return_document: :after
25
+ )
26
+ end
27
+
28
+ private
29
+
30
+ def selected_node
31
+ @selected_node ||= node || least_used_node
32
+ end
33
+
34
+ def least_used_node
35
+ LeastUsedNode.new(execution_type: execution_type).call
36
+ end
37
+ end