container_broker 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CheckForSlotRemoval
4
+ attr_reader :slot
5
+
6
+ def initialize(slot:)
7
+ @slot = slot
8
+ end
9
+
10
+ def perform
11
+ return unless adjust_execution_type_slots_instance.decrement?
12
+
13
+ slot.destroy!
14
+ FriendlyNameSlots.new(node: slot.node.reload).perform
15
+
16
+ @removed = true
17
+ end
18
+
19
+ def removed?
20
+ @removed
21
+ end
22
+
23
+ private
24
+
25
+ def adjust_execution_type_slots_instance
26
+ AdjustExecutionTypeSlots.new(node: slot.node, execution_type: slot.execution_type)
27
+ end
28
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CollectLoadMetrics
4
+ def perform
5
+ send_tasks_count
6
+ send_slots_count
7
+ send_slots_usage_percent
8
+ end
9
+
10
+ private
11
+
12
+ def send_tasks_count
13
+ send_metrics("tasks_count", Task.where(:status.in => %w[waiting starting started retry failed]))
14
+ end
15
+
16
+ def send_slots_count
17
+ send_metrics("slots_count", Node.available.flat_map(&:slots))
18
+ end
19
+
20
+ def send_slots_usage_percent
21
+ Node
22
+ .available
23
+ .flat_map(&:slots)
24
+ .group_by(&:execution_type)
25
+ .each do |execution_type, slots|
26
+ Metrics.new("slots_usage_percent").count(execution_type: execution_type, percent: SlotsUsagePercentage.new(slots).perform)
27
+ end
28
+ end
29
+
30
+ def send_metrics(metric, items)
31
+ items
32
+ .group_by { |s| { execution_type: s.execution_type, status: s.status } }
33
+ .transform_values(&:count)
34
+ .each do |data, count|
35
+ Metrics.new(metric).count(
36
+ data.merge(amount: count)
37
+ )
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DeleteNode
4
+ class NodeWithRunningSlotsError < StandardError; end
5
+
6
+ attr_reader :node
7
+
8
+ def initialize(node:)
9
+ @node = node
10
+ end
11
+
12
+ def perform
13
+ if node.accept_new_tasks
14
+ was_accepting_new_tasks = true
15
+ NodeTaskAcceptance.new(node: node).reject!
16
+ end
17
+
18
+ if node.slots.working.any?
19
+ NodeTaskAcceptance.new(node: node).accept! if was_accepting_new_tasks
20
+ raise NodeWithRunningSlotsError
21
+ else
22
+ node.destroy!
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ class FriendlyNameNodes
4
+ def perform
5
+ Node.order(runner_provider: :desc, hostname: :asc, id: :asc).each_with_index do |node, index|
6
+ node.update(name: "n#{format("%02d%s", (index + 1), node.runner_provider.first)}")
7
+ FriendlyNameSlots.new(node: node).perform
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ class FriendlyNameSlots
4
+ attr_reader :node
5
+
6
+ def initialize(node:)
7
+ @node = node
8
+ end
9
+
10
+ def perform
11
+ node.slots.each_with_index do |slot, index|
12
+ slot.update(name: "#{node.name}-s#{format("%02d", (index + 1))}-#{slot.execution_type}")
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ class KillNodeRunners
4
+ attr_reader :node
5
+
6
+ def initialize(node:)
7
+ @node = node
8
+ end
9
+
10
+ def perform
11
+ node.slots.running.each do |slot|
12
+ node
13
+ .runner_service(:kill_slot_runner)
14
+ .perform(slot: slot)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ class KillTaskContainer
4
+ attr_reader :task
5
+
6
+ class TaskNotRunningError < StandardError; end
7
+
8
+ delegate :node, to: :task
9
+
10
+ def initialize(task:)
11
+ @task = task
12
+ end
13
+
14
+ def perform
15
+ validate_task_status
16
+
17
+ task
18
+ .slot
19
+ .node
20
+ .runner_service(:kill_slot_runner)
21
+ .perform(slot: task.slot)
22
+ end
23
+
24
+ private
25
+
26
+ def validate_task_status
27
+ raise TaskNotRunningError, "#{task} is not running" unless task.started?
28
+ end
29
+ end
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ class KubernetesClient
4
+ class PodNotFoundError < StandardError; end
5
+ class NetworkError < StandardError; end
6
+ class LogsNotFoundError < StandardError; end
7
+
8
+ LOG_UNAVAILABLE_HTTP_ERROR = 400
9
+
10
+ attr_reader :uri, :bearer_token, :namespace
11
+
12
+ def initialize(uri:, bearer_token:, namespace:)
13
+ @uri = uri
14
+ @bearer_token = bearer_token
15
+ @namespace = namespace
16
+ end
17
+
18
+ def api_info
19
+ handle_exception { pod_client.api }
20
+ end
21
+
22
+ # rubocop:disable Metrics/ParameterLists
23
+ def create_pod(pod_name:, image:, cmd:, internal_mounts: [], external_mounts: [], node_selector:)
24
+ handle_exception do
25
+ pod = Kubeclient::Resource.new(
26
+ metadata: {
27
+ name: pod_name,
28
+ namespace: namespace
29
+ },
30
+ spec: {
31
+ containers: [
32
+ container_options(name: pod_name, image: image, cmd: cmd, internal_mounts: internal_mounts)
33
+ ],
34
+ restartPolicy: "Never",
35
+ nodeSelector: { node_selector => "" },
36
+ tolerations: [
37
+ {
38
+ key: node_selector,
39
+ effect: "NoSchedule"
40
+ }
41
+ ],
42
+ volumes: external_mounts
43
+ }
44
+ )
45
+
46
+ pod_client.create_pod(pod)
47
+
48
+ pod_name
49
+ end
50
+ end
51
+ # rubocop:enable Metrics/ParameterLists
52
+
53
+ def fetch_pod_logs(pod_name:)
54
+ handle_exception(pod_name) do
55
+ pod_client.get_pod_log(pod_name, namespace).body
56
+ rescue Kubeclient::HttpError => e
57
+ raise LogsNotFoundError if e.error_code == LOG_UNAVAILABLE_HTTP_ERROR
58
+
59
+ raise
60
+ end
61
+ end
62
+
63
+ def fetch_pod(pod_name:)
64
+ handle_exception(pod_name) { pod_client.get_pod(pod_name, namespace) }
65
+ end
66
+
67
+ def force_delete_pod(pod_name:)
68
+ handle_exception(pod_name) { pod_client.delete_pod(pod_name, namespace, delete_options: delete_options) }
69
+ end
70
+
71
+ def fetch_pods
72
+ handle_exception do
73
+ pod_client
74
+ .get_pods(namespace: namespace)
75
+ .each_with_object({}) do |pod, result|
76
+ result[pod.metadata.name] = pod
77
+ end
78
+ end
79
+ end
80
+
81
+ def handle_exception(pod_name = nil)
82
+ yield
83
+ rescue Kubeclient::ResourceNotFoundError
84
+ raise PodNotFoundError, "Pod not found #{pod_name}"
85
+ rescue Kubeclient::HttpError, SocketError, Errno::ECONNREFUSED, OpenSSL::SSL::SSLError => e
86
+ raise NetworkError, "#{e.class}: #{e.message}"
87
+ end
88
+
89
+ private
90
+
91
+ def container_options(name:, image:, cmd:, internal_mounts:)
92
+ {
93
+ name: name,
94
+ image: image,
95
+ command: ["sh", "-c", cmd],
96
+ resources: {
97
+ requests: { cpu: Settings.kubernetes.requests.cpu, memory: Settings.kubernetes.requests.memory },
98
+ limits: { cpu: Settings.kubernetes.limits.cpu, memory: Settings.kubernetes.limits.memory }
99
+ },
100
+ volumeMounts: internal_mounts,
101
+ securityContext: {
102
+ runAsUser: Settings.run_container_as.user_id,
103
+ runAsGroup: Settings.run_container_as.group_id
104
+ },
105
+ livenessProbe: liveness_probe_options(internal_mounts: internal_mounts)
106
+ }
107
+ end
108
+
109
+ def liveness_probe_options(internal_mounts:)
110
+ return if internal_mounts.empty?
111
+
112
+ {
113
+ exec: {
114
+ command: %w[ls] + internal_mounts.map { |mount| mount[:mountPath] }
115
+ },
116
+ periodSeconds: Settings.kubernetes.liveness_probe_seconds_interval
117
+ }
118
+ end
119
+
120
+ def delete_options
121
+ Kubeclient::Resource.new(
122
+ apiVersion: "v1",
123
+ gracePeriodSeconds: 0,
124
+ kind: "DeleteOptions"
125
+ )
126
+ end
127
+
128
+ def pod_client
129
+ Kubeclient::Client.new(build_client_uri(path: "/api"), "v1", auth_options: { bearer_token: bearer_token }, ssl_options: { verify_ssl: false })
130
+ end
131
+
132
+ def build_client_uri(path:)
133
+ parsed_uri = URI.parse(uri)
134
+ URI::Generic.build(host: parsed_uri.host, port: parsed_uri.port, scheme: parsed_uri.scheme, path: path).to_s
135
+ end
136
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ class LeastUsedNode
4
+ attr_reader :execution_type
5
+
6
+ def initialize(execution_type:)
7
+ @execution_type = execution_type
8
+ end
9
+
10
+ def call
11
+ least_used_nodes&.sample
12
+ end
13
+
14
+ private
15
+
16
+ def least_used_nodes
17
+ nodes_by_usage[nodes_by_usage.keys.min]
18
+ end
19
+
20
+ def nodes_by_usage
21
+ @nodes_by_usage ||=
22
+ nodes
23
+ .filter { |node| slots(node.id).to_a.filter(&:available?).any? }
24
+ .group_by { |node| SlotsUsagePercentage.new(slots(node.id)).perform }
25
+ end
26
+
27
+ def nodes
28
+ @nodes ||=
29
+ Node
30
+ .accepting_new_tasks
31
+ .available
32
+ end
33
+
34
+ def slots(node_id)
35
+ @slots ||=
36
+ Slot
37
+ .only(:id, :node_id, :status)
38
+ .where(:node_id.in => nodes.map(&:id))
39
+ .where(execution_type: execution_type)
40
+ .group_by(&:node_id)
41
+
42
+ @slots[node_id]
43
+ end
44
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ class LockManager
4
+ attr_reader :expire, :wait, :locked, :key
5
+ KEY_PREFIX = "lockmanager"
6
+
7
+ def initialize(type:, id:, expire:, wait: true)
8
+ @key = "#{KEY_PREFIX}-#{type}-#{id}"
9
+ @expire = expire
10
+ @wait = wait
11
+ end
12
+
13
+ def lock
14
+ if lock!
15
+ begin
16
+ yield(self)
17
+ ensure
18
+ unlock!
19
+ end
20
+ else
21
+ false
22
+ end
23
+ end
24
+
25
+ def lock!
26
+ try_lock
27
+
28
+ if wait
29
+ until locked
30
+ sleep 0.1
31
+ try_lock
32
+ end
33
+ end
34
+
35
+ locked
36
+ end
37
+
38
+ def unlock!
39
+ redis_client.del(key)
40
+ @locked = false
41
+ end
42
+
43
+ def keep_locked
44
+ raise "Lock not acquired" unless locked
45
+
46
+ if redis_set(xx: true)
47
+ puts "[LockManager] lock extended by #{expire}"
48
+ else
49
+ raise "[LockManager] Lock expired"
50
+ end
51
+ end
52
+
53
+ def self.active_locks
54
+ redis_client.keys("#{KEY_PREFIX}*").each_with_object({}) do |key, result|
55
+ result[key] = redis_client.ttl(key)
56
+ end
57
+ end
58
+
59
+ def try_lock
60
+ @locked = redis_set(nx: true)
61
+ end
62
+
63
+ def redis_set(options)
64
+ redis_client.set(key, 1, **options.merge(ex: expire))
65
+ end
66
+
67
+ def redis_client
68
+ LockManager.redis_client
69
+ end
70
+
71
+ def self.redis_client
72
+ Redis.new(RedisUrlParser.call(Settings.redis_url))
73
+ end
74
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ class LockSlot
4
+ attr_reader :execution_type, :node
5
+
6
+ def initialize(execution_type:, node: nil)
7
+ @execution_type = execution_type
8
+ @node = node
9
+ end
10
+
11
+ def perform
12
+ return unless selected_node
13
+
14
+ selected_node
15
+ .slots
16
+ .available
17
+ .where(execution_type: execution_type)
18
+ .find_one_and_update(
19
+ {
20
+ "$set" => {
21
+ status: "attaching"
22
+ }
23
+ },
24
+ return_document: :after
25
+ )
26
+ end
27
+
28
+ private
29
+
30
+ def selected_node
31
+ @selected_node ||= node || least_used_node
32
+ end
33
+
34
+ def least_used_node
35
+ LeastUsedNode.new(execution_type: execution_type).call
36
+ end
37
+ end