container_broker 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class NodeAvailability
6
+ def perform(node:)
7
+ ::Docker.info(CreateConnection.new.perform(node: node))
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class RemoveRunner
6
+ def perform(node:, runner_id:)
7
+ Rails.logger.info("Removing container #{runner_id} from node #{node}")
8
+ container = ::Docker::Container.get(runner_id, { all: true }, CreateConnection.new.perform(node: node))
9
+ container.kill if container.info["State"]["Status"] == "running"
10
+ container.delete
11
+ rescue ::Docker::Error::NotFoundError, ::Docker::Error::ConflictError => e
12
+ Rails.logger.info("Container #{runner_id} already removed - #{e.message} (e.class)")
13
+ rescue Excon::Error, ::Docker::Error::TimeoutError => e
14
+ node.register_error(e.message)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class RunTask
6
+ def perform(task:, slot:, runner_id:)
7
+ Rails.logger.debug("Performing Docker::RunTask for #{task} #{slot}")
8
+
9
+ pull_image(task: task, slot: slot)
10
+ Rails.logger.debug("Image pulled for #{task} #{slot}")
11
+
12
+ container = create_container(task: task, slot: slot, name: runner_id)
13
+ Rails.logger.debug("Container #{container.id} created for #{task} #{slot} with name #{runner_id}")
14
+
15
+ container.start
16
+ Rails.logger.debug("Container #{container.id} started")
17
+
18
+ runner_id
19
+ rescue Excon::Error, ::Docker::Error::TimeoutError => e then
20
+ message = "Docker connection error: #{e.message}"
21
+ message += "\n#{e.response.body}" if e.respond_to?(:response)
22
+ raise Node::NodeConnectionError, message
23
+ rescue ::Docker::Error::NotFoundError => e
24
+ raise "Docker image not found: #{e.message}"
25
+ end
26
+
27
+ private
28
+
29
+ def pull_image(task:, slot:)
30
+ return if ::Docker::Image.exist?(task.image, {}, CreateConnection.new.perform(node: slot.node))
31
+
32
+ image_name, image_tag = task.image.split(":")
33
+ image_tag ||= "latest"
34
+
35
+ ::Docker::Image.create({ "fromImage" => image_name, "tag" => image_tag }, nil, CreateConnection.new.perform(node: slot.node))
36
+ end
37
+
38
+ def create_container(task:, slot:, name:)
39
+ binds = Filer.new.perform(task_storage_mounts: task.storage_mounts)
40
+
41
+ user = [
42
+ Settings.run_container_as.user_id,
43
+ Settings.run_container_as.group_id
44
+ ].join(":")
45
+
46
+ ::Docker::Container.create(
47
+ {
48
+ "name" => name,
49
+ "Image" => task.image,
50
+ "User" => user,
51
+ "HostConfig" => {
52
+ "Binds" => binds,
53
+ "NetworkMode" => ENV["DOCKER_CONTAINERS_NETWORK"].to_s
54
+ },
55
+ "Entrypoint" => [],
56
+ "Cmd" => ["sh", "-c", task.cmd]
57
+ },
58
+ CreateConnection.new.perform(node: slot.node)
59
+ )
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class UpdateNodeStatus
6
+ include UpdateNodeStatusHelper
7
+
8
+ attr_reader :node
9
+
10
+ def perform(node:)
11
+ @node = node
12
+ Rails.logger.debug("Start updating node status for #{node}")
13
+
14
+ # Other tasks can be started at this time. Because of this it's necessary to load the tasks first and then the containers
15
+ started_tasks = Task.started.where(:slot.in => node.slots.pluck(:id)).to_a
16
+
17
+ Rails.logger.debug("Got #{containers.count} containers")
18
+
19
+ execution_infos = containers.map do |container|
20
+ execution_info = CreateExecutionInfo.new.perform(container: container)
21
+
22
+ slot = node.slots.find_by(runner_id: execution_info.id)
23
+ if slot
24
+ Rails.logger.debug("Slot found for container #{execution_info.id}: #{slot}")
25
+
26
+ if execution_info.terminated?
27
+ Rails.logger.debug("Container #{execution_info.id} exited")
28
+
29
+ check_slot_release(slot: slot, runner_id: execution_info.id)
30
+ elsif started_with_error?(container: container, docker_connection: CreateConnection.new.perform(node: node))
31
+ container.start
32
+ end
33
+ else
34
+ remove_unknown_runner(node: node, runner_id: execution_info.id)
35
+ end
36
+
37
+ execution_info
38
+ end
39
+
40
+ RescheduleTasksForMissingRunners
41
+ .new(runner_ids: execution_infos.map(&:id), started_tasks: started_tasks)
42
+ .perform
43
+
44
+ node.register_success
45
+
46
+ send_metrics(node: node, execution_infos: execution_infos)
47
+ rescue Excon::Error, ::Docker::Error::DockerError => e
48
+ node.register_error(e.message)
49
+ end
50
+
51
+ private
52
+
53
+ def containers
54
+ @containers ||= ::Docker::Container.all({ all: true }, CreateConnection.new.perform(node: node))
55
+ end
56
+
57
+ def started_with_error?(container:, docker_connection:)
58
+ container.info["State"] == "created" && ::Docker::Container.get(container.id, { all: true }, docker_connection).info["State"]["ExitCode"].positive?
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ class UnknownCompletionInformation < StandardError; end
5
+
6
+ ExecutionInfo = Struct.new(:id, :status, :exit_code, :started_at, :finished_at, :error, :schedule_pending, keyword_init: true) do
7
+ def success?
8
+ check_completion_information_available
9
+
10
+ status == "success"
11
+ end
12
+
13
+ def error?
14
+ check_completion_information_available
15
+
16
+ status == "error"
17
+ end
18
+
19
+ def running?
20
+ status == "running"
21
+ end
22
+
23
+ def pending?
24
+ status == "pending"
25
+ end
26
+
27
+ def terminated?
28
+ exited_without_completion_information? || success? || error?
29
+ end
30
+
31
+ def exited_without_completion_information?
32
+ status == "exited"
33
+ end
34
+
35
+ def schedule_pending?
36
+ schedule_pending
37
+ end
38
+
39
+ private
40
+
41
+ def check_completion_information_available
42
+ # Some execution infos return just the "exited" status and not the complete state
43
+ # So in this point, if the user is asking for success or error, then we need to force it
44
+ # to fetch the complete status (which has the exit code)
45
+
46
+ raise(UnknownCompletionInformation, "Complete status not available") if exited_without_completion_information?
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ class InvalidConfig < StandardError; end
5
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ class InvalidRunner < StandardError; end
5
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class CreateClient
6
+ attr_reader :node
7
+
8
+ def perform(node:)
9
+ @node = node
10
+
11
+ raise(Runners::InvalidRunner, "Node must be a kubernetes runner") unless node.kubernetes?
12
+
13
+ raise(Runners::InvalidConfig, "Invalid configuration (#{node.runner_config}) for kubernetes") unless valid?
14
+
15
+ KubernetesClient.new(
16
+ uri: node.hostname,
17
+ bearer_token: node.runner_config["bearer_token"],
18
+ namespace: node.runner_config["namespace"]
19
+ )
20
+ end
21
+
22
+ private
23
+
24
+ def valid?
25
+ %w[bearer_token namespace nfs_server nfs_path node_selector].none? { |field| node.runner_config[field].blank? }
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class CreateExecutionInfo
6
+ attr_reader :pod
7
+
8
+ ERROR_REASONS = %w[
9
+ ImagePullBackOff
10
+ ErrImagePull
11
+ ].freeze
12
+
13
+ def perform(pod:)
14
+ @pod = pod
15
+
16
+ Runners::ExecutionInfo.new(
17
+ id: pod&.metadata&.name,
18
+ status: status,
19
+ exit_code: container_status&.state&.terminated&.exitCode,
20
+ started_at: started_at,
21
+ finished_at: container_status&.state&.terminated&.finishedAt,
22
+ error: error_message,
23
+ schedule_pending: schedule_pending?
24
+ )
25
+ end
26
+
27
+ private
28
+
29
+ def status
30
+ if running?
31
+ "running"
32
+ elsif terminated_with_success?
33
+ "success"
34
+ elsif error?
35
+ "error"
36
+ elsif waiting? || schedule_pending?
37
+ "pending"
38
+ end
39
+ end
40
+
41
+ def waiting?
42
+ container_status&.state&.waiting.present?
43
+ end
44
+
45
+ def running?
46
+ container_status&.state&.running&.present?
47
+ end
48
+
49
+ def terminated_with_error?
50
+ container_status&.state&.terminated&.exitCode&.positive?
51
+ end
52
+
53
+ def terminated_with_success?
54
+ container_status&.state&.terminated&.exitCode&.zero?
55
+ end
56
+
57
+ def reason_is_error?
58
+ waiting? && ERROR_REASONS.include?(reason[:reason])
59
+ end
60
+
61
+ def error?
62
+ terminated_with_error? || reason_is_error?
63
+ end
64
+
65
+ def schedule_pending?
66
+ unschedulable_error_messsage.present?
67
+ end
68
+
69
+ def unschedulable_error_messsage
70
+ return if pod.status&.phase != "Pending"
71
+
72
+ found = pod&.status&.conditions&.find { |condition| condition.reason == "Unschedulable" }
73
+ "#{found.reason}: #{found.message}" if found
74
+ end
75
+
76
+ def error_message
77
+ if error?
78
+ reason.values.compact.join(": ")
79
+ elsif unschedulable_error_messsage.present?
80
+ unschedulable_error_messsage
81
+ end
82
+ end
83
+
84
+ def started_at
85
+ (container_status&.state&.terminated || container_status&.state&.running)&.startedAt
86
+ end
87
+
88
+ def reason
89
+ reason_value = container_status&.state&.to_hash&.values&.first
90
+ return {} unless reason_value
91
+
92
+ {
93
+ reason: reason_value[:reason],
94
+ message: reason_value[:message]
95
+ }
96
+ end
97
+
98
+ def container_status
99
+ pod.status&.containerStatuses&.first
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class FetchExecutionInfo
6
+ def perform(task:)
7
+ pod = CreateClient.new.perform(node: task.slot.node).fetch_pod(pod_name: task.runner_id)
8
+
9
+ CreateExecutionInfo.new.perform(pod: pod)
10
+ rescue KubernetesClient::PodNotFoundError => e
11
+ raise Runners::RunnerIdNotFoundError, e.message
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class FetchLogs
6
+ def perform(task:)
7
+ CreateClient.new.perform(node: task.slot.node).fetch_pod_logs(pod_name: task.runner_id)
8
+ rescue KubernetesClient::PodNotFoundError => e
9
+ raise Runners::RunnerIdNotFoundError, e.message
10
+ rescue KubernetesClient::LogsNotFoundError
11
+ Rails.logger.error("Error on fetching kubernetes pod logs")
12
+
13
+ "Logs not found"
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class Filer
6
+ class InvalidMountName < StandardError; end
7
+
8
+ attr_reader :task_storage_mounts
9
+
10
+ def perform(task_storage_mounts:)
11
+ @task_storage_mounts = task_storage_mounts
12
+
13
+ {
14
+ internal: internal,
15
+ external: external
16
+ }
17
+ end
18
+
19
+ private
20
+
21
+ def internal
22
+ task_storage_mounts.map do |task_mount_name, task_mount_path|
23
+ {
24
+ name: task_mount_name,
25
+ mountPath: task_mount_path
26
+ }
27
+ end
28
+ end
29
+
30
+ def external
31
+ task_storage_mounts.map do |task_mount_name, _task_mount_path|
32
+ node_mount_path = Settings.to_hash[:storage_mounts][:kubernetes][task_mount_name.to_sym]
33
+
34
+ raise InvalidMountName unless node_mount_path
35
+
36
+ node_mount_path.merge(name: task_mount_name)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class KillSlotRunner
6
+ def perform(slot:)
7
+ RemoveRunner.new.perform(node: slot.node, runner_id: slot.runner_id)
8
+ end
9
+ end
10
+ end
11
+ end