container_broker 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class NodeAvailability
6
+ def perform(node:)
7
+ ::Docker.info(CreateConnection.new.perform(node: node))
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class RemoveRunner
6
+ def perform(node:, runner_id:)
7
+ Rails.logger.info("Removing container #{runner_id} from node #{node}")
8
+ container = ::Docker::Container.get(runner_id, { all: true }, CreateConnection.new.perform(node: node))
9
+ container.kill if container.info["State"]["Status"] == "running"
10
+ container.delete
11
+ rescue ::Docker::Error::NotFoundError, ::Docker::Error::ConflictError => e
12
+ Rails.logger.info("Container #{runner_id} already removed - #{e.message} (e.class)")
13
+ rescue Excon::Error, ::Docker::Error::TimeoutError => e
14
+ node.register_error(e.message)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class RunTask
6
+ def perform(task:, slot:, runner_id:)
7
+ Rails.logger.debug("Performing Docker::RunTask for #{task} #{slot}")
8
+
9
+ pull_image(task: task, slot: slot)
10
+ Rails.logger.debug("Image pulled for #{task} #{slot}")
11
+
12
+ container = create_container(task: task, slot: slot, name: runner_id)
13
+ Rails.logger.debug("Container #{container.id} created for #{task} #{slot} with name #{runner_id}")
14
+
15
+ container.start
16
+ Rails.logger.debug("Container #{container.id} started")
17
+
18
+ runner_id
19
+ rescue Excon::Error, ::Docker::Error::TimeoutError => e then
20
+ message = "Docker connection error: #{e.message}"
21
+ message += "\n#{e.response.body}" if e.respond_to?(:response)
22
+ raise Node::NodeConnectionError, message
23
+ rescue ::Docker::Error::NotFoundError => e
24
+ raise "Docker image not found: #{e.message}"
25
+ end
26
+
27
+ private
28
+
29
+ def pull_image(task:, slot:)
30
+ return if ::Docker::Image.exist?(task.image, {}, CreateConnection.new.perform(node: slot.node))
31
+
32
+ image_name, image_tag = task.image.split(":")
33
+ image_tag ||= "latest"
34
+
35
+ ::Docker::Image.create({ "fromImage" => image_name, "tag" => image_tag }, nil, CreateConnection.new.perform(node: slot.node))
36
+ end
37
+
38
+ def create_container(task:, slot:, name:)
39
+ binds = Filer.new.perform(task_storage_mounts: task.storage_mounts)
40
+
41
+ user = [
42
+ Settings.run_container_as.user_id,
43
+ Settings.run_container_as.group_id
44
+ ].join(":")
45
+
46
+ ::Docker::Container.create(
47
+ {
48
+ "name" => name,
49
+ "Image" => task.image,
50
+ "User" => user,
51
+ "HostConfig" => {
52
+ "Binds" => binds,
53
+ "NetworkMode" => ENV["DOCKER_CONTAINERS_NETWORK"].to_s
54
+ },
55
+ "Entrypoint" => [],
56
+ "Cmd" => ["sh", "-c", task.cmd]
57
+ },
58
+ CreateConnection.new.perform(node: slot.node)
59
+ )
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class UpdateNodeStatus
6
+ include UpdateNodeStatusHelper
7
+
8
+ attr_reader :node
9
+
10
+ def perform(node:)
11
+ @node = node
12
+ Rails.logger.debug("Start updating node status for #{node}")
13
+
14
+ # Other tasks can be started at this time. Because of this it's necessary to load the tasks first and then the containers
15
+ started_tasks = Task.started.where(:slot.in => node.slots.pluck(:id)).to_a
16
+
17
+ Rails.logger.debug("Got #{containers.count} containers")
18
+
19
+ execution_infos = containers.map do |container|
20
+ execution_info = CreateExecutionInfo.new.perform(container: container)
21
+
22
+ slot = node.slots.find_by(runner_id: execution_info.id)
23
+ if slot
24
+ Rails.logger.debug("Slot found for container #{execution_info.id}: #{slot}")
25
+
26
+ if execution_info.terminated?
27
+ Rails.logger.debug("Container #{execution_info.id} exited")
28
+
29
+ check_slot_release(slot: slot, runner_id: execution_info.id)
30
+ elsif started_with_error?(container: container, docker_connection: CreateConnection.new.perform(node: node))
31
+ container.start
32
+ end
33
+ else
34
+ remove_unknown_runner(node: node, runner_id: execution_info.id)
35
+ end
36
+
37
+ execution_info
38
+ end
39
+
40
+ RescheduleTasksForMissingRunners
41
+ .new(runner_ids: execution_infos.map(&:id), started_tasks: started_tasks)
42
+ .perform
43
+
44
+ node.register_success
45
+
46
+ send_metrics(node: node, execution_infos: execution_infos)
47
+ rescue Excon::Error, ::Docker::Error::DockerError => e
48
+ node.register_error(e.message)
49
+ end
50
+
51
+ private
52
+
53
+ def containers
54
+ @containers ||= ::Docker::Container.all({ all: true }, CreateConnection.new.perform(node: node))
55
+ end
56
+
57
+ def started_with_error?(container:, docker_connection:)
58
+ container.info["State"] == "created" && ::Docker::Container.get(container.id, { all: true }, docker_connection).info["State"]["ExitCode"].positive?
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ class UnknownCompletionInformation < StandardError; end
5
+
6
+ ExecutionInfo = Struct.new(:id, :status, :exit_code, :started_at, :finished_at, :error, :schedule_pending, keyword_init: true) do
7
+ def success?
8
+ check_completion_information_available
9
+
10
+ status == "success"
11
+ end
12
+
13
+ def error?
14
+ check_completion_information_available
15
+
16
+ status == "error"
17
+ end
18
+
19
+ def running?
20
+ status == "running"
21
+ end
22
+
23
+ def pending?
24
+ status == "pending"
25
+ end
26
+
27
+ def terminated?
28
+ exited_without_completion_information? || success? || error?
29
+ end
30
+
31
+ def exited_without_completion_information?
32
+ status == "exited"
33
+ end
34
+
35
+ def schedule_pending?
36
+ schedule_pending
37
+ end
38
+
39
+ private
40
+
41
+ def check_completion_information_available
42
+ # Some execution infos return just the "exited" status and not the complete state
43
+ # So in this point, if the user is asking for success or error, then we need to force it
44
+ # to fetch the complete status (which has the exit code)
45
+
46
+ raise(UnknownCompletionInformation, "Complete status not available") if exited_without_completion_information?
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ class InvalidConfig < StandardError; end
5
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ class InvalidRunner < StandardError; end
5
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class CreateClient
6
+ attr_reader :node
7
+
8
+ def perform(node:)
9
+ @node = node
10
+
11
+ raise(Runners::InvalidRunner, "Node must be a kubernetes runner") unless node.kubernetes?
12
+
13
+ raise(Runners::InvalidConfig, "Invalid configuration (#{node.runner_config}) for kubernetes") unless valid?
14
+
15
+ KubernetesClient.new(
16
+ uri: node.hostname,
17
+ bearer_token: node.runner_config["bearer_token"],
18
+ namespace: node.runner_config["namespace"]
19
+ )
20
+ end
21
+
22
+ private
23
+
24
+ def valid?
25
+ %w[bearer_token namespace nfs_server nfs_path node_selector].none? { |field| node.runner_config[field].blank? }
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class CreateExecutionInfo
6
+ attr_reader :pod
7
+
8
+ ERROR_REASONS = %w[
9
+ ImagePullBackOff
10
+ ErrImagePull
11
+ ].freeze
12
+
13
+ def perform(pod:)
14
+ @pod = pod
15
+
16
+ Runners::ExecutionInfo.new(
17
+ id: pod&.metadata&.name,
18
+ status: status,
19
+ exit_code: container_status&.state&.terminated&.exitCode,
20
+ started_at: started_at,
21
+ finished_at: container_status&.state&.terminated&.finishedAt,
22
+ error: error_message,
23
+ schedule_pending: schedule_pending?
24
+ )
25
+ end
26
+
27
+ private
28
+
29
+ def status
30
+ if running?
31
+ "running"
32
+ elsif terminated_with_success?
33
+ "success"
34
+ elsif error?
35
+ "error"
36
+ elsif waiting? || schedule_pending?
37
+ "pending"
38
+ end
39
+ end
40
+
41
+ def waiting?
42
+ container_status&.state&.waiting.present?
43
+ end
44
+
45
+ def running?
46
+ container_status&.state&.running&.present?
47
+ end
48
+
49
+ def terminated_with_error?
50
+ container_status&.state&.terminated&.exitCode&.positive?
51
+ end
52
+
53
+ def terminated_with_success?
54
+ container_status&.state&.terminated&.exitCode&.zero?
55
+ end
56
+
57
+ def reason_is_error?
58
+ waiting? && ERROR_REASONS.include?(reason[:reason])
59
+ end
60
+
61
+ def error?
62
+ terminated_with_error? || reason_is_error?
63
+ end
64
+
65
+ def schedule_pending?
66
+ unschedulable_error_messsage.present?
67
+ end
68
+
69
+ def unschedulable_error_messsage
70
+ return if pod.status&.phase != "Pending"
71
+
72
+ found = pod&.status&.conditions&.find { |condition| condition.reason == "Unschedulable" }
73
+ "#{found.reason}: #{found.message}" if found
74
+ end
75
+
76
+ def error_message
77
+ if error?
78
+ reason.values.compact.join(": ")
79
+ elsif unschedulable_error_messsage.present?
80
+ unschedulable_error_messsage
81
+ end
82
+ end
83
+
84
+ def started_at
85
+ (container_status&.state&.terminated || container_status&.state&.running)&.startedAt
86
+ end
87
+
88
+ def reason
89
+ reason_value = container_status&.state&.to_hash&.values&.first
90
+ return {} unless reason_value
91
+
92
+ {
93
+ reason: reason_value[:reason],
94
+ message: reason_value[:message]
95
+ }
96
+ end
97
+
98
+ def container_status
99
+ pod.status&.containerStatuses&.first
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class FetchExecutionInfo
6
+ def perform(task:)
7
+ pod = CreateClient.new.perform(node: task.slot.node).fetch_pod(pod_name: task.runner_id)
8
+
9
+ CreateExecutionInfo.new.perform(pod: pod)
10
+ rescue KubernetesClient::PodNotFoundError => e
11
+ raise Runners::RunnerIdNotFoundError, e.message
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class FetchLogs
6
+ def perform(task:)
7
+ CreateClient.new.perform(node: task.slot.node).fetch_pod_logs(pod_name: task.runner_id)
8
+ rescue KubernetesClient::PodNotFoundError => e
9
+ raise Runners::RunnerIdNotFoundError, e.message
10
+ rescue KubernetesClient::LogsNotFoundError
11
+ Rails.logger.error("Error on fetching kubernetes pod logs")
12
+
13
+ "Logs not found"
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class Filer
6
+ class InvalidMountName < StandardError; end
7
+
8
+ attr_reader :task_storage_mounts
9
+
10
+ def perform(task_storage_mounts:)
11
+ @task_storage_mounts = task_storage_mounts
12
+
13
+ {
14
+ internal: internal,
15
+ external: external
16
+ }
17
+ end
18
+
19
+ private
20
+
21
+ def internal
22
+ task_storage_mounts.map do |task_mount_name, task_mount_path|
23
+ {
24
+ name: task_mount_name,
25
+ mountPath: task_mount_path
26
+ }
27
+ end
28
+ end
29
+
30
+ def external
31
+ task_storage_mounts.map do |task_mount_name, _task_mount_path|
32
+ node_mount_path = Settings.to_hash[:storage_mounts][:kubernetes][task_mount_name.to_sym]
33
+
34
+ raise InvalidMountName unless node_mount_path
35
+
36
+ node_mount_path.merge(name: task_mount_name)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class KillSlotRunner
6
+ def perform(slot:)
7
+ RemoveRunner.new.perform(node: slot.node, runner_id: slot.runner_id)
8
+ end
9
+ end
10
+ end
11
+ end