container_broker 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +98 -0
- data/Rakefile +8 -0
- data/app/controllers/application_controller.rb +5 -0
- data/app/controllers/healthcheck_controller.rb +21 -0
- data/app/controllers/nodes_controller.rb +70 -0
- data/app/controllers/nodes_healthcheck_controller.rb +28 -0
- data/app/controllers/status_controller.rb +48 -0
- data/app/controllers/tasks_controller.rb +83 -0
- data/app/controllers/tasks_healthcheck_controller.rb +28 -0
- data/app/jobs/add_task_tags_job.rb +13 -0
- data/app/jobs/adjust_node_slots_job.rb +27 -0
- data/app/jobs/application_job.rb +9 -0
- data/app/jobs/collect_load_metrics_job.rb +9 -0
- data/app/jobs/container_broker_base_job.rb +32 -0
- data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
- data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
- data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
- data/app/jobs/release_slot_job.rb +47 -0
- data/app/jobs/remove_runner_job.rb +11 -0
- data/app/jobs/remove_unused_tags_job.rb +25 -0
- data/app/jobs/request_id_from_task.rb +7 -0
- data/app/jobs/run_task_job.rb +64 -0
- data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
- data/app/jobs/run_tasks_job.rb +42 -0
- data/app/jobs/timeout_failed_tasks_job.rb +31 -0
- data/app/jobs/update_all_nodes_status_job.rb +9 -0
- data/app/jobs/update_node_status_job.rb +24 -0
- data/app/jobs/update_task_status_job.rb +71 -0
- data/app/models/mongoid_serializable_model.rb +14 -0
- data/app/models/node.rb +101 -0
- data/app/models/slot.rb +42 -0
- data/app/models/task.rb +148 -0
- data/app/models/task_tag.rb +11 -0
- data/app/observers/observable.rb +23 -0
- data/app/observers/task_observer.rb +11 -0
- data/app/serializers/node_healthcheck_serializer.rb +5 -0
- data/app/serializers/node_serializer.rb +5 -0
- data/app/serializers/status_panel_node_serializer.rb +9 -0
- data/app/serializers/status_panel_slot_serializer.rb +5 -0
- data/app/serializers/status_panel_task_serializer.rb +16 -0
- data/app/serializers/task_healthcheck_serializer.rb +5 -0
- data/app/serializers/task_serializer.rb +7 -0
- data/app/services/adjust_execution_type_slots.rb +51 -0
- data/app/services/check_for_slot_removal.rb +28 -0
- data/app/services/collect_load_metrics.rb +40 -0
- data/app/services/delete_node.rb +25 -0
- data/app/services/friendly_name_nodes.rb +10 -0
- data/app/services/friendly_name_slots.rb +15 -0
- data/app/services/kill_node_runners.rb +17 -0
- data/app/services/kill_task_container.rb +29 -0
- data/app/services/kubernetes_client.rb +136 -0
- data/app/services/least_used_node.rb +44 -0
- data/app/services/lock_manager.rb +74 -0
- data/app/services/lock_slot.rb +37 -0
- data/app/services/lock_task.rb +45 -0
- data/app/services/metrics.rb +43 -0
- data/app/services/migrate_runner.rb +26 -0
- data/app/services/node_task_acceptance.rb +18 -0
- data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
- data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
- data/app/services/runners.rb +4 -0
- data/app/services/runners/docker/create_connection.rb +18 -0
- data/app/services/runners/docker/create_execution_info.rb +87 -0
- data/app/services/runners/docker/fetch_execution_info.rb +17 -0
- data/app/services/runners/docker/fetch_logs.rb +18 -0
- data/app/services/runners/docker/fetch_task_container.rb +15 -0
- data/app/services/runners/docker/filer.rb +19 -0
- data/app/services/runners/docker/kill_slot_runner.rb +19 -0
- data/app/services/runners/docker/node_availability.rb +11 -0
- data/app/services/runners/docker/remove_runner.rb +18 -0
- data/app/services/runners/docker/run_task.rb +63 -0
- data/app/services/runners/docker/update_node_status.rb +62 -0
- data/app/services/runners/execution_info.rb +49 -0
- data/app/services/runners/invalid_config.rb +5 -0
- data/app/services/runners/invalid_runner.rb +5 -0
- data/app/services/runners/kubernetes/create_client.rb +29 -0
- data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
- data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
- data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
- data/app/services/runners/kubernetes/filer.rb +41 -0
- data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
- data/app/services/runners/kubernetes/node_availability.rb +11 -0
- data/app/services/runners/kubernetes/remove_runner.rb +19 -0
- data/app/services/runners/kubernetes/run_task.rb +54 -0
- data/app/services/runners/kubernetes/update_node_status.rb +64 -0
- data/app/services/runners/runner_id_not_found_error.rb +5 -0
- data/app/services/runners/services_factory.rb +38 -0
- data/app/services/runners/update_node_status_helper.rb +43 -0
- data/app/services/slots_usage_percentage.rb +18 -0
- data/config/application.rb +34 -0
- data/config/boot.rb +5 -0
- data/config/environment.rb +7 -0
- data/config/environments/test.rb +44 -0
- data/config/initializers/application_controller_renderer.rb +10 -0
- data/config/initializers/backtrace_silencers.rb +9 -0
- data/config/initializers/config.rb +51 -0
- data/config/initializers/cookies_serializer.rb +7 -0
- data/config/initializers/docker_config.rb +3 -0
- data/config/initializers/filter_parameter_logging.rb +6 -0
- data/config/initializers/idempotent_request.rb +12 -0
- data/config/initializers/inflections.rb +18 -0
- data/config/initializers/mime_types.rb +6 -0
- data/config/initializers/mongoid.rb +3 -0
- data/config/initializers/new_framework_defaults_6_0.rb +47 -0
- data/config/initializers/raven.rb +10 -0
- data/config/initializers/sidekiq.rb +24 -0
- data/config/initializers/wrap_parameters.rb +16 -0
- data/config/locales/en.yml +33 -0
- data/config/mongoid.yml +10 -0
- data/config/routes.rb +43 -0
- data/config/secrets.yml +35 -0
- data/config/settings.yml +34 -0
- data/config/settings/test.yml +27 -0
- data/config/sidekiq_scheduler.yml +18 -0
- data/config/spring.rb +8 -0
- data/lib/constants.rb +12 -0
- data/lib/container_broker.rb +30 -0
- data/lib/container_broker/engine.rb +6 -0
- data/lib/container_broker/version.rb +5 -0
- data/lib/current_thread_request_id.rb +19 -0
- data/lib/idempotent_request/callback.rb +25 -0
- data/lib/idempotent_request/policy.rb +15 -0
- data/lib/redis_url_parser.rb +25 -0
- data/lib/tasks/task.rake +34 -0
- metadata +590 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class RemoveRunner
|
6
|
+
def perform(node:, runner_id:)
|
7
|
+
Rails.logger.info("Removing container #{runner_id} from node #{node}")
|
8
|
+
container = ::Docker::Container.get(runner_id, { all: true }, CreateConnection.new.perform(node: node))
|
9
|
+
container.kill if container.info["State"]["Status"] == "running"
|
10
|
+
container.delete
|
11
|
+
rescue ::Docker::Error::NotFoundError, ::Docker::Error::ConflictError => e
|
12
|
+
Rails.logger.info("Container #{runner_id} already removed - #{e.message} (e.class)")
|
13
|
+
rescue Excon::Error, ::Docker::Error::TimeoutError => e
|
14
|
+
node.register_error(e.message)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class RunTask
|
6
|
+
def perform(task:, slot:, runner_id:)
|
7
|
+
Rails.logger.debug("Performing Docker::RunTask for #{task} #{slot}")
|
8
|
+
|
9
|
+
pull_image(task: task, slot: slot)
|
10
|
+
Rails.logger.debug("Image pulled for #{task} #{slot}")
|
11
|
+
|
12
|
+
container = create_container(task: task, slot: slot, name: runner_id)
|
13
|
+
Rails.logger.debug("Container #{container.id} created for #{task} #{slot} with name #{runner_id}")
|
14
|
+
|
15
|
+
container.start
|
16
|
+
Rails.logger.debug("Container #{container.id} started")
|
17
|
+
|
18
|
+
runner_id
|
19
|
+
rescue Excon::Error, ::Docker::Error::TimeoutError => e then
|
20
|
+
message = "Docker connection error: #{e.message}"
|
21
|
+
message += "\n#{e.response.body}" if e.respond_to?(:response)
|
22
|
+
raise Node::NodeConnectionError, message
|
23
|
+
rescue ::Docker::Error::NotFoundError => e
|
24
|
+
raise "Docker image not found: #{e.message}"
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def pull_image(task:, slot:)
|
30
|
+
return if ::Docker::Image.exist?(task.image, {}, CreateConnection.new.perform(node: slot.node))
|
31
|
+
|
32
|
+
image_name, image_tag = task.image.split(":")
|
33
|
+
image_tag ||= "latest"
|
34
|
+
|
35
|
+
::Docker::Image.create({ "fromImage" => image_name, "tag" => image_tag }, nil, CreateConnection.new.perform(node: slot.node))
|
36
|
+
end
|
37
|
+
|
38
|
+
def create_container(task:, slot:, name:)
|
39
|
+
binds = Filer.new.perform(task_storage_mounts: task.storage_mounts)
|
40
|
+
|
41
|
+
user = [
|
42
|
+
Settings.run_container_as.user_id,
|
43
|
+
Settings.run_container_as.group_id
|
44
|
+
].join(":")
|
45
|
+
|
46
|
+
::Docker::Container.create(
|
47
|
+
{
|
48
|
+
"name" => name,
|
49
|
+
"Image" => task.image,
|
50
|
+
"User" => user,
|
51
|
+
"HostConfig" => {
|
52
|
+
"Binds" => binds,
|
53
|
+
"NetworkMode" => ENV["DOCKER_CONTAINERS_NETWORK"].to_s
|
54
|
+
},
|
55
|
+
"Entrypoint" => [],
|
56
|
+
"Cmd" => ["sh", "-c", task.cmd]
|
57
|
+
},
|
58
|
+
CreateConnection.new.perform(node: slot.node)
|
59
|
+
)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class UpdateNodeStatus
|
6
|
+
include UpdateNodeStatusHelper
|
7
|
+
|
8
|
+
attr_reader :node
|
9
|
+
|
10
|
+
def perform(node:)
|
11
|
+
@node = node
|
12
|
+
Rails.logger.debug("Start updating node status for #{node}")
|
13
|
+
|
14
|
+
# Other tasks can be started at this time. Because of this it's necessary to load the tasks first and then the containers
|
15
|
+
started_tasks = Task.started.where(:slot.in => node.slots.pluck(:id)).to_a
|
16
|
+
|
17
|
+
Rails.logger.debug("Got #{containers.count} containers")
|
18
|
+
|
19
|
+
execution_infos = containers.map do |container|
|
20
|
+
execution_info = CreateExecutionInfo.new.perform(container: container)
|
21
|
+
|
22
|
+
slot = node.slots.find_by(runner_id: execution_info.id)
|
23
|
+
if slot
|
24
|
+
Rails.logger.debug("Slot found for container #{execution_info.id}: #{slot}")
|
25
|
+
|
26
|
+
if execution_info.terminated?
|
27
|
+
Rails.logger.debug("Container #{execution_info.id} exited")
|
28
|
+
|
29
|
+
check_slot_release(slot: slot, runner_id: execution_info.id)
|
30
|
+
elsif started_with_error?(container: container, docker_connection: CreateConnection.new.perform(node: node))
|
31
|
+
container.start
|
32
|
+
end
|
33
|
+
else
|
34
|
+
remove_unknown_runner(node: node, runner_id: execution_info.id)
|
35
|
+
end
|
36
|
+
|
37
|
+
execution_info
|
38
|
+
end
|
39
|
+
|
40
|
+
RescheduleTasksForMissingRunners
|
41
|
+
.new(runner_ids: execution_infos.map(&:id), started_tasks: started_tasks)
|
42
|
+
.perform
|
43
|
+
|
44
|
+
node.register_success
|
45
|
+
|
46
|
+
send_metrics(node: node, execution_infos: execution_infos)
|
47
|
+
rescue Excon::Error, ::Docker::Error::DockerError => e
|
48
|
+
node.register_error(e.message)
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def containers
|
54
|
+
@containers ||= ::Docker::Container.all({ all: true }, CreateConnection.new.perform(node: node))
|
55
|
+
end
|
56
|
+
|
57
|
+
def started_with_error?(container:, docker_connection:)
|
58
|
+
container.info["State"] == "created" && ::Docker::Container.get(container.id, { all: true }, docker_connection).info["State"]["ExitCode"].positive?
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
class UnknownCompletionInformation < StandardError; end
|
5
|
+
|
6
|
+
ExecutionInfo = Struct.new(:id, :status, :exit_code, :started_at, :finished_at, :error, :schedule_pending, keyword_init: true) do
|
7
|
+
def success?
|
8
|
+
check_completion_information_available
|
9
|
+
|
10
|
+
status == "success"
|
11
|
+
end
|
12
|
+
|
13
|
+
def error?
|
14
|
+
check_completion_information_available
|
15
|
+
|
16
|
+
status == "error"
|
17
|
+
end
|
18
|
+
|
19
|
+
def running?
|
20
|
+
status == "running"
|
21
|
+
end
|
22
|
+
|
23
|
+
def pending?
|
24
|
+
status == "pending"
|
25
|
+
end
|
26
|
+
|
27
|
+
def terminated?
|
28
|
+
exited_without_completion_information? || success? || error?
|
29
|
+
end
|
30
|
+
|
31
|
+
def exited_without_completion_information?
|
32
|
+
status == "exited"
|
33
|
+
end
|
34
|
+
|
35
|
+
def schedule_pending?
|
36
|
+
schedule_pending
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def check_completion_information_available
|
42
|
+
# Some execution infos return just the "exited" status and not the complete state
|
43
|
+
# So in this point, if the user is asking for success or error, then we need to force it
|
44
|
+
# to fetch the complete status (which has the exit code)
|
45
|
+
|
46
|
+
raise(UnknownCompletionInformation, "Complete status not available") if exited_without_completion_information?
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Kubernetes
|
5
|
+
class CreateClient
|
6
|
+
attr_reader :node
|
7
|
+
|
8
|
+
def perform(node:)
|
9
|
+
@node = node
|
10
|
+
|
11
|
+
raise(Runners::InvalidRunner, "Node must be a kubernetes runner") unless node.kubernetes?
|
12
|
+
|
13
|
+
raise(Runners::InvalidConfig, "Invalid configuration (#{node.runner_config}) for kubernetes") unless valid?
|
14
|
+
|
15
|
+
KubernetesClient.new(
|
16
|
+
uri: node.hostname,
|
17
|
+
bearer_token: node.runner_config["bearer_token"],
|
18
|
+
namespace: node.runner_config["namespace"]
|
19
|
+
)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def valid?
|
25
|
+
%w[bearer_token namespace nfs_server nfs_path node_selector].none? { |field| node.runner_config[field].blank? }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Kubernetes
|
5
|
+
class CreateExecutionInfo
|
6
|
+
attr_reader :pod
|
7
|
+
|
8
|
+
ERROR_REASONS = %w[
|
9
|
+
ImagePullBackOff
|
10
|
+
ErrImagePull
|
11
|
+
].freeze
|
12
|
+
|
13
|
+
def perform(pod:)
|
14
|
+
@pod = pod
|
15
|
+
|
16
|
+
Runners::ExecutionInfo.new(
|
17
|
+
id: pod&.metadata&.name,
|
18
|
+
status: status,
|
19
|
+
exit_code: container_status&.state&.terminated&.exitCode,
|
20
|
+
started_at: started_at,
|
21
|
+
finished_at: container_status&.state&.terminated&.finishedAt,
|
22
|
+
error: error_message,
|
23
|
+
schedule_pending: schedule_pending?
|
24
|
+
)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def status
|
30
|
+
if running?
|
31
|
+
"running"
|
32
|
+
elsif terminated_with_success?
|
33
|
+
"success"
|
34
|
+
elsif error?
|
35
|
+
"error"
|
36
|
+
elsif waiting? || schedule_pending?
|
37
|
+
"pending"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def waiting?
|
42
|
+
container_status&.state&.waiting.present?
|
43
|
+
end
|
44
|
+
|
45
|
+
def running?
|
46
|
+
container_status&.state&.running&.present?
|
47
|
+
end
|
48
|
+
|
49
|
+
def terminated_with_error?
|
50
|
+
container_status&.state&.terminated&.exitCode&.positive?
|
51
|
+
end
|
52
|
+
|
53
|
+
def terminated_with_success?
|
54
|
+
container_status&.state&.terminated&.exitCode&.zero?
|
55
|
+
end
|
56
|
+
|
57
|
+
def reason_is_error?
|
58
|
+
waiting? && ERROR_REASONS.include?(reason[:reason])
|
59
|
+
end
|
60
|
+
|
61
|
+
def error?
|
62
|
+
terminated_with_error? || reason_is_error?
|
63
|
+
end
|
64
|
+
|
65
|
+
def schedule_pending?
|
66
|
+
unschedulable_error_messsage.present?
|
67
|
+
end
|
68
|
+
|
69
|
+
def unschedulable_error_messsage
|
70
|
+
return if pod.status&.phase != "Pending"
|
71
|
+
|
72
|
+
found = pod&.status&.conditions&.find { |condition| condition.reason == "Unschedulable" }
|
73
|
+
"#{found.reason}: #{found.message}" if found
|
74
|
+
end
|
75
|
+
|
76
|
+
def error_message
|
77
|
+
if error?
|
78
|
+
reason.values.compact.join(": ")
|
79
|
+
elsif unschedulable_error_messsage.present?
|
80
|
+
unschedulable_error_messsage
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def started_at
|
85
|
+
(container_status&.state&.terminated || container_status&.state&.running)&.startedAt
|
86
|
+
end
|
87
|
+
|
88
|
+
def reason
|
89
|
+
reason_value = container_status&.state&.to_hash&.values&.first
|
90
|
+
return {} unless reason_value
|
91
|
+
|
92
|
+
{
|
93
|
+
reason: reason_value[:reason],
|
94
|
+
message: reason_value[:message]
|
95
|
+
}
|
96
|
+
end
|
97
|
+
|
98
|
+
def container_status
|
99
|
+
pod.status&.containerStatuses&.first
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Kubernetes
|
5
|
+
class FetchExecutionInfo
|
6
|
+
def perform(task:)
|
7
|
+
pod = CreateClient.new.perform(node: task.slot.node).fetch_pod(pod_name: task.runner_id)
|
8
|
+
|
9
|
+
CreateExecutionInfo.new.perform(pod: pod)
|
10
|
+
rescue KubernetesClient::PodNotFoundError => e
|
11
|
+
raise Runners::RunnerIdNotFoundError, e.message
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Kubernetes
|
5
|
+
class FetchLogs
|
6
|
+
def perform(task:)
|
7
|
+
CreateClient.new.perform(node: task.slot.node).fetch_pod_logs(pod_name: task.runner_id)
|
8
|
+
rescue KubernetesClient::PodNotFoundError => e
|
9
|
+
raise Runners::RunnerIdNotFoundError, e.message
|
10
|
+
rescue KubernetesClient::LogsNotFoundError
|
11
|
+
Rails.logger.error("Error on fetching kubernetes pod logs")
|
12
|
+
|
13
|
+
"Logs not found"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Kubernetes
|
5
|
+
class Filer
|
6
|
+
class InvalidMountName < StandardError; end
|
7
|
+
|
8
|
+
attr_reader :task_storage_mounts
|
9
|
+
|
10
|
+
def perform(task_storage_mounts:)
|
11
|
+
@task_storage_mounts = task_storage_mounts
|
12
|
+
|
13
|
+
{
|
14
|
+
internal: internal,
|
15
|
+
external: external
|
16
|
+
}
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def internal
|
22
|
+
task_storage_mounts.map do |task_mount_name, task_mount_path|
|
23
|
+
{
|
24
|
+
name: task_mount_name,
|
25
|
+
mountPath: task_mount_path
|
26
|
+
}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def external
|
31
|
+
task_storage_mounts.map do |task_mount_name, _task_mount_path|
|
32
|
+
node_mount_path = Settings.to_hash[:storage_mounts][:kubernetes][task_mount_name.to_sym]
|
33
|
+
|
34
|
+
raise InvalidMountName unless node_mount_path
|
35
|
+
|
36
|
+
node_mount_path.merge(name: task_mount_name)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|