container_broker 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +98 -0
- data/Rakefile +8 -0
- data/app/controllers/application_controller.rb +5 -0
- data/app/controllers/healthcheck_controller.rb +21 -0
- data/app/controllers/nodes_controller.rb +70 -0
- data/app/controllers/nodes_healthcheck_controller.rb +28 -0
- data/app/controllers/status_controller.rb +48 -0
- data/app/controllers/tasks_controller.rb +83 -0
- data/app/controllers/tasks_healthcheck_controller.rb +28 -0
- data/app/jobs/add_task_tags_job.rb +13 -0
- data/app/jobs/adjust_node_slots_job.rb +27 -0
- data/app/jobs/application_job.rb +9 -0
- data/app/jobs/collect_load_metrics_job.rb +9 -0
- data/app/jobs/container_broker_base_job.rb +32 -0
- data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
- data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
- data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
- data/app/jobs/release_slot_job.rb +47 -0
- data/app/jobs/remove_runner_job.rb +11 -0
- data/app/jobs/remove_unused_tags_job.rb +25 -0
- data/app/jobs/request_id_from_task.rb +7 -0
- data/app/jobs/run_task_job.rb +64 -0
- data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
- data/app/jobs/run_tasks_job.rb +42 -0
- data/app/jobs/timeout_failed_tasks_job.rb +31 -0
- data/app/jobs/update_all_nodes_status_job.rb +9 -0
- data/app/jobs/update_node_status_job.rb +24 -0
- data/app/jobs/update_task_status_job.rb +71 -0
- data/app/models/mongoid_serializable_model.rb +14 -0
- data/app/models/node.rb +101 -0
- data/app/models/slot.rb +42 -0
- data/app/models/task.rb +148 -0
- data/app/models/task_tag.rb +11 -0
- data/app/observers/observable.rb +23 -0
- data/app/observers/task_observer.rb +11 -0
- data/app/serializers/node_healthcheck_serializer.rb +5 -0
- data/app/serializers/node_serializer.rb +5 -0
- data/app/serializers/status_panel_node_serializer.rb +9 -0
- data/app/serializers/status_panel_slot_serializer.rb +5 -0
- data/app/serializers/status_panel_task_serializer.rb +16 -0
- data/app/serializers/task_healthcheck_serializer.rb +5 -0
- data/app/serializers/task_serializer.rb +7 -0
- data/app/services/adjust_execution_type_slots.rb +51 -0
- data/app/services/check_for_slot_removal.rb +28 -0
- data/app/services/collect_load_metrics.rb +40 -0
- data/app/services/delete_node.rb +25 -0
- data/app/services/friendly_name_nodes.rb +10 -0
- data/app/services/friendly_name_slots.rb +15 -0
- data/app/services/kill_node_runners.rb +17 -0
- data/app/services/kill_task_container.rb +29 -0
- data/app/services/kubernetes_client.rb +136 -0
- data/app/services/least_used_node.rb +44 -0
- data/app/services/lock_manager.rb +74 -0
- data/app/services/lock_slot.rb +37 -0
- data/app/services/lock_task.rb +45 -0
- data/app/services/metrics.rb +43 -0
- data/app/services/migrate_runner.rb +26 -0
- data/app/services/node_task_acceptance.rb +18 -0
- data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
- data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
- data/app/services/runners.rb +4 -0
- data/app/services/runners/docker/create_connection.rb +18 -0
- data/app/services/runners/docker/create_execution_info.rb +87 -0
- data/app/services/runners/docker/fetch_execution_info.rb +17 -0
- data/app/services/runners/docker/fetch_logs.rb +18 -0
- data/app/services/runners/docker/fetch_task_container.rb +15 -0
- data/app/services/runners/docker/filer.rb +19 -0
- data/app/services/runners/docker/kill_slot_runner.rb +19 -0
- data/app/services/runners/docker/node_availability.rb +11 -0
- data/app/services/runners/docker/remove_runner.rb +18 -0
- data/app/services/runners/docker/run_task.rb +63 -0
- data/app/services/runners/docker/update_node_status.rb +62 -0
- data/app/services/runners/execution_info.rb +49 -0
- data/app/services/runners/invalid_config.rb +5 -0
- data/app/services/runners/invalid_runner.rb +5 -0
- data/app/services/runners/kubernetes/create_client.rb +29 -0
- data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
- data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
- data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
- data/app/services/runners/kubernetes/filer.rb +41 -0
- data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
- data/app/services/runners/kubernetes/node_availability.rb +11 -0
- data/app/services/runners/kubernetes/remove_runner.rb +19 -0
- data/app/services/runners/kubernetes/run_task.rb +54 -0
- data/app/services/runners/kubernetes/update_node_status.rb +64 -0
- data/app/services/runners/runner_id_not_found_error.rb +5 -0
- data/app/services/runners/services_factory.rb +38 -0
- data/app/services/runners/update_node_status_helper.rb +43 -0
- data/app/services/slots_usage_percentage.rb +18 -0
- data/config/application.rb +34 -0
- data/config/boot.rb +5 -0
- data/config/environment.rb +7 -0
- data/config/environments/test.rb +44 -0
- data/config/initializers/application_controller_renderer.rb +10 -0
- data/config/initializers/backtrace_silencers.rb +9 -0
- data/config/initializers/config.rb +51 -0
- data/config/initializers/cookies_serializer.rb +7 -0
- data/config/initializers/docker_config.rb +3 -0
- data/config/initializers/filter_parameter_logging.rb +6 -0
- data/config/initializers/idempotent_request.rb +12 -0
- data/config/initializers/inflections.rb +18 -0
- data/config/initializers/mime_types.rb +6 -0
- data/config/initializers/mongoid.rb +3 -0
- data/config/initializers/new_framework_defaults_6_0.rb +47 -0
- data/config/initializers/raven.rb +10 -0
- data/config/initializers/sidekiq.rb +24 -0
- data/config/initializers/wrap_parameters.rb +16 -0
- data/config/locales/en.yml +33 -0
- data/config/mongoid.yml +10 -0
- data/config/routes.rb +43 -0
- data/config/secrets.yml +35 -0
- data/config/settings.yml +34 -0
- data/config/settings/test.yml +27 -0
- data/config/sidekiq_scheduler.yml +18 -0
- data/config/spring.rb +8 -0
- data/lib/constants.rb +12 -0
- data/lib/container_broker.rb +30 -0
- data/lib/container_broker/engine.rb +6 -0
- data/lib/container_broker/version.rb +5 -0
- data/lib/current_thread_request_id.rb +19 -0
- data/lib/idempotent_request/callback.rb +25 -0
- data/lib/idempotent_request/policy.rb +15 -0
- data/lib/redis_url_parser.rb +25 -0
- data/lib/tasks/task.rake +34 -0
- metadata +590 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class RemoveRunner
|
6
|
+
def perform(node:, runner_id:)
|
7
|
+
Rails.logger.info("Removing container #{runner_id} from node #{node}")
|
8
|
+
container = ::Docker::Container.get(runner_id, { all: true }, CreateConnection.new.perform(node: node))
|
9
|
+
container.kill if container.info["State"]["Status"] == "running"
|
10
|
+
container.delete
|
11
|
+
rescue ::Docker::Error::NotFoundError, ::Docker::Error::ConflictError => e
|
12
|
+
Rails.logger.info("Container #{runner_id} already removed - #{e.message} (e.class)")
|
13
|
+
rescue Excon::Error, ::Docker::Error::TimeoutError => e
|
14
|
+
node.register_error(e.message)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class RunTask
|
6
|
+
def perform(task:, slot:, runner_id:)
|
7
|
+
Rails.logger.debug("Performing Docker::RunTask for #{task} #{slot}")
|
8
|
+
|
9
|
+
pull_image(task: task, slot: slot)
|
10
|
+
Rails.logger.debug("Image pulled for #{task} #{slot}")
|
11
|
+
|
12
|
+
container = create_container(task: task, slot: slot, name: runner_id)
|
13
|
+
Rails.logger.debug("Container #{container.id} created for #{task} #{slot} with name #{runner_id}")
|
14
|
+
|
15
|
+
container.start
|
16
|
+
Rails.logger.debug("Container #{container.id} started")
|
17
|
+
|
18
|
+
runner_id
|
19
|
+
rescue Excon::Error, ::Docker::Error::TimeoutError => e then
|
20
|
+
message = "Docker connection error: #{e.message}"
|
21
|
+
message += "\n#{e.response.body}" if e.respond_to?(:response)
|
22
|
+
raise Node::NodeConnectionError, message
|
23
|
+
rescue ::Docker::Error::NotFoundError => e
|
24
|
+
raise "Docker image not found: #{e.message}"
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def pull_image(task:, slot:)
|
30
|
+
return if ::Docker::Image.exist?(task.image, {}, CreateConnection.new.perform(node: slot.node))
|
31
|
+
|
32
|
+
image_name, image_tag = task.image.split(":")
|
33
|
+
image_tag ||= "latest"
|
34
|
+
|
35
|
+
::Docker::Image.create({ "fromImage" => image_name, "tag" => image_tag }, nil, CreateConnection.new.perform(node: slot.node))
|
36
|
+
end
|
37
|
+
|
38
|
+
def create_container(task:, slot:, name:)
|
39
|
+
binds = Filer.new.perform(task_storage_mounts: task.storage_mounts)
|
40
|
+
|
41
|
+
user = [
|
42
|
+
Settings.run_container_as.user_id,
|
43
|
+
Settings.run_container_as.group_id
|
44
|
+
].join(":")
|
45
|
+
|
46
|
+
::Docker::Container.create(
|
47
|
+
{
|
48
|
+
"name" => name,
|
49
|
+
"Image" => task.image,
|
50
|
+
"User" => user,
|
51
|
+
"HostConfig" => {
|
52
|
+
"Binds" => binds,
|
53
|
+
"NetworkMode" => ENV["DOCKER_CONTAINERS_NETWORK"].to_s
|
54
|
+
},
|
55
|
+
"Entrypoint" => [],
|
56
|
+
"Cmd" => ["sh", "-c", task.cmd]
|
57
|
+
},
|
58
|
+
CreateConnection.new.perform(node: slot.node)
|
59
|
+
)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class UpdateNodeStatus
|
6
|
+
include UpdateNodeStatusHelper
|
7
|
+
|
8
|
+
attr_reader :node
|
9
|
+
|
10
|
+
def perform(node:)
|
11
|
+
@node = node
|
12
|
+
Rails.logger.debug("Start updating node status for #{node}")
|
13
|
+
|
14
|
+
# Other tasks can be started at this time. Because of this it's necessary to load the tasks first and then the containers
|
15
|
+
started_tasks = Task.started.where(:slot.in => node.slots.pluck(:id)).to_a
|
16
|
+
|
17
|
+
Rails.logger.debug("Got #{containers.count} containers")
|
18
|
+
|
19
|
+
execution_infos = containers.map do |container|
|
20
|
+
execution_info = CreateExecutionInfo.new.perform(container: container)
|
21
|
+
|
22
|
+
slot = node.slots.find_by(runner_id: execution_info.id)
|
23
|
+
if slot
|
24
|
+
Rails.logger.debug("Slot found for container #{execution_info.id}: #{slot}")
|
25
|
+
|
26
|
+
if execution_info.terminated?
|
27
|
+
Rails.logger.debug("Container #{execution_info.id} exited")
|
28
|
+
|
29
|
+
check_slot_release(slot: slot, runner_id: execution_info.id)
|
30
|
+
elsif started_with_error?(container: container, docker_connection: CreateConnection.new.perform(node: node))
|
31
|
+
container.start
|
32
|
+
end
|
33
|
+
else
|
34
|
+
remove_unknown_runner(node: node, runner_id: execution_info.id)
|
35
|
+
end
|
36
|
+
|
37
|
+
execution_info
|
38
|
+
end
|
39
|
+
|
40
|
+
RescheduleTasksForMissingRunners
|
41
|
+
.new(runner_ids: execution_infos.map(&:id), started_tasks: started_tasks)
|
42
|
+
.perform
|
43
|
+
|
44
|
+
node.register_success
|
45
|
+
|
46
|
+
send_metrics(node: node, execution_infos: execution_infos)
|
47
|
+
rescue Excon::Error, ::Docker::Error::DockerError => e
|
48
|
+
node.register_error(e.message)
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def containers
|
54
|
+
@containers ||= ::Docker::Container.all({ all: true }, CreateConnection.new.perform(node: node))
|
55
|
+
end
|
56
|
+
|
57
|
+
def started_with_error?(container:, docker_connection:)
|
58
|
+
container.info["State"] == "created" && ::Docker::Container.get(container.id, { all: true }, docker_connection).info["State"]["ExitCode"].positive?
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
class UnknownCompletionInformation < StandardError; end
|
5
|
+
|
6
|
+
ExecutionInfo = Struct.new(:id, :status, :exit_code, :started_at, :finished_at, :error, :schedule_pending, keyword_init: true) do
|
7
|
+
def success?
|
8
|
+
check_completion_information_available
|
9
|
+
|
10
|
+
status == "success"
|
11
|
+
end
|
12
|
+
|
13
|
+
def error?
|
14
|
+
check_completion_information_available
|
15
|
+
|
16
|
+
status == "error"
|
17
|
+
end
|
18
|
+
|
19
|
+
def running?
|
20
|
+
status == "running"
|
21
|
+
end
|
22
|
+
|
23
|
+
def pending?
|
24
|
+
status == "pending"
|
25
|
+
end
|
26
|
+
|
27
|
+
def terminated?
|
28
|
+
exited_without_completion_information? || success? || error?
|
29
|
+
end
|
30
|
+
|
31
|
+
def exited_without_completion_information?
|
32
|
+
status == "exited"
|
33
|
+
end
|
34
|
+
|
35
|
+
def schedule_pending?
|
36
|
+
schedule_pending
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def check_completion_information_available
|
42
|
+
# Some execution infos return just the "exited" status and not the complete state
|
43
|
+
# So in this point, if the user is asking for success or error, then we need to force it
|
44
|
+
# to fetch the complete status (which has the exit code)
|
45
|
+
|
46
|
+
raise(UnknownCompletionInformation, "Complete status not available") if exited_without_completion_information?
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Kubernetes
|
5
|
+
class CreateClient
|
6
|
+
attr_reader :node
|
7
|
+
|
8
|
+
def perform(node:)
|
9
|
+
@node = node
|
10
|
+
|
11
|
+
raise(Runners::InvalidRunner, "Node must be a kubernetes runner") unless node.kubernetes?
|
12
|
+
|
13
|
+
raise(Runners::InvalidConfig, "Invalid configuration (#{node.runner_config}) for kubernetes") unless valid?
|
14
|
+
|
15
|
+
KubernetesClient.new(
|
16
|
+
uri: node.hostname,
|
17
|
+
bearer_token: node.runner_config["bearer_token"],
|
18
|
+
namespace: node.runner_config["namespace"]
|
19
|
+
)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def valid?
|
25
|
+
%w[bearer_token namespace nfs_server nfs_path node_selector].none? { |field| node.runner_config[field].blank? }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Kubernetes
|
5
|
+
class CreateExecutionInfo
|
6
|
+
attr_reader :pod
|
7
|
+
|
8
|
+
ERROR_REASONS = %w[
|
9
|
+
ImagePullBackOff
|
10
|
+
ErrImagePull
|
11
|
+
].freeze
|
12
|
+
|
13
|
+
def perform(pod:)
|
14
|
+
@pod = pod
|
15
|
+
|
16
|
+
Runners::ExecutionInfo.new(
|
17
|
+
id: pod&.metadata&.name,
|
18
|
+
status: status,
|
19
|
+
exit_code: container_status&.state&.terminated&.exitCode,
|
20
|
+
started_at: started_at,
|
21
|
+
finished_at: container_status&.state&.terminated&.finishedAt,
|
22
|
+
error: error_message,
|
23
|
+
schedule_pending: schedule_pending?
|
24
|
+
)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def status
|
30
|
+
if running?
|
31
|
+
"running"
|
32
|
+
elsif terminated_with_success?
|
33
|
+
"success"
|
34
|
+
elsif error?
|
35
|
+
"error"
|
36
|
+
elsif waiting? || schedule_pending?
|
37
|
+
"pending"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def waiting?
|
42
|
+
container_status&.state&.waiting.present?
|
43
|
+
end
|
44
|
+
|
45
|
+
def running?
|
46
|
+
container_status&.state&.running&.present?
|
47
|
+
end
|
48
|
+
|
49
|
+
def terminated_with_error?
|
50
|
+
container_status&.state&.terminated&.exitCode&.positive?
|
51
|
+
end
|
52
|
+
|
53
|
+
def terminated_with_success?
|
54
|
+
container_status&.state&.terminated&.exitCode&.zero?
|
55
|
+
end
|
56
|
+
|
57
|
+
def reason_is_error?
|
58
|
+
waiting? && ERROR_REASONS.include?(reason[:reason])
|
59
|
+
end
|
60
|
+
|
61
|
+
def error?
|
62
|
+
terminated_with_error? || reason_is_error?
|
63
|
+
end
|
64
|
+
|
65
|
+
def schedule_pending?
|
66
|
+
unschedulable_error_messsage.present?
|
67
|
+
end
|
68
|
+
|
69
|
+
def unschedulable_error_messsage
|
70
|
+
return if pod.status&.phase != "Pending"
|
71
|
+
|
72
|
+
found = pod&.status&.conditions&.find { |condition| condition.reason == "Unschedulable" }
|
73
|
+
"#{found.reason}: #{found.message}" if found
|
74
|
+
end
|
75
|
+
|
76
|
+
def error_message
|
77
|
+
if error?
|
78
|
+
reason.values.compact.join(": ")
|
79
|
+
elsif unschedulable_error_messsage.present?
|
80
|
+
unschedulable_error_messsage
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def started_at
|
85
|
+
(container_status&.state&.terminated || container_status&.state&.running)&.startedAt
|
86
|
+
end
|
87
|
+
|
88
|
+
def reason
|
89
|
+
reason_value = container_status&.state&.to_hash&.values&.first
|
90
|
+
return {} unless reason_value
|
91
|
+
|
92
|
+
{
|
93
|
+
reason: reason_value[:reason],
|
94
|
+
message: reason_value[:message]
|
95
|
+
}
|
96
|
+
end
|
97
|
+
|
98
|
+
def container_status
|
99
|
+
pod.status&.containerStatuses&.first
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Kubernetes
|
5
|
+
class FetchExecutionInfo
|
6
|
+
def perform(task:)
|
7
|
+
pod = CreateClient.new.perform(node: task.slot.node).fetch_pod(pod_name: task.runner_id)
|
8
|
+
|
9
|
+
CreateExecutionInfo.new.perform(pod: pod)
|
10
|
+
rescue KubernetesClient::PodNotFoundError => e
|
11
|
+
raise Runners::RunnerIdNotFoundError, e.message
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Kubernetes
|
5
|
+
class FetchLogs
|
6
|
+
def perform(task:)
|
7
|
+
CreateClient.new.perform(node: task.slot.node).fetch_pod_logs(pod_name: task.runner_id)
|
8
|
+
rescue KubernetesClient::PodNotFoundError => e
|
9
|
+
raise Runners::RunnerIdNotFoundError, e.message
|
10
|
+
rescue KubernetesClient::LogsNotFoundError
|
11
|
+
Rails.logger.error("Error on fetching kubernetes pod logs")
|
12
|
+
|
13
|
+
"Logs not found"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Kubernetes
|
5
|
+
class Filer
|
6
|
+
class InvalidMountName < StandardError; end
|
7
|
+
|
8
|
+
attr_reader :task_storage_mounts
|
9
|
+
|
10
|
+
def perform(task_storage_mounts:)
|
11
|
+
@task_storage_mounts = task_storage_mounts
|
12
|
+
|
13
|
+
{
|
14
|
+
internal: internal,
|
15
|
+
external: external
|
16
|
+
}
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def internal
|
22
|
+
task_storage_mounts.map do |task_mount_name, task_mount_path|
|
23
|
+
{
|
24
|
+
name: task_mount_name,
|
25
|
+
mountPath: task_mount_path
|
26
|
+
}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def external
|
31
|
+
task_storage_mounts.map do |task_mount_name, _task_mount_path|
|
32
|
+
node_mount_path = Settings.to_hash[:storage_mounts][:kubernetes][task_mount_name.to_sym]
|
33
|
+
|
34
|
+
raise InvalidMountName unless node_mount_path
|
35
|
+
|
36
|
+
node_mount_path.merge(name: task_mount_name)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|