container_broker 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +98 -0
- data/Rakefile +8 -0
- data/app/controllers/application_controller.rb +5 -0
- data/app/controllers/healthcheck_controller.rb +21 -0
- data/app/controllers/nodes_controller.rb +70 -0
- data/app/controllers/nodes_healthcheck_controller.rb +28 -0
- data/app/controllers/status_controller.rb +48 -0
- data/app/controllers/tasks_controller.rb +83 -0
- data/app/controllers/tasks_healthcheck_controller.rb +28 -0
- data/app/jobs/add_task_tags_job.rb +13 -0
- data/app/jobs/adjust_node_slots_job.rb +27 -0
- data/app/jobs/application_job.rb +9 -0
- data/app/jobs/collect_load_metrics_job.rb +9 -0
- data/app/jobs/container_broker_base_job.rb +32 -0
- data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
- data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
- data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
- data/app/jobs/release_slot_job.rb +47 -0
- data/app/jobs/remove_runner_job.rb +11 -0
- data/app/jobs/remove_unused_tags_job.rb +25 -0
- data/app/jobs/request_id_from_task.rb +7 -0
- data/app/jobs/run_task_job.rb +64 -0
- data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
- data/app/jobs/run_tasks_job.rb +42 -0
- data/app/jobs/timeout_failed_tasks_job.rb +31 -0
- data/app/jobs/update_all_nodes_status_job.rb +9 -0
- data/app/jobs/update_node_status_job.rb +24 -0
- data/app/jobs/update_task_status_job.rb +71 -0
- data/app/models/mongoid_serializable_model.rb +14 -0
- data/app/models/node.rb +101 -0
- data/app/models/slot.rb +42 -0
- data/app/models/task.rb +148 -0
- data/app/models/task_tag.rb +11 -0
- data/app/observers/observable.rb +23 -0
- data/app/observers/task_observer.rb +11 -0
- data/app/serializers/node_healthcheck_serializer.rb +5 -0
- data/app/serializers/node_serializer.rb +5 -0
- data/app/serializers/status_panel_node_serializer.rb +9 -0
- data/app/serializers/status_panel_slot_serializer.rb +5 -0
- data/app/serializers/status_panel_task_serializer.rb +16 -0
- data/app/serializers/task_healthcheck_serializer.rb +5 -0
- data/app/serializers/task_serializer.rb +7 -0
- data/app/services/adjust_execution_type_slots.rb +51 -0
- data/app/services/check_for_slot_removal.rb +28 -0
- data/app/services/collect_load_metrics.rb +40 -0
- data/app/services/delete_node.rb +25 -0
- data/app/services/friendly_name_nodes.rb +10 -0
- data/app/services/friendly_name_slots.rb +15 -0
- data/app/services/kill_node_runners.rb +17 -0
- data/app/services/kill_task_container.rb +29 -0
- data/app/services/kubernetes_client.rb +136 -0
- data/app/services/least_used_node.rb +44 -0
- data/app/services/lock_manager.rb +74 -0
- data/app/services/lock_slot.rb +37 -0
- data/app/services/lock_task.rb +45 -0
- data/app/services/metrics.rb +43 -0
- data/app/services/migrate_runner.rb +26 -0
- data/app/services/node_task_acceptance.rb +18 -0
- data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
- data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
- data/app/services/runners.rb +4 -0
- data/app/services/runners/docker/create_connection.rb +18 -0
- data/app/services/runners/docker/create_execution_info.rb +87 -0
- data/app/services/runners/docker/fetch_execution_info.rb +17 -0
- data/app/services/runners/docker/fetch_logs.rb +18 -0
- data/app/services/runners/docker/fetch_task_container.rb +15 -0
- data/app/services/runners/docker/filer.rb +19 -0
- data/app/services/runners/docker/kill_slot_runner.rb +19 -0
- data/app/services/runners/docker/node_availability.rb +11 -0
- data/app/services/runners/docker/remove_runner.rb +18 -0
- data/app/services/runners/docker/run_task.rb +63 -0
- data/app/services/runners/docker/update_node_status.rb +62 -0
- data/app/services/runners/execution_info.rb +49 -0
- data/app/services/runners/invalid_config.rb +5 -0
- data/app/services/runners/invalid_runner.rb +5 -0
- data/app/services/runners/kubernetes/create_client.rb +29 -0
- data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
- data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
- data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
- data/app/services/runners/kubernetes/filer.rb +41 -0
- data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
- data/app/services/runners/kubernetes/node_availability.rb +11 -0
- data/app/services/runners/kubernetes/remove_runner.rb +19 -0
- data/app/services/runners/kubernetes/run_task.rb +54 -0
- data/app/services/runners/kubernetes/update_node_status.rb +64 -0
- data/app/services/runners/runner_id_not_found_error.rb +5 -0
- data/app/services/runners/services_factory.rb +38 -0
- data/app/services/runners/update_node_status_helper.rb +43 -0
- data/app/services/slots_usage_percentage.rb +18 -0
- data/config/application.rb +34 -0
- data/config/boot.rb +5 -0
- data/config/environment.rb +7 -0
- data/config/environments/test.rb +44 -0
- data/config/initializers/application_controller_renderer.rb +10 -0
- data/config/initializers/backtrace_silencers.rb +9 -0
- data/config/initializers/config.rb +51 -0
- data/config/initializers/cookies_serializer.rb +7 -0
- data/config/initializers/docker_config.rb +3 -0
- data/config/initializers/filter_parameter_logging.rb +6 -0
- data/config/initializers/idempotent_request.rb +12 -0
- data/config/initializers/inflections.rb +18 -0
- data/config/initializers/mime_types.rb +6 -0
- data/config/initializers/mongoid.rb +3 -0
- data/config/initializers/new_framework_defaults_6_0.rb +47 -0
- data/config/initializers/raven.rb +10 -0
- data/config/initializers/sidekiq.rb +24 -0
- data/config/initializers/wrap_parameters.rb +16 -0
- data/config/locales/en.yml +33 -0
- data/config/mongoid.yml +10 -0
- data/config/routes.rb +43 -0
- data/config/secrets.yml +35 -0
- data/config/settings.yml +34 -0
- data/config/settings/test.yml +27 -0
- data/config/sidekiq_scheduler.yml +18 -0
- data/config/spring.rb +8 -0
- data/lib/constants.rb +12 -0
- data/lib/container_broker.rb +30 -0
- data/lib/container_broker/engine.rb +6 -0
- data/lib/container_broker/version.rb +5 -0
- data/lib/current_thread_request_id.rb +19 -0
- data/lib/idempotent_request/callback.rb +25 -0
- data/lib/idempotent_request/policy.rb +15 -0
- data/lib/redis_url_parser.rb +25 -0
- data/lib/tasks/task.rake +34 -0
- metadata +590 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class LockTask
|
4
|
+
attr_reader :execution_type
|
5
|
+
|
6
|
+
def initialize(execution_type:)
|
7
|
+
@execution_type = execution_type
|
8
|
+
end
|
9
|
+
|
10
|
+
def perform
|
11
|
+
task = all_pending
|
12
|
+
.find_one_and_update(
|
13
|
+
{
|
14
|
+
"$set" => { status: "starting" }
|
15
|
+
}, return_document: :after
|
16
|
+
)
|
17
|
+
return unless task
|
18
|
+
|
19
|
+
task.reload
|
20
|
+
|
21
|
+
persist_metrics(task)
|
22
|
+
|
23
|
+
task
|
24
|
+
end
|
25
|
+
|
26
|
+
def any_pending?
|
27
|
+
all_pending.any?
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def all_pending
|
33
|
+
Task
|
34
|
+
.where(execution_type: execution_type)
|
35
|
+
.where(:status.in => %w[waiting retry])
|
36
|
+
end
|
37
|
+
|
38
|
+
def persist_metrics(task)
|
39
|
+
Metrics.new("tasks").count(
|
40
|
+
task_id: task.id,
|
41
|
+
name: task&.name,
|
42
|
+
status: task.status
|
43
|
+
)
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "measures"
|
4
|
+
|
5
|
+
class Metrics
|
6
|
+
attr_reader :metric
|
7
|
+
|
8
|
+
def initialize(metric)
|
9
|
+
@metric = metric
|
10
|
+
end
|
11
|
+
|
12
|
+
def count(data = {})
|
13
|
+
return unless enabled?
|
14
|
+
|
15
|
+
client.count(metric, data.merge(
|
16
|
+
origin: "container-broker"
|
17
|
+
))
|
18
|
+
rescue StandardError => e
|
19
|
+
Rails.logger.warn("Error sending metrics to measures: #{e}")
|
20
|
+
end
|
21
|
+
|
22
|
+
def duration(data = {})
|
23
|
+
if enabled?
|
24
|
+
client.time(metric, data) { yield data if block_given? }
|
25
|
+
else
|
26
|
+
yield data if block_given?
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def enabled?
|
33
|
+
Settings.measures.enabled
|
34
|
+
end
|
35
|
+
|
36
|
+
def transport
|
37
|
+
Measures::Transports::UDP.new(Settings.measures.host, Settings.measures.port)
|
38
|
+
end
|
39
|
+
|
40
|
+
def client
|
41
|
+
Measures::Client.new(transport, Settings.measures.index, Settings.measures.owner)
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# It's important to persist migrated ids because there may be some tasks in the execution queue for the same runner id
|
4
|
+
class MigrateRunner
|
5
|
+
TTL = Rails.env.development? ? 10.hours : 1.hour
|
6
|
+
KEY_PREFIX = "migrated_ids"
|
7
|
+
|
8
|
+
attr_reader :runner_id
|
9
|
+
|
10
|
+
def initialize(runner_id:)
|
11
|
+
@runner_id = runner_id
|
12
|
+
end
|
13
|
+
|
14
|
+
def migrate
|
15
|
+
Rails.logger.info("Migrate runner id #{runner_id}")
|
16
|
+
self.class.redis_client.set("#{KEY_PREFIX}_#{runner_id}", 1, ex: TTL)
|
17
|
+
end
|
18
|
+
|
19
|
+
def migrated?
|
20
|
+
self.class.redis_client.exists?("#{KEY_PREFIX}_#{runner_id}")
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.redis_client
|
24
|
+
Redis.new(RedisUrlParser.call(Settings.redis_url))
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class NodeTaskAcceptance
|
4
|
+
attr_reader :node
|
5
|
+
|
6
|
+
def initialize(node:)
|
7
|
+
@node = node
|
8
|
+
end
|
9
|
+
|
10
|
+
def accept!
|
11
|
+
@node.update!(accept_new_tasks: true)
|
12
|
+
RunTasksForAllExecutionTypesJob.perform_later
|
13
|
+
end
|
14
|
+
|
15
|
+
def reject!
|
16
|
+
@node.update!(accept_new_tasks: false)
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class NodeUsagePercentagePerExecutionType
|
4
|
+
def initialize(node)
|
5
|
+
@node = node
|
6
|
+
end
|
7
|
+
|
8
|
+
def perform
|
9
|
+
execution_type_groups.map do |execution_type_group|
|
10
|
+
{
|
11
|
+
execution_type: execution_type_group[0],
|
12
|
+
usage_percent: SlotsUsagePercentage.new(execution_type_group[1]).perform
|
13
|
+
}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def execution_type_groups
|
20
|
+
@node.slots.group_by(&:execution_type)
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class RescheduleTasksForMissingRunners
|
4
|
+
attr_reader :started_tasks, :runner_ids
|
5
|
+
|
6
|
+
def initialize(runner_ids:, started_tasks:)
|
7
|
+
@started_tasks = started_tasks
|
8
|
+
@runner_ids = runner_ids
|
9
|
+
end
|
10
|
+
|
11
|
+
def perform
|
12
|
+
tasks_without_runner.each do |runner_id|
|
13
|
+
task = started_tasks_group_by_runner_id[runner_id]
|
14
|
+
message = "Task retryied because runner #{runner_id} is missing (#{task} #{task&.slot})"
|
15
|
+
Rails.logger.debug(message)
|
16
|
+
|
17
|
+
report_event(message: message, task: task, runner_id: runner_id)
|
18
|
+
|
19
|
+
slot = task.slot
|
20
|
+
task.mark_as_retry(error: message)
|
21
|
+
slot&.release
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def report_event(message:, task:, runner_id:)
|
28
|
+
return unless Settings.sentry.enabled
|
29
|
+
|
30
|
+
slot = task&.slot
|
31
|
+
node = slot&.node
|
32
|
+
Raven.capture_exception(
|
33
|
+
message,
|
34
|
+
level: :info,
|
35
|
+
extra: {
|
36
|
+
runner: slot&.node&.runner_provider,
|
37
|
+
runner_id: runner_id,
|
38
|
+
slot: {
|
39
|
+
id: slot&.id,
|
40
|
+
name: slot&.name,
|
41
|
+
status: slot&.status,
|
42
|
+
runner_id: slot&.runner_id
|
43
|
+
},
|
44
|
+
node: {
|
45
|
+
id: node&.id,
|
46
|
+
name: node&.name,
|
47
|
+
status: node&.status
|
48
|
+
},
|
49
|
+
task: {
|
50
|
+
id: task.id,
|
51
|
+
name: task.name,
|
52
|
+
status: task.status
|
53
|
+
}
|
54
|
+
}
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
def tasks_without_runner
|
59
|
+
started_tasks_group_by_runner_id.keys - runner_ids
|
60
|
+
end
|
61
|
+
|
62
|
+
def started_tasks_group_by_runner_id
|
63
|
+
@started_tasks_group_by_runner_id ||= started_tasks
|
64
|
+
.map(&:reload)
|
65
|
+
.select(&:started?)
|
66
|
+
.select(&:runner_id)
|
67
|
+
.group_by(&:runner_id)
|
68
|
+
.transform_values(&:first)
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class CreateConnection
|
6
|
+
def perform(node:)
|
7
|
+
raise(Runners::InvalidRunner, "Node must be a docker runner") unless node.docker?
|
8
|
+
|
9
|
+
::Docker::Connection.new(
|
10
|
+
node.hostname,
|
11
|
+
connect_timeout: 5,
|
12
|
+
read_timeout: 15,
|
13
|
+
write_timeout: 5
|
14
|
+
)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class CreateExecutionInfo
|
6
|
+
attr_reader :container
|
7
|
+
|
8
|
+
def perform(container:)
|
9
|
+
@container = container
|
10
|
+
|
11
|
+
execution_info_data = {
|
12
|
+
id: container_name(container: container),
|
13
|
+
status: status
|
14
|
+
}
|
15
|
+
|
16
|
+
if full_state_present?
|
17
|
+
execution_info_data.merge!(
|
18
|
+
exit_code: state["ExitCode"],
|
19
|
+
started_at: state["StartedAt"],
|
20
|
+
finished_at: state["FinishedAt"],
|
21
|
+
error: state["Error"]
|
22
|
+
)
|
23
|
+
end
|
24
|
+
|
25
|
+
Runners::ExecutionInfo.new(execution_info_data)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def status
|
31
|
+
if waiting?
|
32
|
+
"pending"
|
33
|
+
elsif running?
|
34
|
+
"running"
|
35
|
+
elsif terminated_with_success?
|
36
|
+
"success"
|
37
|
+
elsif terminated_with_error?
|
38
|
+
"error"
|
39
|
+
elsif terminated?
|
40
|
+
"exited"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def container_name(container:)
|
45
|
+
name = container.info["Name"] || container.info["Names"].first
|
46
|
+
|
47
|
+
name.remove(%r{^/})
|
48
|
+
end
|
49
|
+
|
50
|
+
def waiting?
|
51
|
+
state_status == "created"
|
52
|
+
end
|
53
|
+
|
54
|
+
def running?
|
55
|
+
state_status == "running"
|
56
|
+
end
|
57
|
+
|
58
|
+
def terminated?
|
59
|
+
state_status == "exited"
|
60
|
+
end
|
61
|
+
|
62
|
+
def terminated_with_success?
|
63
|
+
terminated? && state["ExitCode"]&.zero?
|
64
|
+
end
|
65
|
+
|
66
|
+
def terminated_with_error?
|
67
|
+
terminated? && state["ExitCode"]&.positive?
|
68
|
+
end
|
69
|
+
|
70
|
+
def full_state_present?
|
71
|
+
state.is_a?(Hash)
|
72
|
+
end
|
73
|
+
|
74
|
+
def state
|
75
|
+
container.info["State"]
|
76
|
+
end
|
77
|
+
|
78
|
+
def state_status
|
79
|
+
if full_state_present?
|
80
|
+
state["Status"]
|
81
|
+
else
|
82
|
+
state
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class FetchExecutionInfo
|
6
|
+
def perform(task:)
|
7
|
+
container = Runners::Docker::FetchTaskContainer
|
8
|
+
.new
|
9
|
+
.perform(task: task)
|
10
|
+
|
11
|
+
CreateExecutionInfo.new.perform(container: container)
|
12
|
+
rescue ::Docker::Error::NotFoundError => e
|
13
|
+
raise Runners::RunnerIdNotFoundError, e.message
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class FetchLogs
|
6
|
+
def perform(task:)
|
7
|
+
# streaming_logs avoids some encoding issues and should be safe since container status = exited
|
8
|
+
# (see https://github.com/swipely/docker-api/issues/290 for reference)
|
9
|
+
Runners::Docker::FetchTaskContainer
|
10
|
+
.new
|
11
|
+
.perform(task: task)
|
12
|
+
.streaming_logs(stdout: true, stderr: true, tail: 1_000)
|
13
|
+
rescue ::Docker::Error::NotFoundError => e
|
14
|
+
raise Runners::RunnerIdNotFoundError, e.message
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class FetchTaskContainer
|
6
|
+
def perform(task:)
|
7
|
+
::Docker::Container.get(
|
8
|
+
task.runner_id,
|
9
|
+
{ all: true },
|
10
|
+
CreateConnection.new.perform(node: task.slot.node)
|
11
|
+
)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class Filer
|
6
|
+
class InvalidMountName < StandardError; end
|
7
|
+
|
8
|
+
def perform(task_storage_mounts:)
|
9
|
+
task_storage_mounts.map do |task_mount_name, task_mount_path|
|
10
|
+
node_mount_path = Settings.storage_mounts.docker[task_mount_name]
|
11
|
+
|
12
|
+
raise InvalidMountName unless node_mount_path
|
13
|
+
|
14
|
+
[node_mount_path, task_mount_path].join(":")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class KillSlotRunner
|
6
|
+
def perform(slot:)
|
7
|
+
return unless slot.runner_id.present?
|
8
|
+
|
9
|
+
::Docker::Container
|
10
|
+
.get(slot.runner_id, {}, CreateConnection.new.perform(node: slot.node))
|
11
|
+
.kill!
|
12
|
+
rescue ::Docker::Error::NotFoundError => e
|
13
|
+
Rails.logger.info("Container #{slot.runner_id} already removed - #{e.message} (e.class)")
|
14
|
+
rescue Excon::Error => e
|
15
|
+
Rails.logger.info("Error removing container: #{e}")
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|