container_broker 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +98 -0
- data/Rakefile +8 -0
- data/app/controllers/application_controller.rb +5 -0
- data/app/controllers/healthcheck_controller.rb +21 -0
- data/app/controllers/nodes_controller.rb +70 -0
- data/app/controllers/nodes_healthcheck_controller.rb +28 -0
- data/app/controllers/status_controller.rb +48 -0
- data/app/controllers/tasks_controller.rb +83 -0
- data/app/controllers/tasks_healthcheck_controller.rb +28 -0
- data/app/jobs/add_task_tags_job.rb +13 -0
- data/app/jobs/adjust_node_slots_job.rb +27 -0
- data/app/jobs/application_job.rb +9 -0
- data/app/jobs/collect_load_metrics_job.rb +9 -0
- data/app/jobs/container_broker_base_job.rb +32 -0
- data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
- data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
- data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
- data/app/jobs/release_slot_job.rb +47 -0
- data/app/jobs/remove_runner_job.rb +11 -0
- data/app/jobs/remove_unused_tags_job.rb +25 -0
- data/app/jobs/request_id_from_task.rb +7 -0
- data/app/jobs/run_task_job.rb +64 -0
- data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
- data/app/jobs/run_tasks_job.rb +42 -0
- data/app/jobs/timeout_failed_tasks_job.rb +31 -0
- data/app/jobs/update_all_nodes_status_job.rb +9 -0
- data/app/jobs/update_node_status_job.rb +24 -0
- data/app/jobs/update_task_status_job.rb +71 -0
- data/app/models/mongoid_serializable_model.rb +14 -0
- data/app/models/node.rb +101 -0
- data/app/models/slot.rb +42 -0
- data/app/models/task.rb +148 -0
- data/app/models/task_tag.rb +11 -0
- data/app/observers/observable.rb +23 -0
- data/app/observers/task_observer.rb +11 -0
- data/app/serializers/node_healthcheck_serializer.rb +5 -0
- data/app/serializers/node_serializer.rb +5 -0
- data/app/serializers/status_panel_node_serializer.rb +9 -0
- data/app/serializers/status_panel_slot_serializer.rb +5 -0
- data/app/serializers/status_panel_task_serializer.rb +16 -0
- data/app/serializers/task_healthcheck_serializer.rb +5 -0
- data/app/serializers/task_serializer.rb +7 -0
- data/app/services/adjust_execution_type_slots.rb +51 -0
- data/app/services/check_for_slot_removal.rb +28 -0
- data/app/services/collect_load_metrics.rb +40 -0
- data/app/services/delete_node.rb +25 -0
- data/app/services/friendly_name_nodes.rb +10 -0
- data/app/services/friendly_name_slots.rb +15 -0
- data/app/services/kill_node_runners.rb +17 -0
- data/app/services/kill_task_container.rb +29 -0
- data/app/services/kubernetes_client.rb +136 -0
- data/app/services/least_used_node.rb +44 -0
- data/app/services/lock_manager.rb +74 -0
- data/app/services/lock_slot.rb +37 -0
- data/app/services/lock_task.rb +45 -0
- data/app/services/metrics.rb +43 -0
- data/app/services/migrate_runner.rb +26 -0
- data/app/services/node_task_acceptance.rb +18 -0
- data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
- data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
- data/app/services/runners.rb +4 -0
- data/app/services/runners/docker/create_connection.rb +18 -0
- data/app/services/runners/docker/create_execution_info.rb +87 -0
- data/app/services/runners/docker/fetch_execution_info.rb +17 -0
- data/app/services/runners/docker/fetch_logs.rb +18 -0
- data/app/services/runners/docker/fetch_task_container.rb +15 -0
- data/app/services/runners/docker/filer.rb +19 -0
- data/app/services/runners/docker/kill_slot_runner.rb +19 -0
- data/app/services/runners/docker/node_availability.rb +11 -0
- data/app/services/runners/docker/remove_runner.rb +18 -0
- data/app/services/runners/docker/run_task.rb +63 -0
- data/app/services/runners/docker/update_node_status.rb +62 -0
- data/app/services/runners/execution_info.rb +49 -0
- data/app/services/runners/invalid_config.rb +5 -0
- data/app/services/runners/invalid_runner.rb +5 -0
- data/app/services/runners/kubernetes/create_client.rb +29 -0
- data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
- data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
- data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
- data/app/services/runners/kubernetes/filer.rb +41 -0
- data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
- data/app/services/runners/kubernetes/node_availability.rb +11 -0
- data/app/services/runners/kubernetes/remove_runner.rb +19 -0
- data/app/services/runners/kubernetes/run_task.rb +54 -0
- data/app/services/runners/kubernetes/update_node_status.rb +64 -0
- data/app/services/runners/runner_id_not_found_error.rb +5 -0
- data/app/services/runners/services_factory.rb +38 -0
- data/app/services/runners/update_node_status_helper.rb +43 -0
- data/app/services/slots_usage_percentage.rb +18 -0
- data/config/application.rb +34 -0
- data/config/boot.rb +5 -0
- data/config/environment.rb +7 -0
- data/config/environments/test.rb +44 -0
- data/config/initializers/application_controller_renderer.rb +10 -0
- data/config/initializers/backtrace_silencers.rb +9 -0
- data/config/initializers/config.rb +51 -0
- data/config/initializers/cookies_serializer.rb +7 -0
- data/config/initializers/docker_config.rb +3 -0
- data/config/initializers/filter_parameter_logging.rb +6 -0
- data/config/initializers/idempotent_request.rb +12 -0
- data/config/initializers/inflections.rb +18 -0
- data/config/initializers/mime_types.rb +6 -0
- data/config/initializers/mongoid.rb +3 -0
- data/config/initializers/new_framework_defaults_6_0.rb +47 -0
- data/config/initializers/raven.rb +10 -0
- data/config/initializers/sidekiq.rb +24 -0
- data/config/initializers/wrap_parameters.rb +16 -0
- data/config/locales/en.yml +33 -0
- data/config/mongoid.yml +10 -0
- data/config/routes.rb +43 -0
- data/config/secrets.yml +35 -0
- data/config/settings.yml +34 -0
- data/config/settings/test.yml +27 -0
- data/config/sidekiq_scheduler.yml +18 -0
- data/config/spring.rb +8 -0
- data/lib/constants.rb +12 -0
- data/lib/container_broker.rb +30 -0
- data/lib/container_broker/engine.rb +6 -0
- data/lib/container_broker/version.rb +5 -0
- data/lib/current_thread_request_id.rb +19 -0
- data/lib/idempotent_request/callback.rb +25 -0
- data/lib/idempotent_request/policy.rb +15 -0
- data/lib/redis_url_parser.rb +25 -0
- data/lib/tasks/task.rake +34 -0
- metadata +590 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class LockTask
|
4
|
+
attr_reader :execution_type
|
5
|
+
|
6
|
+
def initialize(execution_type:)
|
7
|
+
@execution_type = execution_type
|
8
|
+
end
|
9
|
+
|
10
|
+
def perform
|
11
|
+
task = all_pending
|
12
|
+
.find_one_and_update(
|
13
|
+
{
|
14
|
+
"$set" => { status: "starting" }
|
15
|
+
}, return_document: :after
|
16
|
+
)
|
17
|
+
return unless task
|
18
|
+
|
19
|
+
task.reload
|
20
|
+
|
21
|
+
persist_metrics(task)
|
22
|
+
|
23
|
+
task
|
24
|
+
end
|
25
|
+
|
26
|
+
def any_pending?
|
27
|
+
all_pending.any?
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def all_pending
|
33
|
+
Task
|
34
|
+
.where(execution_type: execution_type)
|
35
|
+
.where(:status.in => %w[waiting retry])
|
36
|
+
end
|
37
|
+
|
38
|
+
def persist_metrics(task)
|
39
|
+
Metrics.new("tasks").count(
|
40
|
+
task_id: task.id,
|
41
|
+
name: task&.name,
|
42
|
+
status: task.status
|
43
|
+
)
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "measures"
|
4
|
+
|
5
|
+
class Metrics
|
6
|
+
attr_reader :metric
|
7
|
+
|
8
|
+
def initialize(metric)
|
9
|
+
@metric = metric
|
10
|
+
end
|
11
|
+
|
12
|
+
def count(data = {})
|
13
|
+
return unless enabled?
|
14
|
+
|
15
|
+
client.count(metric, data.merge(
|
16
|
+
origin: "container-broker"
|
17
|
+
))
|
18
|
+
rescue StandardError => e
|
19
|
+
Rails.logger.warn("Error sending metrics to measures: #{e}")
|
20
|
+
end
|
21
|
+
|
22
|
+
def duration(data = {})
|
23
|
+
if enabled?
|
24
|
+
client.time(metric, data) { yield data if block_given? }
|
25
|
+
else
|
26
|
+
yield data if block_given?
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def enabled?
|
33
|
+
Settings.measures.enabled
|
34
|
+
end
|
35
|
+
|
36
|
+
def transport
|
37
|
+
Measures::Transports::UDP.new(Settings.measures.host, Settings.measures.port)
|
38
|
+
end
|
39
|
+
|
40
|
+
def client
|
41
|
+
Measures::Client.new(transport, Settings.measures.index, Settings.measures.owner)
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# It's important to persist migrated ids because there may be some tasks in the execution queue for the same runner id
|
4
|
+
class MigrateRunner
|
5
|
+
TTL = Rails.env.development? ? 10.hours : 1.hour
|
6
|
+
KEY_PREFIX = "migrated_ids"
|
7
|
+
|
8
|
+
attr_reader :runner_id
|
9
|
+
|
10
|
+
def initialize(runner_id:)
|
11
|
+
@runner_id = runner_id
|
12
|
+
end
|
13
|
+
|
14
|
+
def migrate
|
15
|
+
Rails.logger.info("Migrate runner id #{runner_id}")
|
16
|
+
self.class.redis_client.set("#{KEY_PREFIX}_#{runner_id}", 1, ex: TTL)
|
17
|
+
end
|
18
|
+
|
19
|
+
def migrated?
|
20
|
+
self.class.redis_client.exists?("#{KEY_PREFIX}_#{runner_id}")
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.redis_client
|
24
|
+
Redis.new(RedisUrlParser.call(Settings.redis_url))
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class NodeTaskAcceptance
|
4
|
+
attr_reader :node
|
5
|
+
|
6
|
+
def initialize(node:)
|
7
|
+
@node = node
|
8
|
+
end
|
9
|
+
|
10
|
+
def accept!
|
11
|
+
@node.update!(accept_new_tasks: true)
|
12
|
+
RunTasksForAllExecutionTypesJob.perform_later
|
13
|
+
end
|
14
|
+
|
15
|
+
def reject!
|
16
|
+
@node.update!(accept_new_tasks: false)
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class NodeUsagePercentagePerExecutionType
|
4
|
+
def initialize(node)
|
5
|
+
@node = node
|
6
|
+
end
|
7
|
+
|
8
|
+
def perform
|
9
|
+
execution_type_groups.map do |execution_type_group|
|
10
|
+
{
|
11
|
+
execution_type: execution_type_group[0],
|
12
|
+
usage_percent: SlotsUsagePercentage.new(execution_type_group[1]).perform
|
13
|
+
}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def execution_type_groups
|
20
|
+
@node.slots.group_by(&:execution_type)
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class RescheduleTasksForMissingRunners
|
4
|
+
attr_reader :started_tasks, :runner_ids
|
5
|
+
|
6
|
+
def initialize(runner_ids:, started_tasks:)
|
7
|
+
@started_tasks = started_tasks
|
8
|
+
@runner_ids = runner_ids
|
9
|
+
end
|
10
|
+
|
11
|
+
def perform
|
12
|
+
tasks_without_runner.each do |runner_id|
|
13
|
+
task = started_tasks_group_by_runner_id[runner_id]
|
14
|
+
message = "Task retryied because runner #{runner_id} is missing (#{task} #{task&.slot})"
|
15
|
+
Rails.logger.debug(message)
|
16
|
+
|
17
|
+
report_event(message: message, task: task, runner_id: runner_id)
|
18
|
+
|
19
|
+
slot = task.slot
|
20
|
+
task.mark_as_retry(error: message)
|
21
|
+
slot&.release
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def report_event(message:, task:, runner_id:)
|
28
|
+
return unless Settings.sentry.enabled
|
29
|
+
|
30
|
+
slot = task&.slot
|
31
|
+
node = slot&.node
|
32
|
+
Raven.capture_exception(
|
33
|
+
message,
|
34
|
+
level: :info,
|
35
|
+
extra: {
|
36
|
+
runner: slot&.node&.runner_provider,
|
37
|
+
runner_id: runner_id,
|
38
|
+
slot: {
|
39
|
+
id: slot&.id,
|
40
|
+
name: slot&.name,
|
41
|
+
status: slot&.status,
|
42
|
+
runner_id: slot&.runner_id
|
43
|
+
},
|
44
|
+
node: {
|
45
|
+
id: node&.id,
|
46
|
+
name: node&.name,
|
47
|
+
status: node&.status
|
48
|
+
},
|
49
|
+
task: {
|
50
|
+
id: task.id,
|
51
|
+
name: task.name,
|
52
|
+
status: task.status
|
53
|
+
}
|
54
|
+
}
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
def tasks_without_runner
|
59
|
+
started_tasks_group_by_runner_id.keys - runner_ids
|
60
|
+
end
|
61
|
+
|
62
|
+
def started_tasks_group_by_runner_id
|
63
|
+
@started_tasks_group_by_runner_id ||= started_tasks
|
64
|
+
.map(&:reload)
|
65
|
+
.select(&:started?)
|
66
|
+
.select(&:runner_id)
|
67
|
+
.group_by(&:runner_id)
|
68
|
+
.transform_values(&:first)
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class CreateConnection
|
6
|
+
def perform(node:)
|
7
|
+
raise(Runners::InvalidRunner, "Node must be a docker runner") unless node.docker?
|
8
|
+
|
9
|
+
::Docker::Connection.new(
|
10
|
+
node.hostname,
|
11
|
+
connect_timeout: 5,
|
12
|
+
read_timeout: 15,
|
13
|
+
write_timeout: 5
|
14
|
+
)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class CreateExecutionInfo
|
6
|
+
attr_reader :container
|
7
|
+
|
8
|
+
def perform(container:)
|
9
|
+
@container = container
|
10
|
+
|
11
|
+
execution_info_data = {
|
12
|
+
id: container_name(container: container),
|
13
|
+
status: status
|
14
|
+
}
|
15
|
+
|
16
|
+
if full_state_present?
|
17
|
+
execution_info_data.merge!(
|
18
|
+
exit_code: state["ExitCode"],
|
19
|
+
started_at: state["StartedAt"],
|
20
|
+
finished_at: state["FinishedAt"],
|
21
|
+
error: state["Error"]
|
22
|
+
)
|
23
|
+
end
|
24
|
+
|
25
|
+
Runners::ExecutionInfo.new(execution_info_data)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def status
|
31
|
+
if waiting?
|
32
|
+
"pending"
|
33
|
+
elsif running?
|
34
|
+
"running"
|
35
|
+
elsif terminated_with_success?
|
36
|
+
"success"
|
37
|
+
elsif terminated_with_error?
|
38
|
+
"error"
|
39
|
+
elsif terminated?
|
40
|
+
"exited"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def container_name(container:)
|
45
|
+
name = container.info["Name"] || container.info["Names"].first
|
46
|
+
|
47
|
+
name.remove(%r{^/})
|
48
|
+
end
|
49
|
+
|
50
|
+
def waiting?
|
51
|
+
state_status == "created"
|
52
|
+
end
|
53
|
+
|
54
|
+
def running?
|
55
|
+
state_status == "running"
|
56
|
+
end
|
57
|
+
|
58
|
+
def terminated?
|
59
|
+
state_status == "exited"
|
60
|
+
end
|
61
|
+
|
62
|
+
def terminated_with_success?
|
63
|
+
terminated? && state["ExitCode"]&.zero?
|
64
|
+
end
|
65
|
+
|
66
|
+
def terminated_with_error?
|
67
|
+
terminated? && state["ExitCode"]&.positive?
|
68
|
+
end
|
69
|
+
|
70
|
+
def full_state_present?
|
71
|
+
state.is_a?(Hash)
|
72
|
+
end
|
73
|
+
|
74
|
+
def state
|
75
|
+
container.info["State"]
|
76
|
+
end
|
77
|
+
|
78
|
+
def state_status
|
79
|
+
if full_state_present?
|
80
|
+
state["Status"]
|
81
|
+
else
|
82
|
+
state
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class FetchExecutionInfo
|
6
|
+
def perform(task:)
|
7
|
+
container = Runners::Docker::FetchTaskContainer
|
8
|
+
.new
|
9
|
+
.perform(task: task)
|
10
|
+
|
11
|
+
CreateExecutionInfo.new.perform(container: container)
|
12
|
+
rescue ::Docker::Error::NotFoundError => e
|
13
|
+
raise Runners::RunnerIdNotFoundError, e.message
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class FetchLogs
|
6
|
+
def perform(task:)
|
7
|
+
# streaming_logs avoids some encoding issues and should be safe since container status = exited
|
8
|
+
# (see https://github.com/swipely/docker-api/issues/290 for reference)
|
9
|
+
Runners::Docker::FetchTaskContainer
|
10
|
+
.new
|
11
|
+
.perform(task: task)
|
12
|
+
.streaming_logs(stdout: true, stderr: true, tail: 1_000)
|
13
|
+
rescue ::Docker::Error::NotFoundError => e
|
14
|
+
raise Runners::RunnerIdNotFoundError, e.message
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class FetchTaskContainer
|
6
|
+
def perform(task:)
|
7
|
+
::Docker::Container.get(
|
8
|
+
task.runner_id,
|
9
|
+
{ all: true },
|
10
|
+
CreateConnection.new.perform(node: task.slot.node)
|
11
|
+
)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class Filer
|
6
|
+
class InvalidMountName < StandardError; end
|
7
|
+
|
8
|
+
def perform(task_storage_mounts:)
|
9
|
+
task_storage_mounts.map do |task_mount_name, task_mount_path|
|
10
|
+
node_mount_path = Settings.storage_mounts.docker[task_mount_name]
|
11
|
+
|
12
|
+
raise InvalidMountName unless node_mount_path
|
13
|
+
|
14
|
+
[node_mount_path, task_mount_path].join(":")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Runners
|
4
|
+
module Docker
|
5
|
+
class KillSlotRunner
|
6
|
+
def perform(slot:)
|
7
|
+
return unless slot.runner_id.present?
|
8
|
+
|
9
|
+
::Docker::Container
|
10
|
+
.get(slot.runner_id, {}, CreateConnection.new.perform(node: slot.node))
|
11
|
+
.kill!
|
12
|
+
rescue ::Docker::Error::NotFoundError => e
|
13
|
+
Rails.logger.info("Container #{slot.runner_id} already removed - #{e.message} (e.class)")
|
14
|
+
rescue Excon::Error => e
|
15
|
+
Rails.logger.info("Error removing container: #{e}")
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|