container_broker 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +98 -0
- data/Rakefile +8 -0
- data/app/controllers/application_controller.rb +5 -0
- data/app/controllers/healthcheck_controller.rb +21 -0
- data/app/controllers/nodes_controller.rb +70 -0
- data/app/controllers/nodes_healthcheck_controller.rb +28 -0
- data/app/controllers/status_controller.rb +48 -0
- data/app/controllers/tasks_controller.rb +83 -0
- data/app/controllers/tasks_healthcheck_controller.rb +28 -0
- data/app/jobs/add_task_tags_job.rb +13 -0
- data/app/jobs/adjust_node_slots_job.rb +27 -0
- data/app/jobs/application_job.rb +9 -0
- data/app/jobs/collect_load_metrics_job.rb +9 -0
- data/app/jobs/container_broker_base_job.rb +32 -0
- data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
- data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
- data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
- data/app/jobs/release_slot_job.rb +47 -0
- data/app/jobs/remove_runner_job.rb +11 -0
- data/app/jobs/remove_unused_tags_job.rb +25 -0
- data/app/jobs/request_id_from_task.rb +7 -0
- data/app/jobs/run_task_job.rb +64 -0
- data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
- data/app/jobs/run_tasks_job.rb +42 -0
- data/app/jobs/timeout_failed_tasks_job.rb +31 -0
- data/app/jobs/update_all_nodes_status_job.rb +9 -0
- data/app/jobs/update_node_status_job.rb +24 -0
- data/app/jobs/update_task_status_job.rb +71 -0
- data/app/models/mongoid_serializable_model.rb +14 -0
- data/app/models/node.rb +101 -0
- data/app/models/slot.rb +42 -0
- data/app/models/task.rb +148 -0
- data/app/models/task_tag.rb +11 -0
- data/app/observers/observable.rb +23 -0
- data/app/observers/task_observer.rb +11 -0
- data/app/serializers/node_healthcheck_serializer.rb +5 -0
- data/app/serializers/node_serializer.rb +5 -0
- data/app/serializers/status_panel_node_serializer.rb +9 -0
- data/app/serializers/status_panel_slot_serializer.rb +5 -0
- data/app/serializers/status_panel_task_serializer.rb +16 -0
- data/app/serializers/task_healthcheck_serializer.rb +5 -0
- data/app/serializers/task_serializer.rb +7 -0
- data/app/services/adjust_execution_type_slots.rb +51 -0
- data/app/services/check_for_slot_removal.rb +28 -0
- data/app/services/collect_load_metrics.rb +40 -0
- data/app/services/delete_node.rb +25 -0
- data/app/services/friendly_name_nodes.rb +10 -0
- data/app/services/friendly_name_slots.rb +15 -0
- data/app/services/kill_node_runners.rb +17 -0
- data/app/services/kill_task_container.rb +29 -0
- data/app/services/kubernetes_client.rb +136 -0
- data/app/services/least_used_node.rb +44 -0
- data/app/services/lock_manager.rb +74 -0
- data/app/services/lock_slot.rb +37 -0
- data/app/services/lock_task.rb +45 -0
- data/app/services/metrics.rb +43 -0
- data/app/services/migrate_runner.rb +26 -0
- data/app/services/node_task_acceptance.rb +18 -0
- data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
- data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
- data/app/services/runners.rb +4 -0
- data/app/services/runners/docker/create_connection.rb +18 -0
- data/app/services/runners/docker/create_execution_info.rb +87 -0
- data/app/services/runners/docker/fetch_execution_info.rb +17 -0
- data/app/services/runners/docker/fetch_logs.rb +18 -0
- data/app/services/runners/docker/fetch_task_container.rb +15 -0
- data/app/services/runners/docker/filer.rb +19 -0
- data/app/services/runners/docker/kill_slot_runner.rb +19 -0
- data/app/services/runners/docker/node_availability.rb +11 -0
- data/app/services/runners/docker/remove_runner.rb +18 -0
- data/app/services/runners/docker/run_task.rb +63 -0
- data/app/services/runners/docker/update_node_status.rb +62 -0
- data/app/services/runners/execution_info.rb +49 -0
- data/app/services/runners/invalid_config.rb +5 -0
- data/app/services/runners/invalid_runner.rb +5 -0
- data/app/services/runners/kubernetes/create_client.rb +29 -0
- data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
- data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
- data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
- data/app/services/runners/kubernetes/filer.rb +41 -0
- data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
- data/app/services/runners/kubernetes/node_availability.rb +11 -0
- data/app/services/runners/kubernetes/remove_runner.rb +19 -0
- data/app/services/runners/kubernetes/run_task.rb +54 -0
- data/app/services/runners/kubernetes/update_node_status.rb +64 -0
- data/app/services/runners/runner_id_not_found_error.rb +5 -0
- data/app/services/runners/services_factory.rb +38 -0
- data/app/services/runners/update_node_status_helper.rb +43 -0
- data/app/services/slots_usage_percentage.rb +18 -0
- data/config/application.rb +34 -0
- data/config/boot.rb +5 -0
- data/config/environment.rb +7 -0
- data/config/environments/test.rb +44 -0
- data/config/initializers/application_controller_renderer.rb +10 -0
- data/config/initializers/backtrace_silencers.rb +9 -0
- data/config/initializers/config.rb +51 -0
- data/config/initializers/cookies_serializer.rb +7 -0
- data/config/initializers/docker_config.rb +3 -0
- data/config/initializers/filter_parameter_logging.rb +6 -0
- data/config/initializers/idempotent_request.rb +12 -0
- data/config/initializers/inflections.rb +18 -0
- data/config/initializers/mime_types.rb +6 -0
- data/config/initializers/mongoid.rb +3 -0
- data/config/initializers/new_framework_defaults_6_0.rb +47 -0
- data/config/initializers/raven.rb +10 -0
- data/config/initializers/sidekiq.rb +24 -0
- data/config/initializers/wrap_parameters.rb +16 -0
- data/config/locales/en.yml +33 -0
- data/config/mongoid.yml +10 -0
- data/config/routes.rb +43 -0
- data/config/secrets.yml +35 -0
- data/config/settings.yml +34 -0
- data/config/settings/test.yml +27 -0
- data/config/sidekiq_scheduler.yml +18 -0
- data/config/spring.rb +8 -0
- data/lib/constants.rb +12 -0
- data/lib/container_broker.rb +30 -0
- data/lib/container_broker/engine.rb +6 -0
- data/lib/container_broker/version.rb +5 -0
- data/lib/current_thread_request_id.rb +19 -0
- data/lib/idempotent_request/callback.rb +25 -0
- data/lib/idempotent_request/policy.rb +15 -0
- data/lib/redis_url_parser.rb +25 -0
- data/lib/tasks/task.rake +34 -0
- metadata +590 -0
data/app/models/node.rb
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class Node
|
4
|
+
class NodeConnectionError < StandardError; end
|
5
|
+
|
6
|
+
include Mongoid::Document
|
7
|
+
include Mongoid::Uuid
|
8
|
+
include Mongoid::Timestamps
|
9
|
+
include GlobalID::Identification
|
10
|
+
include MongoidEnumerable
|
11
|
+
|
12
|
+
field :name, type: String
|
13
|
+
field :hostname, type: String
|
14
|
+
field :last_error, type: String
|
15
|
+
field :last_success_at, type: DateTime
|
16
|
+
field :accept_new_tasks, type: Boolean, default: true
|
17
|
+
field :runner_capacity_reached, type: Boolean, default: false
|
18
|
+
field :slots_execution_types, type: Hash, default: {}
|
19
|
+
field :runner_config, type: Hash, default: {}
|
20
|
+
|
21
|
+
enumerable :status, %w[available unstable unavailable], default: "unavailable"
|
22
|
+
enumerable :runner_provider, %w[docker kubernetes], default: :docker
|
23
|
+
|
24
|
+
has_many :slots
|
25
|
+
|
26
|
+
scope :accepting_new_tasks, -> { where(accept_new_tasks: true, :runner_capacity_reached.in => [nil, false]) }
|
27
|
+
|
28
|
+
validates :hostname, presence: true
|
29
|
+
validates :slots_execution_types, presence: true
|
30
|
+
validate :execution_types_format
|
31
|
+
|
32
|
+
def usage_per_execution_type
|
33
|
+
NodeUsagePercentagePerExecutionType.new(self).perform
|
34
|
+
end
|
35
|
+
|
36
|
+
def available_slot_with_execution_type(execution_type)
|
37
|
+
available_slots.find_by(execution_type: execution_type)
|
38
|
+
end
|
39
|
+
|
40
|
+
def available_slots
|
41
|
+
slots.available
|
42
|
+
end
|
43
|
+
|
44
|
+
def destroy_slots
|
45
|
+
slots.destroy_all
|
46
|
+
end
|
47
|
+
|
48
|
+
def runner_service(service)
|
49
|
+
Runners::ServicesFactory.fabricate(runner: runner_provider, service: service)
|
50
|
+
end
|
51
|
+
|
52
|
+
def register_error(error)
|
53
|
+
Rails.logger.info("Registering error in #{self}: #{error}")
|
54
|
+
|
55
|
+
update!(last_error: "#{error} at #{Time.zone.now}")
|
56
|
+
|
57
|
+
if available?
|
58
|
+
unstable!
|
59
|
+
Rails.logger.debug("#{self} marked as unstable")
|
60
|
+
elsif unstable?
|
61
|
+
if unstable_period_expired?
|
62
|
+
unavailable!
|
63
|
+
Rails.logger.debug("#{self} marked as unavailable because the unstable period has expired (last success was at #{last_success_at}). Migrating all tasks.")
|
64
|
+
MigrateTasksFromDeadNodeJob.perform_later(node: self)
|
65
|
+
else
|
66
|
+
Rails.logger.debug("#{self} still unstable until the limit period be expired (last success was at #{last_success_at})")
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def unstable_period_expired?
|
72
|
+
last_success_at && last_success_at < Settings.node_unavailable_after_seconds.seconds.ago
|
73
|
+
end
|
74
|
+
|
75
|
+
def register_success
|
76
|
+
Rails.logger.debug("Registering success in #{self}")
|
77
|
+
update!(last_success_at: Time.zone.now)
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_s
|
81
|
+
last_success = ", last success at #{last_success_at}" unless available?
|
82
|
+
|
83
|
+
"Node #{name} #{uuid} #{runner_provider} (#{status}#{last_success})"
|
84
|
+
end
|
85
|
+
|
86
|
+
def run_with_lock_no_wait
|
87
|
+
LockManager.new(type: self.class.to_s, id: id, wait: false, expire: 5.minutes).lock do
|
88
|
+
yield
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
def execution_types_format
|
95
|
+
valid = slots_execution_types
|
96
|
+
.keys
|
97
|
+
.all? { |execution_type| execution_type.match?(Constants::ExecutionType::FORMAT) }
|
98
|
+
|
99
|
+
errors.add(:slots_execution_types, Constants::ExecutionType::INVALID_FORMAT_MESSAGE) unless valid
|
100
|
+
end
|
101
|
+
end
|
data/app/models/slot.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class Slot
|
4
|
+
include Mongoid::Document
|
5
|
+
include Mongoid::Uuid
|
6
|
+
include GlobalID::Identification
|
7
|
+
include MongoidEnumerable
|
8
|
+
|
9
|
+
enumerable :status, %w[available attaching running releasing]
|
10
|
+
|
11
|
+
field :name, type: String
|
12
|
+
field :execution_type, type: String
|
13
|
+
field :runner_id, type: String
|
14
|
+
belongs_to :current_task, class_name: "Task", optional: true
|
15
|
+
|
16
|
+
belongs_to :node, optional: true
|
17
|
+
|
18
|
+
index(runner_id: 1)
|
19
|
+
index(node_id: 1)
|
20
|
+
index(execution_type: 1, status: 1)
|
21
|
+
|
22
|
+
validates :execution_type, presence: true
|
23
|
+
validates :execution_type, format: {
|
24
|
+
with: Constants::ExecutionType::FORMAT,
|
25
|
+
message: Constants::ExecutionType::INVALID_FORMAT_MESSAGE
|
26
|
+
}
|
27
|
+
|
28
|
+
scope :working, -> { where(:status.in => %w[attaching running releasing]) }
|
29
|
+
|
30
|
+
def mark_as_running(current_task:, runner_id:)
|
31
|
+
update!(status: :running, current_task: current_task, runner_id: runner_id)
|
32
|
+
end
|
33
|
+
|
34
|
+
def release
|
35
|
+
update!(status: :available, runner_id: nil, current_task: nil)
|
36
|
+
RunTasksJob.perform_later(execution_type: execution_type)
|
37
|
+
end
|
38
|
+
|
39
|
+
def to_s
|
40
|
+
"Slot #{name} #{uuid} (#{status} runner_id: #{runner_id})"
|
41
|
+
end
|
42
|
+
end
|
data/app/models/task.rb
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class Task
|
4
|
+
include GlobalID::Identification
|
5
|
+
include Mongoid::Document
|
6
|
+
include Mongoid::Uuid
|
7
|
+
include MongoidEnumerable
|
8
|
+
extend Observable
|
9
|
+
|
10
|
+
field :name, type: String
|
11
|
+
field :runner_id, type: String
|
12
|
+
field :image, type: String
|
13
|
+
field :execution_type, type: String
|
14
|
+
field :cmd, type: String
|
15
|
+
field :storage_mounts, type: Hash, default: {}
|
16
|
+
field :exit_code, type: Integer
|
17
|
+
field :error, type: String
|
18
|
+
field :logs, type: BSON::Binary
|
19
|
+
field :created_at, type: DateTime
|
20
|
+
field :started_at, type: DateTime
|
21
|
+
field :finished_at, type: DateTime
|
22
|
+
field :progress, type: String
|
23
|
+
field :try_count, type: Integer, default: 0
|
24
|
+
field :persist_logs, type: Boolean, default: false
|
25
|
+
field :tags, type: Hash, default: {}
|
26
|
+
|
27
|
+
enumerable :status, %w[waiting starting started retry failed completed error], after_change: :status_changed
|
28
|
+
|
29
|
+
belongs_to :slot, optional: true
|
30
|
+
|
31
|
+
index({ created_at: 1 }, expire_after_seconds: 1.month)
|
32
|
+
index(tags: 1)
|
33
|
+
index(status: 1)
|
34
|
+
index(request_id: 1)
|
35
|
+
TaskTag.distinct(:name).each { |key| index("tags.#{key}" => 1) }
|
36
|
+
|
37
|
+
before_validation :normalize_tags
|
38
|
+
before_create { |task| task.created_at = Time.zone.now }
|
39
|
+
after_create do
|
40
|
+
RunTasksJob.perform_later(execution_type: execution_type)
|
41
|
+
AddTaskTagsJob.perform_later(task: self)
|
42
|
+
end
|
43
|
+
|
44
|
+
validates :name, :image, :cmd, :execution_type, presence: true
|
45
|
+
validates :execution_type, format: {
|
46
|
+
with: Constants::ExecutionType::FORMAT,
|
47
|
+
message: Constants::ExecutionType::INVALID_FORMAT_MESSAGE
|
48
|
+
}
|
49
|
+
validate :storage_mount_identifiers_exist
|
50
|
+
|
51
|
+
def set_logs(logs)
|
52
|
+
self.logs = BSON::Binary.new(logs.dup, :generic)
|
53
|
+
end
|
54
|
+
|
55
|
+
def get_logs
|
56
|
+
if started?
|
57
|
+
slot.node.runner_service(:fetch_logs).perform(task: self)
|
58
|
+
else
|
59
|
+
logs.try(:data)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def mark_as_started!(runner_id:, slot:)
|
64
|
+
update!(started_at: Time.zone.now, runner_id: runner_id, slot: slot)
|
65
|
+
|
66
|
+
started!
|
67
|
+
end
|
68
|
+
|
69
|
+
def mark_as_retry(error: nil)
|
70
|
+
update!(error: error)
|
71
|
+
|
72
|
+
if try_count < Settings.task_retry_count
|
73
|
+
update(try_count: try_count + 1, slot: nil, runner_id: nil)
|
74
|
+
retry!
|
75
|
+
RunTasksJob.perform_later(execution_type: execution_type)
|
76
|
+
else
|
77
|
+
failed!
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def milliseconds_waiting
|
82
|
+
if started? || completed? || failed?
|
83
|
+
calculate_millisecond_span(created_at, started_at)
|
84
|
+
else
|
85
|
+
calculate_millisecond_span(created_at, Time.zone.now.to_datetime)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def milliseconds_running
|
90
|
+
if completed? || failed?
|
91
|
+
calculate_millisecond_span(started_at, finished_at)
|
92
|
+
elsif started?
|
93
|
+
calculate_millisecond_span(started_at, Time.zone.now.to_datetime)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def seconds_running
|
98
|
+
milliseconds_running&.div(1000)
|
99
|
+
end
|
100
|
+
|
101
|
+
def calculate_millisecond_span(start, finish)
|
102
|
+
((finish - start) * 1.day.in_milliseconds).to_i if finish.present? && start.present?
|
103
|
+
end
|
104
|
+
|
105
|
+
def force_retry!
|
106
|
+
update(try_count: 0)
|
107
|
+
retry!
|
108
|
+
RunTasksJob.perform_later(execution_type: execution_type)
|
109
|
+
end
|
110
|
+
|
111
|
+
def normalize_tags
|
112
|
+
tags.transform_values!(&:to_s)
|
113
|
+
end
|
114
|
+
|
115
|
+
def to_s
|
116
|
+
"Task #{name} #{uuid} (#{status} runner_id: #{runner_id}) request_id=#{request_id}"
|
117
|
+
end
|
118
|
+
|
119
|
+
def generate_runner_id
|
120
|
+
prefix = name.gsub("_", "-").parameterize
|
121
|
+
random_suffix = SecureRandom.alphanumeric(8).downcase
|
122
|
+
max_prefix_size = Constants::Runner::MAX_NAME_SIZE - random_suffix.length - 1
|
123
|
+
|
124
|
+
"#{prefix.truncate(max_prefix_size, omission: "")}-#{random_suffix}"
|
125
|
+
end
|
126
|
+
|
127
|
+
def request_id
|
128
|
+
tags&.dig("request_id")
|
129
|
+
end
|
130
|
+
|
131
|
+
private
|
132
|
+
|
133
|
+
def status_changed(old_value, new_value)
|
134
|
+
self.class.observer_instances_for(self).each do |observer|
|
135
|
+
observer.status_change(old_value, new_value)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def storage_mount_identifiers_exist
|
140
|
+
valid = Node.pluck(:runner_provider).uniq.all? do |runner|
|
141
|
+
(storage_mounts.keys.map(&:to_s) - Settings.storage_mounts[runner].keys.map(&:to_s)).empty?
|
142
|
+
end
|
143
|
+
|
144
|
+
return if valid
|
145
|
+
|
146
|
+
errors.add(:storage_mounts, "Storage mounts are invalid")
|
147
|
+
end
|
148
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Observable
|
4
|
+
mattr_accessor :observers
|
5
|
+
|
6
|
+
def self.extended(model)
|
7
|
+
model.observers = Set.new
|
8
|
+
end
|
9
|
+
|
10
|
+
def add_observer(observer)
|
11
|
+
observers << observer
|
12
|
+
end
|
13
|
+
|
14
|
+
def remove_observer(observer)
|
15
|
+
observers.delete(observer)
|
16
|
+
end
|
17
|
+
|
18
|
+
def observer_instances_for(model)
|
19
|
+
observers.map do |observer|
|
20
|
+
observer.new(model)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class StatusPanelNodeSerializer < ActiveModel::Serializer
|
4
|
+
attributes :uuid, :name, :hostname, :status, :last_error, :last_success_at,
|
5
|
+
:usage_per_execution_type, :slots_execution_types, :accept_new_tasks,
|
6
|
+
:runner_provider, :runner_capacity_reached
|
7
|
+
|
8
|
+
has_many :slots, serializer: StatusPanelSlotSerializer
|
9
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class StatusPanelTaskSerializer < ActiveModel::Serializer
|
4
|
+
attributes :uuid, :name, :image, :cmd, :status, :exit_code, :error, :try_count, :created_at,
|
5
|
+
:started_at, :finished_at, :progress, :seconds_running, :tags, :runner_id,
|
6
|
+
:storage_mounts, :slot, :execution_type
|
7
|
+
|
8
|
+
def slot
|
9
|
+
if object.slot
|
10
|
+
{
|
11
|
+
uuid: object.slot.uuid,
|
12
|
+
name: object.slot.name
|
13
|
+
}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class AdjustExecutionTypeSlots
|
4
|
+
attr_reader :node, :execution_type
|
5
|
+
|
6
|
+
def initialize(node:, execution_type:)
|
7
|
+
@node = node
|
8
|
+
@execution_type = execution_type
|
9
|
+
end
|
10
|
+
|
11
|
+
def perform
|
12
|
+
increment_slots
|
13
|
+
|
14
|
+
decrement_slots
|
15
|
+
|
16
|
+
FriendlyNameSlots.new(node: node).perform
|
17
|
+
end
|
18
|
+
|
19
|
+
def increment?
|
20
|
+
amount > count_by_execution_type
|
21
|
+
end
|
22
|
+
|
23
|
+
def decrement?
|
24
|
+
amount < count_by_execution_type
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def count_by_execution_type
|
30
|
+
node.slots.where(execution_type: execution_type).count
|
31
|
+
end
|
32
|
+
|
33
|
+
def amount
|
34
|
+
node.slots_execution_types[execution_type].to_i
|
35
|
+
end
|
36
|
+
|
37
|
+
def increment_slots
|
38
|
+
node.slots.create!(execution_type: execution_type) while increment?
|
39
|
+
|
40
|
+
RunTasksJob.perform_later(execution_type: execution_type)
|
41
|
+
end
|
42
|
+
|
43
|
+
def decrement_slots
|
44
|
+
while decrement?
|
45
|
+
slot = LockSlot.new(execution_type: execution_type, node: node).perform
|
46
|
+
break unless slot
|
47
|
+
|
48
|
+
slot.destroy!
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|