container_broker 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ class MongoidSerializableModel
4
+ attr_reader :model
5
+ include GlobalID::Identification
6
+
7
+ def initialize(model)
8
+ @model = model
9
+ end
10
+
11
+ def to_global_id
12
+ model.to_global_id
13
+ end
14
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Node
4
+ class NodeConnectionError < StandardError; end
5
+
6
+ include Mongoid::Document
7
+ include Mongoid::Uuid
8
+ include Mongoid::Timestamps
9
+ include GlobalID::Identification
10
+ include MongoidEnumerable
11
+
12
+ field :name, type: String
13
+ field :hostname, type: String
14
+ field :last_error, type: String
15
+ field :last_success_at, type: DateTime
16
+ field :accept_new_tasks, type: Boolean, default: true
17
+ field :runner_capacity_reached, type: Boolean, default: false
18
+ field :slots_execution_types, type: Hash, default: {}
19
+ field :runner_config, type: Hash, default: {}
20
+
21
+ enumerable :status, %w[available unstable unavailable], default: "unavailable"
22
+ enumerable :runner_provider, %w[docker kubernetes], default: :docker
23
+
24
+ has_many :slots
25
+
26
+ scope :accepting_new_tasks, -> { where(accept_new_tasks: true, :runner_capacity_reached.in => [nil, false]) }
27
+
28
+ validates :hostname, presence: true
29
+ validates :slots_execution_types, presence: true
30
+ validate :execution_types_format
31
+
32
+ def usage_per_execution_type
33
+ NodeUsagePercentagePerExecutionType.new(self).perform
34
+ end
35
+
36
+ def available_slot_with_execution_type(execution_type)
37
+ available_slots.find_by(execution_type: execution_type)
38
+ end
39
+
40
+ def available_slots
41
+ slots.available
42
+ end
43
+
44
+ def destroy_slots
45
+ slots.destroy_all
46
+ end
47
+
48
+ def runner_service(service)
49
+ Runners::ServicesFactory.fabricate(runner: runner_provider, service: service)
50
+ end
51
+
52
+ def register_error(error)
53
+ Rails.logger.info("Registering error in #{self}: #{error}")
54
+
55
+ update!(last_error: "#{error} at #{Time.zone.now}")
56
+
57
+ if available?
58
+ unstable!
59
+ Rails.logger.debug("#{self} marked as unstable")
60
+ elsif unstable?
61
+ if unstable_period_expired?
62
+ unavailable!
63
+ Rails.logger.debug("#{self} marked as unavailable because the unstable period has expired (last success was at #{last_success_at}). Migrating all tasks.")
64
+ MigrateTasksFromDeadNodeJob.perform_later(node: self)
65
+ else
66
+ Rails.logger.debug("#{self} still unstable until the limit period be expired (last success was at #{last_success_at})")
67
+ end
68
+ end
69
+ end
70
+
71
+ def unstable_period_expired?
72
+ last_success_at && last_success_at < Settings.node_unavailable_after_seconds.seconds.ago
73
+ end
74
+
75
+ def register_success
76
+ Rails.logger.debug("Registering success in #{self}")
77
+ update!(last_success_at: Time.zone.now)
78
+ end
79
+
80
+ def to_s
81
+ last_success = ", last success at #{last_success_at}" unless available?
82
+
83
+ "Node #{name} #{uuid} #{runner_provider} (#{status}#{last_success})"
84
+ end
85
+
86
+ def run_with_lock_no_wait
87
+ LockManager.new(type: self.class.to_s, id: id, wait: false, expire: 5.minutes).lock do
88
+ yield
89
+ end
90
+ end
91
+
92
+ private
93
+
94
+ def execution_types_format
95
+ valid = slots_execution_types
96
+ .keys
97
+ .all? { |execution_type| execution_type.match?(Constants::ExecutionType::FORMAT) }
98
+
99
+ errors.add(:slots_execution_types, Constants::ExecutionType::INVALID_FORMAT_MESSAGE) unless valid
100
+ end
101
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Slot
4
+ include Mongoid::Document
5
+ include Mongoid::Uuid
6
+ include GlobalID::Identification
7
+ include MongoidEnumerable
8
+
9
+ enumerable :status, %w[available attaching running releasing]
10
+
11
+ field :name, type: String
12
+ field :execution_type, type: String
13
+ field :runner_id, type: String
14
+ belongs_to :current_task, class_name: "Task", optional: true
15
+
16
+ belongs_to :node, optional: true
17
+
18
+ index(runner_id: 1)
19
+ index(node_id: 1)
20
+ index(execution_type: 1, status: 1)
21
+
22
+ validates :execution_type, presence: true
23
+ validates :execution_type, format: {
24
+ with: Constants::ExecutionType::FORMAT,
25
+ message: Constants::ExecutionType::INVALID_FORMAT_MESSAGE
26
+ }
27
+
28
+ scope :working, -> { where(:status.in => %w[attaching running releasing]) }
29
+
30
+ def mark_as_running(current_task:, runner_id:)
31
+ update!(status: :running, current_task: current_task, runner_id: runner_id)
32
+ end
33
+
34
+ def release
35
+ update!(status: :available, runner_id: nil, current_task: nil)
36
+ RunTasksJob.perform_later(execution_type: execution_type)
37
+ end
38
+
39
+ def to_s
40
+ "Slot #{name} #{uuid} (#{status} runner_id: #{runner_id})"
41
+ end
42
+ end
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Task
4
+ include GlobalID::Identification
5
+ include Mongoid::Document
6
+ include Mongoid::Uuid
7
+ include MongoidEnumerable
8
+ extend Observable
9
+
10
+ field :name, type: String
11
+ field :runner_id, type: String
12
+ field :image, type: String
13
+ field :execution_type, type: String
14
+ field :cmd, type: String
15
+ field :storage_mounts, type: Hash, default: {}
16
+ field :exit_code, type: Integer
17
+ field :error, type: String
18
+ field :logs, type: BSON::Binary
19
+ field :created_at, type: DateTime
20
+ field :started_at, type: DateTime
21
+ field :finished_at, type: DateTime
22
+ field :progress, type: String
23
+ field :try_count, type: Integer, default: 0
24
+ field :persist_logs, type: Boolean, default: false
25
+ field :tags, type: Hash, default: {}
26
+
27
+ enumerable :status, %w[waiting starting started retry failed completed error], after_change: :status_changed
28
+
29
+ belongs_to :slot, optional: true
30
+
31
+ index({ created_at: 1 }, expire_after_seconds: 1.month)
32
+ index(tags: 1)
33
+ index(status: 1)
34
+ index(request_id: 1)
35
+ TaskTag.distinct(:name).each { |key| index("tags.#{key}" => 1) }
36
+
37
+ before_validation :normalize_tags
38
+ before_create { |task| task.created_at = Time.zone.now }
39
+ after_create do
40
+ RunTasksJob.perform_later(execution_type: execution_type)
41
+ AddTaskTagsJob.perform_later(task: self)
42
+ end
43
+
44
+ validates :name, :image, :cmd, :execution_type, presence: true
45
+ validates :execution_type, format: {
46
+ with: Constants::ExecutionType::FORMAT,
47
+ message: Constants::ExecutionType::INVALID_FORMAT_MESSAGE
48
+ }
49
+ validate :storage_mount_identifiers_exist
50
+
51
+ def set_logs(logs)
52
+ self.logs = BSON::Binary.new(logs.dup, :generic)
53
+ end
54
+
55
+ def get_logs
56
+ if started?
57
+ slot.node.runner_service(:fetch_logs).perform(task: self)
58
+ else
59
+ logs.try(:data)
60
+ end
61
+ end
62
+
63
+ def mark_as_started!(runner_id:, slot:)
64
+ update!(started_at: Time.zone.now, runner_id: runner_id, slot: slot)
65
+
66
+ started!
67
+ end
68
+
69
+ def mark_as_retry(error: nil)
70
+ update!(error: error)
71
+
72
+ if try_count < Settings.task_retry_count
73
+ update(try_count: try_count + 1, slot: nil, runner_id: nil)
74
+ retry!
75
+ RunTasksJob.perform_later(execution_type: execution_type)
76
+ else
77
+ failed!
78
+ end
79
+ end
80
+
81
+ def milliseconds_waiting
82
+ if started? || completed? || failed?
83
+ calculate_millisecond_span(created_at, started_at)
84
+ else
85
+ calculate_millisecond_span(created_at, Time.zone.now.to_datetime)
86
+ end
87
+ end
88
+
89
+ def milliseconds_running
90
+ if completed? || failed?
91
+ calculate_millisecond_span(started_at, finished_at)
92
+ elsif started?
93
+ calculate_millisecond_span(started_at, Time.zone.now.to_datetime)
94
+ end
95
+ end
96
+
97
+ def seconds_running
98
+ milliseconds_running&.div(1000)
99
+ end
100
+
101
+ def calculate_millisecond_span(start, finish)
102
+ ((finish - start) * 1.day.in_milliseconds).to_i if finish.present? && start.present?
103
+ end
104
+
105
+ def force_retry!
106
+ update(try_count: 0)
107
+ retry!
108
+ RunTasksJob.perform_later(execution_type: execution_type)
109
+ end
110
+
111
+ def normalize_tags
112
+ tags.transform_values!(&:to_s)
113
+ end
114
+
115
+ def to_s
116
+ "Task #{name} #{uuid} (#{status} runner_id: #{runner_id}) request_id=#{request_id}"
117
+ end
118
+
119
+ def generate_runner_id
120
+ prefix = name.gsub("_", "-").parameterize
121
+ random_suffix = SecureRandom.alphanumeric(8).downcase
122
+ max_prefix_size = Constants::Runner::MAX_NAME_SIZE - random_suffix.length - 1
123
+
124
+ "#{prefix.truncate(max_prefix_size, omission: "")}-#{random_suffix}"
125
+ end
126
+
127
+ def request_id
128
+ tags&.dig("request_id")
129
+ end
130
+
131
+ private
132
+
133
+ def status_changed(old_value, new_value)
134
+ self.class.observer_instances_for(self).each do |observer|
135
+ observer.status_change(old_value, new_value)
136
+ end
137
+ end
138
+
139
+ def storage_mount_identifiers_exist
140
+ valid = Node.pluck(:runner_provider).uniq.all? do |runner|
141
+ (storage_mounts.keys.map(&:to_s) - Settings.storage_mounts[runner].keys.map(&:to_s)).empty?
142
+ end
143
+
144
+ return if valid
145
+
146
+ errors.add(:storage_mounts, "Storage mounts are invalid")
147
+ end
148
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TaskTag
4
+ include Mongoid::Document
5
+ include Mongoid::Uuid
6
+
7
+ field :name, type: String
8
+ field :values, type: Array, default: []
9
+
10
+ index({ name: 1 }, unique: true)
11
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Observable
4
+ mattr_accessor :observers
5
+
6
+ def self.extended(model)
7
+ model.observers = Set.new
8
+ end
9
+
10
+ def add_observer(observer)
11
+ observers << observer
12
+ end
13
+
14
+ def remove_observer(observer)
15
+ observers.delete(observer)
16
+ end
17
+
18
+ def observer_instances_for(model)
19
+ observers.map do |observer|
20
+ observer.new(model)
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TaskObserver
4
+ attr_reader :task
5
+
6
+ def initialize(task)
7
+ @task = task
8
+ end
9
+
10
+ def status_change(_old_value, _new_value); end
11
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodeHealthcheckSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :name, :hostname, :status, :last_error, :created_at, :updated_at
5
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodeSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :hostname, :status, :accept_new_tasks, :slots_execution_types, :runner_provider, :runner_capacity_reached
5
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ class StatusPanelNodeSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :name, :hostname, :status, :last_error, :last_success_at,
5
+ :usage_per_execution_type, :slots_execution_types, :accept_new_tasks,
6
+ :runner_provider, :runner_capacity_reached
7
+
8
+ has_many :slots, serializer: StatusPanelSlotSerializer
9
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class StatusPanelSlotSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :name, :runner_id, :status, :execution_type
5
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ class StatusPanelTaskSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :name, :image, :cmd, :status, :exit_code, :error, :try_count, :created_at,
5
+ :started_at, :finished_at, :progress, :seconds_running, :tags, :runner_id,
6
+ :storage_mounts, :slot, :execution_type
7
+
8
+ def slot
9
+ if object.slot
10
+ {
11
+ uuid: object.slot.uuid,
12
+ name: object.slot.name
13
+ }
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TaskHealthcheckSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :status, :error, :created_at, :started_at, :finished_at, :execution_type
5
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TaskSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :status, :exit_code, :error, :try_count,
5
+ :created_at, :started_at, :finished_at, :progress, :seconds_running,
6
+ :execution_type
7
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ class AdjustExecutionTypeSlots
4
+ attr_reader :node, :execution_type
5
+
6
+ def initialize(node:, execution_type:)
7
+ @node = node
8
+ @execution_type = execution_type
9
+ end
10
+
11
+ def perform
12
+ increment_slots
13
+
14
+ decrement_slots
15
+
16
+ FriendlyNameSlots.new(node: node).perform
17
+ end
18
+
19
+ def increment?
20
+ amount > count_by_execution_type
21
+ end
22
+
23
+ def decrement?
24
+ amount < count_by_execution_type
25
+ end
26
+
27
+ private
28
+
29
+ def count_by_execution_type
30
+ node.slots.where(execution_type: execution_type).count
31
+ end
32
+
33
+ def amount
34
+ node.slots_execution_types[execution_type].to_i
35
+ end
36
+
37
+ def increment_slots
38
+ node.slots.create!(execution_type: execution_type) while increment?
39
+
40
+ RunTasksJob.perform_later(execution_type: execution_type)
41
+ end
42
+
43
+ def decrement_slots
44
+ while decrement?
45
+ slot = LockSlot.new(execution_type: execution_type, node: node).perform
46
+ break unless slot
47
+
48
+ slot.destroy!
49
+ end
50
+ end
51
+ end