container_broker 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ class MongoidSerializableModel
4
+ attr_reader :model
5
+ include GlobalID::Identification
6
+
7
+ def initialize(model)
8
+ @model = model
9
+ end
10
+
11
+ def to_global_id
12
+ model.to_global_id
13
+ end
14
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Node
4
+ class NodeConnectionError < StandardError; end
5
+
6
+ include Mongoid::Document
7
+ include Mongoid::Uuid
8
+ include Mongoid::Timestamps
9
+ include GlobalID::Identification
10
+ include MongoidEnumerable
11
+
12
+ field :name, type: String
13
+ field :hostname, type: String
14
+ field :last_error, type: String
15
+ field :last_success_at, type: DateTime
16
+ field :accept_new_tasks, type: Boolean, default: true
17
+ field :runner_capacity_reached, type: Boolean, default: false
18
+ field :slots_execution_types, type: Hash, default: {}
19
+ field :runner_config, type: Hash, default: {}
20
+
21
+ enumerable :status, %w[available unstable unavailable], default: "unavailable"
22
+ enumerable :runner_provider, %w[docker kubernetes], default: :docker
23
+
24
+ has_many :slots
25
+
26
+ scope :accepting_new_tasks, -> { where(accept_new_tasks: true, :runner_capacity_reached.in => [nil, false]) }
27
+
28
+ validates :hostname, presence: true
29
+ validates :slots_execution_types, presence: true
30
+ validate :execution_types_format
31
+
32
+ def usage_per_execution_type
33
+ NodeUsagePercentagePerExecutionType.new(self).perform
34
+ end
35
+
36
+ def available_slot_with_execution_type(execution_type)
37
+ available_slots.find_by(execution_type: execution_type)
38
+ end
39
+
40
+ def available_slots
41
+ slots.available
42
+ end
43
+
44
+ def destroy_slots
45
+ slots.destroy_all
46
+ end
47
+
48
+ def runner_service(service)
49
+ Runners::ServicesFactory.fabricate(runner: runner_provider, service: service)
50
+ end
51
+
52
+ def register_error(error)
53
+ Rails.logger.info("Registering error in #{self}: #{error}")
54
+
55
+ update!(last_error: "#{error} at #{Time.zone.now}")
56
+
57
+ if available?
58
+ unstable!
59
+ Rails.logger.debug("#{self} marked as unstable")
60
+ elsif unstable?
61
+ if unstable_period_expired?
62
+ unavailable!
63
+ Rails.logger.debug("#{self} marked as unavailable because the unstable period has expired (last success was at #{last_success_at}). Migrating all tasks.")
64
+ MigrateTasksFromDeadNodeJob.perform_later(node: self)
65
+ else
66
+ Rails.logger.debug("#{self} still unstable until the limit period be expired (last success was at #{last_success_at})")
67
+ end
68
+ end
69
+ end
70
+
71
+ def unstable_period_expired?
72
+ last_success_at && last_success_at < Settings.node_unavailable_after_seconds.seconds.ago
73
+ end
74
+
75
+ def register_success
76
+ Rails.logger.debug("Registering success in #{self}")
77
+ update!(last_success_at: Time.zone.now)
78
+ end
79
+
80
+ def to_s
81
+ last_success = ", last success at #{last_success_at}" unless available?
82
+
83
+ "Node #{name} #{uuid} #{runner_provider} (#{status}#{last_success})"
84
+ end
85
+
86
+ def run_with_lock_no_wait
87
+ LockManager.new(type: self.class.to_s, id: id, wait: false, expire: 5.minutes).lock do
88
+ yield
89
+ end
90
+ end
91
+
92
+ private
93
+
94
+ def execution_types_format
95
+ valid = slots_execution_types
96
+ .keys
97
+ .all? { |execution_type| execution_type.match?(Constants::ExecutionType::FORMAT) }
98
+
99
+ errors.add(:slots_execution_types, Constants::ExecutionType::INVALID_FORMAT_MESSAGE) unless valid
100
+ end
101
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Slot
4
+ include Mongoid::Document
5
+ include Mongoid::Uuid
6
+ include GlobalID::Identification
7
+ include MongoidEnumerable
8
+
9
+ enumerable :status, %w[available attaching running releasing]
10
+
11
+ field :name, type: String
12
+ field :execution_type, type: String
13
+ field :runner_id, type: String
14
+ belongs_to :current_task, class_name: "Task", optional: true
15
+
16
+ belongs_to :node, optional: true
17
+
18
+ index(runner_id: 1)
19
+ index(node_id: 1)
20
+ index(execution_type: 1, status: 1)
21
+
22
+ validates :execution_type, presence: true
23
+ validates :execution_type, format: {
24
+ with: Constants::ExecutionType::FORMAT,
25
+ message: Constants::ExecutionType::INVALID_FORMAT_MESSAGE
26
+ }
27
+
28
+ scope :working, -> { where(:status.in => %w[attaching running releasing]) }
29
+
30
+ def mark_as_running(current_task:, runner_id:)
31
+ update!(status: :running, current_task: current_task, runner_id: runner_id)
32
+ end
33
+
34
+ def release
35
+ update!(status: :available, runner_id: nil, current_task: nil)
36
+ RunTasksJob.perform_later(execution_type: execution_type)
37
+ end
38
+
39
+ def to_s
40
+ "Slot #{name} #{uuid} (#{status} runner_id: #{runner_id})"
41
+ end
42
+ end
@@ -0,0 +1,148 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Task
4
+ include GlobalID::Identification
5
+ include Mongoid::Document
6
+ include Mongoid::Uuid
7
+ include MongoidEnumerable
8
+ extend Observable
9
+
10
+ field :name, type: String
11
+ field :runner_id, type: String
12
+ field :image, type: String
13
+ field :execution_type, type: String
14
+ field :cmd, type: String
15
+ field :storage_mounts, type: Hash, default: {}
16
+ field :exit_code, type: Integer
17
+ field :error, type: String
18
+ field :logs, type: BSON::Binary
19
+ field :created_at, type: DateTime
20
+ field :started_at, type: DateTime
21
+ field :finished_at, type: DateTime
22
+ field :progress, type: String
23
+ field :try_count, type: Integer, default: 0
24
+ field :persist_logs, type: Boolean, default: false
25
+ field :tags, type: Hash, default: {}
26
+
27
+ enumerable :status, %w[waiting starting started retry failed completed error], after_change: :status_changed
28
+
29
+ belongs_to :slot, optional: true
30
+
31
+ index({ created_at: 1 }, expire_after_seconds: 1.month)
32
+ index(tags: 1)
33
+ index(status: 1)
34
+ index(request_id: 1)
35
+ TaskTag.distinct(:name).each { |key| index("tags.#{key}" => 1) }
36
+
37
+ before_validation :normalize_tags
38
+ before_create { |task| task.created_at = Time.zone.now }
39
+ after_create do
40
+ RunTasksJob.perform_later(execution_type: execution_type)
41
+ AddTaskTagsJob.perform_later(task: self)
42
+ end
43
+
44
+ validates :name, :image, :cmd, :execution_type, presence: true
45
+ validates :execution_type, format: {
46
+ with: Constants::ExecutionType::FORMAT,
47
+ message: Constants::ExecutionType::INVALID_FORMAT_MESSAGE
48
+ }
49
+ validate :storage_mount_identifiers_exist
50
+
51
+ def set_logs(logs)
52
+ self.logs = BSON::Binary.new(logs.dup, :generic)
53
+ end
54
+
55
+ def get_logs
56
+ if started?
57
+ slot.node.runner_service(:fetch_logs).perform(task: self)
58
+ else
59
+ logs.try(:data)
60
+ end
61
+ end
62
+
63
+ def mark_as_started!(runner_id:, slot:)
64
+ update!(started_at: Time.zone.now, runner_id: runner_id, slot: slot)
65
+
66
+ started!
67
+ end
68
+
69
+ def mark_as_retry(error: nil)
70
+ update!(error: error)
71
+
72
+ if try_count < Settings.task_retry_count
73
+ update(try_count: try_count + 1, slot: nil, runner_id: nil)
74
+ retry!
75
+ RunTasksJob.perform_later(execution_type: execution_type)
76
+ else
77
+ failed!
78
+ end
79
+ end
80
+
81
+ def milliseconds_waiting
82
+ if started? || completed? || failed?
83
+ calculate_millisecond_span(created_at, started_at)
84
+ else
85
+ calculate_millisecond_span(created_at, Time.zone.now.to_datetime)
86
+ end
87
+ end
88
+
89
+ def milliseconds_running
90
+ if completed? || failed?
91
+ calculate_millisecond_span(started_at, finished_at)
92
+ elsif started?
93
+ calculate_millisecond_span(started_at, Time.zone.now.to_datetime)
94
+ end
95
+ end
96
+
97
+ def seconds_running
98
+ milliseconds_running&.div(1000)
99
+ end
100
+
101
+ def calculate_millisecond_span(start, finish)
102
+ ((finish - start) * 1.day.in_milliseconds).to_i if finish.present? && start.present?
103
+ end
104
+
105
+ def force_retry!
106
+ update(try_count: 0)
107
+ retry!
108
+ RunTasksJob.perform_later(execution_type: execution_type)
109
+ end
110
+
111
+ def normalize_tags
112
+ tags.transform_values!(&:to_s)
113
+ end
114
+
115
+ def to_s
116
+ "Task #{name} #{uuid} (#{status} runner_id: #{runner_id}) request_id=#{request_id}"
117
+ end
118
+
119
+ def generate_runner_id
120
+ prefix = name.gsub("_", "-").parameterize
121
+ random_suffix = SecureRandom.alphanumeric(8).downcase
122
+ max_prefix_size = Constants::Runner::MAX_NAME_SIZE - random_suffix.length - 1
123
+
124
+ "#{prefix.truncate(max_prefix_size, omission: "")}-#{random_suffix}"
125
+ end
126
+
127
+ def request_id
128
+ tags&.dig("request_id")
129
+ end
130
+
131
+ private
132
+
133
+ def status_changed(old_value, new_value)
134
+ self.class.observer_instances_for(self).each do |observer|
135
+ observer.status_change(old_value, new_value)
136
+ end
137
+ end
138
+
139
+ def storage_mount_identifiers_exist
140
+ valid = Node.pluck(:runner_provider).uniq.all? do |runner|
141
+ (storage_mounts.keys.map(&:to_s) - Settings.storage_mounts[runner].keys.map(&:to_s)).empty?
142
+ end
143
+
144
+ return if valid
145
+
146
+ errors.add(:storage_mounts, "Storage mounts are invalid")
147
+ end
148
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TaskTag
4
+ include Mongoid::Document
5
+ include Mongoid::Uuid
6
+
7
+ field :name, type: String
8
+ field :values, type: Array, default: []
9
+
10
+ index({ name: 1 }, unique: true)
11
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Observable
4
+ mattr_accessor :observers
5
+
6
+ def self.extended(model)
7
+ model.observers = Set.new
8
+ end
9
+
10
+ def add_observer(observer)
11
+ observers << observer
12
+ end
13
+
14
+ def remove_observer(observer)
15
+ observers.delete(observer)
16
+ end
17
+
18
+ def observer_instances_for(model)
19
+ observers.map do |observer|
20
+ observer.new(model)
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TaskObserver
4
+ attr_reader :task
5
+
6
+ def initialize(task)
7
+ @task = task
8
+ end
9
+
10
+ def status_change(_old_value, _new_value); end
11
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodeHealthcheckSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :name, :hostname, :status, :last_error, :created_at, :updated_at
5
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodeSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :hostname, :status, :accept_new_tasks, :slots_execution_types, :runner_provider, :runner_capacity_reached
5
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ class StatusPanelNodeSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :name, :hostname, :status, :last_error, :last_success_at,
5
+ :usage_per_execution_type, :slots_execution_types, :accept_new_tasks,
6
+ :runner_provider, :runner_capacity_reached
7
+
8
+ has_many :slots, serializer: StatusPanelSlotSerializer
9
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class StatusPanelSlotSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :name, :runner_id, :status, :execution_type
5
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ class StatusPanelTaskSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :name, :image, :cmd, :status, :exit_code, :error, :try_count, :created_at,
5
+ :started_at, :finished_at, :progress, :seconds_running, :tags, :runner_id,
6
+ :storage_mounts, :slot, :execution_type
7
+
8
+ def slot
9
+ if object.slot
10
+ {
11
+ uuid: object.slot.uuid,
12
+ name: object.slot.name
13
+ }
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TaskHealthcheckSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :status, :error, :created_at, :started_at, :finished_at, :execution_type
5
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TaskSerializer < ActiveModel::Serializer
4
+ attributes :uuid, :status, :exit_code, :error, :try_count,
5
+ :created_at, :started_at, :finished_at, :progress, :seconds_running,
6
+ :execution_type
7
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ class AdjustExecutionTypeSlots
4
+ attr_reader :node, :execution_type
5
+
6
+ def initialize(node:, execution_type:)
7
+ @node = node
8
+ @execution_type = execution_type
9
+ end
10
+
11
+ def perform
12
+ increment_slots
13
+
14
+ decrement_slots
15
+
16
+ FriendlyNameSlots.new(node: node).perform
17
+ end
18
+
19
+ def increment?
20
+ amount > count_by_execution_type
21
+ end
22
+
23
+ def decrement?
24
+ amount < count_by_execution_type
25
+ end
26
+
27
+ private
28
+
29
+ def count_by_execution_type
30
+ node.slots.where(execution_type: execution_type).count
31
+ end
32
+
33
+ def amount
34
+ node.slots_execution_types[execution_type].to_i
35
+ end
36
+
37
+ def increment_slots
38
+ node.slots.create!(execution_type: execution_type) while increment?
39
+
40
+ RunTasksJob.perform_later(execution_type: execution_type)
41
+ end
42
+
43
+ def decrement_slots
44
+ while decrement?
45
+ slot = LockSlot.new(execution_type: execution_type, node: node).perform
46
+ break unless slot
47
+
48
+ slot.destroy!
49
+ end
50
+ end
51
+ end