container_broker 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class NodeAvailability
6
+ def perform(node:)
7
+ CreateClient.new.perform(node: node).api_info
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class RemoveRunner
6
+ def perform(node:, runner_id:)
7
+ Rails.logger.debug("Deleting pod")
8
+ begin
9
+ CreateClient.new.perform(node: node).force_delete_pod(pod_name: runner_id)
10
+ Rails.logger.debug("Pod #{runner_id} removed")
11
+ rescue KubernetesClient::PodNotFoundError
12
+ Rails.logger.debug("Pod #{runner_id} already removed")
13
+ rescue KubernetesClient::NetworkError => e
14
+ node.register_error(e.message)
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class RunTask
6
+ NFS_NAME = "nfs"
7
+
8
+ def perform(task:, slot:, runner_id:)
9
+ create_pod(task: task, node: slot.node, runner_id: runner_id)
10
+ rescue KubernetesClient::NetworkError => e then
11
+ raise Node::NodeConnectionError, "#{e.class}: #{e.message}"
12
+ end
13
+
14
+ private
15
+
16
+ def create_pod(task:, node:, runner_id:)
17
+ CreateClient.new.perform(node: node).create_pod(
18
+ pod_name: runner_id,
19
+ image: task.image,
20
+ cmd: task.cmd,
21
+ internal_mounts: internal_mounts(task: task),
22
+ external_mounts: external_mounts(task: task),
23
+ node_selector: node.runner_config["node_selector"]
24
+ )
25
+ end
26
+
27
+ def filer(task:)
28
+ Filer.new.perform(task_storage_mounts: task.storage_mounts)
29
+ end
30
+
31
+ def internal_mounts(task:)
32
+ filer(task: task)[:internal]
33
+ end
34
+
35
+ def external_mounts(task:)
36
+ filer(task: task)[:external]
37
+ end
38
+
39
+ def add_metric(task)
40
+ Metrics.new("tasks").count(
41
+ task_id: task.id,
42
+ name: task&.name,
43
+ type: task&.execution_type,
44
+ slot: task&.slot&.name,
45
+ node: task&.slot&.node&.name,
46
+ started_at: task.started_at,
47
+ duration: task.milliseconds_waiting,
48
+ error: task.error,
49
+ status: task.status
50
+ )
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Kubernetes
5
+ class UpdateNodeStatus
6
+ include UpdateNodeStatusHelper
7
+
8
+ attr_reader :node
9
+
10
+ def perform(node:)
11
+ @node = node
12
+
13
+ # Other tasks can be started at this time. Because of this it's necessary to load the tasks first and then the containers
14
+ started_tasks = Task.started.where(:slot.in => node.slots.pluck(:id)).to_a
15
+
16
+ node.update!(runner_capacity_reached: pending_schedule_pods?)
17
+
18
+ execution_infos.each do |execution_info|
19
+ runner_id = execution_info.id
20
+ slot = node.slots.find_by(runner_id: runner_id)
21
+
22
+ if slot
23
+ if execution_info.terminated?
24
+ Rails.logger.debug("Pod #{runner_id} Complete")
25
+ check_slot_release(slot: slot, runner_id: runner_id)
26
+ else
27
+ slot.current_task&.update!(error: execution_info.error) if execution_info.error
28
+ Rails.logger.debug("Pod is not terminated (it is #{execution_info.status}). Ignoring.")
29
+ end
30
+ else
31
+ remove_unknown_runner(node: node, runner_id: runner_id)
32
+ end
33
+ end
34
+
35
+ RescheduleTasksForMissingRunners
36
+ .new(runner_ids: pods.keys, started_tasks: started_tasks)
37
+ .perform
38
+
39
+ node.register_success
40
+
41
+ send_metrics(node: node, execution_infos: execution_infos)
42
+ rescue KubernetesClient::NetworkError => e
43
+ Rails.logger.debug("Error #{e.class}: #{e}")
44
+ node.register_error(e.message)
45
+ end
46
+
47
+ private
48
+
49
+ def pods
50
+ @pods ||= CreateClient.new.perform(node: node)
51
+ .fetch_pods
52
+ .tap { |pods| Rails.logger.debug("Fetched #{pods.count} pods") }
53
+ end
54
+
55
+ def execution_infos
56
+ @execution_infos ||= pods.values.map { |pod| CreateExecutionInfo.new.perform(pod: pod) }
57
+ end
58
+
59
+ def pending_schedule_pods?
60
+ execution_infos.any?(&:schedule_pending?)
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ class RunnerIdNotFoundError < StandardError; end
5
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ class ServicesFactory
5
+ class ServiceNotFoundForRunner < StandardError; end
6
+
7
+ SERVICES = {
8
+ kubernetes: {
9
+ update_node_status: Runners::Kubernetes::UpdateNodeStatus,
10
+ node_availability: Runners::Kubernetes::NodeAvailability,
11
+ run_task: Runners::Kubernetes::RunTask,
12
+ kill_slot_runner: Runners::Kubernetes::KillSlotRunner,
13
+ remove_runner: Runners::Kubernetes::RemoveRunner,
14
+ fetch_logs: Runners::Kubernetes::FetchLogs,
15
+ fetch_execution_info: Runners::Kubernetes::FetchExecutionInfo,
16
+ filer: Runners::Kubernetes::Filer
17
+ },
18
+ docker: {
19
+ update_node_status: Runners::Docker::UpdateNodeStatus,
20
+ node_availability: Runners::Docker::NodeAvailability,
21
+ run_task: Runners::Docker::RunTask,
22
+ kill_slot_runner: Runners::Docker::KillSlotRunner,
23
+ remove_runner: Runners::Docker::RemoveRunner,
24
+ fetch_logs: Runners::Docker::FetchLogs,
25
+ fetch_execution_info: Runners::Docker::FetchExecutionInfo,
26
+ filer: Runners::Docker::Filer
27
+ }
28
+ }.freeze
29
+
30
+ def self.fabricate(runner:, service:)
31
+ service_class = SERVICES.dig(runner.to_sym, service)
32
+
33
+ raise ServiceNotFoundForRunner, "No service #{service} found for #{runner}" unless service_class
34
+
35
+ service_class.new
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module UpdateNodeStatusHelper
5
+ def check_slot_release(slot:, runner_id:)
6
+ if slot.running?
7
+ slot.releasing!
8
+ Rails.logger.debug("Slot was running. Marked as releasing. Slot: #{slot}. Current task: #{slot.current_task}")
9
+ ReleaseSlotJob.perform_later(slot: MongoidSerializableModel.new(slot), runner_id: runner_id)
10
+ else
11
+ Rails.logger.debug("Slot was not running (it was #{slot.status}). Ignoring.")
12
+ end
13
+ end
14
+
15
+ def remove_unknown_runner(node:, runner_id:)
16
+ Rails.logger.debug("Slot not found for container #{runner_id}")
17
+
18
+ if Settings.ignore_containers.none? { |ignored_name| runner_id.start_with?(ignored_name) }
19
+ # It is needed to select the container using just any of its names
20
+ RemoveRunnerJob.perform_later(node: node, runner_id: runner_id)
21
+ else
22
+ Rails.logger.debug("Container #{runner_id} is ignored for removal")
23
+ end
24
+ end
25
+
26
+ def send_metrics(node:, execution_infos:)
27
+ runners_count = execution_infos
28
+ .group_by(&:status)
29
+ .transform_keys { |k| "#{k}_runners".to_sym }
30
+ .transform_values(&:count)
31
+
32
+ data = {
33
+ hostname: node.hostname,
34
+ runner_type: node.runner_provider,
35
+ capacity_reached: node.runner_capacity_reached,
36
+ schedule_pending: execution_infos.count(&:schedule_pending?),
37
+ total_runners: execution_infos.count
38
+ }
39
+
40
+ Metrics.new("runners").count(data.merge(runners_count))
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ class SlotsUsagePercentage
4
+ def initialize(slots)
5
+ @slots = slots
6
+ end
7
+
8
+ def perform
9
+ (((@slots.size - available_slots.size).to_f / @slots.size) * 100)
10
+ .round(2)
11
+ end
12
+
13
+ private
14
+
15
+ def available_slots
16
+ @slots.select(&:available?)
17
+ end
18
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "boot"
4
+
5
+ require "action_controller/railtie"
6
+ # require "action_view/railtie"
7
+ # require "action_mailer/railtie"
8
+ require "active_job/railtie"
9
+ # require "action_cable/engine"
10
+ # require "rails/test_unit/railtie"
11
+ # require "sprockets/railtie"
12
+
13
+ # Require the gems listed in Gemfile, including any gems
14
+ # you've limited to :test, :development, or :production.
15
+ Bundler.require(*Rails.groups)
16
+
17
+ module ContainerBroker
18
+ class Application < Rails::Application
19
+ # Initialize configuration defaults for originally generated Rails version.
20
+ config.load_defaults 5.1
21
+
22
+ config.api_only = true
23
+
24
+ # Settings in config/environments/* take precedence over those specified here.
25
+ # Application configuration should go into files in config/initializers
26
+ # -- all .rb files in that directory are automatically loaded.
27
+
28
+ config.eager_load_paths << Rails.root.join("lib")
29
+
30
+ config.log_tags = [
31
+ ->(request) { " request_id=#{request.request_id} " }
32
+ ]
33
+ end
34
+ end
data/config/boot.rb ADDED
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../Gemfile", __dir__)
4
+
5
+ require "bundler/setup" # Set up gems listed in the Gemfile.
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Load the Rails application.
4
+ require_relative "application"
5
+
6
+ # Initialize the Rails application.
7
+ Rails.application.initialize!
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ Rails.application.configure do
4
+ # Settings specified here will take precedence over those in config/application.rb.
5
+
6
+ # The test environment is used exclusively to run your application's
7
+ # test suite. You never need to work with it otherwise. Remember that
8
+ # your test database is "scratch space" for the test suite and is wiped
9
+ # and recreated between test runs. Don't rely on the data there!
10
+ config.cache_classes = true
11
+
12
+ # Do not eager load code on boot. This avoids loading your whole application
13
+ # just for the purpose of running a single test. If you are using a tool that
14
+ # preloads Rails for running tests, you may have to set it to true.
15
+ config.eager_load = false
16
+
17
+ # Configure public file server for tests with Cache-Control for performance.
18
+ config.public_file_server.enabled = true
19
+ config.public_file_server.headers = {
20
+ "Cache-Control" => "public, max-age=#{1.hour.seconds.to_i}"
21
+ }
22
+
23
+ # Show full error reports and disable caching.
24
+ config.consider_all_requests_local = true
25
+ config.action_controller.perform_caching = false
26
+
27
+ # Raise exceptions instead of rendering exception templates.
28
+ config.action_dispatch.show_exceptions = false
29
+
30
+ # Disable request forgery protection in test environment.
31
+ config.action_controller.allow_forgery_protection = false
32
+ # config.action_mailer.perform_caching = false
33
+
34
+ # Tell Action Mailer not to deliver emails to the real world.
35
+ # The :test delivery method accumulates sent emails in the
36
+ # ActionMailer::Base.deliveries array.
37
+ # config.action_mailer.delivery_method = :test
38
+
39
+ # Print deprecation notices to the stderr.
40
+ config.active_support.deprecation = :stderr
41
+
42
+ # Raises error for missing translations
43
+ # config.action_view.raise_on_missing_translations = true
44
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Be sure to restart your server when you modify this file.
4
+
5
+ # ActiveSupport::Reloader.to_prepare do
6
+ # ApplicationController.renderer.defaults.merge!(
7
+ # http_host: 'example.org',
8
+ # https: false
9
+ # )
10
+ # end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Be sure to restart your server when you modify this file.
4
+
5
+ # You can add backtrace silencers for libraries that you're using but don't wish to see in your backtraces.
6
+ # Rails.backtrace_cleaner.add_silencer { |line| line =~ /my_noisy_library/ }
7
+
8
+ # You can also remove all the silencers if you're trying to debug a problem that might stem from framework code.
9
+ # Rails.backtrace_cleaner.remove_silencers!
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ Config.setup do |config|
4
+ # Name of the constant exposing loaded settings
5
+ config.const_name = "Settings"
6
+
7
+ # Ability to remove elements of the array set in earlier loaded settings file. For example value: '--'.
8
+ #
9
+ # config.knockout_prefix = nil
10
+
11
+ # Overwrite an existing value when merging a `nil` value.
12
+ # When set to `false`, the existing value is retained after merge.
13
+ #
14
+ # config.merge_nil_values = true
15
+
16
+ # Overwrite arrays found in previously loaded settings file. When set to `false`, arrays will be merged.
17
+ #
18
+ # config.overwrite_arrays = true
19
+
20
+ # Load environment variables from the `ENV` object and override any settings defined in files.
21
+ #
22
+ # config.use_env = false
23
+
24
+ # Define ENV variable prefix deciding which variables to load into config.
25
+ #
26
+ # config.env_prefix = 'Settings'
27
+
28
+ # What string to use as level separator for settings loaded from ENV variables. Default value of '.' works well
29
+ # with Heroku, but you might want to change it for example for '__' to easy override settings from command line, where
30
+ # using dots in variable names might not be allowed (eg. Bash).
31
+ #
32
+ # config.env_separator = '.'
33
+
34
+ # Ability to process variables names:
35
+ # * nil - no change
36
+ # * :downcase - convert to lower case
37
+ #
38
+ # config.env_converter = :downcase
39
+
40
+ # Parse numeric values as integers instead of strings.
41
+ #
42
+ # config.env_parse_values = true
43
+
44
+ # Validate presence and type of specific config values. Check https://github.com/dry-rb/dry-validation for details.
45
+ #
46
+ # config.schema do
47
+ # required(:name).filled
48
+ # required(:age).maybe(:int?)
49
+ # required(:email).filled(format?: EMAIL_REGEX)
50
+ # end
51
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Be sure to restart your server when you modify this file.
4
+
5
+ # Specify a serializer for the signed and encrypted cookie jars.
6
+ # Valid options are :json, :marshal, and :hybrid.
7
+ Rails.application.config.action_dispatch.cookies_serializer = :json