container_broker 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ class LockTask
4
+ attr_reader :execution_type
5
+
6
+ def initialize(execution_type:)
7
+ @execution_type = execution_type
8
+ end
9
+
10
+ def perform
11
+ task = all_pending
12
+ .find_one_and_update(
13
+ {
14
+ "$set" => { status: "starting" }
15
+ }, return_document: :after
16
+ )
17
+ return unless task
18
+
19
+ task.reload
20
+
21
+ persist_metrics(task)
22
+
23
+ task
24
+ end
25
+
26
+ def any_pending?
27
+ all_pending.any?
28
+ end
29
+
30
+ private
31
+
32
+ def all_pending
33
+ Task
34
+ .where(execution_type: execution_type)
35
+ .where(:status.in => %w[waiting retry])
36
+ end
37
+
38
+ def persist_metrics(task)
39
+ Metrics.new("tasks").count(
40
+ task_id: task.id,
41
+ name: task&.name,
42
+ status: task.status
43
+ )
44
+ end
45
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "measures"
4
+
5
+ class Metrics
6
+ attr_reader :metric
7
+
8
+ def initialize(metric)
9
+ @metric = metric
10
+ end
11
+
12
+ def count(data = {})
13
+ return unless enabled?
14
+
15
+ client.count(metric, data.merge(
16
+ origin: "container-broker"
17
+ ))
18
+ rescue StandardError => e
19
+ Rails.logger.warn("Error sending metrics to measures: #{e}")
20
+ end
21
+
22
+ def duration(data = {})
23
+ if enabled?
24
+ client.time(metric, data) { yield data if block_given? }
25
+ else
26
+ yield data if block_given?
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def enabled?
33
+ Settings.measures.enabled
34
+ end
35
+
36
+ def transport
37
+ Measures::Transports::UDP.new(Settings.measures.host, Settings.measures.port)
38
+ end
39
+
40
+ def client
41
+ Measures::Client.new(transport, Settings.measures.index, Settings.measures.owner)
42
+ end
43
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ # It's important to persist migrated ids because there may be some tasks in the execution queue for the same runner id
4
+ class MigrateRunner
5
+ TTL = Rails.env.development? ? 10.hours : 1.hour
6
+ KEY_PREFIX = "migrated_ids"
7
+
8
+ attr_reader :runner_id
9
+
10
+ def initialize(runner_id:)
11
+ @runner_id = runner_id
12
+ end
13
+
14
+ def migrate
15
+ Rails.logger.info("Migrate runner id #{runner_id}")
16
+ self.class.redis_client.set("#{KEY_PREFIX}_#{runner_id}", 1, ex: TTL)
17
+ end
18
+
19
+ def migrated?
20
+ self.class.redis_client.exists?("#{KEY_PREFIX}_#{runner_id}")
21
+ end
22
+
23
+ def self.redis_client
24
+ Redis.new(RedisUrlParser.call(Settings.redis_url))
25
+ end
26
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodeTaskAcceptance
4
+ attr_reader :node
5
+
6
+ def initialize(node:)
7
+ @node = node
8
+ end
9
+
10
+ def accept!
11
+ @node.update!(accept_new_tasks: true)
12
+ RunTasksForAllExecutionTypesJob.perform_later
13
+ end
14
+
15
+ def reject!
16
+ @node.update!(accept_new_tasks: false)
17
+ end
18
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodeUsagePercentagePerExecutionType
4
+ def initialize(node)
5
+ @node = node
6
+ end
7
+
8
+ def perform
9
+ execution_type_groups.map do |execution_type_group|
10
+ {
11
+ execution_type: execution_type_group[0],
12
+ usage_percent: SlotsUsagePercentage.new(execution_type_group[1]).perform
13
+ }
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def execution_type_groups
20
+ @node.slots.group_by(&:execution_type)
21
+ end
22
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RescheduleTasksForMissingRunners
4
+ attr_reader :started_tasks, :runner_ids
5
+
6
+ def initialize(runner_ids:, started_tasks:)
7
+ @started_tasks = started_tasks
8
+ @runner_ids = runner_ids
9
+ end
10
+
11
+ def perform
12
+ tasks_without_runner.each do |runner_id|
13
+ task = started_tasks_group_by_runner_id[runner_id]
14
+ message = "Task retryied because runner #{runner_id} is missing (#{task} #{task&.slot})"
15
+ Rails.logger.debug(message)
16
+
17
+ report_event(message: message, task: task, runner_id: runner_id)
18
+
19
+ slot = task.slot
20
+ task.mark_as_retry(error: message)
21
+ slot&.release
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def report_event(message:, task:, runner_id:)
28
+ return unless Settings.sentry.enabled
29
+
30
+ slot = task&.slot
31
+ node = slot&.node
32
+ Raven.capture_exception(
33
+ message,
34
+ level: :info,
35
+ extra: {
36
+ runner: slot&.node&.runner_provider,
37
+ runner_id: runner_id,
38
+ slot: {
39
+ id: slot&.id,
40
+ name: slot&.name,
41
+ status: slot&.status,
42
+ runner_id: slot&.runner_id
43
+ },
44
+ node: {
45
+ id: node&.id,
46
+ name: node&.name,
47
+ status: node&.status
48
+ },
49
+ task: {
50
+ id: task.id,
51
+ name: task.name,
52
+ status: task.status
53
+ }
54
+ }
55
+ )
56
+ end
57
+
58
+ def tasks_without_runner
59
+ started_tasks_group_by_runner_id.keys - runner_ids
60
+ end
61
+
62
+ def started_tasks_group_by_runner_id
63
+ @started_tasks_group_by_runner_id ||= started_tasks
64
+ .map(&:reload)
65
+ .select(&:started?)
66
+ .select(&:runner_id)
67
+ .group_by(&:runner_id)
68
+ .transform_values(&:first)
69
+ end
70
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class CreateConnection
6
+ def perform(node:)
7
+ raise(Runners::InvalidRunner, "Node must be a docker runner") unless node.docker?
8
+
9
+ ::Docker::Connection.new(
10
+ node.hostname,
11
+ connect_timeout: 5,
12
+ read_timeout: 15,
13
+ write_timeout: 5
14
+ )
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class CreateExecutionInfo
6
+ attr_reader :container
7
+
8
+ def perform(container:)
9
+ @container = container
10
+
11
+ execution_info_data = {
12
+ id: container_name(container: container),
13
+ status: status
14
+ }
15
+
16
+ if full_state_present?
17
+ execution_info_data.merge!(
18
+ exit_code: state["ExitCode"],
19
+ started_at: state["StartedAt"],
20
+ finished_at: state["FinishedAt"],
21
+ error: state["Error"]
22
+ )
23
+ end
24
+
25
+ Runners::ExecutionInfo.new(execution_info_data)
26
+ end
27
+
28
+ private
29
+
30
+ def status
31
+ if waiting?
32
+ "pending"
33
+ elsif running?
34
+ "running"
35
+ elsif terminated_with_success?
36
+ "success"
37
+ elsif terminated_with_error?
38
+ "error"
39
+ elsif terminated?
40
+ "exited"
41
+ end
42
+ end
43
+
44
+ def container_name(container:)
45
+ name = container.info["Name"] || container.info["Names"].first
46
+
47
+ name.remove(%r{^/})
48
+ end
49
+
50
+ def waiting?
51
+ state_status == "created"
52
+ end
53
+
54
+ def running?
55
+ state_status == "running"
56
+ end
57
+
58
+ def terminated?
59
+ state_status == "exited"
60
+ end
61
+
62
+ def terminated_with_success?
63
+ terminated? && state["ExitCode"]&.zero?
64
+ end
65
+
66
+ def terminated_with_error?
67
+ terminated? && state["ExitCode"]&.positive?
68
+ end
69
+
70
+ def full_state_present?
71
+ state.is_a?(Hash)
72
+ end
73
+
74
+ def state
75
+ container.info["State"]
76
+ end
77
+
78
+ def state_status
79
+ if full_state_present?
80
+ state["Status"]
81
+ else
82
+ state
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class FetchExecutionInfo
6
+ def perform(task:)
7
+ container = Runners::Docker::FetchTaskContainer
8
+ .new
9
+ .perform(task: task)
10
+
11
+ CreateExecutionInfo.new.perform(container: container)
12
+ rescue ::Docker::Error::NotFoundError => e
13
+ raise Runners::RunnerIdNotFoundError, e.message
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class FetchLogs
6
+ def perform(task:)
7
+ # streaming_logs avoids some encoding issues and should be safe since container status = exited
8
+ # (see https://github.com/swipely/docker-api/issues/290 for reference)
9
+ Runners::Docker::FetchTaskContainer
10
+ .new
11
+ .perform(task: task)
12
+ .streaming_logs(stdout: true, stderr: true, tail: 1_000)
13
+ rescue ::Docker::Error::NotFoundError => e
14
+ raise Runners::RunnerIdNotFoundError, e.message
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class FetchTaskContainer
6
+ def perform(task:)
7
+ ::Docker::Container.get(
8
+ task.runner_id,
9
+ { all: true },
10
+ CreateConnection.new.perform(node: task.slot.node)
11
+ )
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class Filer
6
+ class InvalidMountName < StandardError; end
7
+
8
+ def perform(task_storage_mounts:)
9
+ task_storage_mounts.map do |task_mount_name, task_mount_path|
10
+ node_mount_path = Settings.storage_mounts.docker[task_mount_name]
11
+
12
+ raise InvalidMountName unless node_mount_path
13
+
14
+ [node_mount_path, task_mount_path].join(":")
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class KillSlotRunner
6
+ def perform(slot:)
7
+ return unless slot.runner_id.present?
8
+
9
+ ::Docker::Container
10
+ .get(slot.runner_id, {}, CreateConnection.new.perform(node: slot.node))
11
+ .kill!
12
+ rescue ::Docker::Error::NotFoundError => e
13
+ Rails.logger.info("Container #{slot.runner_id} already removed - #{e.message} (e.class)")
14
+ rescue Excon::Error => e
15
+ Rails.logger.info("Error removing container: #{e}")
16
+ end
17
+ end
18
+ end
19
+ end