container_broker 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ class LockTask
4
+ attr_reader :execution_type
5
+
6
+ def initialize(execution_type:)
7
+ @execution_type = execution_type
8
+ end
9
+
10
+ def perform
11
+ task = all_pending
12
+ .find_one_and_update(
13
+ {
14
+ "$set" => { status: "starting" }
15
+ }, return_document: :after
16
+ )
17
+ return unless task
18
+
19
+ task.reload
20
+
21
+ persist_metrics(task)
22
+
23
+ task
24
+ end
25
+
26
+ def any_pending?
27
+ all_pending.any?
28
+ end
29
+
30
+ private
31
+
32
+ def all_pending
33
+ Task
34
+ .where(execution_type: execution_type)
35
+ .where(:status.in => %w[waiting retry])
36
+ end
37
+
38
+ def persist_metrics(task)
39
+ Metrics.new("tasks").count(
40
+ task_id: task.id,
41
+ name: task&.name,
42
+ status: task.status
43
+ )
44
+ end
45
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "measures"
4
+
5
+ class Metrics
6
+ attr_reader :metric
7
+
8
+ def initialize(metric)
9
+ @metric = metric
10
+ end
11
+
12
+ def count(data = {})
13
+ return unless enabled?
14
+
15
+ client.count(metric, data.merge(
16
+ origin: "container-broker"
17
+ ))
18
+ rescue StandardError => e
19
+ Rails.logger.warn("Error sending metrics to measures: #{e}")
20
+ end
21
+
22
+ def duration(data = {})
23
+ if enabled?
24
+ client.time(metric, data) { yield data if block_given? }
25
+ else
26
+ yield data if block_given?
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def enabled?
33
+ Settings.measures.enabled
34
+ end
35
+
36
+ def transport
37
+ Measures::Transports::UDP.new(Settings.measures.host, Settings.measures.port)
38
+ end
39
+
40
+ def client
41
+ Measures::Client.new(transport, Settings.measures.index, Settings.measures.owner)
42
+ end
43
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ # It's important to persist migrated ids because there may be some tasks in the execution queue for the same runner id
4
+ class MigrateRunner
5
+ TTL = Rails.env.development? ? 10.hours : 1.hour
6
+ KEY_PREFIX = "migrated_ids"
7
+
8
+ attr_reader :runner_id
9
+
10
+ def initialize(runner_id:)
11
+ @runner_id = runner_id
12
+ end
13
+
14
+ def migrate
15
+ Rails.logger.info("Migrate runner id #{runner_id}")
16
+ self.class.redis_client.set("#{KEY_PREFIX}_#{runner_id}", 1, ex: TTL)
17
+ end
18
+
19
+ def migrated?
20
+ self.class.redis_client.exists?("#{KEY_PREFIX}_#{runner_id}")
21
+ end
22
+
23
+ def self.redis_client
24
+ Redis.new(RedisUrlParser.call(Settings.redis_url))
25
+ end
26
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodeTaskAcceptance
4
+ attr_reader :node
5
+
6
+ def initialize(node:)
7
+ @node = node
8
+ end
9
+
10
+ def accept!
11
+ @node.update!(accept_new_tasks: true)
12
+ RunTasksForAllExecutionTypesJob.perform_later
13
+ end
14
+
15
+ def reject!
16
+ @node.update!(accept_new_tasks: false)
17
+ end
18
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodeUsagePercentagePerExecutionType
4
+ def initialize(node)
5
+ @node = node
6
+ end
7
+
8
+ def perform
9
+ execution_type_groups.map do |execution_type_group|
10
+ {
11
+ execution_type: execution_type_group[0],
12
+ usage_percent: SlotsUsagePercentage.new(execution_type_group[1]).perform
13
+ }
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def execution_type_groups
20
+ @node.slots.group_by(&:execution_type)
21
+ end
22
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RescheduleTasksForMissingRunners
4
+ attr_reader :started_tasks, :runner_ids
5
+
6
+ def initialize(runner_ids:, started_tasks:)
7
+ @started_tasks = started_tasks
8
+ @runner_ids = runner_ids
9
+ end
10
+
11
+ def perform
12
+ tasks_without_runner.each do |runner_id|
13
+ task = started_tasks_group_by_runner_id[runner_id]
14
+ message = "Task retryied because runner #{runner_id} is missing (#{task} #{task&.slot})"
15
+ Rails.logger.debug(message)
16
+
17
+ report_event(message: message, task: task, runner_id: runner_id)
18
+
19
+ slot = task.slot
20
+ task.mark_as_retry(error: message)
21
+ slot&.release
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def report_event(message:, task:, runner_id:)
28
+ return unless Settings.sentry.enabled
29
+
30
+ slot = task&.slot
31
+ node = slot&.node
32
+ Raven.capture_exception(
33
+ message,
34
+ level: :info,
35
+ extra: {
36
+ runner: slot&.node&.runner_provider,
37
+ runner_id: runner_id,
38
+ slot: {
39
+ id: slot&.id,
40
+ name: slot&.name,
41
+ status: slot&.status,
42
+ runner_id: slot&.runner_id
43
+ },
44
+ node: {
45
+ id: node&.id,
46
+ name: node&.name,
47
+ status: node&.status
48
+ },
49
+ task: {
50
+ id: task.id,
51
+ name: task.name,
52
+ status: task.status
53
+ }
54
+ }
55
+ )
56
+ end
57
+
58
+ def tasks_without_runner
59
+ started_tasks_group_by_runner_id.keys - runner_ids
60
+ end
61
+
62
+ def started_tasks_group_by_runner_id
63
+ @started_tasks_group_by_runner_id ||= started_tasks
64
+ .map(&:reload)
65
+ .select(&:started?)
66
+ .select(&:runner_id)
67
+ .group_by(&:runner_id)
68
+ .transform_values(&:first)
69
+ end
70
+ end
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class CreateConnection
6
+ def perform(node:)
7
+ raise(Runners::InvalidRunner, "Node must be a docker runner") unless node.docker?
8
+
9
+ ::Docker::Connection.new(
10
+ node.hostname,
11
+ connect_timeout: 5,
12
+ read_timeout: 15,
13
+ write_timeout: 5
14
+ )
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class CreateExecutionInfo
6
+ attr_reader :container
7
+
8
+ def perform(container:)
9
+ @container = container
10
+
11
+ execution_info_data = {
12
+ id: container_name(container: container),
13
+ status: status
14
+ }
15
+
16
+ if full_state_present?
17
+ execution_info_data.merge!(
18
+ exit_code: state["ExitCode"],
19
+ started_at: state["StartedAt"],
20
+ finished_at: state["FinishedAt"],
21
+ error: state["Error"]
22
+ )
23
+ end
24
+
25
+ Runners::ExecutionInfo.new(execution_info_data)
26
+ end
27
+
28
+ private
29
+
30
+ def status
31
+ if waiting?
32
+ "pending"
33
+ elsif running?
34
+ "running"
35
+ elsif terminated_with_success?
36
+ "success"
37
+ elsif terminated_with_error?
38
+ "error"
39
+ elsif terminated?
40
+ "exited"
41
+ end
42
+ end
43
+
44
+ def container_name(container:)
45
+ name = container.info["Name"] || container.info["Names"].first
46
+
47
+ name.remove(%r{^/})
48
+ end
49
+
50
+ def waiting?
51
+ state_status == "created"
52
+ end
53
+
54
+ def running?
55
+ state_status == "running"
56
+ end
57
+
58
+ def terminated?
59
+ state_status == "exited"
60
+ end
61
+
62
+ def terminated_with_success?
63
+ terminated? && state["ExitCode"]&.zero?
64
+ end
65
+
66
+ def terminated_with_error?
67
+ terminated? && state["ExitCode"]&.positive?
68
+ end
69
+
70
+ def full_state_present?
71
+ state.is_a?(Hash)
72
+ end
73
+
74
+ def state
75
+ container.info["State"]
76
+ end
77
+
78
+ def state_status
79
+ if full_state_present?
80
+ state["Status"]
81
+ else
82
+ state
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class FetchExecutionInfo
6
+ def perform(task:)
7
+ container = Runners::Docker::FetchTaskContainer
8
+ .new
9
+ .perform(task: task)
10
+
11
+ CreateExecutionInfo.new.perform(container: container)
12
+ rescue ::Docker::Error::NotFoundError => e
13
+ raise Runners::RunnerIdNotFoundError, e.message
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class FetchLogs
6
+ def perform(task:)
7
+ # streaming_logs avoids some encoding issues and should be safe since container status = exited
8
+ # (see https://github.com/swipely/docker-api/issues/290 for reference)
9
+ Runners::Docker::FetchTaskContainer
10
+ .new
11
+ .perform(task: task)
12
+ .streaming_logs(stdout: true, stderr: true, tail: 1_000)
13
+ rescue ::Docker::Error::NotFoundError => e
14
+ raise Runners::RunnerIdNotFoundError, e.message
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class FetchTaskContainer
6
+ def perform(task:)
7
+ ::Docker::Container.get(
8
+ task.runner_id,
9
+ { all: true },
10
+ CreateConnection.new.perform(node: task.slot.node)
11
+ )
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class Filer
6
+ class InvalidMountName < StandardError; end
7
+
8
+ def perform(task_storage_mounts:)
9
+ task_storage_mounts.map do |task_mount_name, task_mount_path|
10
+ node_mount_path = Settings.storage_mounts.docker[task_mount_name]
11
+
12
+ raise InvalidMountName unless node_mount_path
13
+
14
+ [node_mount_path, task_mount_path].join(":")
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Runners
4
+ module Docker
5
+ class KillSlotRunner
6
+ def perform(slot:)
7
+ return unless slot.runner_id.present?
8
+
9
+ ::Docker::Container
10
+ .get(slot.runner_id, {}, CreateConnection.new.perform(node: slot.node))
11
+ .kill!
12
+ rescue ::Docker::Error::NotFoundError => e
13
+ Rails.logger.info("Container #{slot.runner_id} already removed - #{e.message} (e.class)")
14
+ rescue Excon::Error => e
15
+ Rails.logger.info("Error removing container: #{e}")
16
+ end
17
+ end
18
+ end
19
+ end