container_broker 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ApplicationJob < ActiveJob::Base
4
+ # Automatically retry jobs that encountered a deadlock
5
+ # retry_on ActiveRecord::Deadlocked
6
+
7
+ # Most jobs are safe to ignore if the underlying records are no longer available
8
+ # discard_on ActiveJob::DeserializationError
9
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CollectLoadMetricsJob < ContainerBrokerBaseJob
4
+ queue_as :default
5
+
6
+ def perform
7
+ CollectLoadMetrics.new.perform
8
+ end
9
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ContainerBrokerBaseJob < ApplicationJob
4
+ JOB_METRIC = "jobs"
5
+
6
+ around_perform do |job, block|
7
+ time = Benchmark.realtime { block.call }
8
+
9
+ Metrics.new(JOB_METRIC).count(
10
+ job_id: job.job_id,
11
+ job_class: job.class.to_s,
12
+ executions: job.executions,
13
+ queue_name: job.queue_name,
14
+ hostname: Socket.gethostname,
15
+ time: time
16
+ )
17
+ end
18
+
19
+ around_perform do |job, block|
20
+ request_id = job.class.request_id_from_args(job.arguments.first)
21
+
22
+ if request_id
23
+ Rails.logger.tagged(" request_id=#{request_id} ") do
24
+ CurrentThreadRequestId.set(request_id) { block.call }
25
+ end
26
+ else
27
+ block.call
28
+ end
29
+ end
30
+
31
+ def self.request_id_from_args(_args); end
32
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ class MigrateTasksFromDeadNodeJob < ContainerBrokerBaseJob
4
+ queue_as :default
5
+
6
+ def perform(node:)
7
+ if node.available?
8
+ Rails.logger.debug("Not migrating tasks because #{node} returned to available status")
9
+ return
10
+ end
11
+
12
+ node.run_with_lock_no_wait do
13
+ Rails.logger.debug("Migrating tasks from #{node}")
14
+ node.slots.reject(&:available?).each do |slot|
15
+ Rails.logger.debug("Migrating task for #{slot}")
16
+ current_task = slot.current_task
17
+ if current_task
18
+ Rails.logger.debug("Retrying slot current task #{current_task}")
19
+ current_task.mark_as_retry if current_task.starting? || current_task.started?
20
+ else
21
+ Rails.logger.debug("Slot does not have current task")
22
+ end
23
+
24
+ MigrateRunner.new(runner_id: slot.runner_id).migrate
25
+
26
+ Rails.logger.debug("Releasing #{slot}")
27
+ slot.release
28
+ Rails.logger.debug("#{slot} released")
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ class MonitorUnresponsiveNodeJob < ContainerBrokerBaseJob
4
+ queue_as :default
5
+
6
+ def perform(node:)
7
+ node.run_with_lock_no_wait do
8
+ node.runner_service(:node_availability).perform(node: node)
9
+
10
+ Rails.logger.debug("Marking #{node} as available again")
11
+ node.register_success
12
+ node.available!
13
+
14
+ RunTasksForAllExecutionTypesJob.perform_later
15
+ end
16
+ rescue StandardError => e
17
+ node.register_error("#{e.class}: #{e.message}")
18
+
19
+ Rails.logger.info("#{node} still unresponsive")
20
+ end
21
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ class MonitorUnresponsiveNodesJob < ContainerBrokerBaseJob
4
+ def perform
5
+ Node.where(:status.in => %w[unstable unavailable]).each do |node|
6
+ MonitorUnresponsiveNodeJob.perform_later(node: node)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ReleaseSlotJob < ContainerBrokerBaseJob
4
+ class InvalidSlotContainerId < StandardError; end
5
+ queue_as :default
6
+
7
+ def perform(slot:, runner_id:)
8
+ Rails.logger.debug("ReleaseSlotJob for #{slot} and container #{runner_id}")
9
+
10
+ if MigrateRunner.new(runner_id: runner_id).migrated?
11
+ Rails.logger.debug("Ignores release slot for #{slot} because it's migrated")
12
+ return
13
+ end
14
+
15
+ check_same_runner_id(slot: slot, runner_id: runner_id)
16
+
17
+ UpdateTaskStatusJob.perform_now(slot.current_task.reload)
18
+
19
+ Rails.logger.debug("Enqueueing container removal")
20
+ RemoveRunnerJob.perform_later(node: MongoidSerializableModel.new(slot.node), runner_id: slot.runner_id) if Settings.delete_container_after_run
21
+
22
+ check_for_slot_removal = CheckForSlotRemoval.new(slot: slot)
23
+ check_for_slot_removal.perform
24
+ if check_for_slot_removal.removed?
25
+ Rails.logger.debug("Slot removed and wont be released")
26
+ else
27
+ slot.release
28
+ Rails.logger.debug("Slot released (#{slot.status})")
29
+ end
30
+ rescue Runners::RunnerIdNotFoundError => e
31
+ Rails.logger.debug("Runner #{runner_id} not found (#{e.message}). Task will be rescheduled in UpdateNodeStatus.")
32
+ rescue StandardError => e
33
+ Rails.logger.debug("Error in ReleaseSlotJob for #{slot}: #{e}")
34
+ slot.node.register_error(e.message)
35
+ raise
36
+ end
37
+
38
+ def check_same_runner_id(slot:, runner_id:)
39
+ return if runner_id == slot.runner_id
40
+
41
+ error_message = "Current container id (#{slot.runner_id}) in #{slot} is different than the provided (#{runner_id})"
42
+
43
+ Rails.logger.error(error_message)
44
+
45
+ raise InvalidSlotContainerId, error_message
46
+ end
47
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RemoveRunnerJob < ContainerBrokerBaseJob
4
+ queue_as :default
5
+
6
+ def perform(node:, runner_id:)
7
+ node
8
+ .runner_service(:remove_runner)
9
+ .perform(node: node, runner_id: runner_id)
10
+ end
11
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RemoveUnusedTagsJob < ContainerBrokerBaseJob
4
+ queue_as :default
5
+
6
+ def perform(*_args)
7
+ remove_unreferenced_tags
8
+ end
9
+
10
+ def remove_unreferenced_tags
11
+ TaskTag
12
+ .all
13
+ .to_a
14
+ .reject { |task_tag| any_task_referencing_tag?(task_tag) }
15
+ .each(&:destroy!)
16
+ end
17
+
18
+ def tag_expression(task_tag)
19
+ :"tags.#{task_tag.name}"
20
+ end
21
+
22
+ def any_task_referencing_tag?(task_tag)
23
+ Task.where(tag_expression(task_tag).exists => true).exists?
24
+ end
25
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RequestIdFromTask
4
+ def request_id_from_args(args)
5
+ args[:task]&.request_id
6
+ end
7
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RunTaskJob < ContainerBrokerBaseJob
4
+ extend RequestIdFromTask
5
+
6
+ queue_as :default
7
+
8
+ def perform(task:, slot:)
9
+ # TODO: remove after successful deploy
10
+ task.update!(storage_mounts: { "ingest-nfs" => task.attributes["ingest_storage_mount"] }) if task.attributes["ingest_storage_mount"] && task.storage_mounts.blank?
11
+
12
+ Rails.logger.debug("Performing RunTaskJob for #{task} #{slot}")
13
+
14
+ raise "Invalid task status - #{task}" unless task.starting?
15
+ raise "Invalid slot status - #{slot}" unless slot.attaching?
16
+
17
+ runner_id = task.generate_runner_id
18
+
19
+ task.update!(runner_id: runner_id)
20
+ slot.update!(runner_id: runner_id)
21
+
22
+ slot
23
+ .node
24
+ .runner_service(:run_task)
25
+ .perform(task: task, slot: slot, runner_id: runner_id)
26
+
27
+ task.mark_as_started!(runner_id: runner_id, slot: slot)
28
+ Rails.logger.debug("#{task} marked as started")
29
+
30
+ slot.mark_as_running(current_task: task, runner_id: runner_id)
31
+ Rails.logger.debug("#{slot} marked as running")
32
+
33
+ add_metric(task)
34
+ task
35
+ rescue StandardError => e
36
+ Rails.logger.debug("Error in RunTaskJob: #{e}")
37
+
38
+ slot.node.register_error(e.message) if e.is_a?(Node::NodeConnectionError)
39
+
40
+ slot.release
41
+ Rails.logger.debug("#{slot} released")
42
+
43
+ task.mark_as_retry(error: e.message)
44
+ Rails.logger.debug("#{task} marked as retry")
45
+
46
+ add_metric(task)
47
+
48
+ Rails.logger.debug("Performed RunTaskJob for #{task} #{slot}")
49
+ end
50
+
51
+ def add_metric(task)
52
+ Metrics.new("tasks").count(
53
+ task_id: task.id,
54
+ name: task&.name,
55
+ type: task&.execution_type,
56
+ slot: task&.slot&.name,
57
+ node: task&.slot&.node&.name,
58
+ started_at: task.started_at,
59
+ duration: task.milliseconds_waiting,
60
+ error: task.error,
61
+ status: task.status
62
+ )
63
+ end
64
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RunTasksForAllExecutionTypesJob < ContainerBrokerBaseJob
4
+ queue_as :default
5
+
6
+ def perform
7
+ Slot.pluck(:execution_type).uniq.each do |execution_type|
8
+ RunTasksJob.perform_later(execution_type: execution_type)
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RunTasksJob < ContainerBrokerBaseJob
4
+ attr_reader :execution_type
5
+
6
+ def perform(execution_type:)
7
+ @execution_type = execution_type
8
+
9
+ enqueue_tasks
10
+ end
11
+
12
+ private
13
+
14
+ def enqueue_tasks
15
+ while have_pending_tasks? && (slot = lock_slot)
16
+ task = lock_task
17
+ if task
18
+ Rails.logger.debug "Perform_later RunTaskJob for #{slot} #{task}"
19
+ RunTaskJob.perform_later(slot: slot, task: task)
20
+ else
21
+ slot.available!
22
+ break
23
+ end
24
+ end
25
+ end
26
+
27
+ def lock_slot
28
+ LockSlot.new(execution_type: execution_type).perform
29
+ end
30
+
31
+ def lock_task
32
+ lock_task_service.perform
33
+ end
34
+
35
+ def have_pending_tasks?
36
+ lock_task_service.any_pending?
37
+ end
38
+
39
+ def lock_task_service
40
+ @lock_task_service ||= LockTask.new(execution_type: execution_type)
41
+ end
42
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TimeoutFailedTasksJob < ContainerBrokerBaseJob
4
+ queue_as :default
5
+
6
+ def perform(*_args)
7
+ timeout_failed_tasks
8
+ end
9
+
10
+ def timeout_failed_tasks
11
+ failed_tasks_to_timeout.map do |task|
12
+ Rails.logger.debug("Marking task as error due to timeout: #{task.uuid}")
13
+
14
+ task.error!
15
+
16
+ persist_logs(task)
17
+ end
18
+ end
19
+
20
+ def failed_tasks_to_timeout
21
+ Task.failed.where(:finished_at.lt => Time.current - Settings.timeout_tasks_after_hours)
22
+ end
23
+
24
+ private
25
+
26
+ def persist_logs(task)
27
+ task.set_logs("#{task.get_logs}\nThis task was automatically marked as error due to timeout.\n".dup)
28
+
29
+ task.save!
30
+ end
31
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ class UpdateAllNodesStatusJob < ContainerBrokerBaseJob
4
+ def perform
5
+ Node.available.each do |node|
6
+ UpdateNodeStatusJob.perform_later(node: node)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ class UpdateNodeStatusJob < ContainerBrokerBaseJob
4
+ queue_as :default
5
+
6
+ def perform(node:)
7
+ Rails.logger.debug("Waiting for lock to update status of #{node}")
8
+
9
+ updated = LockManager.new(type: self.class.to_s, id: node.id, expire: 1.minute, wait: false).lock do
10
+ Rails.logger.debug("Lock acquired for update status of #{node}")
11
+
12
+ node.runner_service(:update_node_status).perform(node: node)
13
+
14
+ Rails.logger.debug("Releasing lock for update status of #{node}")
15
+ true
16
+ end
17
+
18
+ if updated
19
+ Rails.logger.debug("Lock released for update status of #{node}")
20
+ else
21
+ Rails.logger.debug("Node updating is locked by another job and will be ignored now")
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ class UpdateTaskStatusJob < ContainerBrokerBaseJob
4
+ extend RequestIdFromTask
5
+
6
+ class InvalidContainerStatusError < StandardError; end
7
+
8
+ queue_as :default
9
+
10
+ def perform(task)
11
+ Rails.logger.debug("Updating status for task #{task}")
12
+ Rails.logger.debug("Task #{task} is running in slot #{task.slot}")
13
+
14
+ execution_info = task.slot.node.runner_service(:fetch_execution_info).perform(task: task)
15
+
16
+ Rails.logger.debug("Got runner #{execution_info.id} with state #{execution_info.status}")
17
+
18
+ unless execution_info.terminated?
19
+ Rails.logger.debug("Runner should be terminated but it is #{execution_info.status}. Execution info is #{execution_info.to_h}")
20
+ raise InvalidContainerStatusError,
21
+ "Runner should be terminated (current status: #{execution_info.status})"
22
+ end
23
+
24
+ Rails.logger.debug("Container is in status #{execution_info.status} and exit code #{execution_info.exit_code}")
25
+
26
+ task.exit_code = execution_info.exit_code
27
+ task.started_at = execution_info.started_at
28
+ task.finished_at = execution_info.finished_at
29
+
30
+ persist_logs(task)
31
+
32
+ if execution_info.success?
33
+ Rails.logger.debug("Marking task as completed and no errors")
34
+ task.error = nil
35
+ task.completed!
36
+ else
37
+ Rails.logger.debug("Marked task for retry and set error as #{execution_info.error}")
38
+ task.mark_as_retry(error: execution_info.error)
39
+ end
40
+
41
+ task.save!
42
+
43
+ add_metric(task)
44
+ end
45
+
46
+ def persist_logs(task)
47
+ return unless task.persist_logs
48
+
49
+ Rails.logger.debug("Persisting logs for #{task}")
50
+ container_logs = task.slot.node.runner_service(:fetch_logs).perform(task: task)
51
+ task.set_logs(container_logs)
52
+ end
53
+
54
+ def add_metric(task)
55
+ Metrics.new("tasks").count(
56
+ task_id: task.id,
57
+ event_id: task&.tags&.dig("event_id"),
58
+ api_id: task&.tags&.dig("api_id").to_i,
59
+ name: task&.name,
60
+ type: task&.execution_type,
61
+ slot: task&.slot&.name,
62
+ node: task&.slot&.node&.name,
63
+ started_at: task.started_at,
64
+ finished_at: task.finished_at,
65
+ duration: task.milliseconds_running,
66
+ processing_time: task.seconds_running.to_i,
67
+ error: task.error,
68
+ status: task.status
69
+ )
70
+ end
71
+ end