container_broker 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 021bb9a1ca3d028ae71c5640840335415923572f5427f4ae2ab0905cadd35fe4
4
+ data.tar.gz: e8af975eb58b008668e654fa0e1d0ca4b7ea70e9832ca788ed4842d021fe6f8d
5
+ SHA512:
6
+ metadata.gz: 963ddbcc97ebc45676eab092818f8f3e23d20be0e24f45aec26fb3c5f49db3ef7fe9342f6bc0488b7827b6d99bdb8bbba586f446a7cf25ba6b5613bc0f9fa35d
7
+ data.tar.gz: c59b5198d1b4fdc532ac1ea93cafd947ea04b35573f752b9c8ce6277c9a40a76fa1903ac774d98afc13693e846558056ef8d2ac0a771c9b3c26edb9a36675d36
data/README.md ADDED
@@ -0,0 +1,98 @@
1
+ # Container Broker
2
+
3
+ ## Installation
4
+
5
+ Add this line to your application's Gemfile:
6
+
7
+ ```ruby
8
+ gem 'container-broker'
9
+ ```
10
+
11
+ And then execute:
12
+
13
+ $ bundle install
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install container-broker
18
+
19
+ ## Usage
20
+
21
+ ### Routes
22
+
23
+ #### Nodes
24
+
25
+ ##### List
26
+ - `GET /nodes`
27
+
28
+ ##### Add
29
+ - `POST /nodes`
30
+ - Parameters: `{ "hostname": "", "slots_execution_types": {"cpu": 2, "network": 10} }`
31
+
32
+ ##### Update
33
+ - `PATCH /nodes/:uuid`
34
+ - Parameters: `{ "slots_execution_types": {"cpu": 2, "network": 10} }`
35
+
36
+ ##### Remove
37
+ - `DELETE /nodes/:uuid`
38
+ - Parameters: `{ "slots_execution_types": {"cpu": 2, "network": 10} }`
39
+
40
+ #### Tasks
41
+
42
+ ##### Create
43
+ - `POST /tasks`
44
+ - Parameters: `{ "name": "", "image": "", "cmd": "", "storage_mounts": "{}", "tags": {"type": "video"} }`
45
+ - Response:
46
+ ```json
47
+ {
48
+ "status": "ok|error",
49
+ "uuid": "2d272b5c-953c-44e9-ad15-6c31187903c9"
50
+ }
51
+ ```
52
+
53
+ ##### Task Details
54
+ - Show information about some job
55
+ - Parameters (query string): `id`
56
+ - Response:
57
+ ```json
58
+ {
59
+ "uuid": "2d272b5c-953c-44e9-ad15-6c31187903c9",
60
+ "status": "waiting|running|completed|error"
61
+ }
62
+ ```
63
+
64
+ #### GET /status
65
+ - Show pool informations
66
+ - Response:
67
+ ```json
68
+ {
69
+ "nodes": [
70
+ {
71
+ "hostname": "",
72
+ "cores": 10,
73
+ "memory": 4096,
74
+ "jobs": [
75
+ {
76
+ "uuid": "2d272b5c-953c-44e9-ad15-6c31187903c9",
77
+ "status": "waiting|running|completed|error"
78
+ }
79
+ ]
80
+ }
81
+ ]
82
+ }
83
+ ```
84
+
85
+ ## Development
86
+
87
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
88
+
89
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
90
+
91
+ ### Expose Docker HTTP API on MacOSX:
92
+ ```shell
93
+ socat TCP-LISTEN:2376,reuseaddr,fork UNIX-CONNECT:/var/run/docker.sock
94
+ ```
95
+
96
+ ## Contributing
97
+
98
+ Bug reports and pull requests are welcome on GitHub at https://github.com/globocom/container-broker.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Add your own tasks in files placed in lib/tasks ending in .rake,
4
+ # for example lib/tasks/capistrano.rake, and they will automatically be available to Rake.
5
+
6
+ require_relative 'config/application'
7
+
8
+ Rails.application.load_tasks
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ApplicationController < ActionController::API
4
+ # protect_from_forgery with: :exception
5
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HealthcheckController < ApplicationController
4
+ def index
5
+ render json: {
6
+ status: "OK",
7
+ sidekiq_redis: Sidekiq.redis_info["redis_version"],
8
+ lock_manager_redis: LockManager.redis_client.info["redis_version"],
9
+ mongodb: {
10
+ nodes: Node.count,
11
+ slots: Slot.count,
12
+ pending_tasks: Task.where(status: "waiting").count
13
+ }
14
+ }
15
+ rescue StandardError => e
16
+ render json: {
17
+ status: "ERROR",
18
+ message: "#{e.class}: #{e.message}"
19
+ }
20
+ end
21
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodesController < ApplicationController
4
+ before_action :load_node, only: %i[update show destroy accept_new_tasks reject_new_tasks kill_containers]
5
+
6
+ rescue_from Mongoid::Errors::Validations do |exception|
7
+ render json: exception.record.errors.messages, status: :unprocessable_entity
8
+ end
9
+
10
+ def index
11
+ render json: Node.all
12
+ end
13
+
14
+ def create
15
+ @node = Node.create!(node_params)
16
+
17
+ FriendlyNameNodes.new.perform
18
+ AdjustNodeSlotsJob.perform_later(node: @node)
19
+
20
+ render json: @node, status: :created
21
+ end
22
+
23
+ def show
24
+ render json: @node
25
+ end
26
+
27
+ def update
28
+ @node.update!(node_params.slice(:slots_execution_types))
29
+ AdjustNodeSlotsJob.perform_later(node: @node)
30
+
31
+ render json: @node
32
+ end
33
+
34
+ def destroy
35
+ DeleteNode.new(node: @node).perform
36
+ FriendlyNameNodes.new.perform
37
+
38
+ head :ok
39
+ rescue DeleteNode::NodeWithRunningSlotsError
40
+ head :not_acceptable
41
+ end
42
+
43
+ def reject_new_tasks
44
+ NodeTaskAcceptance.new(node: @node).reject!
45
+
46
+ head :ok
47
+ end
48
+
49
+ def accept_new_tasks
50
+ NodeTaskAcceptance.new(node: @node).accept!
51
+
52
+ head :ok
53
+ end
54
+
55
+ def kill_containers
56
+ KillNodeRunners.new(node: @node).perform
57
+
58
+ head :ok
59
+ end
60
+
61
+ private
62
+
63
+ def load_node
64
+ @node = Node.find_by!(uuid: params[:uuid])
65
+ end
66
+
67
+ def node_params
68
+ params.require(:node).permit(:hostname, slots_execution_types: {})
69
+ end
70
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodesHealthcheckController < ApplicationController
4
+ def index
5
+ render json: {
6
+ status: status,
7
+ failed_nodes: failed_nodes.map { |node| NodeHealthcheckSerializer.new(node) }
8
+ }
9
+ end
10
+
11
+ private
12
+
13
+ def failed_nodes
14
+ @failed_nodes ||= Node.unavailable
15
+ end
16
+
17
+ def nodes_failed?
18
+ failed_nodes.to_a.any?
19
+ end
20
+
21
+ def status
22
+ if nodes_failed?
23
+ "FAILING"
24
+ else
25
+ "WORKING"
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ class StatusController < ApplicationController
4
+ LIMIT_TASKS = 200
5
+
6
+ def nodes
7
+ render json: Node.includes(:slots).order(name: :asc), each_serializer: StatusPanelNodeSerializer
8
+ end
9
+
10
+ def tasks
11
+ @tasks = Task
12
+ .only(Task.attribute_names - %w[logs])
13
+ .includes(:slot)
14
+ .order_by("created_at" => "desc")
15
+ .batch_size(LIMIT_TASKS)
16
+ .limit(LIMIT_TASKS)
17
+
18
+ @tasks = @tasks.where(status: params[:status]) if params[:status].present?
19
+ if params[:tags]
20
+ params.require(:tags).each do |tag, value|
21
+ @tasks = @tasks.where("tags.#{tag}" => value.to_s)
22
+ end
23
+ end
24
+
25
+ render json: @tasks, each_serializer: StatusPanelTaskSerializer
26
+ end
27
+
28
+ def tags
29
+ @tags = TaskTag.pluck(:name)
30
+ render json: @tags
31
+ end
32
+
33
+ def task_statuses
34
+ render json: Task.all_status
35
+ end
36
+
37
+ def tag_values
38
+ @tag = TaskTag.find_by!(name: params[:tag_name])
39
+ render json: @tag.values.take(50)
40
+ end
41
+
42
+ def retry_task
43
+ @task = Task.find_by!(uuid: params[:uuid])
44
+ @task.force_retry!
45
+
46
+ head :ok
47
+ end
48
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TasksController < ApplicationController
4
+ before_action :set_task, only: %i[show logs mark_as_error kill_container]
5
+ before_action :set_request_id, only: %i[create]
6
+
7
+ def create
8
+ @task = Task.new(task_params)
9
+
10
+ if @task.save
11
+ Metrics.new("tasks").count(
12
+ task_id: @task.id,
13
+ name: @task&.name,
14
+ status: @task.status
15
+ )
16
+
17
+ render json: @task
18
+ else
19
+ render json: @task.errors, status: :unprocessable_entity
20
+ end
21
+ end
22
+
23
+ def show
24
+ render json: @task
25
+ end
26
+
27
+ def logs
28
+ render json: { logs: @task.get_logs&.encode("utf-8", undef: :replace, replace: "?") }
29
+ end
30
+
31
+ def clear_errors
32
+ Task.error.destroy
33
+ head :ok
34
+ end
35
+
36
+ def mark_as_error
37
+ if @task.failed?
38
+ @task.error!
39
+
40
+ head :ok
41
+ else
42
+ render json: { message: "Task must have failed status to be marked as error" }, status: :unprocessable_entity
43
+ end
44
+ end
45
+
46
+ def kill_container
47
+ KillTaskContainer.new(task: @task).perform
48
+
49
+ head :ok
50
+ rescue KillTaskContainer::TaskNotRunningError => e
51
+ render json: { message: e.message }, status: :bad_request
52
+ end
53
+
54
+ private
55
+
56
+ def set_task
57
+ @task = Task.find_by!(uuid: params[:uuid])
58
+ end
59
+
60
+ def task_params
61
+ params.require(:task).permit(
62
+ :name,
63
+ :image,
64
+ :cmd,
65
+ :persist_logs,
66
+ :execution_type,
67
+ storage_mounts: {},
68
+ tags: {}
69
+ ).tap do |permitted_params|
70
+ # TODO: Remove after migrate encoder
71
+ if params.key?(:ingest_storage_mount) || params[:task].key?(:ingest_storage_mount)
72
+ permitted_params[:storage_mounts] = {
73
+ "ingest-nfs" => params[:ingest_storage_mount] || params.dig(:task, :ingest_storage_mount)
74
+ }
75
+ end
76
+ end
77
+ end
78
+
79
+ def set_request_id
80
+ params[:task][:tags] ||= {}
81
+ params[:task][:tags][:request_id] = request.request_id
82
+ end
83
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TasksHealthcheckController < ApplicationController
4
+ def index
5
+ render json: {
6
+ status: status,
7
+ failed_tasks_count: failed_tasks_count
8
+ }
9
+ end
10
+
11
+ private
12
+
13
+ def failed_tasks_count
14
+ @failed_tasks_count ||= Task.failed.count
15
+ end
16
+
17
+ def tasks_failed?
18
+ failed_tasks_count > 0
19
+ end
20
+
21
+ def status
22
+ if tasks_failed?
23
+ "FAILING"
24
+ else
25
+ "WORKING"
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ class AddTaskTagsJob < ContainerBrokerBaseJob
4
+ extend RequestIdFromTask
5
+
6
+ queue_as :default
7
+
8
+ def perform(task:)
9
+ task.tags.keys.each do |tag_name|
10
+ TaskTag.find_or_create_by(name: tag_name.to_s)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ class AdjustNodeSlotsJob < ContainerBrokerBaseJob
4
+ queue_as :default
5
+
6
+ def perform(node:)
7
+ lock_manager_instance(node).lock do
8
+ node.reload
9
+ all_execution_types(node).each do |execution_type|
10
+ AdjustExecutionTypeSlots.new(
11
+ node: node,
12
+ execution_type: execution_type
13
+ ).perform
14
+ end
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def all_execution_types(node)
21
+ (node.slots_execution_types.keys + node.slots.map(&:execution_type)).uniq
22
+ end
23
+
24
+ def lock_manager_instance(node)
25
+ LockManager.new(type: self.class.to_s, id: node.id, wait: true, expire: 1.minute)
26
+ end
27
+ end