container_broker 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +98 -0
  3. data/Rakefile +8 -0
  4. data/app/controllers/application_controller.rb +5 -0
  5. data/app/controllers/healthcheck_controller.rb +21 -0
  6. data/app/controllers/nodes_controller.rb +70 -0
  7. data/app/controllers/nodes_healthcheck_controller.rb +28 -0
  8. data/app/controllers/status_controller.rb +48 -0
  9. data/app/controllers/tasks_controller.rb +83 -0
  10. data/app/controllers/tasks_healthcheck_controller.rb +28 -0
  11. data/app/jobs/add_task_tags_job.rb +13 -0
  12. data/app/jobs/adjust_node_slots_job.rb +27 -0
  13. data/app/jobs/application_job.rb +9 -0
  14. data/app/jobs/collect_load_metrics_job.rb +9 -0
  15. data/app/jobs/container_broker_base_job.rb +32 -0
  16. data/app/jobs/migrate_tasks_from_dead_node_job.rb +32 -0
  17. data/app/jobs/monitor_unresponsive_node_job.rb +21 -0
  18. data/app/jobs/monitor_unresponsive_nodes_job.rb +9 -0
  19. data/app/jobs/release_slot_job.rb +47 -0
  20. data/app/jobs/remove_runner_job.rb +11 -0
  21. data/app/jobs/remove_unused_tags_job.rb +25 -0
  22. data/app/jobs/request_id_from_task.rb +7 -0
  23. data/app/jobs/run_task_job.rb +64 -0
  24. data/app/jobs/run_tasks_for_all_execution_types_job.rb +11 -0
  25. data/app/jobs/run_tasks_job.rb +42 -0
  26. data/app/jobs/timeout_failed_tasks_job.rb +31 -0
  27. data/app/jobs/update_all_nodes_status_job.rb +9 -0
  28. data/app/jobs/update_node_status_job.rb +24 -0
  29. data/app/jobs/update_task_status_job.rb +71 -0
  30. data/app/models/mongoid_serializable_model.rb +14 -0
  31. data/app/models/node.rb +101 -0
  32. data/app/models/slot.rb +42 -0
  33. data/app/models/task.rb +148 -0
  34. data/app/models/task_tag.rb +11 -0
  35. data/app/observers/observable.rb +23 -0
  36. data/app/observers/task_observer.rb +11 -0
  37. data/app/serializers/node_healthcheck_serializer.rb +5 -0
  38. data/app/serializers/node_serializer.rb +5 -0
  39. data/app/serializers/status_panel_node_serializer.rb +9 -0
  40. data/app/serializers/status_panel_slot_serializer.rb +5 -0
  41. data/app/serializers/status_panel_task_serializer.rb +16 -0
  42. data/app/serializers/task_healthcheck_serializer.rb +5 -0
  43. data/app/serializers/task_serializer.rb +7 -0
  44. data/app/services/adjust_execution_type_slots.rb +51 -0
  45. data/app/services/check_for_slot_removal.rb +28 -0
  46. data/app/services/collect_load_metrics.rb +40 -0
  47. data/app/services/delete_node.rb +25 -0
  48. data/app/services/friendly_name_nodes.rb +10 -0
  49. data/app/services/friendly_name_slots.rb +15 -0
  50. data/app/services/kill_node_runners.rb +17 -0
  51. data/app/services/kill_task_container.rb +29 -0
  52. data/app/services/kubernetes_client.rb +136 -0
  53. data/app/services/least_used_node.rb +44 -0
  54. data/app/services/lock_manager.rb +74 -0
  55. data/app/services/lock_slot.rb +37 -0
  56. data/app/services/lock_task.rb +45 -0
  57. data/app/services/metrics.rb +43 -0
  58. data/app/services/migrate_runner.rb +26 -0
  59. data/app/services/node_task_acceptance.rb +18 -0
  60. data/app/services/node_usage_percentage_per_execution_type.rb +22 -0
  61. data/app/services/reschedule_tasks_for_missing_runners.rb +70 -0
  62. data/app/services/runners.rb +4 -0
  63. data/app/services/runners/docker/create_connection.rb +18 -0
  64. data/app/services/runners/docker/create_execution_info.rb +87 -0
  65. data/app/services/runners/docker/fetch_execution_info.rb +17 -0
  66. data/app/services/runners/docker/fetch_logs.rb +18 -0
  67. data/app/services/runners/docker/fetch_task_container.rb +15 -0
  68. data/app/services/runners/docker/filer.rb +19 -0
  69. data/app/services/runners/docker/kill_slot_runner.rb +19 -0
  70. data/app/services/runners/docker/node_availability.rb +11 -0
  71. data/app/services/runners/docker/remove_runner.rb +18 -0
  72. data/app/services/runners/docker/run_task.rb +63 -0
  73. data/app/services/runners/docker/update_node_status.rb +62 -0
  74. data/app/services/runners/execution_info.rb +49 -0
  75. data/app/services/runners/invalid_config.rb +5 -0
  76. data/app/services/runners/invalid_runner.rb +5 -0
  77. data/app/services/runners/kubernetes/create_client.rb +29 -0
  78. data/app/services/runners/kubernetes/create_execution_info.rb +103 -0
  79. data/app/services/runners/kubernetes/fetch_execution_info.rb +15 -0
  80. data/app/services/runners/kubernetes/fetch_logs.rb +17 -0
  81. data/app/services/runners/kubernetes/filer.rb +41 -0
  82. data/app/services/runners/kubernetes/kill_slot_runner.rb +11 -0
  83. data/app/services/runners/kubernetes/node_availability.rb +11 -0
  84. data/app/services/runners/kubernetes/remove_runner.rb +19 -0
  85. data/app/services/runners/kubernetes/run_task.rb +54 -0
  86. data/app/services/runners/kubernetes/update_node_status.rb +64 -0
  87. data/app/services/runners/runner_id_not_found_error.rb +5 -0
  88. data/app/services/runners/services_factory.rb +38 -0
  89. data/app/services/runners/update_node_status_helper.rb +43 -0
  90. data/app/services/slots_usage_percentage.rb +18 -0
  91. data/config/application.rb +34 -0
  92. data/config/boot.rb +5 -0
  93. data/config/environment.rb +7 -0
  94. data/config/environments/test.rb +44 -0
  95. data/config/initializers/application_controller_renderer.rb +10 -0
  96. data/config/initializers/backtrace_silencers.rb +9 -0
  97. data/config/initializers/config.rb +51 -0
  98. data/config/initializers/cookies_serializer.rb +7 -0
  99. data/config/initializers/docker_config.rb +3 -0
  100. data/config/initializers/filter_parameter_logging.rb +6 -0
  101. data/config/initializers/idempotent_request.rb +12 -0
  102. data/config/initializers/inflections.rb +18 -0
  103. data/config/initializers/mime_types.rb +6 -0
  104. data/config/initializers/mongoid.rb +3 -0
  105. data/config/initializers/new_framework_defaults_6_0.rb +47 -0
  106. data/config/initializers/raven.rb +10 -0
  107. data/config/initializers/sidekiq.rb +24 -0
  108. data/config/initializers/wrap_parameters.rb +16 -0
  109. data/config/locales/en.yml +33 -0
  110. data/config/mongoid.yml +10 -0
  111. data/config/routes.rb +43 -0
  112. data/config/secrets.yml +35 -0
  113. data/config/settings.yml +34 -0
  114. data/config/settings/test.yml +27 -0
  115. data/config/sidekiq_scheduler.yml +18 -0
  116. data/config/spring.rb +8 -0
  117. data/lib/constants.rb +12 -0
  118. data/lib/container_broker.rb +30 -0
  119. data/lib/container_broker/engine.rb +6 -0
  120. data/lib/container_broker/version.rb +5 -0
  121. data/lib/current_thread_request_id.rb +19 -0
  122. data/lib/idempotent_request/callback.rb +25 -0
  123. data/lib/idempotent_request/policy.rb +15 -0
  124. data/lib/redis_url_parser.rb +25 -0
  125. data/lib/tasks/task.rake +34 -0
  126. metadata +590 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 021bb9a1ca3d028ae71c5640840335415923572f5427f4ae2ab0905cadd35fe4
4
+ data.tar.gz: e8af975eb58b008668e654fa0e1d0ca4b7ea70e9832ca788ed4842d021fe6f8d
5
+ SHA512:
6
+ metadata.gz: 963ddbcc97ebc45676eab092818f8f3e23d20be0e24f45aec26fb3c5f49db3ef7fe9342f6bc0488b7827b6d99bdb8bbba586f446a7cf25ba6b5613bc0f9fa35d
7
+ data.tar.gz: c59b5198d1b4fdc532ac1ea93cafd947ea04b35573f752b9c8ce6277c9a40a76fa1903ac774d98afc13693e846558056ef8d2ac0a771c9b3c26edb9a36675d36
data/README.md ADDED
@@ -0,0 +1,98 @@
1
+ # Container Broker
2
+
3
+ ## Installation
4
+
5
+ Add this line to your application's Gemfile:
6
+
7
+ ```ruby
8
+ gem 'container-broker'
9
+ ```
10
+
11
+ And then execute:
12
+
13
+ $ bundle install
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install container-broker
18
+
19
+ ## Usage
20
+
21
+ ### Routes
22
+
23
+ #### Nodes
24
+
25
+ ##### List
26
+ - `GET /nodes`
27
+
28
+ ##### Add
29
+ - `POST /nodes`
30
+ - Parameters: `{ "hostname": "", "slots_execution_types": {"cpu": 2, "network": 10} }`
31
+
32
+ ##### Update
33
+ - `PATCH /nodes/:uuid`
34
+ - Parameters: `{ "slots_execution_types": {"cpu": 2, "network": 10} }`
35
+
36
+ ##### Remove
37
+ - `DELETE /nodes/:uuid`
38
+ - Parameters: `{ "slots_execution_types": {"cpu": 2, "network": 10} }`
39
+
40
+ #### Tasks
41
+
42
+ ##### Create
43
+ - `POST /tasks`
44
+ - Parameters: `{ "name": "", "image": "", "cmd": "", "storage_mounts": "{}", "tags": {"type": "video"} }`
45
+ - Response:
46
+ ```json
47
+ {
48
+ "status": "ok|error",
49
+ "uuid": "2d272b5c-953c-44e9-ad15-6c31187903c9"
50
+ }
51
+ ```
52
+
53
+ ##### Task Details
54
+ - Show information about some job
55
+ - Parameters (query string): `id`
56
+ - Response:
57
+ ```json
58
+ {
59
+ "uuid": "2d272b5c-953c-44e9-ad15-6c31187903c9",
60
+ "status": "waiting|running|completed|error"
61
+ }
62
+ ```
63
+
64
+ #### GET /status
65
+ - Show pool informations
66
+ - Response:
67
+ ```json
68
+ {
69
+ "nodes": [
70
+ {
71
+ "hostname": "",
72
+ "cores": 10,
73
+ "memory": 4096,
74
+ "jobs": [
75
+ {
76
+ "uuid": "2d272b5c-953c-44e9-ad15-6c31187903c9",
77
+ "status": "waiting|running|completed|error"
78
+ }
79
+ ]
80
+ }
81
+ ]
82
+ }
83
+ ```
84
+
85
+ ## Development
86
+
87
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
88
+
89
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
90
+
91
+ ### Expose Docker HTTP API on MacOSX:
92
+ ```shell
93
+ socat TCP-LISTEN:2376,reuseaddr,fork UNIX-CONNECT:/var/run/docker.sock
94
+ ```
95
+
96
+ ## Contributing
97
+
98
+ Bug reports and pull requests are welcome on GitHub at https://github.com/globocom/container-broker.
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Add your own tasks in files placed in lib/tasks ending in .rake,
4
+ # for example lib/tasks/capistrano.rake, and they will automatically be available to Rake.
5
+
6
+ require_relative 'config/application'
7
+
8
+ Rails.application.load_tasks
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class ApplicationController < ActionController::API
4
+ # protect_from_forgery with: :exception
5
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HealthcheckController < ApplicationController
4
+ def index
5
+ render json: {
6
+ status: "OK",
7
+ sidekiq_redis: Sidekiq.redis_info["redis_version"],
8
+ lock_manager_redis: LockManager.redis_client.info["redis_version"],
9
+ mongodb: {
10
+ nodes: Node.count,
11
+ slots: Slot.count,
12
+ pending_tasks: Task.where(status: "waiting").count
13
+ }
14
+ }
15
+ rescue StandardError => e
16
+ render json: {
17
+ status: "ERROR",
18
+ message: "#{e.class}: #{e.message}"
19
+ }
20
+ end
21
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodesController < ApplicationController
4
+ before_action :load_node, only: %i[update show destroy accept_new_tasks reject_new_tasks kill_containers]
5
+
6
+ rescue_from Mongoid::Errors::Validations do |exception|
7
+ render json: exception.record.errors.messages, status: :unprocessable_entity
8
+ end
9
+
10
+ def index
11
+ render json: Node.all
12
+ end
13
+
14
+ def create
15
+ @node = Node.create!(node_params)
16
+
17
+ FriendlyNameNodes.new.perform
18
+ AdjustNodeSlotsJob.perform_later(node: @node)
19
+
20
+ render json: @node, status: :created
21
+ end
22
+
23
+ def show
24
+ render json: @node
25
+ end
26
+
27
+ def update
28
+ @node.update!(node_params.slice(:slots_execution_types))
29
+ AdjustNodeSlotsJob.perform_later(node: @node)
30
+
31
+ render json: @node
32
+ end
33
+
34
+ def destroy
35
+ DeleteNode.new(node: @node).perform
36
+ FriendlyNameNodes.new.perform
37
+
38
+ head :ok
39
+ rescue DeleteNode::NodeWithRunningSlotsError
40
+ head :not_acceptable
41
+ end
42
+
43
+ def reject_new_tasks
44
+ NodeTaskAcceptance.new(node: @node).reject!
45
+
46
+ head :ok
47
+ end
48
+
49
+ def accept_new_tasks
50
+ NodeTaskAcceptance.new(node: @node).accept!
51
+
52
+ head :ok
53
+ end
54
+
55
+ def kill_containers
56
+ KillNodeRunners.new(node: @node).perform
57
+
58
+ head :ok
59
+ end
60
+
61
+ private
62
+
63
+ def load_node
64
+ @node = Node.find_by!(uuid: params[:uuid])
65
+ end
66
+
67
+ def node_params
68
+ params.require(:node).permit(:hostname, slots_execution_types: {})
69
+ end
70
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ class NodesHealthcheckController < ApplicationController
4
+ def index
5
+ render json: {
6
+ status: status,
7
+ failed_nodes: failed_nodes.map { |node| NodeHealthcheckSerializer.new(node) }
8
+ }
9
+ end
10
+
11
+ private
12
+
13
+ def failed_nodes
14
+ @failed_nodes ||= Node.unavailable
15
+ end
16
+
17
+ def nodes_failed?
18
+ failed_nodes.to_a.any?
19
+ end
20
+
21
+ def status
22
+ if nodes_failed?
23
+ "FAILING"
24
+ else
25
+ "WORKING"
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ class StatusController < ApplicationController
4
+ LIMIT_TASKS = 200
5
+
6
+ def nodes
7
+ render json: Node.includes(:slots).order(name: :asc), each_serializer: StatusPanelNodeSerializer
8
+ end
9
+
10
+ def tasks
11
+ @tasks = Task
12
+ .only(Task.attribute_names - %w[logs])
13
+ .includes(:slot)
14
+ .order_by("created_at" => "desc")
15
+ .batch_size(LIMIT_TASKS)
16
+ .limit(LIMIT_TASKS)
17
+
18
+ @tasks = @tasks.where(status: params[:status]) if params[:status].present?
19
+ if params[:tags]
20
+ params.require(:tags).each do |tag, value|
21
+ @tasks = @tasks.where("tags.#{tag}" => value.to_s)
22
+ end
23
+ end
24
+
25
+ render json: @tasks, each_serializer: StatusPanelTaskSerializer
26
+ end
27
+
28
+ def tags
29
+ @tags = TaskTag.pluck(:name)
30
+ render json: @tags
31
+ end
32
+
33
+ def task_statuses
34
+ render json: Task.all_status
35
+ end
36
+
37
+ def tag_values
38
+ @tag = TaskTag.find_by!(name: params[:tag_name])
39
+ render json: @tag.values.take(50)
40
+ end
41
+
42
+ def retry_task
43
+ @task = Task.find_by!(uuid: params[:uuid])
44
+ @task.force_retry!
45
+
46
+ head :ok
47
+ end
48
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TasksController < ApplicationController
4
+ before_action :set_task, only: %i[show logs mark_as_error kill_container]
5
+ before_action :set_request_id, only: %i[create]
6
+
7
+ def create
8
+ @task = Task.new(task_params)
9
+
10
+ if @task.save
11
+ Metrics.new("tasks").count(
12
+ task_id: @task.id,
13
+ name: @task&.name,
14
+ status: @task.status
15
+ )
16
+
17
+ render json: @task
18
+ else
19
+ render json: @task.errors, status: :unprocessable_entity
20
+ end
21
+ end
22
+
23
+ def show
24
+ render json: @task
25
+ end
26
+
27
+ def logs
28
+ render json: { logs: @task.get_logs&.encode("utf-8", undef: :replace, replace: "?") }
29
+ end
30
+
31
+ def clear_errors
32
+ Task.error.destroy
33
+ head :ok
34
+ end
35
+
36
+ def mark_as_error
37
+ if @task.failed?
38
+ @task.error!
39
+
40
+ head :ok
41
+ else
42
+ render json: { message: "Task must have failed status to be marked as error" }, status: :unprocessable_entity
43
+ end
44
+ end
45
+
46
+ def kill_container
47
+ KillTaskContainer.new(task: @task).perform
48
+
49
+ head :ok
50
+ rescue KillTaskContainer::TaskNotRunningError => e
51
+ render json: { message: e.message }, status: :bad_request
52
+ end
53
+
54
+ private
55
+
56
+ def set_task
57
+ @task = Task.find_by!(uuid: params[:uuid])
58
+ end
59
+
60
+ def task_params
61
+ params.require(:task).permit(
62
+ :name,
63
+ :image,
64
+ :cmd,
65
+ :persist_logs,
66
+ :execution_type,
67
+ storage_mounts: {},
68
+ tags: {}
69
+ ).tap do |permitted_params|
70
+ # TODO: Remove after migrate encoder
71
+ if params.key?(:ingest_storage_mount) || params[:task].key?(:ingest_storage_mount)
72
+ permitted_params[:storage_mounts] = {
73
+ "ingest-nfs" => params[:ingest_storage_mount] || params.dig(:task, :ingest_storage_mount)
74
+ }
75
+ end
76
+ end
77
+ end
78
+
79
+ def set_request_id
80
+ params[:task][:tags] ||= {}
81
+ params[:task][:tags][:request_id] = request.request_id
82
+ end
83
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TasksHealthcheckController < ApplicationController
4
+ def index
5
+ render json: {
6
+ status: status,
7
+ failed_tasks_count: failed_tasks_count
8
+ }
9
+ end
10
+
11
+ private
12
+
13
+ def failed_tasks_count
14
+ @failed_tasks_count ||= Task.failed.count
15
+ end
16
+
17
+ def tasks_failed?
18
+ failed_tasks_count > 0
19
+ end
20
+
21
+ def status
22
+ if tasks_failed?
23
+ "FAILING"
24
+ else
25
+ "WORKING"
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ class AddTaskTagsJob < ContainerBrokerBaseJob
4
+ extend RequestIdFromTask
5
+
6
+ queue_as :default
7
+
8
+ def perform(task:)
9
+ task.tags.keys.each do |tag_name|
10
+ TaskTag.find_or_create_by(name: tag_name.to_s)
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ class AdjustNodeSlotsJob < ContainerBrokerBaseJob
4
+ queue_as :default
5
+
6
+ def perform(node:)
7
+ lock_manager_instance(node).lock do
8
+ node.reload
9
+ all_execution_types(node).each do |execution_type|
10
+ AdjustExecutionTypeSlots.new(
11
+ node: node,
12
+ execution_type: execution_type
13
+ ).perform
14
+ end
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def all_execution_types(node)
21
+ (node.slots_execution_types.keys + node.slots.map(&:execution_type)).uniq
22
+ end
23
+
24
+ def lock_manager_instance(node)
25
+ LockManager.new(type: self.class.to_s, id: node.id, wait: true, expire: 1.minute)
26
+ end
27
+ end