inst-jobs-statsd 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/lib/inst-jobs-statsd.rb +26 -0
  3. data/lib/inst_jobs_statsd/default_tracking.rb +15 -0
  4. data/lib/inst_jobs_statsd/jobs_tracker.rb +25 -0
  5. data/lib/inst_jobs_statsd/naming.rb +49 -0
  6. data/lib/inst_jobs_statsd/stats/counters/failed.rb +28 -0
  7. data/lib/inst_jobs_statsd/stats/counters/orphaned.rb +34 -0
  8. data/lib/inst_jobs_statsd/stats/counters/run.rb +21 -0
  9. data/lib/inst_jobs_statsd/stats/counters.rb +10 -0
  10. data/lib/inst_jobs_statsd/stats/periodic/failed.rb +21 -0
  11. data/lib/inst_jobs_statsd/stats/periodic/queue.rb +49 -0
  12. data/lib/inst_jobs_statsd/stats/periodic/run.rb +38 -0
  13. data/lib/inst_jobs_statsd/stats/periodic.rb +91 -0
  14. data/lib/inst_jobs_statsd/stats/timing/failed.rb +17 -0
  15. data/lib/inst_jobs_statsd/stats/timing/perform.rb +29 -0
  16. data/lib/inst_jobs_statsd/stats/timing/pop.rb +28 -0
  17. data/lib/inst_jobs_statsd/stats/timing.rb +27 -0
  18. data/lib/inst_jobs_statsd/version.rb +3 -0
  19. data/spec/factories/jobs.rb +9 -0
  20. data/spec/factories/workers.rb +9 -0
  21. data/spec/gemfiles/42.gemfile +7 -0
  22. data/spec/gemfiles/42.gemfile.lock +201 -0
  23. data/spec/gemfiles/50.gemfile +7 -0
  24. data/spec/gemfiles/50.gemfile.lock +224 -0
  25. data/spec/gemfiles/51.gemfile +7 -0
  26. data/spec/gemfiles/51.gemfile.lock +224 -0
  27. data/spec/inst_jobs_statsd/jobs_tracker_spec.rb +30 -0
  28. data/spec/inst_jobs_statsd/naming_spec.rb +26 -0
  29. data/spec/inst_jobs_statsd/stats/counters/failed_spec.rb +20 -0
  30. data/spec/inst_jobs_statsd/stats/counters/orphaned_spec.rb +27 -0
  31. data/spec/inst_jobs_statsd/stats/counters/run_spec.rb +27 -0
  32. data/spec/inst_jobs_statsd/stats/periodic/failed_spec.rb +31 -0
  33. data/spec/inst_jobs_statsd/stats/periodic/queue_spec.rb +95 -0
  34. data/spec/inst_jobs_statsd/stats/periodic/run_spec.rb +53 -0
  35. data/spec/inst_jobs_statsd/stats/periodic_spec.rb +63 -0
  36. data/spec/inst_jobs_statsd/stats/timing/failed_spec.rb +25 -0
  37. data/spec/inst_jobs_statsd/stats/timing/perform_spec.rb +35 -0
  38. data/spec/inst_jobs_statsd/stats/timing/pop_spec.rb +34 -0
  39. data/spec/inst_jobs_statsd/stats/timing_spec.rb +35 -0
  40. data/spec/inst_statsd/default_tracking_spec.rb +16 -0
  41. data/spec/matchers.rb +3 -0
  42. data/spec/setup_test_db.rb +41 -0
  43. data/spec/spec_helper.rb +63 -0
  44. metadata +327 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ea3cc2f583c0d355a0ab7440affd123343fa8de1
4
+ data.tar.gz: 7fca023059959468544941e582d9d805caf2f3e1
5
+ SHA512:
6
+ metadata.gz: 116fada163459dab67b7d70a00d0a8191438636bffe7fa5c95e25b06ad45a1ac8a60e28f628b50333821f109cdca3e182d75a1eb145af54b9992b48d1f067a46
7
+ data.tar.gz: fe822cd09c1421b0d694b2ea8255e34f47e98b402275e9ba4c4b262f1b953127a00f122b58376299971c02a2b1fec9305529927db5ad49a817df55ca70749ef7
@@ -0,0 +1,26 @@
1
+ require 'inst-jobs'
2
+ require 'inst_statsd'
3
+
4
+ require_relative 'inst_jobs_statsd/version'
5
+
6
+ require_relative 'inst_jobs_statsd/default_tracking'
7
+ require_relative 'inst_jobs_statsd/jobs_tracker'
8
+
9
+ require_relative 'inst_jobs_statsd/naming'
10
+
11
+ require_relative 'inst_jobs_statsd/stats/counters'
12
+ require_relative 'inst_jobs_statsd/stats/counters/failed'
13
+ require_relative 'inst_jobs_statsd/stats/counters/orphaned'
14
+ require_relative 'inst_jobs_statsd/stats/counters/run'
15
+
16
+ require_relative 'inst_jobs_statsd/stats/periodic'
17
+ require_relative 'inst_jobs_statsd/stats/periodic/failed'
18
+ require_relative 'inst_jobs_statsd/stats/periodic/queue'
19
+ require_relative 'inst_jobs_statsd/stats/periodic/run'
20
+
21
+ require_relative 'inst_jobs_statsd/stats/timing'
22
+ require_relative 'inst_jobs_statsd/stats/timing/failed'
23
+ require_relative 'inst_jobs_statsd/stats/timing/perform'
24
+ require_relative 'inst_jobs_statsd/stats/timing/pop'
25
+
26
+ ::InstStatsd::DefaultTracking.include InstJobsStatsd::DefaultTracking
@@ -0,0 +1,15 @@
1
+ # Defines InstStatsd::DefaultTracking.track_jobs
2
+ # to be consistent with InstStatsd::DefaultTracking.track_sql etc
3
+ module InstJobsStatsd
4
+ module DefaultTracking
5
+ def self.included(base)
6
+ base.extend(ClassMethods)
7
+ end
8
+
9
+ module ClassMethods
10
+ def track_jobs
11
+ @jobs_tracker ||= JobsTracker.new
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,25 @@
1
+ module InstJobsStatsd
2
+ class JobsTracker
3
+ def self.track
4
+ @current_tracking = new
5
+ yield
6
+ tracking = @current_tracking
7
+ @current_tracking = nil
8
+ tracking
9
+ end
10
+
11
+ def initialize
12
+ Stats::Counters::Failed.enable
13
+ Stats::Counters::Orphaned.enable
14
+ Stats::Counters::Run.enable
15
+
16
+ Stats::Periodic::Failed.enable
17
+ Stats::Periodic::Queue.enable
18
+ Stats::Periodic::Run.enable
19
+
20
+ Stats::Timing::Failed.enable
21
+ Stats::Timing::Perform.enable
22
+ Stats::Timing::Pop.enable
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,49 @@
1
+ module InstJobsStatsd
2
+ module Naming
3
+ BASENAME = 'delayedjob'.freeze
4
+
5
+ # The root prefix for all stat names
6
+ # TODO: Make this configurable
7
+ def self.basename
8
+ BASENAME
9
+ end
10
+
11
+ def self.qualified_names(stat_name, job)
12
+ names = ["#{basename}.#{stat_name}"]
13
+ tagged = tagged_stat(names[0], job)
14
+ names << tagged if tagged.present?
15
+ names
16
+ end
17
+
18
+ # Given a stat name, add a suffix to it to make it
19
+ # unique per job type -- using the job's class name
20
+ # and method name as appropriate
21
+ def self.tagged_stat(stat_name, job)
22
+ return unless job
23
+
24
+ obj_tag, method_tag = job_tags(job)
25
+ return if obj_tag.blank?
26
+
27
+ tagged = "#{stat_name}.tag.#{obj_tag}"
28
+ tagged += ".#{method_tag}" if method_tag.present?
29
+ tagged
30
+ end
31
+
32
+ # this converts Foo#bar" or "Foo.bar" into "Foo and "bar",
33
+ # and makes sure the values are valid to be used for statsd names
34
+ def self.job_tags(job)
35
+ return unless job
36
+ return unless job.tag
37
+ return if job.tag =~ /Class:0x/
38
+
39
+ obj_tag, method_tag = job.tag.split(/[\.#]/, 2).map do |v|
40
+ InstStatsd::Statsd.escape(v).gsub('::', '-')
41
+ end
42
+
43
+ tags = [obj_tag]
44
+ tags << method_tag if method_tag.present?
45
+
46
+ tags
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,28 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Counters
4
+ module Failed
5
+ def self.enable
6
+ enable_failed_count
7
+ end
8
+
9
+ def self.enable_failed_count
10
+ return if Delayed::Job::Failed < AfterCreateHook
11
+ Delayed::Job::Failed.include AfterCreateHook
12
+ end
13
+
14
+ module AfterCreateHook
15
+ def self.included(base)
16
+ base.after_create do
17
+ InstJobsStatsd::Stats::Counters::Failed.report_failed_count(self)
18
+ end
19
+ end
20
+ end
21
+
22
+ def self.report_failed_count(job)
23
+ Counters.report_count(:failed, 1, job: job)
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,34 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Counters
4
+ module Orphaned
5
+ def self.enable
6
+ enable_orphaned_count
7
+ end
8
+
9
+ # The idea of the orphaned count: when a job finishes, if there
10
+ # are other jobs locked_by the *same* worker, they must have been
11
+ # orphaned, because they are not going to be picked up and run by
12
+ # the worker -- the work queue is designed to only have one job
13
+ # locked_by a worker at a time.
14
+ # This is based on the symptom seen in AMS-447, where mutliple
15
+ # rows of the jobs table can be (incorrectly) updated by the same
16
+ # query.
17
+ def self.enable_orphaned_count
18
+ Delayed::Worker.lifecycle.before(:perform) do |_worker, job|
19
+ report_orphaned_count(job)
20
+ end
21
+ end
22
+
23
+ def self.report_orphaned_count(job)
24
+ scope = Delayed::Job.where(
25
+ 'locked_by = ? AND locked_at = ? AND id <> ?',
26
+ job.locked_by, job.locked_at, job.id
27
+ )
28
+ count = scope.count
29
+ Counters.report_count(:orphaned, count, job: job) unless count.zero?
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,21 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Counters
4
+ module Run
5
+ def self.enable
6
+ enable_run_count
7
+ end
8
+
9
+ def self.enable_run_count
10
+ Delayed::Worker.lifecycle.before(:perform) do |_worker, job|
11
+ report_run_count(job)
12
+ end
13
+ end
14
+
15
+ def self.report_run_count(job)
16
+ Counters.report_count(:run, 1, job: job)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,10 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Counters
4
+ def self.report_count(stat, value, job: nil, sample_rate: 1)
5
+ stats = Naming.qualified_names(stat, job)
6
+ InstStatsd::Statsd.count(stats, value, sample_rate)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,21 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Periodic
4
+ module Failed
5
+ def self.enable
6
+ enable_failed_depth
7
+ end
8
+
9
+ def self.enable_failed_depth
10
+ Periodic.enable_callbacks
11
+ Periodic.add(-> { report_failed_depth })
12
+ end
13
+
14
+ def self.report_failed_depth
15
+ count = Delayed::Job::Failed.count
16
+ Periodic.report_gauge(:failed_depth, count)
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,49 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Periodic
4
+ module Queue
5
+ def self.enable
6
+ enable_queue_depth
7
+ enable_queue_age
8
+ end
9
+
10
+ def self.enable_queue_depth
11
+ Periodic.enable_callbacks
12
+ Periodic.add(-> { report_queue_depth })
13
+ end
14
+
15
+ def self.enable_queue_age
16
+ Periodic.enable_callbacks
17
+ Periodic.add(-> { report_queue_age })
18
+ end
19
+
20
+ def self.report_queue_depth
21
+ # count = Delayed::Job.jobs_count(:current) <-- includes running / locked
22
+ scope = queued_jobs_scope
23
+ count = scope.count
24
+ Periodic.report_gauge(:queue_depth, count)
25
+ end
26
+
27
+ # Limit the jobs included in this gauge to prevent blowing up
28
+ # memory usage in iterating the list.
29
+ # This has the adverse effect of artificially capping this
30
+ # metric, but the limit should be high enough so that the
31
+ # the metric still has a meaningful range -- and even if
32
+ # the count is capped, the metric will continue to grow
33
+ # if the queue is actually stalled
34
+ def self.report_queue_age
35
+ jobs_run_at = queued_jobs_scope.limit(10_000).pluck(:run_at)
36
+ age_secs = jobs_run_at.map { |t| Delayed::Job.db_time_now - t }
37
+ Periodic.report_gauge(:queue_age_total, age_secs.sum)
38
+ Periodic.report_gauge(:queue_age_max, age_secs.max)
39
+ end
40
+
41
+ def self.queued_jobs_scope
42
+ Delayed::Job
43
+ .current
44
+ .where("locked_at IS NULL OR locked_by = 'on_hold'") # not running
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,38 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Periodic
4
+ module Run
5
+ def self.enable
6
+ enable_run_depth
7
+ enable_run_age
8
+ end
9
+
10
+ def self.enable_run_depth
11
+ Periodic.enable_callbacks
12
+ Periodic.add(-> { report_run_depth })
13
+ end
14
+
15
+ def self.enable_run_age
16
+ Periodic.enable_callbacks
17
+ Periodic.add(-> { report_run_age })
18
+ end
19
+
20
+ def self.report_run_depth
21
+ scope = running_jobs_scope
22
+ Periodic.report_gauge(:run_depth, scope.count)
23
+ end
24
+
25
+ def self.report_run_age
26
+ jobs_run_at = running_jobs_scope.limit(10_000).pluck(:run_at)
27
+ age_secs = jobs_run_at.map { |t| Delayed::Job.db_time_now - t }
28
+ Periodic.report_gauge(:run_age_total, age_secs.sum)
29
+ Periodic.report_gauge(:run_age_max, age_secs.max)
30
+ end
31
+
32
+ def self.running_jobs_scope
33
+ Delayed::Job.running
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,91 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Periodic
4
+ def self.enable_callbacks
5
+ @instance ||= Callbacks.new
6
+ end
7
+
8
+ def self.add(proc)
9
+ return unless @instance
10
+ @instance.add(proc)
11
+ end
12
+
13
+ def self.report_gauge(stat, value, job: nil, sample_rate: 1)
14
+ stats = Naming.qualified_names(stat, job)
15
+ InstStatsd::Statsd.gauge(stats, value, sample_rate)
16
+ end
17
+
18
+ class Callbacks
19
+ def initialize(min_interval = 60)
20
+ @timer = Timer.new(min_interval)
21
+ @procs = []
22
+ register_lifecycle
23
+ end
24
+
25
+ def add(proc)
26
+ @procs << proc if proc
27
+ end
28
+
29
+ # This hooks into the lifecycle events such that it will
30
+ # get triggered periodically which reasonable certainty
31
+ # -- as long as the rest of the inst-jobs processing
32
+ # system is working, any way.
33
+ # This allows for stats to be reported periodically
34
+ # without having to start a separate sideband process.
35
+ # It means that reporting of those stats will run
36
+ # inline (in the same process and thread) as the
37
+ # regular job processing work, but it also means
38
+ # no need for an additional database connection, and
39
+ # no managing of additional threads or processes.
40
+ #
41
+ # The :work_queue_pop event is used becaused in production
42
+ # mode, the 'parent_process' work queue is typically
43
+ # going to be used, and this callback runs in the parent
44
+ # process -- as opposed to having this callback run in
45
+ # each of the subordinate worker processes, which would
46
+ # not be ideal. In a dev env with the 'in_process' work queue,
47
+ # there's typically only going to be a single worker process
48
+ # anyway, so it works just as well.
49
+ def register_lifecycle
50
+ Delayed::Worker.lifecycle.after(:work_queue_pop) do |_q, _c|
51
+ @timer.tick do
52
+ run
53
+ end
54
+ end
55
+ end
56
+
57
+ def run
58
+ InstStatsd::Statsd.batch do
59
+ @procs.each(&:call)
60
+ end
61
+ end
62
+ end
63
+
64
+ class Timer
65
+ def initialize(min_interval)
66
+ @min_interval = min_interval * 1.0
67
+ @start_time = Delayed::Job.db_time_now
68
+ update_next_run
69
+ end
70
+
71
+ # This is called as often as possible, based on the lifecycle callbacks.
72
+ # When the required interval of time has passed, execute the given block
73
+ def tick
74
+ return unless Delayed::Job.db_time_now >= @next_run
75
+ update_next_run
76
+ yield
77
+ end
78
+
79
+ private
80
+
81
+ # Target the next run time to based on the original start time,
82
+ # instead of just adding the run interval, to prevent drift
83
+ # from the target interval as much as possible
84
+ def update_next_run
85
+ ticks = ((Delayed::Job.db_time_now - @start_time) / @min_interval).floor
86
+ @next_run = @start_time + (ticks + 1) * @min_interval
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,17 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Timing
4
+ module Failed
5
+ def self.enable
6
+ enable_failure_timing
7
+ end
8
+
9
+ def self.enable_failure_timing
10
+ Delayed::Worker.lifecycle.before(:error) do |_worker, job, _exception|
11
+ Timing.report_job_timing_failed(job)
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,29 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Timing
4
+ module Perform
5
+ def self.enable
6
+ enable_batching
7
+ enable_perform_timing
8
+ end
9
+
10
+ def self.enable_batching
11
+ Delayed::Worker.lifecycle.around(:perform) do |worker, job, &block|
12
+ InstStatsd::Statsd.batch do
13
+ block.call(worker, job)
14
+ end
15
+ end
16
+ end
17
+
18
+ def self.enable_perform_timing
19
+ Delayed::Worker.lifecycle.around(:perform) do |worker, job, &block|
20
+ Timing.report_job_timing_queued(job)
21
+ Timing.report_timing(:perform, job: job) do
22
+ block.call(worker, job)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,28 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Timing
4
+ module Pop
5
+ def self.enable
6
+ enable_pop_timing
7
+ enable_workqueue_pop_timing
8
+ end
9
+
10
+ def self.enable_pop_timing
11
+ Delayed::Worker.lifecycle.around(:pop) do |worker, &block|
12
+ Timing.report_timing(:pop) do
13
+ block.call(worker)
14
+ end
15
+ end
16
+ end
17
+
18
+ def self.enable_workqueue_pop_timing
19
+ Delayed::Worker.lifecycle.around(:work_queue_pop) do |worker, config, &block|
20
+ Timing.report_timing(:workqueuepop) do
21
+ block.call(worker, config)
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,27 @@
1
+ module InstJobsStatsd
2
+ module Stats
3
+ module Timing
4
+ def self.report_timing(stat, job: nil, timing: nil, sample_rate: 1)
5
+ stats = Naming.qualified_names(stat, job)
6
+
7
+ if block_given?
8
+ InstStatsd::Statsd.time(stats, sample_rate) { yield }
9
+ else
10
+ InstStatsd::Statsd.timing(stats, timing, sample_rate)
11
+ end
12
+ end
13
+
14
+ def self.report_job_timing_queued(job)
15
+ return unless job
16
+ time_in_queue = ((Delayed::Job.db_time_now - job.run_at) * 1000).round
17
+ report_timing(:queue, job: job, timing: time_in_queue)
18
+ end
19
+
20
+ def self.report_job_timing_failed(job)
21
+ return unless job
22
+ time_to_failure = ((Delayed::Job.db_time_now - job.run_at) * 1000).round
23
+ report_timing(:failed_after, job: job, timing: time_to_failure)
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,3 @@
1
+ module InstJobsStatsd
2
+ VERSION = '1.0.0'.freeze
3
+ end
@@ -0,0 +1,9 @@
1
+ FactoryGirl.define do
2
+ class JobFixture
3
+ attr_accessor :tag, :run_at
4
+ end
5
+
6
+ factory :job_fixture, aliases: [:job] do
7
+ tag 'Test::Job.perform'
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ FactoryGirl.define do
2
+ class WorkerFixture
3
+ attr_accessor :name
4
+ end
5
+
6
+ factory :worker_fixture, aliases: [:worker] do
7
+ sequence(:name) { |n| "worker-#{n}" }
8
+ end
9
+ end
@@ -0,0 +1,7 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec :path=>"../../"
4
+
5
+ gem "rails", "~> 4.2.5"
6
+ gem "after_transaction_commit", "<2"
7
+ gem 'test_after_commit', '0.4.1'