postjob 0.4.5 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/lib/postjob.rb +22 -13
  3. data/lib/postjob/cli/events.rb +60 -0
  4. data/lib/postjob/cli/heartbeat.rb +55 -0
  5. data/lib/postjob/cli/hosts.rb +67 -0
  6. data/lib/postjob/cli/ps.rb +1 -13
  7. data/lib/postjob/cli/sessions.rb +83 -0
  8. data/lib/postjob/job.rb +4 -15
  9. data/lib/postjob/migrations/003_postjobs.sql +10 -8
  10. data/lib/postjob/migrations/003b_processing_columns.sql +8 -8
  11. data/lib/postjob/migrations/005_helpers.sql +3 -1
  12. data/lib/postjob/migrations/006_enqueue.sql +3 -0
  13. data/lib/postjob/migrations/006a_processing.sql +6 -26
  14. data/lib/postjob/migrations/007_job_results.sql +32 -13
  15. data/lib/postjob/migrations/008_checkout_runnable.sql +15 -21
  16. data/lib/postjob/migrations/008a_childjobs.sql +13 -0
  17. data/lib/postjob/migrations/010_settings.sql +18 -3
  18. data/lib/postjob/migrations/011_null_uuid.sql +7 -0
  19. data/lib/postjob/migrations/012_hosts.sql +42 -0
  20. data/lib/postjob/migrations/013_worker_sessions.sql +44 -0
  21. data/lib/postjob/migrations/014_postjob_session_id.sql +17 -0
  22. data/lib/postjob/migrations/015_events.sql +76 -0
  23. data/lib/postjob/migrations/016_sessions_functions.sql +16 -0
  24. data/lib/postjob/migrations/017_zombie_check.sql +58 -0
  25. data/lib/postjob/migrations/018_heartbeat.sql +28 -0
  26. data/lib/postjob/migrations/019_heartbeat_indices.sql +5 -0
  27. data/lib/postjob/queue.rb +41 -27
  28. data/lib/postjob/queue/notifications.rb +5 -4
  29. data/lib/postjob/queue/search.rb +2 -0
  30. data/lib/postjob/queue/settings.rb +11 -1
  31. data/lib/postjob/record.rb +17 -0
  32. data/lib/postjob/runner.rb +9 -2
  33. data/lib/postjob/worker_session.rb +76 -0
  34. data/lib/postjob/workflow.rb +0 -4
  35. data/lib/tools/atomic_store.rb +17 -0
  36. data/lib/tools/heartbeat.rb +151 -0
  37. data/lib/tools/history.rb +25 -0
  38. data/spec/postjob/events/heartbeat_event_spec.rb +85 -0
  39. data/spec/postjob/events/job_event_spec.rb +80 -0
  40. data/spec/postjob/job_control/max_attempts_spec.rb +0 -2
  41. data/spec/postjob/queue/search_spec.rb +0 -14
  42. data/spec/postjob/worker_session_spec.rb +41 -0
  43. data/spec/spec_helper.rb +9 -0
  44. data/spec/support/test_helper.rb +11 -1
  45. metadata +43 -3
  46. data/spec/postjob/job_control/workflow_status_spec.rb +0 -52
@@ -34,6 +34,8 @@ module Postjob::Runner
34
34
  # returns a subjob within the current job, for a +runner+
35
35
  # description and +args+.
36
36
  def async(workflow, *args, timeout: nil, max_attempts:)
37
+ worker_session_id = Postjob.current_worker_session.id
38
+
37
39
  # if the workflow is a symbol, then we change it into "__manual__"
38
40
  # - there should never be a workflow with that name - or into
39
41
  # "CurrentWorkshop.#{workflow}", denoting the \a workflow method of the
@@ -47,7 +49,7 @@ module Postjob::Runner
47
49
  raise ArgumentError, "Unsupported workflow spec #{workflow.inspect}. Did you run await(fun(a, b)) instead of await(:fun, a, b)"
48
50
  end
49
51
 
50
- ::Postjob::Queue.find_or_create_childjob(self.current_job, workflow, args,
52
+ ::Postjob::Queue.find_or_create_childjob(worker_session_id, self.current_job, workflow, args,
51
53
  timeout: timeout,
52
54
  max_attempts: max_attempts)
53
55
  end
@@ -62,7 +64,7 @@ module Postjob::Runner
62
64
  throw :pending, :pending
63
65
  else
64
66
  childjobs = Postjob::Queue.childjobs(current_job)
65
- childjobs.map(&:resolve)
67
+ childjobs.each(&:resolve).count
66
68
  end
67
69
  when Job
68
70
  expect! args == []
@@ -100,6 +102,11 @@ module Postjob::Runner
100
102
  with_current_job(job) do
101
103
  status, value, shutdown = invoke_workflow workflow, job
102
104
  log_result! job, status, value
105
+ # If the status is ok the job finished processing. In that case
106
+ # we'll wait for all child jobs to finish.
107
+ if status == :ok
108
+ await :all
109
+ end
103
110
  [ workflow.workflow_version, status, value, shutdown ]
104
111
  end
105
112
  end
@@ -0,0 +1,76 @@
1
+ # rubocop:disable Lint/RescueException
2
+
3
+ require_relative "./record"
4
+
5
+ require "tools/heartbeat"
6
+ require "tools/atomic_store"
7
+
8
+ class Postjob::Host < Postjob::Record
9
+ def self.register(attributes = {})
10
+ Postjob.logger.debug "registering host w/#{attributes.inspect}"
11
+ ::Postjob::Queue.host_register(attributes)
12
+ end
13
+ end
14
+
15
+ # A worker worker_session
16
+ class Postjob::WorkerSession < Postjob::Record
17
+ HOST_ID_STORE = ".postjob.host_id"
18
+
19
+ class << self
20
+ # Starts a worker session.
21
+ def start!(workflows_with_versions)
22
+ worker_session = nil
23
+
24
+ AtomicStore.with(HOST_ID_STORE) do |host_id|
25
+ host_id ||= ::Postjob::Host.register
26
+ Postjob.logger.debug "Starting worker_session w/host_id #{host_id.inspect}"
27
+ worker_session = ::Postjob::Queue.start_worker_session(workflows_with_versions, host_id: host_id)
28
+ host_id
29
+ end
30
+
31
+ Postjob.logger.info "Starting worker_session #{worker_session.inspect}"
32
+
33
+ start_heartbeat_monitor(worker_session.host_id)
34
+ worker_session
35
+ end
36
+
37
+ # Starts a heartbeat monitor in the background (i.e. in a new thread).
38
+ def start_heartbeat_monitor(host_id)
39
+ Thread.new do
40
+ begin
41
+ Simple::SQL.connect!
42
+ run_heartbeat_monitor(host_id)
43
+ rescue Exception => e
44
+ STDERR.puts "#{e}, from \n\t#{e.backtrace[0, 5].join("\n\t")}"
45
+ end
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ # This message is used during specs.
52
+ def run_heartbeat_monitor(host_id, &block)
53
+ Heartbeat.monitor(60) do |measurement|
54
+ Postjob::Queue.host_heartbeat(host_id, measurement)
55
+ next true unless block
56
+ yield
57
+ end
58
+ end
59
+ end
60
+
61
+ attribute :id
62
+ attribute :host_id
63
+ attribute :client_socket
64
+ attribute :workflows
65
+ attribute :attributes
66
+ attribute :created_at
67
+
68
+ def to_s
69
+ "Session##{id}"
70
+ end
71
+
72
+ def inspect
73
+ versionized_workflows = workflows.grep(/\d$/)
74
+ "<Session##{id} w/host_id: #{host_id}, client_socket: #{client_socket}, #{versionized_workflows.count} workflows>"
75
+ end
76
+ end
@@ -39,10 +39,6 @@ module Postjob::Workflow
39
39
  ::Postjob::Queue.find_or_create_token(job)
40
40
  end
41
41
 
42
- def set_workflow_status(status)
43
- ::Postjob::Queue.set_workflow_status ::Postjob::Runner.current_job, status
44
- end
45
-
46
42
  def workflow_version
47
43
  @workflow_version || "0.0"
48
44
  end
@@ -0,0 +1,17 @@
1
+ module AtomicStore
2
+ def self.with(path)
3
+ File.open(path, File::RDWR | File::CREAT, 0644) do |f|
4
+ f.flock(File::LOCK_EX)
5
+ value = f.read
6
+ value = nil if value == ""
7
+ new_value = yield value
8
+ expect! new_value => /./
9
+ next if new_value == value
10
+
11
+ f.rewind
12
+ f.write(new_value)
13
+ f.flush
14
+ f.truncate(f.pos)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,151 @@
1
+ # rubocop:disable Metrics/MethodLength
2
+
3
+ require_relative "./history"
4
+
5
+ # It seems that vmstat relies on loading specific subclasses automatically and/or deferredly.
6
+ # This should be fine, but it results in failing tests sometimes. By explicitely loading
7
+ # these we can circumvent this.
8
+ require "vmstat"
9
+ require "vmstat/disk"
10
+ require "vmstat/memory"
11
+ require "vmstat/cpu"
12
+ require "vmstat/load_average"
13
+ require "vmstat/cpu"
14
+ require "vmstat/disk"
15
+ require "vmstat/linux_disk"
16
+ require "vmstat/linux_memory"
17
+ require "vmstat/load_average"
18
+ require "vmstat/memory"
19
+ require "vmstat/network_interface"
20
+ require "vmstat/task"
21
+
22
+ #
23
+ # A heartbeat monitor
24
+ #
25
+ # The heartbeat monitor watches various machine metrics (disk usage, CPU load, network traffic)
26
+ # and yields current measurements once a second.
27
+ #
28
+ #
29
+ module Heartbeat
30
+ # The heartbeat monitor watches various machine metrics (disk usage, CPU load, network traffic)
31
+ # and yields current measurements once every cycle_length_seconds seconds.
32
+
33
+ # This method yields hashes with these keys:
34
+ #
35
+ # {
36
+ # net_errors_1min: ... # network errors in the last minute
37
+ # net_errors_5min: ... # network errors in the last 5 minutes
38
+ # net_errors_15min: ... # network errors in the last 15 minutes
39
+ # net_in_1min: ... # incoming network traffic in the last minute (bytes)
40
+ # net_in_5min: ... # incoming network traffic in the last 5 minutes (bytes)
41
+ # net_in_15min: ... # incoming network traffic in the last 15 minutes (bytes)
42
+ # net_out_1min: ... # outgoing network traffic in the last minute (bytes)
43
+ # net_out_5min: ... # outgoing network traffic in the last 5 minutes (bytes)
44
+ # net_out_15min: ... # outgoing network traffic in the last 15 minutes (bytes)
45
+ #
46
+ # uptime: ... # uptime (seconds)
47
+ # cpu_load_1min: ... # cpu load
48
+ # cpu_load_5min: ... # cpu load
49
+ # cpu_load_15min: ... # cpu load
50
+ #
51
+ # disk_used: ... # used disk space, over all disks
52
+ # disk_available: ... # available disk space, over all disks
53
+ # }
54
+
55
+ def self.monitor(cycle_length_seconds)
56
+ monitor = Monitor.new(cycle_length_seconds)
57
+
58
+ loop do
59
+ do_continue = yield monitor.measure
60
+ break if do_continue == false
61
+ monitor.sleep
62
+ end
63
+ end
64
+
65
+ class Monitor
66
+ def initialize(cycle_length_seconds)
67
+ @history = History.new(15 * 60 / cycle_length_seconds + 1)
68
+ @cycle_length_seconds = cycle_length_seconds
69
+
70
+ @count = 0
71
+ @started_at = Time.now
72
+ end
73
+
74
+ def measure
75
+ @count += 1
76
+ take_measurement
77
+ end
78
+
79
+ def sleep
80
+ sleep_time = (@started_at + @count * @cycle_length_seconds) - Time.now
81
+ Kernel.sleep sleep_time if sleep_time > 0
82
+ end
83
+
84
+ private
85
+
86
+ def take_measurement
87
+ now = snapshot
88
+
89
+ @history << now
90
+
91
+ past_1min = @history.last_nth(60 / @cycle_length_seconds)
92
+ past_5min = @history.last_nth(5 * 60 / @cycle_length_seconds)
93
+ past_15min = @history.last_nth(15 * 60 / @cycle_length_seconds)
94
+
95
+ {
96
+ net_errors_1min: difference(:network_errors, past_1min, now),
97
+ net_errors_5min: difference(:network_errors, past_5min, now),
98
+ net_errors_15min: difference(:network_errors, past_15min, now),
99
+
100
+ net_in_1min: difference(:network_in, past_1min, now),
101
+ net_in_5min: difference(:network_in, past_5min, now),
102
+ net_in_15min: difference(:network_in, past_15min, now),
103
+
104
+ net_out_1min: difference(:network_out, past_1min, now),
105
+ net_out_5min: difference(:network_out, past_5min, now),
106
+ net_out_15min: difference(:network_out, past_15min, now),
107
+
108
+ uptime: now[:uptime],
109
+ cpu_load_1min: now[:cpu_load_1min],
110
+ cpu_load_5min: now[:cpu_load_5min],
111
+ cpu_load_15min: now[:cpu_load_15min],
112
+ disk_used: now[:disk_used],
113
+ disk_available: now[:disk_available]
114
+ }
115
+ end
116
+
117
+ def difference(key, past, now)
118
+ return nil unless past
119
+ now[key] + past[key]
120
+ end
121
+
122
+ # collect current stats, and aggregate network and disk values into something
123
+ # meaningful
124
+ def snapshot
125
+ snapshot = Vmstat.snapshot
126
+
127
+ # [TODO] It would be great if we could filter based on the interface.type value.
128
+ # vmstat's rdoc says "BSD numbers", but I couldn't find documentation.
129
+ relevant_network_interfaces = snapshot.network_interfaces.reject { |interface| interface.name =~ /^lo/ }
130
+
131
+ {
132
+ network_in: sum(relevant_network_interfaces, &:in_bytes),
133
+ network_out: sum(relevant_network_interfaces, &:out_bytes),
134
+ network_errors: (sum(relevant_network_interfaces, &:in_errors) + sum(relevant_network_interfaces, &:out_errors)),
135
+
136
+ uptime: snapshot.at - snapshot.boot_time,
137
+ cpu_load_1min: snapshot.load_average.one_minute,
138
+ cpu_load_5min: snapshot.load_average.five_minutes,
139
+ cpu_load_15min: snapshot.load_average.fifteen_minutes,
140
+
141
+ disk_used: sum(snapshot.disks, &:used_bytes),
142
+ disk_available: sum(snapshot.disks, &:available_bytes)
143
+ }
144
+ end
145
+
146
+ # A helper for snapshot. Array#sum is in activesupport, but not in the standard library.
147
+ def sum(ary)
148
+ ary.inject(0) { |sum, e| sum + yield(e) }
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,25 @@
1
+ # A History object, which holds the last +size+ objects in memory.
2
+ #
3
+ # Implemented as a ring buffer which wraps over.
4
+ class History
5
+ attr_reader :size
6
+
7
+ def initialize(size)
8
+ @buffer = Array.new(size)
9
+ @size = size
10
+ @writer = 0
11
+ end
12
+
13
+ # returns the nth-last entry
14
+ def last_nth(idx)
15
+ raise ArgumentError, "RingBuffer size #{@size} is too small to hold #{idx} entries" if idx > @size
16
+
17
+ pos = (@writer - idx) % @size
18
+ @buffer[pos]
19
+ end
20
+
21
+ def <<(element)
22
+ @writer = (@writer + 1) % @size
23
+ @buffer[@writer] = element
24
+ end
25
+ end
@@ -0,0 +1,85 @@
1
+ require "spec_helper"
2
+
3
+ describe "Heartbeat Events" do
4
+ include TestHelper
5
+
6
+ # This test uses the nil host_id, which is configured during migration.
7
+ let(:null_host_id) { "00000000-0000-0000-0000-000000000000" }
8
+
9
+ def heartbeat!(host_id: nil)
10
+ host_id ||= null_host_id
11
+ Postjob::WorkerSession.send(:run_heartbeat_monitor, host_id) do
12
+ # By returning false we create only a single heartbeat event
13
+ false
14
+ end
15
+ end
16
+
17
+ describe "creation" do
18
+ before { heartbeat! }
19
+
20
+ it "creates a heartbeat event" do
21
+ event = Simple::SQL.ask <<~SQL, into: OpenStruct
22
+ SELECT * FROM postjob.events
23
+ WHERE name='heartbeat' ORDER BY created_at DESC
24
+ SQL
25
+
26
+ expect(event.worker_session_id).to be_nil
27
+ expect(event.host_id).to eq(null_host_id)
28
+
29
+ expected_keys = %w(
30
+ cpu_load_1min cpu_load_5min cpu_load_15min
31
+ disk_available disk_used
32
+ net_errors_1min net_errors_5min net_errors_15min
33
+ net_in_1min net_in_5min net_in_15min
34
+ net_out_1min net_out_5min net_out_15min
35
+ uptime
36
+ )
37
+
38
+ expect(event.attributes.keys).to include(*expected_keys)
39
+ end
40
+
41
+ it "creates a zombie event" do
42
+ event = Simple::SQL.ask <<~SQL, into: OpenStruct
43
+ SELECT * FROM postjob.events
44
+ WHERE name='zombie' ORDER BY created_at DESC
45
+ SQL
46
+
47
+ expect(event.worker_session_id).to be_nil
48
+ expect(event.host_id).to eq(null_host_id)
49
+ expect(event.attributes.keys).to eq([ "zombie_count" ])
50
+ expect(event.attributes["zombie_count"]).to be_a(Integer)
51
+ end
52
+ end
53
+
54
+ describe "zombie checking" do
55
+ let!(:job_id) { Postjob.enqueue! "HeartbeatSpecWorkflow" }
56
+
57
+ before do
58
+ # change the job status to processing, and move all timestamps into the past.
59
+ # This simulates a zombie situation.
60
+ Simple::SQL.ask "UPDATE postjob.postjobs SET status='processing' WHERE id=$1", job_id
61
+ Simple::SQL.ask "UPDATE postjob.events SET created_at = (now() at time zone 'utc' - interval '2 hours')"
62
+ end
63
+
64
+ context "when running with the real host_id" do
65
+ it "detects zombies" do
66
+ job = load_job(job_id)
67
+ session = Simple::SQL.ask "SELECT * FROM postjob.worker_sessions WHERE id=$1", job.last_worker_session_id, into: OpenStruct
68
+
69
+ heartbeat! host_id: session.host_id
70
+
71
+ job = Simple::SQL.ask "SELECT * FROM postjob.postjobs WHERE id=$1", job_id, into: Hash
72
+ expect(job).to include(status: "err", failed_attempts: 1, error: "Zombie", error_message: "zombie")
73
+ end
74
+ end
75
+
76
+ context "when running with a different host_id" do
77
+ it "detects zombies" do
78
+ heartbeat!
79
+
80
+ job = Simple::SQL.ask "SELECT * FROM postjob.postjobs WHERE id=$1", job_id, into: Hash
81
+ expect(job).to include(status: "err", failed_attempts: 1, error: "Zombie", error_message: "zombie")
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,80 @@
1
+ require "spec_helper"
2
+
3
+ module EventTestWorkflow
4
+ module OddWorkflow
5
+ def self.run(id)
6
+ raise "this is for the odd" unless id.odd?
7
+ end
8
+
9
+ Postjob.register_workflow self
10
+ end
11
+
12
+ def self.run
13
+ await OddWorkflow, 1
14
+ await OddWorkflow, 2
15
+ end
16
+
17
+ Postjob.register_workflow self
18
+ end
19
+
20
+ describe "Job Events" do
21
+ include TestHelper
22
+
23
+ let!(:job) do
24
+ id = Postjob.enqueue! "EventTestWorkflow"
25
+ load_job id
26
+ end
27
+
28
+ before do
29
+ TestHelper.resolve_all
30
+ end
31
+
32
+ def job_events(*jobs)
33
+ events_query = "SELECT * FROM postjob.events WHERE postjob_id = ANY($1) ORDER BY created_at"
34
+ Simple::SQL.all events_query, jobs.map(&:id), into: OpenStruct
35
+ end
36
+
37
+ it "creates events on each job status change" do
38
+ good_child = load_job("SELECT * FROM postjob.postjobs WHERE parent_id=#{job.id} ORDER BY id")
39
+ bad_child = load_job("SELECT * FROM postjob.postjobs WHERE parent_id=#{job.id} ORDER BY id OFFSET 1")
40
+
41
+ expect(job_events(good_child).map(&:name)).to eq(["ready", "processing", "ok"])
42
+
43
+ bad_child_events = %w(
44
+ ready
45
+ processing err
46
+ processing err
47
+ processing err
48
+ processing err
49
+ processing failed
50
+ )
51
+ expect(job_events(bad_child).map(&:name)).to eq(bad_child_events)
52
+
53
+ toplevel_events = %w(
54
+ ready
55
+ processing sleep ready
56
+ processing sleep ready
57
+ processing sleep ready
58
+ processing sleep ready
59
+ processing sleep ready
60
+ processing sleep ready
61
+ processing failed
62
+ )
63
+ expect(job_events(job).map(&:name)).to eq(toplevel_events)
64
+ end
65
+
66
+ describe "automatic event creation" do
67
+ let(:jobs) { Simple::SQL.all("SELECT * FROM postjob.postjobs WHERE root_id=#{job.id}", into: Postjob::Job) }
68
+ let(:events) { job_events(*jobs) }
69
+
70
+ it "sets the job's last_worker_session_id" do
71
+ last_worker_session_ids = jobs.map(&:last_worker_session_id)
72
+ expect(last_worker_session_ids.uniq).to eq([Postjob.current_worker_session.id])
73
+ end
74
+
75
+ it "records the worker session id" do
76
+ worker_session_ids = events.map(&:worker_session_id).uniq
77
+ expect(worker_session_ids).to eq([Postjob.current_worker_session.id])
78
+ end
79
+ end
80
+ end