postjob 0.4.5 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/lib/postjob.rb +22 -13
  3. data/lib/postjob/cli/events.rb +60 -0
  4. data/lib/postjob/cli/heartbeat.rb +55 -0
  5. data/lib/postjob/cli/hosts.rb +67 -0
  6. data/lib/postjob/cli/ps.rb +1 -13
  7. data/lib/postjob/cli/sessions.rb +83 -0
  8. data/lib/postjob/job.rb +4 -15
  9. data/lib/postjob/migrations/003_postjobs.sql +10 -8
  10. data/lib/postjob/migrations/003b_processing_columns.sql +8 -8
  11. data/lib/postjob/migrations/005_helpers.sql +3 -1
  12. data/lib/postjob/migrations/006_enqueue.sql +3 -0
  13. data/lib/postjob/migrations/006a_processing.sql +6 -26
  14. data/lib/postjob/migrations/007_job_results.sql +32 -13
  15. data/lib/postjob/migrations/008_checkout_runnable.sql +15 -21
  16. data/lib/postjob/migrations/008a_childjobs.sql +13 -0
  17. data/lib/postjob/migrations/010_settings.sql +18 -3
  18. data/lib/postjob/migrations/011_null_uuid.sql +7 -0
  19. data/lib/postjob/migrations/012_hosts.sql +42 -0
  20. data/lib/postjob/migrations/013_worker_sessions.sql +44 -0
  21. data/lib/postjob/migrations/014_postjob_session_id.sql +17 -0
  22. data/lib/postjob/migrations/015_events.sql +76 -0
  23. data/lib/postjob/migrations/016_sessions_functions.sql +16 -0
  24. data/lib/postjob/migrations/017_zombie_check.sql +58 -0
  25. data/lib/postjob/migrations/018_heartbeat.sql +28 -0
  26. data/lib/postjob/migrations/019_heartbeat_indices.sql +5 -0
  27. data/lib/postjob/queue.rb +41 -27
  28. data/lib/postjob/queue/notifications.rb +5 -4
  29. data/lib/postjob/queue/search.rb +2 -0
  30. data/lib/postjob/queue/settings.rb +11 -1
  31. data/lib/postjob/record.rb +17 -0
  32. data/lib/postjob/runner.rb +9 -2
  33. data/lib/postjob/worker_session.rb +76 -0
  34. data/lib/postjob/workflow.rb +0 -4
  35. data/lib/tools/atomic_store.rb +17 -0
  36. data/lib/tools/heartbeat.rb +151 -0
  37. data/lib/tools/history.rb +25 -0
  38. data/spec/postjob/events/heartbeat_event_spec.rb +85 -0
  39. data/spec/postjob/events/job_event_spec.rb +80 -0
  40. data/spec/postjob/job_control/max_attempts_spec.rb +0 -2
  41. data/spec/postjob/queue/search_spec.rb +0 -14
  42. data/spec/postjob/worker_session_spec.rb +41 -0
  43. data/spec/spec_helper.rb +9 -0
  44. data/spec/support/test_helper.rb +11 -1
  45. metadata +43 -3
  46. data/spec/postjob/job_control/workflow_status_spec.rb +0 -52
@@ -34,6 +34,8 @@ module Postjob::Runner
34
34
  # returns a subjob within the current job, for a +runner+
35
35
  # description and +args+.
36
36
  def async(workflow, *args, timeout: nil, max_attempts:)
37
+ worker_session_id = Postjob.current_worker_session.id
38
+
37
39
  # if the workflow is a symbol, then we change it into "__manual__"
38
40
  # - there should never be a workflow with that name - or into
39
41
  # "CurrentWorkshop.#{workflow}", denoting the \a workflow method of the
@@ -47,7 +49,7 @@ module Postjob::Runner
47
49
  raise ArgumentError, "Unsupported workflow spec #{workflow.inspect}. Did you run await(fun(a, b)) instead of await(:fun, a, b)"
48
50
  end
49
51
 
50
- ::Postjob::Queue.find_or_create_childjob(self.current_job, workflow, args,
52
+ ::Postjob::Queue.find_or_create_childjob(worker_session_id, self.current_job, workflow, args,
51
53
  timeout: timeout,
52
54
  max_attempts: max_attempts)
53
55
  end
@@ -62,7 +64,7 @@ module Postjob::Runner
62
64
  throw :pending, :pending
63
65
  else
64
66
  childjobs = Postjob::Queue.childjobs(current_job)
65
- childjobs.map(&:resolve)
67
+ childjobs.each(&:resolve).count
66
68
  end
67
69
  when Job
68
70
  expect! args == []
@@ -100,6 +102,11 @@ module Postjob::Runner
100
102
  with_current_job(job) do
101
103
  status, value, shutdown = invoke_workflow workflow, job
102
104
  log_result! job, status, value
105
+ # If the status is ok the job finished processing. In that case
106
+ # we'll wait for all child jobs to finish.
107
+ if status == :ok
108
+ await :all
109
+ end
103
110
  [ workflow.workflow_version, status, value, shutdown ]
104
111
  end
105
112
  end
@@ -0,0 +1,76 @@
1
+ # rubocop:disable Lint/RescueException
2
+
3
+ require_relative "./record"
4
+
5
+ require "tools/heartbeat"
6
+ require "tools/atomic_store"
7
+
8
+ class Postjob::Host < Postjob::Record
9
+ def self.register(attributes = {})
10
+ Postjob.logger.debug "registering host w/#{attributes.inspect}"
11
+ ::Postjob::Queue.host_register(attributes)
12
+ end
13
+ end
14
+
15
+ # A worker worker_session
16
+ class Postjob::WorkerSession < Postjob::Record
17
+ HOST_ID_STORE = ".postjob.host_id"
18
+
19
+ class << self
20
+ # Starts a worker session.
21
+ def start!(workflows_with_versions)
22
+ worker_session = nil
23
+
24
+ AtomicStore.with(HOST_ID_STORE) do |host_id|
25
+ host_id ||= ::Postjob::Host.register
26
+ Postjob.logger.debug "Starting worker_session w/host_id #{host_id.inspect}"
27
+ worker_session = ::Postjob::Queue.start_worker_session(workflows_with_versions, host_id: host_id)
28
+ host_id
29
+ end
30
+
31
+ Postjob.logger.info "Starting worker_session #{worker_session.inspect}"
32
+
33
+ start_heartbeat_monitor(worker_session.host_id)
34
+ worker_session
35
+ end
36
+
37
+ # Starts a heartbeat monitor in the background (i.e. in a new thread).
38
+ def start_heartbeat_monitor(host_id)
39
+ Thread.new do
40
+ begin
41
+ Simple::SQL.connect!
42
+ run_heartbeat_monitor(host_id)
43
+ rescue Exception => e
44
+ STDERR.puts "#{e}, from \n\t#{e.backtrace[0, 5].join("\n\t")}"
45
+ end
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ # This message is used during specs.
52
+ def run_heartbeat_monitor(host_id, &block)
53
+ Heartbeat.monitor(60) do |measurement|
54
+ Postjob::Queue.host_heartbeat(host_id, measurement)
55
+ next true unless block
56
+ yield
57
+ end
58
+ end
59
+ end
60
+
61
+ attribute :id
62
+ attribute :host_id
63
+ attribute :client_socket
64
+ attribute :workflows
65
+ attribute :attributes
66
+ attribute :created_at
67
+
68
+ def to_s
69
+ "Session##{id}"
70
+ end
71
+
72
+ def inspect
73
+ versionized_workflows = workflows.grep(/\d$/)
74
+ "<Session##{id} w/host_id: #{host_id}, client_socket: #{client_socket}, #{versionized_workflows.count} workflows>"
75
+ end
76
+ end
@@ -39,10 +39,6 @@ module Postjob::Workflow
39
39
  ::Postjob::Queue.find_or_create_token(job)
40
40
  end
41
41
 
42
- def set_workflow_status(status)
43
- ::Postjob::Queue.set_workflow_status ::Postjob::Runner.current_job, status
44
- end
45
-
46
42
  def workflow_version
47
43
  @workflow_version || "0.0"
48
44
  end
@@ -0,0 +1,17 @@
1
+ module AtomicStore
2
+ def self.with(path)
3
+ File.open(path, File::RDWR | File::CREAT, 0644) do |f|
4
+ f.flock(File::LOCK_EX)
5
+ value = f.read
6
+ value = nil if value == ""
7
+ new_value = yield value
8
+ expect! new_value => /./
9
+ next if new_value == value
10
+
11
+ f.rewind
12
+ f.write(new_value)
13
+ f.flush
14
+ f.truncate(f.pos)
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,151 @@
1
+ # rubocop:disable Metrics/MethodLength
2
+
3
+ require_relative "./history"
4
+
5
+ # It seems that vmstat relies on loading specific subclasses automatically and/or deferredly.
6
+ # This should be fine, but it results in failing tests sometimes. By explicitely loading
7
+ # these we can circumvent this.
8
+ require "vmstat"
9
+ require "vmstat/disk"
10
+ require "vmstat/memory"
11
+ require "vmstat/cpu"
12
+ require "vmstat/load_average"
13
+ require "vmstat/cpu"
14
+ require "vmstat/disk"
15
+ require "vmstat/linux_disk"
16
+ require "vmstat/linux_memory"
17
+ require "vmstat/load_average"
18
+ require "vmstat/memory"
19
+ require "vmstat/network_interface"
20
+ require "vmstat/task"
21
+
22
+ #
23
+ # A heartbeat monitor
24
+ #
25
+ # The heartbeat monitor watches various machine metrics (disk usage, CPU load, network traffic)
26
+ # and yields current measurements once a second.
27
+ #
28
+ #
29
+ module Heartbeat
30
+ # The heartbeat monitor watches various machine metrics (disk usage, CPU load, network traffic)
31
+ # and yields current measurements once every cycle_length_seconds seconds.
32
+
33
+ # This method yields hashes with these keys:
34
+ #
35
+ # {
36
+ # net_errors_1min: ... # network errors in the last minute
37
+ # net_errors_5min: ... # network errors in the last 5 minutes
38
+ # net_errors_15min: ... # network errors in the last 15 minutes
39
+ # net_in_1min: ... # incoming network traffic in the last minute (bytes)
40
+ # net_in_5min: ... # incoming network traffic in the last 5 minutes (bytes)
41
+ # net_in_15min: ... # incoming network traffic in the last 15 minutes (bytes)
42
+ # net_out_1min: ... # outgoing network traffic in the last minute (bytes)
43
+ # net_out_5min: ... # outgoing network traffic in the last 5 minutes (bytes)
44
+ # net_out_15min: ... # outgoing network traffic in the last 15 minutes (bytes)
45
+ #
46
+ # uptime: ... # uptime (seconds)
47
+ # cpu_load_1min: ... # cpu load
48
+ # cpu_load_5min: ... # cpu load
49
+ # cpu_load_15min: ... # cpu load
50
+ #
51
+ # disk_used: ... # used disk space, over all disks
52
+ # disk_available: ... # available disk space, over all disks
53
+ # }
54
+
55
+ def self.monitor(cycle_length_seconds)
56
+ monitor = Monitor.new(cycle_length_seconds)
57
+
58
+ loop do
59
+ do_continue = yield monitor.measure
60
+ break if do_continue == false
61
+ monitor.sleep
62
+ end
63
+ end
64
+
65
+ class Monitor
66
+ def initialize(cycle_length_seconds)
67
+ @history = History.new(15 * 60 / cycle_length_seconds + 1)
68
+ @cycle_length_seconds = cycle_length_seconds
69
+
70
+ @count = 0
71
+ @started_at = Time.now
72
+ end
73
+
74
+ def measure
75
+ @count += 1
76
+ take_measurement
77
+ end
78
+
79
+ def sleep
80
+ sleep_time = (@started_at + @count * @cycle_length_seconds) - Time.now
81
+ Kernel.sleep sleep_time if sleep_time > 0
82
+ end
83
+
84
+ private
85
+
86
+ def take_measurement
87
+ now = snapshot
88
+
89
+ @history << now
90
+
91
+ past_1min = @history.last_nth(60 / @cycle_length_seconds)
92
+ past_5min = @history.last_nth(5 * 60 / @cycle_length_seconds)
93
+ past_15min = @history.last_nth(15 * 60 / @cycle_length_seconds)
94
+
95
+ {
96
+ net_errors_1min: difference(:network_errors, past_1min, now),
97
+ net_errors_5min: difference(:network_errors, past_5min, now),
98
+ net_errors_15min: difference(:network_errors, past_15min, now),
99
+
100
+ net_in_1min: difference(:network_in, past_1min, now),
101
+ net_in_5min: difference(:network_in, past_5min, now),
102
+ net_in_15min: difference(:network_in, past_15min, now),
103
+
104
+ net_out_1min: difference(:network_out, past_1min, now),
105
+ net_out_5min: difference(:network_out, past_5min, now),
106
+ net_out_15min: difference(:network_out, past_15min, now),
107
+
108
+ uptime: now[:uptime],
109
+ cpu_load_1min: now[:cpu_load_1min],
110
+ cpu_load_5min: now[:cpu_load_5min],
111
+ cpu_load_15min: now[:cpu_load_15min],
112
+ disk_used: now[:disk_used],
113
+ disk_available: now[:disk_available]
114
+ }
115
+ end
116
+
117
+ def difference(key, past, now)
118
+ return nil unless past
119
+ now[key] + past[key]
120
+ end
121
+
122
+ # collect current stats, and aggregate network and disk values into something
123
+ # meaningful
124
+ def snapshot
125
+ snapshot = Vmstat.snapshot
126
+
127
+ # [TODO] It would be great if we could filter based on the interface.type value.
128
+ # vmstat's rdoc says "BSD numbers", but I couldn't find documentation.
129
+ relevant_network_interfaces = snapshot.network_interfaces.reject { |interface| interface.name =~ /^lo/ }
130
+
131
+ {
132
+ network_in: sum(relevant_network_interfaces, &:in_bytes),
133
+ network_out: sum(relevant_network_interfaces, &:out_bytes),
134
+ network_errors: (sum(relevant_network_interfaces, &:in_errors) + sum(relevant_network_interfaces, &:out_errors)),
135
+
136
+ uptime: snapshot.at - snapshot.boot_time,
137
+ cpu_load_1min: snapshot.load_average.one_minute,
138
+ cpu_load_5min: snapshot.load_average.five_minutes,
139
+ cpu_load_15min: snapshot.load_average.fifteen_minutes,
140
+
141
+ disk_used: sum(snapshot.disks, &:used_bytes),
142
+ disk_available: sum(snapshot.disks, &:available_bytes)
143
+ }
144
+ end
145
+
146
+ # A helper for snapshot. Array#sum is in activesupport, but not in the standard library.
147
+ def sum(ary)
148
+ ary.inject(0) { |sum, e| sum + yield(e) }
149
+ end
150
+ end
151
+ end
@@ -0,0 +1,25 @@
1
+ # A History object, which holds the last +size+ objects in memory.
2
+ #
3
+ # Implemented as a ring buffer which wraps over.
4
+ class History
5
+ attr_reader :size
6
+
7
+ def initialize(size)
8
+ @buffer = Array.new(size)
9
+ @size = size
10
+ @writer = 0
11
+ end
12
+
13
+ # returns the nth-last entry
14
+ def last_nth(idx)
15
+ raise ArgumentError, "RingBuffer size #{@size} is too small to hold #{idx} entries" if idx > @size
16
+
17
+ pos = (@writer - idx) % @size
18
+ @buffer[pos]
19
+ end
20
+
21
+ def <<(element)
22
+ @writer = (@writer + 1) % @size
23
+ @buffer[@writer] = element
24
+ end
25
+ end
@@ -0,0 +1,85 @@
1
+ require "spec_helper"
2
+
3
+ describe "Heartbeat Events" do
4
+ include TestHelper
5
+
6
+ # This test uses the nil host_id, which is configured during migration.
7
+ let(:null_host_id) { "00000000-0000-0000-0000-000000000000" }
8
+
9
+ def heartbeat!(host_id: nil)
10
+ host_id ||= null_host_id
11
+ Postjob::WorkerSession.send(:run_heartbeat_monitor, host_id) do
12
+ # By returning false we create only a single heartbeat event
13
+ false
14
+ end
15
+ end
16
+
17
+ describe "creation" do
18
+ before { heartbeat! }
19
+
20
+ it "creates a heartbeat event" do
21
+ event = Simple::SQL.ask <<~SQL, into: OpenStruct
22
+ SELECT * FROM postjob.events
23
+ WHERE name='heartbeat' ORDER BY created_at DESC
24
+ SQL
25
+
26
+ expect(event.worker_session_id).to be_nil
27
+ expect(event.host_id).to eq(null_host_id)
28
+
29
+ expected_keys = %w(
30
+ cpu_load_1min cpu_load_5min cpu_load_15min
31
+ disk_available disk_used
32
+ net_errors_1min net_errors_5min net_errors_15min
33
+ net_in_1min net_in_5min net_in_15min
34
+ net_out_1min net_out_5min net_out_15min
35
+ uptime
36
+ )
37
+
38
+ expect(event.attributes.keys).to include(*expected_keys)
39
+ end
40
+
41
+ it "creates a zombie event" do
42
+ event = Simple::SQL.ask <<~SQL, into: OpenStruct
43
+ SELECT * FROM postjob.events
44
+ WHERE name='zombie' ORDER BY created_at DESC
45
+ SQL
46
+
47
+ expect(event.worker_session_id).to be_nil
48
+ expect(event.host_id).to eq(null_host_id)
49
+ expect(event.attributes.keys).to eq([ "zombie_count" ])
50
+ expect(event.attributes["zombie_count"]).to be_a(Integer)
51
+ end
52
+ end
53
+
54
+ describe "zombie checking" do
55
+ let!(:job_id) { Postjob.enqueue! "HeartbeatSpecWorkflow" }
56
+
57
+ before do
58
+ # change the job status to processing, and move all timestamps into the past.
59
+ # This simulates a zombie situation.
60
+ Simple::SQL.ask "UPDATE postjob.postjobs SET status='processing' WHERE id=$1", job_id
61
+ Simple::SQL.ask "UPDATE postjob.events SET created_at = (now() at time zone 'utc' - interval '2 hours')"
62
+ end
63
+
64
+ context "when running with the real host_id" do
65
+ it "detects zombies" do
66
+ job = load_job(job_id)
67
+ session = Simple::SQL.ask "SELECT * FROM postjob.worker_sessions WHERE id=$1", job.last_worker_session_id, into: OpenStruct
68
+
69
+ heartbeat! host_id: session.host_id
70
+
71
+ job = Simple::SQL.ask "SELECT * FROM postjob.postjobs WHERE id=$1", job_id, into: Hash
72
+ expect(job).to include(status: "err", failed_attempts: 1, error: "Zombie", error_message: "zombie")
73
+ end
74
+ end
75
+
76
+ context "when running with a different host_id" do
77
+ it "detects zombies" do
78
+ heartbeat!
79
+
80
+ job = Simple::SQL.ask "SELECT * FROM postjob.postjobs WHERE id=$1", job_id, into: Hash
81
+ expect(job).to include(status: "err", failed_attempts: 1, error: "Zombie", error_message: "zombie")
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,80 @@
1
+ require "spec_helper"
2
+
3
+ module EventTestWorkflow
4
+ module OddWorkflow
5
+ def self.run(id)
6
+ raise "this is for the odd" unless id.odd?
7
+ end
8
+
9
+ Postjob.register_workflow self
10
+ end
11
+
12
+ def self.run
13
+ await OddWorkflow, 1
14
+ await OddWorkflow, 2
15
+ end
16
+
17
+ Postjob.register_workflow self
18
+ end
19
+
20
+ describe "Job Events" do
21
+ include TestHelper
22
+
23
+ let!(:job) do
24
+ id = Postjob.enqueue! "EventTestWorkflow"
25
+ load_job id
26
+ end
27
+
28
+ before do
29
+ TestHelper.resolve_all
30
+ end
31
+
32
+ def job_events(*jobs)
33
+ events_query = "SELECT * FROM postjob.events WHERE postjob_id = ANY($1) ORDER BY created_at"
34
+ Simple::SQL.all events_query, jobs.map(&:id), into: OpenStruct
35
+ end
36
+
37
+ it "creates events on each job status change" do
38
+ good_child = load_job("SELECT * FROM postjob.postjobs WHERE parent_id=#{job.id} ORDER BY id")
39
+ bad_child = load_job("SELECT * FROM postjob.postjobs WHERE parent_id=#{job.id} ORDER BY id OFFSET 1")
40
+
41
+ expect(job_events(good_child).map(&:name)).to eq(["ready", "processing", "ok"])
42
+
43
+ bad_child_events = %w(
44
+ ready
45
+ processing err
46
+ processing err
47
+ processing err
48
+ processing err
49
+ processing failed
50
+ )
51
+ expect(job_events(bad_child).map(&:name)).to eq(bad_child_events)
52
+
53
+ toplevel_events = %w(
54
+ ready
55
+ processing sleep ready
56
+ processing sleep ready
57
+ processing sleep ready
58
+ processing sleep ready
59
+ processing sleep ready
60
+ processing sleep ready
61
+ processing failed
62
+ )
63
+ expect(job_events(job).map(&:name)).to eq(toplevel_events)
64
+ end
65
+
66
+ describe "automatic event creation" do
67
+ let(:jobs) { Simple::SQL.all("SELECT * FROM postjob.postjobs WHERE root_id=#{job.id}", into: Postjob::Job) }
68
+ let(:events) { job_events(*jobs) }
69
+
70
+ it "sets the job's last_worker_session_id" do
71
+ last_worker_session_ids = jobs.map(&:last_worker_session_id)
72
+ expect(last_worker_session_ids.uniq).to eq([Postjob.current_worker_session.id])
73
+ end
74
+
75
+ it "records the worker session id" do
76
+ worker_session_ids = events.map(&:worker_session_id).uniq
77
+ expect(worker_session_ids).to eq([Postjob.current_worker_session.id])
78
+ end
79
+ end
80
+ end