RubyGems - postjob - Versions diffs - 0.4.5 → 0.5.0 - Mend

postjob 0.4.5 → 0.5.0

Files changed (46) hide show

checksums.yaml +4 -4
data/lib/postjob.rb +22 -13
data/lib/postjob/cli/events.rb +60 -0
data/lib/postjob/cli/heartbeat.rb +55 -0
data/lib/postjob/cli/hosts.rb +67 -0
data/lib/postjob/cli/ps.rb +1 -13
data/lib/postjob/cli/sessions.rb +83 -0
data/lib/postjob/job.rb +4 -15
data/lib/postjob/migrations/003_postjobs.sql +10 -8
data/lib/postjob/migrations/003b_processing_columns.sql +8 -8
data/lib/postjob/migrations/005_helpers.sql +3 -1
data/lib/postjob/migrations/006_enqueue.sql +3 -0
data/lib/postjob/migrations/006a_processing.sql +6 -26
data/lib/postjob/migrations/007_job_results.sql +32 -13
data/lib/postjob/migrations/008_checkout_runnable.sql +15 -21
data/lib/postjob/migrations/008a_childjobs.sql +13 -0
data/lib/postjob/migrations/010_settings.sql +18 -3
data/lib/postjob/migrations/011_null_uuid.sql +7 -0
data/lib/postjob/migrations/012_hosts.sql +42 -0
data/lib/postjob/migrations/013_worker_sessions.sql +44 -0
data/lib/postjob/migrations/014_postjob_session_id.sql +17 -0
data/lib/postjob/migrations/015_events.sql +76 -0
data/lib/postjob/migrations/016_sessions_functions.sql +16 -0
data/lib/postjob/migrations/017_zombie_check.sql +58 -0
data/lib/postjob/migrations/018_heartbeat.sql +28 -0
data/lib/postjob/migrations/019_heartbeat_indices.sql +5 -0
data/lib/postjob/queue.rb +41 -27
data/lib/postjob/queue/notifications.rb +5 -4
data/lib/postjob/queue/search.rb +2 -0
data/lib/postjob/queue/settings.rb +11 -1
data/lib/postjob/record.rb +17 -0
data/lib/postjob/runner.rb +9 -2
data/lib/postjob/worker_session.rb +76 -0
data/lib/postjob/workflow.rb +0 -4
data/lib/tools/atomic_store.rb +17 -0
data/lib/tools/heartbeat.rb +151 -0
data/lib/tools/history.rb +25 -0
data/spec/postjob/events/heartbeat_event_spec.rb +85 -0
data/spec/postjob/events/job_event_spec.rb +80 -0
data/spec/postjob/job_control/max_attempts_spec.rb +0 -2
data/spec/postjob/queue/search_spec.rb +0 -14
data/spec/postjob/worker_session_spec.rb +41 -0
data/spec/spec_helper.rb +9 -0
data/spec/support/test_helper.rb +11 -1
metadata +43 -3
data/spec/postjob/job_control/workflow_status_spec.rb +0 -52

@@ -34,6 +34,8 @@ module Postjob::Runner
   # returns a subjob within the current job, for a +runner+
   # description and +args+.
   def async(workflow, *args, timeout: nil, max_attempts:)
+    worker_session_id = Postjob.current_worker_session.id
     # if the workflow is a symbol, then we change it into "__manual__"
     # - there should never be a workflow with that name - or into
     # "CurrentWorkshop.#{workflow}", denoting the \a workflow method of the
@@ -47,7 +49,7 @@ module Postjob::Runner
       raise ArgumentError, "Unsupported workflow spec #{workflow.inspect}. Did you run await(fun(a, b)) instead of await(:fun, a, b)"
     end
-    ::Postjob::Queue.find_or_create_childjob(self.current_job, workflow, args,
+    ::Postjob::Queue.find_or_create_childjob(worker_session_id, self.current_job, workflow, args,
                                              timeout: timeout,
                                              max_attempts: max_attempts)
   end
@@ -62,7 +64,7 @@ module Postjob::Runner
         throw :pending, :pending
       else
         childjobs = Postjob::Queue.childjobs(current_job)
-        childjobs.map(&:resolve)
+        childjobs.each(&:resolve).count
       end
     when Job
       expect! args == []
@@ -100,6 +102,11 @@ module Postjob::Runner
     with_current_job(job) do
       status, value, shutdown = invoke_workflow workflow, job
       log_result! job, status, value
+      # If the status is ok the job finished processing. In that case
+      # we'll wait for all child jobs to finish.
+      if status == :ok
+        await :all
+      end
       [ workflow.workflow_version, status, value, shutdown ]
     end
   end

data/lib/postjob/worker_session.rb ADDED

@@ -0,0 +1,76 @@
+# rubocop:disable Lint/RescueException
+require_relative "./record"
+require "tools/heartbeat"
+require "tools/atomic_store"
+class Postjob::Host < Postjob::Record
+  def self.register(attributes = {})
+    Postjob.logger.debug "registering host w/#{attributes.inspect}"
+    ::Postjob::Queue.host_register(attributes)
+  end
+end
+# A worker worker_session
+class Postjob::WorkerSession < Postjob::Record
+  HOST_ID_STORE = ".postjob.host_id"
+  class << self
+    # Starts a worker session.
+    def start!(workflows_with_versions)
+      worker_session = nil
+      AtomicStore.with(HOST_ID_STORE) do |host_id|
+        host_id ||= ::Postjob::Host.register
+        Postjob.logger.debug "Starting worker_session w/host_id #{host_id.inspect}"
+        worker_session = ::Postjob::Queue.start_worker_session(workflows_with_versions, host_id: host_id)
+        host_id
+      end
+      Postjob.logger.info "Starting worker_session #{worker_session.inspect}"
+      start_heartbeat_monitor(worker_session.host_id)
+      worker_session
+    end
+    # Starts a heartbeat monitor in the background (i.e. in a new thread).
+    def start_heartbeat_monitor(host_id)
+      Thread.new do
+        begin
+          Simple::SQL.connect!
+          run_heartbeat_monitor(host_id)
+        rescue Exception => e
+          STDERR.puts "#{e}, from \n\t#{e.backtrace[0, 5].join("\n\t")}"
+        end
+      end
+    end
+    private
+    # This message is used during specs.
+    def run_heartbeat_monitor(host_id, &block)
+      Heartbeat.monitor(60) do |measurement|
+        Postjob::Queue.host_heartbeat(host_id, measurement)
+        next true unless block
+        yield
+      end
+    end
+  end
+  attribute :id
+  attribute :host_id
+  attribute :client_socket
+  attribute :workflows
+  attribute :attributes
+  attribute :created_at
+  def to_s
+    "Session##{id}"
+  end
+  def inspect
+    versionized_workflows = workflows.grep(/\d$/)
+    "<Session##{id} w/host_id: #{host_id}, client_socket: #{client_socket}, #{versionized_workflows.count} workflows>"
+  end
+end

data/lib/postjob/workflow.rb CHANGED

@@ -39,10 +39,6 @@ module Postjob::Workflow
       ::Postjob::Queue.find_or_create_token(job)
     end
-    def set_workflow_status(status)
-      ::Postjob::Queue.set_workflow_status ::Postjob::Runner.current_job, status
-    end
     def workflow_version
       @workflow_version || "0.0"
     end

data/lib/tools/atomic_store.rb ADDED

@@ -0,0 +1,17 @@
+module AtomicStore
+  def self.with(path)
+    File.open(path, File::RDWR | File::CREAT, 0644) do |f|
+      f.flock(File::LOCK_EX)
+      value = f.read
+      value = nil if value == ""
+      new_value = yield value
+      expect! new_value => /./
+      next if new_value == value
+      f.rewind
+      f.write(new_value)
+      f.flush
+      f.truncate(f.pos)
+    end
+  end
+end

data/lib/tools/heartbeat.rb ADDED

@@ -0,0 +1,151 @@
+# rubocop:disable Metrics/MethodLength
+require_relative "./history"
+# It seems that vmstat relies on loading specific subclasses automatically and/or deferredly.
+# This should be fine, but it results in failing tests sometimes. By explicitely loading
+# these we can circumvent this.
+require "vmstat"
+require "vmstat/disk"
+require "vmstat/memory"
+require "vmstat/cpu"
+require "vmstat/load_average"
+require "vmstat/cpu"
+require "vmstat/disk"
+require "vmstat/linux_disk"
+require "vmstat/linux_memory"
+require "vmstat/load_average"
+require "vmstat/memory"
+require "vmstat/network_interface"
+require "vmstat/task"
+#
+# A heartbeat monitor
+#
+# The heartbeat monitor watches various machine metrics (disk usage, CPU load, network traffic)
+# and yields current measurements once a second.
+#
+#
+module Heartbeat
+  # The heartbeat monitor watches various machine metrics (disk usage, CPU load, network traffic)
+  # and yields current measurements once every cycle_length_seconds seconds.
+  # This method yields hashes with these keys:
+  #
+  # {
+  #   net_errors_1min:    ...   # network errors in the last minute
+  #   net_errors_5min:    ...   # network errors in the last 5 minutes
+  #   net_errors_15min:   ...   # network errors in the last 15 minutes
+  #   net_in_1min:        ...   # incoming network traffic in the last minute (bytes)
+  #   net_in_5min:        ...   # incoming network traffic in the last 5 minutes (bytes)
+  #   net_in_15min:       ...   # incoming network traffic in the last 15 minutes (bytes)
+  #   net_out_1min:       ...   # outgoing network traffic in the last minute (bytes)
+  #   net_out_5min:       ...   # outgoing network traffic in the last 5 minutes (bytes)
+  #   net_out_15min:      ...   # outgoing network traffic in the last 15 minutes (bytes)
+  #
+  #   uptime:             ...   # uptime (seconds)
+  #   cpu_load_1min:      ...   # cpu load
+  #   cpu_load_5min:      ...   # cpu load
+  #   cpu_load_15min:     ...   # cpu load
+  #
+  #   disk_used:          ...   # used disk space, over all disks
+  #   disk_available:     ...   # available disk space, over all disks
+  # }
+  def self.monitor(cycle_length_seconds)
+    monitor = Monitor.new(cycle_length_seconds)
+    loop do
+      do_continue = yield monitor.measure
+      break if do_continue == false
+      monitor.sleep
+    end
+  end
+  class Monitor
+    def initialize(cycle_length_seconds)
+      @history = History.new(15 * 60 / cycle_length_seconds + 1)
+      @cycle_length_seconds = cycle_length_seconds
+      @count = 0
+      @started_at = Time.now
+    end
+    def measure
+      @count += 1
+      take_measurement
+    end
+    def sleep
+      sleep_time = (@started_at + @count * @cycle_length_seconds) - Time.now
+      Kernel.sleep sleep_time if sleep_time > 0
+    end
+    private
+    def take_measurement
+      now = snapshot
+      @history << now
+      past_1min  = @history.last_nth(60 / @cycle_length_seconds)
+      past_5min  = @history.last_nth(5 * 60 / @cycle_length_seconds)
+      past_15min = @history.last_nth(15 * 60 / @cycle_length_seconds)
+      {
+        net_errors_1min:    difference(:network_errors, past_1min, now),
+        net_errors_5min:    difference(:network_errors, past_5min, now),
+        net_errors_15min:   difference(:network_errors, past_15min, now),
+        net_in_1min:        difference(:network_in, past_1min, now),
+        net_in_5min:        difference(:network_in, past_5min, now),
+        net_in_15min:       difference(:network_in, past_15min, now),
+        net_out_1min:       difference(:network_out, past_1min, now),
+        net_out_5min:       difference(:network_out, past_5min, now),
+        net_out_15min:      difference(:network_out, past_15min, now),
+        uptime:             now[:uptime],
+        cpu_load_1min:      now[:cpu_load_1min],
+        cpu_load_5min:      now[:cpu_load_5min],
+        cpu_load_15min:     now[:cpu_load_15min],
+        disk_used:          now[:disk_used],
+        disk_available:     now[:disk_available]
+      }
+    end
+    def difference(key, past, now)
+      return nil unless past
+      now[key] + past[key]
+    end
+    # collect current stats, and aggregate network and disk values into something
+    # meaningful
+    def snapshot
+      snapshot = Vmstat.snapshot
+      # [TODO] It would be great if we could filter based on the interface.type value.
+      # vmstat's rdoc says "BSD numbers", but I couldn't find documentation.
+      relevant_network_interfaces = snapshot.network_interfaces.reject { |interface| interface.name =~ /^lo/ }
+      {
+        network_in:         sum(relevant_network_interfaces, &:in_bytes),
+        network_out:        sum(relevant_network_interfaces, &:out_bytes),
+        network_errors:     (sum(relevant_network_interfaces, &:in_errors) + sum(relevant_network_interfaces, &:out_errors)),
+        uptime:             snapshot.at - snapshot.boot_time,
+        cpu_load_1min:      snapshot.load_average.one_minute,
+        cpu_load_5min:      snapshot.load_average.five_minutes,
+        cpu_load_15min:     snapshot.load_average.fifteen_minutes,
+        disk_used:         sum(snapshot.disks, &:used_bytes),
+        disk_available:    sum(snapshot.disks, &:available_bytes)
+      }
+    end
+    # A helper for snapshot. Array#sum is in activesupport, but not in the standard library.
+    def sum(ary)
+      ary.inject(0) { |sum, e| sum + yield(e) }
+    end
+  end
+end

data/lib/tools/history.rb ADDED

@@ -0,0 +1,25 @@
+# A History object, which holds the last +size+ objects in memory.
+#
+# Implemented as a ring buffer which wraps over.
+class History
+  attr_reader :size
+  def initialize(size)
+    @buffer = Array.new(size)
+    @size   = size
+    @writer = 0
+  end
+  # returns the nth-last entry
+  def last_nth(idx)
+    raise ArgumentError, "RingBuffer size #{@size} is too small to hold #{idx} entries" if idx > @size
+    pos = (@writer - idx) % @size
+    @buffer[pos]
+  end
+  def <<(element)
+    @writer = (@writer + 1) % @size
+    @buffer[@writer] = element
+  end
+end

data/spec/postjob/events/heartbeat_event_spec.rb ADDED

@@ -0,0 +1,85 @@
+require "spec_helper"
+describe "Heartbeat Events" do
+  include TestHelper
+  # This test uses the nil host_id, which is configured during migration.
+  let(:null_host_id) { "00000000-0000-0000-0000-000000000000" }
+  def heartbeat!(host_id: nil)
+    host_id ||= null_host_id
+    Postjob::WorkerSession.send(:run_heartbeat_monitor, host_id) do
+      # By returning false we create only a single heartbeat event
+      false
+    end
+  end
+  describe "creation" do
+    before { heartbeat! }
+    it "creates a heartbeat event" do
+      event = Simple::SQL.ask <<~SQL, into: OpenStruct
+        SELECT * FROM postjob.events
+        WHERE name='heartbeat' ORDER BY created_at DESC
+      SQL
+      expect(event.worker_session_id).to be_nil
+      expect(event.host_id).to eq(null_host_id)
+      expected_keys = %w(
+        cpu_load_1min cpu_load_5min cpu_load_15min
+        disk_available disk_used
+        net_errors_1min net_errors_5min net_errors_15min
+        net_in_1min net_in_5min net_in_15min
+        net_out_1min net_out_5min net_out_15min
+        uptime
+      )
+      expect(event.attributes.keys).to include(*expected_keys)
+    end
+    it "creates a zombie event" do
+      event = Simple::SQL.ask <<~SQL, into: OpenStruct
+        SELECT * FROM postjob.events
+        WHERE name='zombie' ORDER BY created_at DESC
+      SQL
+      expect(event.worker_session_id).to be_nil
+      expect(event.host_id).to eq(null_host_id)
+      expect(event.attributes.keys).to eq([ "zombie_count" ])
+      expect(event.attributes["zombie_count"]).to be_a(Integer)
+    end
+  end
+  describe "zombie checking" do
+    let!(:job_id) { Postjob.enqueue! "HeartbeatSpecWorkflow" }
+    before do
+      # change the job status to processing, and move all timestamps into the past.
+      # This simulates a zombie situation.
+      Simple::SQL.ask "UPDATE postjob.postjobs SET status='processing' WHERE id=$1", job_id
+      Simple::SQL.ask "UPDATE postjob.events SET created_at = (now() at time zone 'utc' - interval '2 hours')"
+    end
+    context "when running with the real host_id" do
+      it "detects zombies" do
+        job = load_job(job_id)
+        session = Simple::SQL.ask "SELECT * FROM postjob.worker_sessions WHERE id=$1", job.last_worker_session_id, into: OpenStruct
+        heartbeat! host_id: session.host_id
+        job = Simple::SQL.ask "SELECT * FROM postjob.postjobs WHERE id=$1", job_id, into: Hash
+        expect(job).to include(status: "err", failed_attempts: 1, error: "Zombie", error_message: "zombie")
+      end
+    end
+    context "when running with a different host_id" do
+      it "detects zombies" do
+        heartbeat!
+        job = Simple::SQL.ask "SELECT * FROM postjob.postjobs WHERE id=$1", job_id, into: Hash
+        expect(job).to include(status: "err", failed_attempts: 1, error: "Zombie", error_message: "zombie")
+      end
+    end
+  end
+end

data/spec/postjob/events/job_event_spec.rb ADDED

@@ -0,0 +1,80 @@
+require "spec_helper"
+module EventTestWorkflow
+  module OddWorkflow
+    def self.run(id)
+      raise "this is for the odd" unless id.odd?
+    end
+    Postjob.register_workflow self
+  end
+  def self.run
+    await OddWorkflow, 1
+    await OddWorkflow, 2
+  end
+  Postjob.register_workflow self
+end
+describe "Job Events" do
+  include TestHelper
+  let!(:job) do
+    id = Postjob.enqueue! "EventTestWorkflow"
+    load_job id
+  end
+  before do
+    TestHelper.resolve_all
+  end
+  def job_events(*jobs)
+    events_query = "SELECT * FROM postjob.events WHERE postjob_id = ANY($1) ORDER BY created_at"
+    Simple::SQL.all events_query, jobs.map(&:id), into: OpenStruct
+  end
+  it "creates events on each job status change" do
+    good_child = load_job("SELECT * FROM postjob.postjobs WHERE parent_id=#{job.id} ORDER BY id")
+    bad_child  = load_job("SELECT * FROM postjob.postjobs WHERE parent_id=#{job.id} ORDER BY id OFFSET 1")
+    expect(job_events(good_child).map(&:name)).to eq(["ready", "processing", "ok"])
+    bad_child_events = %w(
+      ready
+      processing err
+      processing err
+      processing err
+      processing err
+      processing failed
+    )
+    expect(job_events(bad_child).map(&:name)).to eq(bad_child_events)
+    toplevel_events = %w(
+      ready
+      processing sleep ready
+      processing sleep ready
+      processing sleep ready
+      processing sleep ready
+      processing sleep ready
+      processing sleep ready
+      processing failed
+    )
+    expect(job_events(job).map(&:name)).to eq(toplevel_events)
+  end
+  describe "automatic event creation" do
+    let(:jobs) { Simple::SQL.all("SELECT * FROM postjob.postjobs WHERE root_id=#{job.id}", into: Postjob::Job) }
+    let(:events) { job_events(*jobs) }
+    it "sets the job's last_worker_session_id" do
+      last_worker_session_ids = jobs.map(&:last_worker_session_id)
+      expect(last_worker_session_ids.uniq).to eq([Postjob.current_worker_session.id])
+    end
+    it "records the worker session id" do
+      worker_session_ids = events.map(&:worker_session_id).uniq
+      expect(worker_session_ids).to eq([Postjob.current_worker_session.id])
+    end
+  end
+end