RubyGems - flapjack - Versions diffs - 0.7.27 → 0.7.28 - Mend

flapjack 0.7.27 → 0.7.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

data/.gitignore +1 -0
data/CHANGELOG.md +9 -0
data/bin/flapjack +22 -28
data/bin/flapjack-nagios-receiver +5 -27
data/bin/flapjack-populator +2 -2
data/bin/flapper +13 -14
data/bin/receive-events +3 -20
data/bin/simulate-failed-check +3 -20
data/etc/flapjack_config.yaml.example +119 -86
data/features/cli.feature +69 -0
data/features/events.feature +15 -0
data/features/packaging-lintian.feature +4 -6
data/features/rollup.feature +198 -0
data/features/steps/cli_steps.rb +81 -0
data/features/steps/events_steps.rb +26 -16
data/features/steps/notifications_steps.rb +2 -2
data/features/steps/packaging-lintian_steps.rb +2 -2
data/features/support/daemons.rb +113 -0
data/features/support/env.rb +26 -4
data/lib/flapjack/configuration.rb +2 -0
data/lib/flapjack/data/contact.rb +76 -5
data/lib/flapjack/data/entity_check.rb +16 -0
data/lib/flapjack/data/message.rb +11 -8
data/lib/flapjack/data/notification.rb +31 -3
data/lib/flapjack/data/notification_rule.rb +1 -1
data/lib/flapjack/filters/delays.rb +1 -5
data/lib/flapjack/gateways/api/contact_methods.rb +12 -6
data/lib/flapjack/gateways/email.rb +35 -26
data/lib/flapjack/gateways/email/alert.html.erb +4 -4
data/lib/flapjack/gateways/email/alert.text.erb +2 -2
data/lib/flapjack/gateways/email/alert_subject.text.erb +14 -0
data/lib/flapjack/gateways/email/rollup.html.erb +48 -0
data/lib/flapjack/gateways/email/rollup.text.erb +20 -0
data/lib/flapjack/gateways/email/rollup_subject.text.erb +19 -0
data/lib/flapjack/gateways/jabber.rb +97 -47
data/lib/flapjack/gateways/sms_messagenet.rb +26 -24
data/lib/flapjack/gateways/sms_messagenet/alert.text.erb +15 -0
data/lib/flapjack/gateways/sms_messagenet/rollup.text.erb +34 -0
data/lib/flapjack/gateways/web/views/contact.html.erb +16 -8
data/lib/flapjack/notifier.rb +17 -4
data/lib/flapjack/processor.rb +1 -1
data/lib/flapjack/version.rb +1 -1
data/spec/lib/flapjack/coordinator_spec.rb +19 -19
data/spec/lib/flapjack/data/contact_spec.rb +100 -25
data/spec/lib/flapjack/data/event_spec.rb +1 -1
data/spec/lib/flapjack/data/message_spec.rb +1 -1
data/spec/lib/flapjack/data/notification_spec.rb +11 -3
data/spec/lib/flapjack/gateways/api/contact_methods_spec.rb +36 -17
data/spec/lib/flapjack/gateways/api/entity_check_presenter_spec.rb +1 -1
data/spec/lib/flapjack/gateways/api/entity_methods_spec.rb +38 -38
data/spec/lib/flapjack/gateways/api/entity_presenter_spec.rb +15 -15
data/spec/lib/flapjack/gateways/email_spec.rb +4 -4
data/spec/lib/flapjack/gateways/jabber_spec.rb +13 -14
data/spec/lib/flapjack/gateways/oobetet_spec.rb +2 -2
data/spec/lib/flapjack/gateways/pagerduty_spec.rb +5 -5
data/spec/lib/flapjack/gateways/sms_messagenet.spec.rb +1 -1
data/spec/lib/flapjack/gateways/web/views/contact.html.erb_spec.rb +2 -2
data/spec/lib/flapjack/gateways/web_spec.rb +4 -4
data/spec/lib/flapjack/logger_spec.rb +3 -3
data/spec/lib/flapjack/pikelet_spec.rb +10 -10
data/spec/lib/flapjack/processor_spec.rb +4 -4
data/spec/lib/flapjack/redis_pool_spec.rb +1 -1
metadata +70 -5
checksums.yaml +0 -15

data/features/cli.feature ADDED Viewed

@@ -0,0 +1,69 @@
+@process
+Feature: command line utility
+  As a systems administrator
+  I should be able to manage Flapjack
+  From the command line
+  Background:
+    Given a file named "flapjack_cfg.yml" with:
+"""
+test:
+  redis:
+    db: 14
+  processor:
+    enabled: yes
+    logger:
+      level: warn
+"""
+    And a file named "flapjack_cfg_d.yml" with:
+"""
+test:
+  pid_file: tmp/cucumber_cli/flapjack_d.pid
+  log_file: tmp/cucumber_cli/flapjack_d.log
+  redis:
+    db: 14
+  processor:
+    enabled: yes
+    logger:
+      level: warn
+"""
+  Scenario: Starting flapjack
+    When I start flapjack with `flapjack start --config tmp/cucumber_cli/flapjack_cfg.yml`
+    Then flapjack should start within 15 seconds
+  Scenario: Stopping flapjack via SIGINT
+    When I start flapjack with `flapjack start --config tmp/cucumber_cli/flapjack_cfg.yml`
+    Then flapjack should start within 15 seconds
+    When I send a SIGINT to the flapjack process
+    Then flapjack should stop within 15 seconds
+  Scenario: Starting and stopping flapjack, daemonized
+    When I start flapjack (daemonised) with `flapjack start -d --config tmp/cucumber_cli/flapjack_cfg_d.yml`
+    Then flapjack should start within 15 seconds
+    When I stop flapjack with `flapjack stop --config tmp/cucumber_cli/flapjack_cfg_d.yml`
+    Then flapjack should stop within 15 seconds
+  Scenario: Starting, restarting and stopping flapjack, daemonized
+    When I start flapjack (daemonised) with `flapjack start -d --config tmp/cucumber_cli/flapjack_cfg_d.yml`
+    Then flapjack should start within 15 seconds
+    When I restart flapjack with `flapjack restart -d --config tmp/cucumber_cli/flapjack_cfg_d.yml`
+    Then flapjack should restart within 15 seconds
+    When I stop flapjack with `flapjack stop --config tmp/cucumber_cli/flapjack_cfg_d.yml`
+    Then flapjack should stop within 15 seconds
+  Scenario: Reloading flapjack configuration
+    When I start flapjack with `flapjack start --config tmp/cucumber_cli/flapjack_cfg.yml`
+    When I run `mv tmp/cucumber_cli/flapjack_cfg.yml tmp/cucumber_cli/flapjack_cfg.yml.bak`
+    Given a file named "flapjack_cfg.yml" with:
+"""
+test:
+  redis:
+    db: 14
+  processor:
+    enabled: no
+"""
+    When I send a SIGHUP to the flapjack process
+    # TODO how to test for config file change?
+    When I send a SIGINT to the flapjack process
+    Then flapjack should stop within 15 seconds

data/features/events.feature CHANGED Viewed

@@ -90,6 +90,21 @@ Feature: events
     And   a critical event is received
     Then  a notification should not be generated
+  @time
+  Scenario: Alert when coming out of scheduled maintenance
+    Given the check is in an ok state
+    And   the check is in scheduled maintenance for 3 hours
+    When  a critical event is received
+    And   1 minute passes
+    And   a critical event is received
+    Then  a notification should not be generated
+    And   2 hours passes
+    And   a critical event is received
+    Then  a notification should not be generated
+    When  1 hours passes
+    And   a critical event is received
+    Then  a notification should be generated
   @time
   Scenario: Check ok to critical for 1 minute when in unscheduled maintenance
     Given the check is in an ok state

data/features/packaging-lintian.feature CHANGED Viewed

@@ -3,15 +3,13 @@ Feature: Packagability
   It must be easily packagable
   Scenario: No rubygems references
-    Given I am at the project root
-    When I run "grep require lib/* bin/* -R |grep rubygems"
-    Then the exit status should be 1
+    When I run `grep require lib/* bin/* -R |grep rubygems`
+    Then the exit value should be 1
     And I should see 0 lines of output
   Scenario: A shebang that works everywhere
-    Given I am at the project root
-    When I run "find lib/ -type 'f' -name '*.rb'"
-    Then the exit status should be 0
+    When I run `find lib/ -type 'f' -name '*.rb'`
+    Then the exit value should be 0
     And every file in the output should start with "#!/usr/bin/env ruby"

data/features/rollup.feature ADDED Viewed

@@ -0,0 +1,198 @@
+@rollup @notification_rules @resque @processor @notifier @events
+Feature: Rollup on a per contact, per media basis
+  Background:
+    Given the following users exist:
+      | id  | first_name | last_name | email             | sms          | timezone         |
+      | 1   | Malak      | Al-Musawi | malak@example.com | +61400000001 | Asia/Baghdad     |
+    And the following entities exist:
+      | id  | name           | contacts |
+      | 1   | foo            | 1        |
+      | 2   | baz            | 1        |
+    And user 1 has the following notification intervals:
+      | email | sms |
+      | 15    | 15  |
+    And user 1 has the following notification rollup thresholds:
+      | email | sms |
+      | 1     | 2   |
+    And user 1 has the following notification rules:
+      | entities | unknown_media | warning_media | critical_media   |
+      |          |               | email         | sms,email        |
+  @time
+  Scenario: Rollup threshold of 1 means first alert is a rollup
+    Given the check is check 'ping' on entity 'foo'
+    And   the check is in an ok state
+    When  a critical event is received
+    Then  no email alerts should be queued for malak@example.com
+    When  1 minute passes
+    And   a critical event is received
+    Then  1 email alert of type problem and rollup problem should be queued for malak@example.com
+    When  1 minute passes
+    And   an ok event is received
+    Then  1 email alert of type recovery and rollup recovery should be queued for malak@example.com
+  @time
+  Scenario: Acknowledgement ending rollup generates rollup recovery message ignoring interval
+    Given the check is check 'ping' on entity 'foo'
+    And   the check is in an ok state
+    When  a critical event is received
+    Then  no email alerts should be queued for malak@example.com
+    When  1 minute passes
+    And   a critical event is received
+    Then  1 email alert of type problem and rollup problem should be queued for malak@example.com
+    When  10 minutes passes
+    And   an acknowledgement event is received
+    Then  1 email alert of rollup recovery should be queued for malak@example.com
+    And   2 email alerts should be queued for malak@example.com
+  @time
+  Scenario: Transition to rollup when threshold is met
+    Given check 'ping' for entity 'foo' is in an ok state
+    And   check 'ping' for entity 'baz' is in an ok state
+    When  a critical event is received for check 'ping' on entity 'foo'
+    Then  no sms alerts should be queued for +61400000001
+    When  1 minute passes
+    And   a critical event is received for check 'ping' on entity 'foo'
+    Then  1 sms alert of type problem and rollup none should be queued for +61400000001
+    When  5 minutes passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    Then  1 sms alert of type problem and rollup none should be queued for +61400000001
+    And   1 sms alert of type problem and rollup problem should be queued for +61400000001
+    When  1 minute passes
+    And   an ok event is received for check 'ping' on entity 'foo'
+    Then  no sms alerts of type recovery and rollup none should be queued for +61400000001
+    And   1 sms alert of type recovery and rollup recovery should be queued for +61400000001
+    And   3 sms alerts should be queued for +61400000001
+    When  1 minute passes
+    And   an ok event is received for check 'ping' on entity 'baz'
+    Then  1 sms alert of type recovery and rollup none should be queued for +61400000001
+    And   1 sms alert of type recovery and rollup recovery should be queued for +61400000001
+    And   4 sms alerts should be queued for +61400000001
+  @time
+  Scenario: Acknowledgement delays rollup kick-in
+    Given check 'ping' for entity 'foo' is in an ok state
+    And   check 'ping' for entity 'baz' is in an ok state
+    When  a critical event is received for check 'ping' on entity 'foo'
+    Then  no sms alerts should be queued for +61400000001
+    When  1 minute passes
+    And   a critical event is received for check 'ping' on entity 'foo'
+    Then  1 sms alert of type problem and rollup none should be queued for +61400000001
+    When  5 minutes passes
+    And   an acknowledgement event is received for check 'ping' on entity 'foo'
+    Then  1 sms alert of type acknowledgement and rollup none should be queued for +61400000001
+    And   2 sms alerts should be queued for +61400000001
+    When  a critical event is received for check 'ping' on entity 'baz'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    Then  2 sms alerts of type problem and rollup none should be queued for +61400000001
+    And   3 sms alerts should be queued for +61400000001
+  @time
+  Scenario: Acknowledgement hastens rollup recovery
+    Given check 'ping' for entity 'foo' is in an ok state
+    And   check 'ping' for entity 'baz' is in an ok state
+    When  a critical event is received for check 'ping' on entity 'foo'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'foo'
+    Then  1 sms alerts of type problem and rollup none should be queued for +61400000001
+    When  5 minutes passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    Then  1 sms alert of type problem and rollup problem should be queued for +61400000001
+    And   2 sms alerts should be queued for +61400000001
+    When  an acknowledgement event is received for check 'ping' on entity 'foo'
+    Then  1 sms alert of type acknowledgement and rollup recovery should be queued for +61400000001
+    And   3 sms alerts should be queued for +61400000001
+    When  30 minutes passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    Then  2 sms alerts of type problem and rollup none should be queued for +61400000001
+    And   4 sms alerts should be queued for +61400000001
+  @time
+  Scenario: Scheduled maintenance hastens rollup recovery
+    Given check 'ping' for entity 'foo' is in an ok state
+    And   check 'ping' for entity 'baz' is in an ok state
+    When  a critical event is received for check 'ping' on entity 'foo'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'foo'
+    Then  1 sms alerts of type problem and rollup none should be queued for +61400000001
+    When  5 minutes passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    Then  1 sms alert of type problem and rollup problem should be queued for +61400000001
+    And   2 sms alerts should be queued for +61400000001
+    When  check 'ping' for entity 'foo' is in scheduled maintenance for 1 day
+    And   30 minutes passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    Then  1 sms alert of rollup recovery should be queued for +61400000001
+  @time
+  Scenario: Unscheduled maintenance ending promotes rollup
+    Given check 'ping' for entity 'foo' is in unscheduled maintenance
+    And   check 'ping' for entity 'baz' is in an ok state
+    When  a critical event is received for check 'ping' on entity 'foo'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'foo'
+    Then  0 sms alerts should be queued for +61400000001
+    When  5 minutes passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    Then  1 sms alert of type problem and rollup none should be queued for +61400000001
+    And   1 sms alerts should be queued for +61400000001
+    When  4 hours passes
+    And   a critical event is received for check 'ping' on entity 'foo'
+    Then  1 sms alert of type problem and rollup problem should be queued for +61400000001
+    And   2 sms alerts should be queued for +61400000001
+  @time
+  Scenario: Scheduled maintenance ending promotes rollup
+    Given check 'ping' for entity 'foo' is in an ok state
+    Given check 'ping' for entity 'foo' is in scheduled maintenance for 4 hours
+    And   check 'ping' for entity 'baz' is in an ok state
+    When  a critical event is received for check 'ping' on entity 'foo'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'foo'
+    Then  0 sms alerts should be queued for +61400000001
+    When  5 minutes passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'baz'
+    Then  1 sms alert of type problem and rollup none should be queued for +61400000001
+    And   1 sms alerts should be queued for +61400000001
+    When  4 hours passes
+    And   a critical event is received for check 'ping' on entity 'foo'
+    And   1 minute passes
+    And   a critical event is received for check 'ping' on entity 'foo'
+    Then  1 sms alert of type problem and rollup problem should be queued for +61400000001
+    And   2 sms alerts should be queued for +61400000001
+#  @time
+#  Scenario: Contact ceases to be a contact on an entity that they were being alerted for
+#    Given check 'ping' for entity 'foo' is in an ok state
+#    And   check 'ping' for entity 'baz' is in an ok state
+#    When  a critical event is received for check 'ping' on entity 'foo'
+#    And   1 minute passes
+#    And   a critical event is received for check 'ping' on entity 'foo'
+#    Then  1 sms alerts of type problem and rollup none should be queued for +61400000001
+#    When  5 minutes passes
+#    And   a critical event is received for check 'ping' on entity 'baz'
+#    And   1 minute passes
+#    And   a critical event is received for check 'ping' on entity 'baz'
+#    Then  1 sms alert of type problem and rollup problem should be queued for +61400000001
+#    And   2 sms alerts should be queued for +61400000001
+#    When  1 minute passes
+#    And   user 1 ceases to be a contact of entity 'foo'
+#    And   a critical event is received for check 'ping' on entity 'baz'
+#    Then  1 sms alert of rollup recovery should be queued for +61400000001

data/features/steps/cli_steps.rb ADDED Viewed

@@ -0,0 +1,81 @@
+Given /^PENDING/ do
+  pending
+end
+Given /^a file named "([^"]*)" with:$/ do |file_name, file_content|
+  write_file(file_name, file_content)
+end
+When /^I ((?:re)?start|stop) flapjack( \(daemonised\))? with `(.+)`$/ do |start_stop_restart, daemonise, cmd|
+  @root = Pathname.new(File.dirname(__FILE__)).parent.parent.expand_path
+  command = "#{@root.join('bin')}/#{cmd}"
+  case start_stop_restart
+  when 'start'
+    @process_h = spawn_process(command,
+                  :daemon_pidfile => (daemonise.nil? || daemonise.empty?) ? nil : 'tmp/cucumber_cli/flapjack_d.pid')
+  when 'stop', 'restart'
+    `#{command}`
+  end
+end
+When /^I send a SIG(\w+) to the flapjack process$/ do |signal|
+  process = @process_h[:process]
+  pid     = process ? process.pid : @process_h[:pid]
+  Process.kill(signal, pid)
+end
+Then /^flapjack should ((?:re)?start|stop) within (\d+) seconds$/ do |start_stop_restart, seconds|
+  process = @process_h[:process]
+  pid     = process ? process.pid : @process_h[:pid]
+  running = nil
+  attempts = 0
+  max_attempts = seconds.to_i * 200
+  case start_stop_restart
+  when 'start'
+    begin
+      Process.kill(0, pid)
+      running = true
+    rescue Errno::EINVAL, Errno::ESRCH, RangeError, Errno::EPERM => e
+      attempts += 1; sleep 0.1; retry if attempts < max_attempts
+      running = false
+    end
+    running.should be_true
+  when 'stop'
+    if process
+      # it's a child process, so we can use waitpid
+      begin
+        Timeout::timeout(seconds.to_i) do
+          Process.waitpid(pid)
+          running = false
+        end
+      rescue Timeout::Error
+        running = true
+      end
+    else
+      # started via dante, so we'll need to monitor externally
+      while (running != false) && (attempts < max_attempts)
+        begin
+          Process.kill(0, pid)
+          attempts += 1; sleep 0.1
+          running = true
+        rescue Errno::EINVAL, Errno::ESRCH, RangeError, Errno::EPERM => e
+          running = false
+        end
+      end
+    end
+    running.should be_false
+  when 'restart'
+    read_pid = nil
+    while attempts < max_attempts
+      time_and_pid = time_and_pid_from_file('tmp/cucumber_cli/flapjack_d.pid')
+      read_pid = time_and_pid.last
+      break if read_pid != pid
+      attempts += 1; sleep 0.1
+    end
+    read_pid.should_not == pid
+  end
+end

data/features/steps/events_steps.rb CHANGED Viewed

@@ -22,7 +22,7 @@ def submit_event(event)
   @redis.rpush 'events', event.to_json
 end
-def set_scheduled_maintenance(entity, check, duration = 60*60*2)
+def set_scheduled_maintenance(entity, check, duration)
   entity_check = Flapjack::Data::EntityCheck.for_entity_name(entity, check, :redis => @redis)
   t = Time.now.to_i
   entity_check.create_scheduled_maintenance(t, duration, :summary => "upgrading everything")
@@ -200,11 +200,12 @@ Given /^(?:the check|check '([\w\.\-]+)' for entity '([\w\.\-]+)') is in a criti
   set_critical_state(entity, check)
 end
-Given /^(?:the check|check '([\w\.\-]+)' for entity '([\w\.\-]+)') is in scheduled maintenance$/ do |check, entity|
+Given /^(?:the check|check '([\w\.\-]+)' for entity '([\w\.\-]+)') is in scheduled maintenance(?: for (.+))?$/ do |check, entity, duration|
   check  ||= @check
   entity ||= @entity
+  durn = duration ? ChronicDuration.parse(duration) : 60*60*2
   remove_unscheduled_maintenance(entity, check)
-  set_scheduled_maintenance(entity, check)
+  set_scheduled_maintenance(entity, check, durn)
 end
 # TODO set the state directly rather than submit & drain
@@ -345,6 +346,14 @@ Given /^user (\d+) has the following notification intervals:$/ do |contact_id, i
   end
 end
+Given /^user (\d+) has the following notification rollup thresholds:$/ do |contact_id, rollup_thresholds|
+  contact = Flapjack::Data::Contact.find_by_id(contact_id, :redis => @redis)
+  rollup_thresholds.hashes.each do |rollup_threshold|
+    contact.set_rollup_threshold_for_media('email', rollup_threshold['email'].to_i)
+    contact.set_rollup_threshold_for_media('sms',   rollup_threshold['sms'].to_i)
+  end
+end
 Given /^user (\d+) has the following notification rules:$/ do |contact_id, rules|
   contact = Flapjack::Data::Contact.find_by_id(contact_id, :redis => @redis)
   timezone = contact.timezone
@@ -395,24 +404,25 @@ Then /^all alert dropping keys for user (\d+) should have expired$/ do |contact_
   @redis.keys("drop_alerts_for_contact:#{contact_id}*").should be_empty
 end
-Then /^(.*) email alert(?:s)? should be queued for (.*)$/ do |num_queued, address|
+Then /^(\w+) (\w+) alert(?:s)?(?: of)?(?: type (\w+))?(?: and)?(?: rollup (\w+))? should be queued for (.*)$/ do |num_queued, media, notification_type, rollup, address|
   check  = check  ? check  : @check
   entity = entity ? entity : @entity
   case num_queued
   when 'no'
     num_queued = 0
   end
-  queue  = Resque.peek('email_notifications', 0, 30)
-  queue.find_all {|n| n['args'].first['address'] == address }.length.should == num_queued.to_i
+  queue = Resque.peek("#{media}_notifications", 0, 30)
+  queue.find_all {|n|
+    type_ok = notification_type ? ( n['args'].first['notification_type'] == notification_type ) : true
+    rollup_ok = true
+    if rollup
+      if rollup == 'none'
+        rollup_ok = n['args'].first['rollup'].nil?
+      else
+        rollup_ok = n['args'].first['rollup'] == rollup
+      end
+    end
+    type_ok && rollup_ok && ( n['args'].first['address'] == address )
+  }.length.should == num_queued.to_i
 end
-Then /^(.*) sms alert(?:s)? should be queued for (.*)$/ do |num_queued, address|
-  check  = check  ? check  : @check
-  entity = entity ? entity : @entity
-  case num_queued
-  when 'no'
-    num_queued = 0
-  end
-  queue  = Resque.peek('sms_notifications', 0, 30)
-  queue.find_all {|n| n['args'].first['address'] == address }.length.should == num_queued.to_i
-end