scout_agent 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/AUTHORS +4 -0
  2. data/CHANGELOG +3 -0
  3. data/COPYING +340 -0
  4. data/INSTALL +17 -0
  5. data/LICENSE +6 -0
  6. data/README +3 -0
  7. data/Rakefile +123 -0
  8. data/TODO +3 -0
  9. data/bin/scout_agent +11 -0
  10. data/lib/scout_agent.rb +73 -0
  11. data/lib/scout_agent/agent.rb +42 -0
  12. data/lib/scout_agent/agent/communication_agent.rb +85 -0
  13. data/lib/scout_agent/agent/master_agent.rb +301 -0
  14. data/lib/scout_agent/api.rb +241 -0
  15. data/lib/scout_agent/assignment.rb +105 -0
  16. data/lib/scout_agent/assignment/configuration.rb +30 -0
  17. data/lib/scout_agent/assignment/identify.rb +110 -0
  18. data/lib/scout_agent/assignment/queue.rb +95 -0
  19. data/lib/scout_agent/assignment/reset.rb +91 -0
  20. data/lib/scout_agent/assignment/snapshot.rb +92 -0
  21. data/lib/scout_agent/assignment/start.rb +149 -0
  22. data/lib/scout_agent/assignment/status.rb +44 -0
  23. data/lib/scout_agent/assignment/stop.rb +60 -0
  24. data/lib/scout_agent/assignment/upload_log.rb +61 -0
  25. data/lib/scout_agent/core_extensions.rb +260 -0
  26. data/lib/scout_agent/database.rb +386 -0
  27. data/lib/scout_agent/database/mission_log.rb +282 -0
  28. data/lib/scout_agent/database/queue.rb +126 -0
  29. data/lib/scout_agent/database/snapshots.rb +187 -0
  30. data/lib/scout_agent/database/statuses.rb +65 -0
  31. data/lib/scout_agent/dispatcher.rb +157 -0
  32. data/lib/scout_agent/id_card.rb +143 -0
  33. data/lib/scout_agent/lifeline.rb +243 -0
  34. data/lib/scout_agent/mission.rb +212 -0
  35. data/lib/scout_agent/order.rb +58 -0
  36. data/lib/scout_agent/order/check_in_order.rb +32 -0
  37. data/lib/scout_agent/order/snapshot_order.rb +33 -0
  38. data/lib/scout_agent/plan.rb +306 -0
  39. data/lib/scout_agent/server.rb +123 -0
  40. data/lib/scout_agent/tracked.rb +59 -0
  41. data/lib/scout_agent/wire_tap.rb +513 -0
  42. data/setup.rb +1360 -0
  43. data/test/tc_core_extensions.rb +89 -0
  44. data/test/tc_id_card.rb +115 -0
  45. data/test/tc_plan.rb +285 -0
  46. data/test/test_helper.rb +22 -0
  47. data/test/ts_all.rb +7 -0
  48. metadata +171 -0
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env ruby -wKU
2
+
3
+ module ScoutAgent
4
+ class Database
5
+ class Statuses < Database
6
+ def update_schema(version = schema_version)
7
+ case version
8
+ when 0
9
+ <<-END_INITIAL_SCHEMA.trim
10
+ CREATE TABLE statuses (
11
+ name TEXT NOT NULL PRIMARY KEY
12
+ CHECK( name IN ( 'lifeline', 'master', 'mission',
13
+ 'communication', 'queue', 'snapshot' ) ),
14
+ pid INTEGER NOT NULL,
15
+ status REQUIRED_TEXT_TYPE,
16
+ last_updated_at DATETIME_TYPE
17
+ );
18
+ DEFAULT_LOCALTIME_TRIGGER statuses last_updated_at
19
+ END_INITIAL_SCHEMA
20
+ end
21
+ end
22
+
23
+ def update_status(status, name = IDCard.me && IDCard.me.process_name)
24
+ write_to_sqlite do |sqlite|
25
+ sqlite.execute(<<-END_UPDATE_STATUS.trim, name, Process.pid, status)
26
+ INSERT OR REPLACE INTO statuses(name, pid, status, last_updated_at)
27
+ VALUES( ?, ?, ?, null)
28
+ END_UPDATE_STATUS
29
+ end
30
+ rescue Amalgalite::SQLite3::Error => error # failed to update status
31
+ # do nothing: try again later
32
+ log.error("Database status update error: #{error.message}.")
33
+ end
34
+
35
+ def clear_status(name = IDCard.me && IDCard.me.process_name)
36
+ write_to_sqlite do |sqlite|
37
+ sqlite.execute("DELETE FROM statuses WHERE name = ?", name)
38
+ end
39
+ rescue Amalgalite::SQLite3::Error => error # failed to delete status
40
+ # do nothing: new process will replace
41
+ log.error("Database status clearing error: #{error.message}.")
42
+ end
43
+
44
+ def current_statuses
45
+ query(<<-END_FIND_STATUSES.trim)
46
+ SELECT name, pid, status, last_updated_at FROM statuses ORDER BY ROWID
47
+ END_FIND_STATUSES
48
+ rescue Amalgalite::SQLite3::Error => error # failed to find statuses
49
+ log.error("Database statuses error: #{error.message}.")
50
+ Array.new # return empty results
51
+ end
52
+
53
+ def current_status(name = IDCard.me && IDCard.me.process_name)
54
+ read_from_sqlite { |sqlite|
55
+ sqlite.first_value_from(<<-END_FIND_STATUS, name)
56
+ SELECT status FROM statuses WHERE name = ?
57
+ END_FIND_STATUS
58
+ }
59
+ rescue Amalgalite::SQLite3::Error => error # failed to find status
60
+ log.error("Database current status error: #{error.message}.")
61
+ nil # return no results
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env ruby -wKU
2
+
3
+ module ScoutAgent
4
+ module Dispatcher
5
+ module_function
6
+
7
+ def dispatch(args = ARGV)
8
+ switches = parse_switches(args)
9
+ assignment = parse_assignment(args)
10
+ code = load_assignment(assignment)
11
+ execute_assignment(assignment, code, switches, args)
12
+ end
13
+
14
+ def parse_switches(args)
15
+ switches = { }
16
+
17
+ args.options do |opts|
18
+ opts.banner = <<-END_USAGE.trim
19
+ Usage:
20
+
21
+ [sudo] #{ScoutAgent.agent_name} [OPTIONS] COMMAND
22
+
23
+ Use the commands identify, start, and stop to prepare, launch, and
24
+ shutdown the agent respectively. Those require super user privileges.
25
+ You can also use the status command to check in on a running agent.
26
+
27
+ END_USAGE
28
+
29
+ opts.separator "Basic Options:"
30
+ opts.on( "-s", "--server URL", String,
31
+ "The URL for the server to report to." ) do |url|
32
+ switches[:server_url] = url
33
+ end
34
+ opts.on( "-d", "--[no-]daemon",
35
+ "Run in the background as a daemon." ) do |boolean|
36
+ switches[:run_as_daemon] = boolean
37
+ end
38
+ opts.on( "-l", "--logging-level LEVEL", %w[DEBUG INFO WARN ERROR FATAL],
39
+ "The minimum level of log message to record." ) do |level|
40
+ switches[:logging_level] = level
41
+ end
42
+ opts.on( "-t", "--[no-]test-mode",
43
+ "Used in agent development." ) do |boolean|
44
+ if switches[:test_mode] = boolean
45
+ switches[:server_url] = "http://localhost:4567"
46
+ switches[:run_as_daemon] = false
47
+ end
48
+ end
49
+
50
+ opts.separator "Expert Options:"
51
+ opts.on( "--users NAME1,NAME2,...", Array,
52
+ "A list of users to try switching to." ) do |users|
53
+ switches[:user_choices] = users
54
+ end
55
+ opts.on( "--groups NAME1,NAME2,...", Array,
56
+ "A list of groups to try switching to." ) do |groups|
57
+ switches[:group_choices] = groups
58
+ end
59
+ opts.on( "--prefix PATH", String,
60
+ "A prefix path prepended to all other paths." ) do |path|
61
+ switches[:prefix_path] = path
62
+ end
63
+ [ %w[os_config_path configuration],
64
+ %w[os_db_path databases],
65
+ %w[os_pid_path PID\ files],
66
+ %w[os_log_path log\ files] ].each do |name, used_for|
67
+ opts.on( "--#{name.tr('_', '-')} PATH", String,
68
+ "The path your OS uses for #{used_for}." ) do |path|
69
+ switches[name.to_sym] = path
70
+ end
71
+ end
72
+
73
+ opts.separator "Application Options:"
74
+ opts.on( "-h", "--help",
75
+ "Show this message." ) do
76
+ puts opts # show usage
77
+ exit
78
+ end
79
+ opts.on( "-v", "--version",
80
+ "Display the current version." ) do
81
+ puts "#{ScoutAgent.proper_agent_name} v#{ScoutAgent::VERSION}"
82
+ exit
83
+ end
84
+
85
+ begin
86
+ opts.parse!
87
+ rescue OptionParser::ParseError # failed to parse options
88
+ puts opts # show usage
89
+ exit
90
+ end
91
+ end
92
+
93
+ # apply switches so paths will be set correctly for load checks
94
+ Plan.update_from_switches(switches)
95
+
96
+ switches
97
+ end
98
+
99
+ def parse_assignment(args)
100
+ assignment = args.shift.to_s.downcase
101
+ if assignment.empty?
102
+ if Plan.present?
103
+ if IDCard.new(:lifeline).pid_file.exist?
104
+ return "status"
105
+ else
106
+ return "start"
107
+ end
108
+ else
109
+ return "identify"
110
+ end
111
+ end
112
+ unless assignment =~ /\A\w+\z/
113
+ abort_with_unknown_assignment(assignment)
114
+ end
115
+ assignment
116
+ end
117
+
118
+ def load_assignment(assignment)
119
+ dir = LIB_DIR + "assignment"
120
+ matches = dir.entries.grep(/#{Regexp.escape(assignment)}\w*\.rb\z/)
121
+ if matches.size > 1
122
+ abort_with_ambiguous_assignment(assignment, matches)
123
+ elsif matches.first and (code = dir + matches.first).exist?
124
+ return code
125
+ else
126
+ abort_with_unknown_assignment(assignment)
127
+ end
128
+ end
129
+
130
+ def execute_assignment(assignment, code, switches, other_args)
131
+ require code
132
+ class_name = code.basename(".rb").to_s.CamelCase
133
+ begin
134
+ loaded = Assignment.const_get(class_name)
135
+ rescue NameError # can't load module
136
+ abort_with_missing_code(class_name)
137
+ end
138
+ loaded.new(switches, other_args).prepare_and_execute
139
+ end
140
+
141
+ def abort_with_ambiguous_assignment(assignment, matches)
142
+ choices = matches.map { |m| "'#{m.basename('.rb')}'" }
143
+ choices[-2..-1] = choices[-2..-1].join(", or ")
144
+ abort <<-END_AMBIGUOUS
145
+ Ambiguous command '#{assignment}'. Did you mean #{choices.join(', ')}?
146
+ END_AMBIGUOUS
147
+ end
148
+
149
+ def abort_with_unknown_assignment(assignment)
150
+ abort "Unknown command '#{assignment}'."
151
+ end
152
+
153
+ def abort_with_missing_code(class_name)
154
+ abort "Failed to load '#{class_name}'."
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,143 @@
1
+ #!/usr/bin/env ruby -wKU
2
+
3
+ module ScoutAgent
4
+ #
5
+ # This class excapsulates a named process. It is used to ensure exclusive
6
+ # execution and to signal other processes.
7
+ #
8
+ class IDCard
9
+ class << self
10
+ #
11
+ # This global attribute should contain the name of the current process.
12
+ # It is set during a successful authorization.
13
+ #
14
+ # <b>Warning:</b> Be sure to clear this attribute immediately after a
15
+ # fork() so you don't keep the parent's identity.
16
+ #
17
+ attr_accessor :me
18
+ end
19
+
20
+ #
21
+ # Pass in the +process_name+ of the process you want to signal() or the
22
+ # +process_name+ you wish to authorize() for yourself.
23
+ #
24
+ def initialize(process_name)
25
+ @process_name = process_name
26
+ end
27
+
28
+ attr_reader :process_name
29
+
30
+ # A String representation of this process, with PID.
31
+ def to_s
32
+ "#{@process_name} (#{pid || 'unauthorized'})"
33
+ end
34
+
35
+ #
36
+ # Returns the path to the unique PID file for this process, based on the
37
+ # current Plan.
38
+ #
39
+ def pid_file
40
+ Plan.pid_dir + "#{@process_name}.pid"
41
+ end
42
+
43
+ # Returns the PID for the named process, or +nil+ if it cannot be read.
44
+ def pid
45
+ pid_file.read.to_i
46
+ rescue Exception
47
+ nil
48
+ end
49
+
50
+ #
51
+ # Tries to send +message+ as a signal to the process represented by this
52
+ # instance. You can pass any message Process.kill() would understand.
53
+ #
54
+ # Returns +true+ if the signal was sent, or +false+ if the PID file could
55
+ # not be read. Any Exception raised during the send, such as Errno::ESRCH
56
+ # for a missing process, will bubble up to the calling code.
57
+ #
58
+ def signal(message)
59
+ if id = pid
60
+ Process.kill(message, id)
61
+ true
62
+ else
63
+ false
64
+ end
65
+ end
66
+
67
+ #
68
+ # Claims this identity for this process. This process is multiprocess-safe
69
+ # and will fail if another process has claimed this identity. However,
70
+ # stale claims are ignored and replaced, if possible.
71
+ #
72
+ # This method returns +true+ in the claim succeeded and +false+ if it could
73
+ # not happen for any reason. A return of +true+ indicates that me() has
74
+ # been updated and an exit handle has been installed to revoke() this claim
75
+ # as the process ends.
76
+ #
77
+ def authorize
78
+ File.open(pid_file, File::CREAT | File::EXCL | File::WRONLY) do |pid|
79
+ pid.flock(File::LOCK_EX)
80
+ if not block_given? or yield # allows for daemonization
81
+ pid.puts Process.pid
82
+ else
83
+ pid.flock(File::LOCK_UN)
84
+ revoke # remove this file if anything went wrong
85
+ return false
86
+ end
87
+ pid.flock(File::LOCK_UN)
88
+ end
89
+
90
+ self.class.me = self
91
+
92
+ at_my_exit do
93
+ unless revoke
94
+ # log.error "Unable to unlink pid file: #{$!.message}" if log
95
+ end
96
+ end
97
+ true
98
+ rescue Errno::EEXIST # pid_file already exists
99
+ File.open(pid_file) do |pid|
100
+ if pid.flock(File::LOCK_EX | File::LOCK_NB)
101
+ if pid.read =~ /\A\d+/
102
+ begin
103
+ unless signal(0)
104
+ # log.warn "Could not create or read PID file. " +
105
+ # "You may need to the path to the config directory. " +
106
+ # "See: http://scoutapp.com/help#data_file" if log
107
+ end
108
+ rescue Errno::ESRCH # no such process
109
+ # log.info "Stale PID file found. Clearing it and reloading..." if log
110
+ if revoke
111
+ pid.flock(File::LOCK_UN) # release the lock before we recurse
112
+ return authorize # try again
113
+ else
114
+ # log.info "Failed to clear PID." if log
115
+ end
116
+ rescue Errno::EACCES # don't have permission
117
+ # nothing we can do so give up
118
+ end
119
+ else
120
+ # nothing we can do so give up
121
+ end
122
+ pid.flock(File::LOCK_UN) # release the lock
123
+ else
124
+ # log.info "Couldn't grab a file lock to verify existing PID file." if log
125
+ return false
126
+ end
127
+ end
128
+ # log.warn "Process #{pid} was already running" if log
129
+ false
130
+ end
131
+
132
+ #
133
+ # Releases a held claim on a process name. Returns +true+ if successful or
134
+ # +false+ if the PID file didn't exist or couldn't be destroyed.
135
+ #
136
+ def revoke
137
+ pid_file.unlink
138
+ true
139
+ rescue Exception
140
+ false
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env ruby -wKU
2
+
3
+ module ScoutAgent
4
+ class Lifeline
5
+ NO_CONTACT_TIMEOUT = 3
6
+ CHECK_IN_FREQUENCY = 0.99 # gives us three check ins before a cutoff
7
+ TERM_TO_KILL_PAUSE = 1
8
+ RELAUNCH_FREQUENCIES = [0, 1, 1, 2, 3, 5, 8, 13]
9
+
10
+ #################
11
+ ### Interface ###
12
+ #################
13
+
14
+ def initialize(agent, log = WireTap.new(nil))
15
+ @agent = agent
16
+ @log = log
17
+ @parent_pid = Process.pid
18
+ @child_pid = nil
19
+ @reader = nil
20
+ @writer = nil
21
+ @launch_and_monitor_thread = nil
22
+ @check_in_with_parent_thread = nil
23
+ @code = nil
24
+ @last_launch = nil
25
+ @relaunch_index = 0
26
+
27
+ at_my_exit do
28
+ clear_status
29
+ end
30
+ end
31
+
32
+ include Tracked
33
+
34
+ attr_reader :log
35
+
36
+ def launch_and_monitor
37
+ @launch_and_monitor_thread = Thread.new do
38
+ Thread.current.abort_on_exception = true
39
+ loop do
40
+ wait_for_launch
41
+ prepare_pipe
42
+ launch_child
43
+ close_writer
44
+ monitor_child
45
+ restart_child
46
+ end
47
+ end
48
+ end
49
+
50
+ def terminate
51
+ if Process.pid == @parent_pid
52
+ # stop monitoring
53
+ log.info("Stopping the monitoring for '#{@agent}'.")
54
+ @launch_and_monitor_thread.exit if @launch_and_monitor_thread
55
+ # ask child process to exit
56
+ log.info("Asking '#{@agent}' to stop.")
57
+ IDCard.new(@agent).signal("TERM")
58
+ end
59
+ rescue Errno::ESRCH # no such process
60
+ # if already exited, so we are fine
61
+ end
62
+
63
+ def join
64
+ if Process.pid == @parent_pid and @launch_and_monitor_thread
65
+ @launch_and_monitor_thread.join
66
+ end
67
+ end
68
+
69
+ #######
70
+ private
71
+ #######
72
+
73
+ ##############
74
+ ### Parent ###
75
+ ##############
76
+
77
+ def wait_for_launch
78
+ if @last_launch
79
+ seconds_ran = Time.now - @last_launch
80
+ relaunch_wait = RELAUNCH_FREQUENCIES[@relaunch_index] * 60 - seconds_ran
81
+
82
+ if relaunch_wait > 0
83
+ log.info( "Waiting #{relaunch_wait} seconds before relaunching " +
84
+ "'#{@agent}'." )
85
+ sleep relaunch_wait
86
+ elsif relaunch_wait.abs > ( RELAUNCH_FREQUENCIES[@relaunch_index + 1] ||
87
+ RELAUNCH_FREQUENCIES.last ) * 60
88
+ @relaunch_index = 0
89
+ return
90
+ end
91
+
92
+ unless @relaunch_index == RELAUNCH_FREQUENCIES.size - 1
93
+ @relaunch_index += 1
94
+ end
95
+ end
96
+ end
97
+
98
+ def prepare_pipe
99
+ @reader, @writer = IO.pipe
100
+ end
101
+
102
+ def launch_child
103
+ log.info("Launching '#{@agent}'.")
104
+ status(@agent)
105
+ @last_launch = Time.now
106
+ @child_pid = fork do
107
+ reset_environment
108
+ close_reader
109
+ load_code
110
+ authorize_code
111
+ check_in_with_parent
112
+ run_code
113
+ end
114
+ end
115
+
116
+ def close_writer
117
+ @writer.close
118
+ rescue IOError # already closed
119
+ # it's closed so we're OK
120
+ rescue NoMethodError # @writer is nil
121
+ # it wasn't set so there's nothing to close
122
+ end
123
+
124
+ def monitor_child
125
+ loop do
126
+ check_in = nil
127
+ begin
128
+ Timeout.timeout(NO_CONTACT_TIMEOUT) { check_in = @reader.gets }
129
+ log.error("'#{@agent}' monitor channel has closed.") if check_in.nil?
130
+ rescue Timeout::Error
131
+ # check_in will stay nil
132
+ log.error("'#{@agent}' failed to check-in in time.")
133
+ end
134
+ unless check_in.to_s =~
135
+ /\A#{@child_pid}: \d{4}-\d{2}-\d{2} \d{2}:\d{2}\Z/
136
+ log.error("'#{@agent}' check-in was malformed.") unless check_in.nil?
137
+ break
138
+ end
139
+ end
140
+ end
141
+
142
+ def restart_child
143
+ log.info("Stopping '#{@agent}'.")
144
+ status(@agent, :restarting)
145
+ close_reader
146
+ Process.term_or_kill(@child_pid, TERM_TO_KILL_PAUSE)
147
+ end
148
+
149
+ def status(process, restarting = false)
150
+ if db = status_database
151
+ db.write_to_sqlite do |sqlite|
152
+ old_status = db.current_status
153
+ if old_status =~ /\ARestarting (.+)\z/
154
+ processes = $1.split(" and ")
155
+ if restarting
156
+ processes << process unless processes.include? process
157
+ super("Restarting #{processes.join(' and ')}")
158
+ else
159
+ processes.delete(process)
160
+ if processes.empty?
161
+ super("Monitoring processes")
162
+ else
163
+ super("Restarting #{processes.join(' and ')}")
164
+ end
165
+ end
166
+ else
167
+ if restarting
168
+ super("Restarting #{process}")
169
+ else
170
+ super("Monitoring processes")
171
+ end
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+ #############
178
+ ### Child ###
179
+ #############
180
+
181
+ def reset_environment
182
+ # swap out our parent's signal handlers
183
+ install_shutdown_handler { finish_code }
184
+ trap("ALRM") { alert_code }
185
+
186
+ # clear the parent's identity
187
+ IDCard.me = nil
188
+ end
189
+
190
+ def close_reader
191
+ @reader.close
192
+ rescue IOError # already closed
193
+ # it's closed so we're OK
194
+ rescue NoMethodError # @reader is nil
195
+ # it wasn't set so there's nothing to close
196
+ end
197
+
198
+ def load_code
199
+ require LIB_DIR + "agent"
200
+ require LIB_DIR + "agent/#{@agent}_agent"
201
+ @code = ScoutAgent::Agent.const_get("#{@agent.CamelCase}Agent").new
202
+ end
203
+
204
+ def authorize_code
205
+ @code.authorize
206
+ end
207
+
208
+ def check_in_with_parent
209
+ @check_in_with_parent_thread = Thread.new do
210
+ Thread.current.abort_on_exception = true
211
+ loop do
212
+ begin
213
+ @writer.puts "#{Process.pid}: " +
214
+ Time.now.strftime('%Y-%m-%d %H:%M')
215
+ rescue Errno::EPIPE, IOError # parent closed our pipe or exited
216
+ break finish_code
217
+ end
218
+ sleep CHECK_IN_FREQUENCY
219
+ end
220
+ end
221
+ end
222
+
223
+ def run_code
224
+ @code.run
225
+ end
226
+
227
+ def finish_code
228
+ close_writer
229
+
230
+ if @code
231
+ @code.finish
232
+ else
233
+ exit
234
+ end
235
+ end
236
+
237
+ def alert_code
238
+ if @code
239
+ @code.notice_changes
240
+ end
241
+ end
242
+ end
243
+ end