scout_agent 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/AUTHORS +4 -0
  2. data/CHANGELOG +3 -0
  3. data/COPYING +340 -0
  4. data/INSTALL +17 -0
  5. data/LICENSE +6 -0
  6. data/README +3 -0
  7. data/Rakefile +123 -0
  8. data/TODO +3 -0
  9. data/bin/scout_agent +11 -0
  10. data/lib/scout_agent.rb +73 -0
  11. data/lib/scout_agent/agent.rb +42 -0
  12. data/lib/scout_agent/agent/communication_agent.rb +85 -0
  13. data/lib/scout_agent/agent/master_agent.rb +301 -0
  14. data/lib/scout_agent/api.rb +241 -0
  15. data/lib/scout_agent/assignment.rb +105 -0
  16. data/lib/scout_agent/assignment/configuration.rb +30 -0
  17. data/lib/scout_agent/assignment/identify.rb +110 -0
  18. data/lib/scout_agent/assignment/queue.rb +95 -0
  19. data/lib/scout_agent/assignment/reset.rb +91 -0
  20. data/lib/scout_agent/assignment/snapshot.rb +92 -0
  21. data/lib/scout_agent/assignment/start.rb +149 -0
  22. data/lib/scout_agent/assignment/status.rb +44 -0
  23. data/lib/scout_agent/assignment/stop.rb +60 -0
  24. data/lib/scout_agent/assignment/upload_log.rb +61 -0
  25. data/lib/scout_agent/core_extensions.rb +260 -0
  26. data/lib/scout_agent/database.rb +386 -0
  27. data/lib/scout_agent/database/mission_log.rb +282 -0
  28. data/lib/scout_agent/database/queue.rb +126 -0
  29. data/lib/scout_agent/database/snapshots.rb +187 -0
  30. data/lib/scout_agent/database/statuses.rb +65 -0
  31. data/lib/scout_agent/dispatcher.rb +157 -0
  32. data/lib/scout_agent/id_card.rb +143 -0
  33. data/lib/scout_agent/lifeline.rb +243 -0
  34. data/lib/scout_agent/mission.rb +212 -0
  35. data/lib/scout_agent/order.rb +58 -0
  36. data/lib/scout_agent/order/check_in_order.rb +32 -0
  37. data/lib/scout_agent/order/snapshot_order.rb +33 -0
  38. data/lib/scout_agent/plan.rb +306 -0
  39. data/lib/scout_agent/server.rb +123 -0
  40. data/lib/scout_agent/tracked.rb +59 -0
  41. data/lib/scout_agent/wire_tap.rb +513 -0
  42. data/setup.rb +1360 -0
  43. data/test/tc_core_extensions.rb +89 -0
  44. data/test/tc_id_card.rb +115 -0
  45. data/test/tc_plan.rb +285 -0
  46. data/test/test_helper.rb +22 -0
  47. data/test/ts_all.rb +7 -0
  48. metadata +171 -0
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env ruby -wKU
2
+
3
+ module ScoutAgent
4
+ class Database
5
+ class Statuses < Database
6
+ def update_schema(version = schema_version)
7
+ case version
8
+ when 0
9
+ <<-END_INITIAL_SCHEMA.trim
10
+ CREATE TABLE statuses (
11
+ name TEXT NOT NULL PRIMARY KEY
12
+ CHECK( name IN ( 'lifeline', 'master', 'mission',
13
+ 'communication', 'queue', 'snapshot' ) ),
14
+ pid INTEGER NOT NULL,
15
+ status REQUIRED_TEXT_TYPE,
16
+ last_updated_at DATETIME_TYPE
17
+ );
18
+ DEFAULT_LOCALTIME_TRIGGER statuses last_updated_at
19
+ END_INITIAL_SCHEMA
20
+ end
21
+ end
22
+
23
+ def update_status(status, name = IDCard.me && IDCard.me.process_name)
24
+ write_to_sqlite do |sqlite|
25
+ sqlite.execute(<<-END_UPDATE_STATUS.trim, name, Process.pid, status)
26
+ INSERT OR REPLACE INTO statuses(name, pid, status, last_updated_at)
27
+ VALUES( ?, ?, ?, null)
28
+ END_UPDATE_STATUS
29
+ end
30
+ rescue Amalgalite::SQLite3::Error => error # failed to update status
31
+ # do nothing: try again later
32
+ log.error("Database status update error: #{error.message}.")
33
+ end
34
+
35
+ def clear_status(name = IDCard.me && IDCard.me.process_name)
36
+ write_to_sqlite do |sqlite|
37
+ sqlite.execute("DELETE FROM statuses WHERE name = ?", name)
38
+ end
39
+ rescue Amalgalite::SQLite3::Error => error # failed to delete status
40
+ # do nothing: new process will replace
41
+ log.error("Database status clearing error: #{error.message}.")
42
+ end
43
+
44
+ def current_statuses
45
+ query(<<-END_FIND_STATUSES.trim)
46
+ SELECT name, pid, status, last_updated_at FROM statuses ORDER BY ROWID
47
+ END_FIND_STATUSES
48
+ rescue Amalgalite::SQLite3::Error => error # failed to find statuses
49
+ log.error("Database statuses error: #{error.message}.")
50
+ Array.new # return empty results
51
+ end
52
+
53
+ def current_status(name = IDCard.me && IDCard.me.process_name)
54
+ read_from_sqlite { |sqlite|
55
+ sqlite.first_value_from(<<-END_FIND_STATUS, name)
56
+ SELECT status FROM statuses WHERE name = ?
57
+ END_FIND_STATUS
58
+ }
59
+ rescue Amalgalite::SQLite3::Error => error # failed to find status
60
+ log.error("Database current status error: #{error.message}.")
61
+ nil # return no results
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env ruby -wKU
2
+
3
+ module ScoutAgent
4
+ module Dispatcher
5
+ module_function
6
+
7
+ def dispatch(args = ARGV)
8
+ switches = parse_switches(args)
9
+ assignment = parse_assignment(args)
10
+ code = load_assignment(assignment)
11
+ execute_assignment(assignment, code, switches, args)
12
+ end
13
+
14
+ def parse_switches(args)
15
+ switches = { }
16
+
17
+ args.options do |opts|
18
+ opts.banner = <<-END_USAGE.trim
19
+ Usage:
20
+
21
+ [sudo] #{ScoutAgent.agent_name} [OPTIONS] COMMAND
22
+
23
+ Use the commands identify, start, and stop to prepare, launch, and
24
+ shutdown the agent respectively. Those require super user privileges.
25
+ You can also use the status command to check in on a running agent.
26
+
27
+ END_USAGE
28
+
29
+ opts.separator "Basic Options:"
30
+ opts.on( "-s", "--server URL", String,
31
+ "The URL for the server to report to." ) do |url|
32
+ switches[:server_url] = url
33
+ end
34
+ opts.on( "-d", "--[no-]daemon",
35
+ "Run in the background as a daemon." ) do |boolean|
36
+ switches[:run_as_daemon] = boolean
37
+ end
38
+ opts.on( "-l", "--logging-level LEVEL", %w[DEBUG INFO WARN ERROR FATAL],
39
+ "The minimum level of log message to record." ) do |level|
40
+ switches[:logging_level] = level
41
+ end
42
+ opts.on( "-t", "--[no-]test-mode",
43
+ "Used in agent development." ) do |boolean|
44
+ if switches[:test_mode] = boolean
45
+ switches[:server_url] = "http://localhost:4567"
46
+ switches[:run_as_daemon] = false
47
+ end
48
+ end
49
+
50
+ opts.separator "Expert Options:"
51
+ opts.on( "--users NAME1,NAME2,...", Array,
52
+ "A list of users to try switching to." ) do |users|
53
+ switches[:user_choices] = users
54
+ end
55
+ opts.on( "--groups NAME1,NAME2,...", Array,
56
+ "A list of groups to try switching to." ) do |groups|
57
+ switches[:group_choices] = groups
58
+ end
59
+ opts.on( "--prefix PATH", String,
60
+ "A prefix path prepended to all other paths." ) do |path|
61
+ switches[:prefix_path] = path
62
+ end
63
+ [ %w[os_config_path configuration],
64
+ %w[os_db_path databases],
65
+ %w[os_pid_path PID\ files],
66
+ %w[os_log_path log\ files] ].each do |name, used_for|
67
+ opts.on( "--#{name.tr('_', '-')} PATH", String,
68
+ "The path your OS uses for #{used_for}." ) do |path|
69
+ switches[name.to_sym] = path
70
+ end
71
+ end
72
+
73
+ opts.separator "Application Options:"
74
+ opts.on( "-h", "--help",
75
+ "Show this message." ) do
76
+ puts opts # show usage
77
+ exit
78
+ end
79
+ opts.on( "-v", "--version",
80
+ "Display the current version." ) do
81
+ puts "#{ScoutAgent.proper_agent_name} v#{ScoutAgent::VERSION}"
82
+ exit
83
+ end
84
+
85
+ begin
86
+ opts.parse!
87
+ rescue OptionParser::ParseError # failed to parse options
88
+ puts opts # show usage
89
+ exit
90
+ end
91
+ end
92
+
93
+ # apply switches so paths will be set correctly for load checks
94
+ Plan.update_from_switches(switches)
95
+
96
+ switches
97
+ end
98
+
99
+ def parse_assignment(args)
100
+ assignment = args.shift.to_s.downcase
101
+ if assignment.empty?
102
+ if Plan.present?
103
+ if IDCard.new(:lifeline).pid_file.exist?
104
+ return "status"
105
+ else
106
+ return "start"
107
+ end
108
+ else
109
+ return "identify"
110
+ end
111
+ end
112
+ unless assignment =~ /\A\w+\z/
113
+ abort_with_unknown_assignment(assignment)
114
+ end
115
+ assignment
116
+ end
117
+
118
+ def load_assignment(assignment)
119
+ dir = LIB_DIR + "assignment"
120
+ matches = dir.entries.grep(/#{Regexp.escape(assignment)}\w*\.rb\z/)
121
+ if matches.size > 1
122
+ abort_with_ambiguous_assignment(assignment, matches)
123
+ elsif matches.first and (code = dir + matches.first).exist?
124
+ return code
125
+ else
126
+ abort_with_unknown_assignment(assignment)
127
+ end
128
+ end
129
+
130
+ def execute_assignment(assignment, code, switches, other_args)
131
+ require code
132
+ class_name = code.basename(".rb").to_s.CamelCase
133
+ begin
134
+ loaded = Assignment.const_get(class_name)
135
+ rescue NameError # can't load module
136
+ abort_with_missing_code(class_name)
137
+ end
138
+ loaded.new(switches, other_args).prepare_and_execute
139
+ end
140
+
141
+ def abort_with_ambiguous_assignment(assignment, matches)
142
+ choices = matches.map { |m| "'#{m.basename('.rb')}'" }
143
+ choices[-2..-1] = choices[-2..-1].join(", or ")
144
+ abort <<-END_AMBIGUOUS
145
+ Ambiguous command '#{assignment}'. Did you mean #{choices.join(', ')}?
146
+ END_AMBIGUOUS
147
+ end
148
+
149
+ def abort_with_unknown_assignment(assignment)
150
+ abort "Unknown command '#{assignment}'."
151
+ end
152
+
153
+ def abort_with_missing_code(class_name)
154
+ abort "Failed to load '#{class_name}'."
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,143 @@
1
+ #!/usr/bin/env ruby -wKU
2
+
3
+ module ScoutAgent
4
+ #
5
+ # This class excapsulates a named process. It is used to ensure exclusive
6
+ # execution and to signal other processes.
7
+ #
8
+ class IDCard
9
+ class << self
10
+ #
11
+ # This global attribute should contain the name of the current process.
12
+ # It is set during a successful authorization.
13
+ #
14
+ # <b>Warning:</b> Be sure to clear this attribute immediately after a
15
+ # fork() so you don't keep the parent's identity.
16
+ #
17
+ attr_accessor :me
18
+ end
19
+
20
+ #
21
+ # Pass in the +process_name+ of the process you want to signal() or the
22
+ # +process_name+ you wish to authorize() for yourself.
23
+ #
24
+ def initialize(process_name)
25
+ @process_name = process_name
26
+ end
27
+
28
+ attr_reader :process_name
29
+
30
+ # A String representation of this process, with PID.
31
+ def to_s
32
+ "#{@process_name} (#{pid || 'unauthorized'})"
33
+ end
34
+
35
+ #
36
+ # Returns the path to the unique PID file for this process, based on the
37
+ # current Plan.
38
+ #
39
+ def pid_file
40
+ Plan.pid_dir + "#{@process_name}.pid"
41
+ end
42
+
43
+ # Returns the PID for the named process, or +nil+ if it cannot be read.
44
+ def pid
45
+ pid_file.read.to_i
46
+ rescue Exception
47
+ nil
48
+ end
49
+
50
+ #
51
+ # Tries to send +message+ as a signal to the process represented by this
52
+ # instance. You can pass any message Process.kill() would understand.
53
+ #
54
+ # Returns +true+ if the signal was sent, or +false+ if the PID file could
55
+ # not be read. Any Exception raised during the send, such as Errno::ESRCH
56
+ # for a missing process, will bubble up to the calling code.
57
+ #
58
+ def signal(message)
59
+ if id = pid
60
+ Process.kill(message, id)
61
+ true
62
+ else
63
+ false
64
+ end
65
+ end
66
+
67
+ #
68
+ # Claims this identity for this process. This process is multiprocess-safe
69
+ # and will fail if another process has claimed this identity. However,
70
+ # stale claims are ignored and replaced, if possible.
71
+ #
72
+ # This method returns +true+ in the claim succeeded and +false+ if it could
73
+ # not happen for any reason. A return of +true+ indicates that me() has
74
+ # been updated and an exit handle has been installed to revoke() this claim
75
+ # as the process ends.
76
+ #
77
+ def authorize
78
+ File.open(pid_file, File::CREAT | File::EXCL | File::WRONLY) do |pid|
79
+ pid.flock(File::LOCK_EX)
80
+ if not block_given? or yield # allows for daemonization
81
+ pid.puts Process.pid
82
+ else
83
+ pid.flock(File::LOCK_UN)
84
+ revoke # remove this file if anything went wrong
85
+ return false
86
+ end
87
+ pid.flock(File::LOCK_UN)
88
+ end
89
+
90
+ self.class.me = self
91
+
92
+ at_my_exit do
93
+ unless revoke
94
+ # log.error "Unable to unlink pid file: #{$!.message}" if log
95
+ end
96
+ end
97
+ true
98
+ rescue Errno::EEXIST # pid_file already exists
99
+ File.open(pid_file) do |pid|
100
+ if pid.flock(File::LOCK_EX | File::LOCK_NB)
101
+ if pid.read =~ /\A\d+/
102
+ begin
103
+ unless signal(0)
104
+ # log.warn "Could not create or read PID file. " +
105
+ # "You may need to the path to the config directory. " +
106
+ # "See: http://scoutapp.com/help#data_file" if log
107
+ end
108
+ rescue Errno::ESRCH # no such process
109
+ # log.info "Stale PID file found. Clearing it and reloading..." if log
110
+ if revoke
111
+ pid.flock(File::LOCK_UN) # release the lock before we recurse
112
+ return authorize # try again
113
+ else
114
+ # log.info "Failed to clear PID." if log
115
+ end
116
+ rescue Errno::EACCES # don't have permission
117
+ # nothing we can do so give up
118
+ end
119
+ else
120
+ # nothing we can do so give up
121
+ end
122
+ pid.flock(File::LOCK_UN) # release the lock
123
+ else
124
+ # log.info "Couldn't grab a file lock to verify existing PID file." if log
125
+ return false
126
+ end
127
+ end
128
+ # log.warn "Process #{pid} was already running" if log
129
+ false
130
+ end
131
+
132
+ #
133
+ # Releases a held claim on a process name. Returns +true+ if successful or
134
+ # +false+ if the PID file didn't exist or couldn't be destroyed.
135
+ #
136
+ def revoke
137
+ pid_file.unlink
138
+ true
139
+ rescue Exception
140
+ false
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env ruby -wKU
2
+
3
+ module ScoutAgent
4
+ class Lifeline
5
+ NO_CONTACT_TIMEOUT = 3
6
+ CHECK_IN_FREQUENCY = 0.99 # gives us three check ins before a cutoff
7
+ TERM_TO_KILL_PAUSE = 1
8
+ RELAUNCH_FREQUENCIES = [0, 1, 1, 2, 3, 5, 8, 13]
9
+
10
+ #################
11
+ ### Interface ###
12
+ #################
13
+
14
+ def initialize(agent, log = WireTap.new(nil))
15
+ @agent = agent
16
+ @log = log
17
+ @parent_pid = Process.pid
18
+ @child_pid = nil
19
+ @reader = nil
20
+ @writer = nil
21
+ @launch_and_monitor_thread = nil
22
+ @check_in_with_parent_thread = nil
23
+ @code = nil
24
+ @last_launch = nil
25
+ @relaunch_index = 0
26
+
27
+ at_my_exit do
28
+ clear_status
29
+ end
30
+ end
31
+
32
+ include Tracked
33
+
34
+ attr_reader :log
35
+
36
+ def launch_and_monitor
37
+ @launch_and_monitor_thread = Thread.new do
38
+ Thread.current.abort_on_exception = true
39
+ loop do
40
+ wait_for_launch
41
+ prepare_pipe
42
+ launch_child
43
+ close_writer
44
+ monitor_child
45
+ restart_child
46
+ end
47
+ end
48
+ end
49
+
50
+ def terminate
51
+ if Process.pid == @parent_pid
52
+ # stop monitoring
53
+ log.info("Stopping the monitoring for '#{@agent}'.")
54
+ @launch_and_monitor_thread.exit if @launch_and_monitor_thread
55
+ # ask child process to exit
56
+ log.info("Asking '#{@agent}' to stop.")
57
+ IDCard.new(@agent).signal("TERM")
58
+ end
59
+ rescue Errno::ESRCH # no such process
60
+ # if already exited, so we are fine
61
+ end
62
+
63
+ def join
64
+ if Process.pid == @parent_pid and @launch_and_monitor_thread
65
+ @launch_and_monitor_thread.join
66
+ end
67
+ end
68
+
69
+ #######
70
+ private
71
+ #######
72
+
73
+ ##############
74
+ ### Parent ###
75
+ ##############
76
+
77
+ def wait_for_launch
78
+ if @last_launch
79
+ seconds_ran = Time.now - @last_launch
80
+ relaunch_wait = RELAUNCH_FREQUENCIES[@relaunch_index] * 60 - seconds_ran
81
+
82
+ if relaunch_wait > 0
83
+ log.info( "Waiting #{relaunch_wait} seconds before relaunching " +
84
+ "'#{@agent}'." )
85
+ sleep relaunch_wait
86
+ elsif relaunch_wait.abs > ( RELAUNCH_FREQUENCIES[@relaunch_index + 1] ||
87
+ RELAUNCH_FREQUENCIES.last ) * 60
88
+ @relaunch_index = 0
89
+ return
90
+ end
91
+
92
+ unless @relaunch_index == RELAUNCH_FREQUENCIES.size - 1
93
+ @relaunch_index += 1
94
+ end
95
+ end
96
+ end
97
+
98
+ def prepare_pipe
99
+ @reader, @writer = IO.pipe
100
+ end
101
+
102
+ def launch_child
103
+ log.info("Launching '#{@agent}'.")
104
+ status(@agent)
105
+ @last_launch = Time.now
106
+ @child_pid = fork do
107
+ reset_environment
108
+ close_reader
109
+ load_code
110
+ authorize_code
111
+ check_in_with_parent
112
+ run_code
113
+ end
114
+ end
115
+
116
+ def close_writer
117
+ @writer.close
118
+ rescue IOError # already closed
119
+ # it's closed so we're OK
120
+ rescue NoMethodError # @writer is nil
121
+ # it wasn't set so there's nothing to close
122
+ end
123
+
124
+ def monitor_child
125
+ loop do
126
+ check_in = nil
127
+ begin
128
+ Timeout.timeout(NO_CONTACT_TIMEOUT) { check_in = @reader.gets }
129
+ log.error("'#{@agent}' monitor channel has closed.") if check_in.nil?
130
+ rescue Timeout::Error
131
+ # check_in will stay nil
132
+ log.error("'#{@agent}' failed to check-in in time.")
133
+ end
134
+ unless check_in.to_s =~
135
+ /\A#{@child_pid}: \d{4}-\d{2}-\d{2} \d{2}:\d{2}\Z/
136
+ log.error("'#{@agent}' check-in was malformed.") unless check_in.nil?
137
+ break
138
+ end
139
+ end
140
+ end
141
+
142
+ def restart_child
143
+ log.info("Stopping '#{@agent}'.")
144
+ status(@agent, :restarting)
145
+ close_reader
146
+ Process.term_or_kill(@child_pid, TERM_TO_KILL_PAUSE)
147
+ end
148
+
149
+ def status(process, restarting = false)
150
+ if db = status_database
151
+ db.write_to_sqlite do |sqlite|
152
+ old_status = db.current_status
153
+ if old_status =~ /\ARestarting (.+)\z/
154
+ processes = $1.split(" and ")
155
+ if restarting
156
+ processes << process unless processes.include? process
157
+ super("Restarting #{processes.join(' and ')}")
158
+ else
159
+ processes.delete(process)
160
+ if processes.empty?
161
+ super("Monitoring processes")
162
+ else
163
+ super("Restarting #{processes.join(' and ')}")
164
+ end
165
+ end
166
+ else
167
+ if restarting
168
+ super("Restarting #{process}")
169
+ else
170
+ super("Monitoring processes")
171
+ end
172
+ end
173
+ end
174
+ end
175
+ end
176
+
177
+ #############
178
+ ### Child ###
179
+ #############
180
+
181
+ def reset_environment
182
+ # swap out our parent's signal handlers
183
+ install_shutdown_handler { finish_code }
184
+ trap("ALRM") { alert_code }
185
+
186
+ # clear the parent's identity
187
+ IDCard.me = nil
188
+ end
189
+
190
+ def close_reader
191
+ @reader.close
192
+ rescue IOError # already closed
193
+ # it's closed so we're OK
194
+ rescue NoMethodError # @reader is nil
195
+ # it wasn't set so there's nothing to close
196
+ end
197
+
198
+ def load_code
199
+ require LIB_DIR + "agent"
200
+ require LIB_DIR + "agent/#{@agent}_agent"
201
+ @code = ScoutAgent::Agent.const_get("#{@agent.CamelCase}Agent").new
202
+ end
203
+
204
+ def authorize_code
205
+ @code.authorize
206
+ end
207
+
208
+ def check_in_with_parent
209
+ @check_in_with_parent_thread = Thread.new do
210
+ Thread.current.abort_on_exception = true
211
+ loop do
212
+ begin
213
+ @writer.puts "#{Process.pid}: " +
214
+ Time.now.strftime('%Y-%m-%d %H:%M')
215
+ rescue Errno::EPIPE, IOError # parent closed our pipe or exited
216
+ break finish_code
217
+ end
218
+ sleep CHECK_IN_FREQUENCY
219
+ end
220
+ end
221
+ end
222
+
223
+ def run_code
224
+ @code.run
225
+ end
226
+
227
+ def finish_code
228
+ close_writer
229
+
230
+ if @code
231
+ @code.finish
232
+ else
233
+ exit
234
+ end
235
+ end
236
+
237
+ def alert_code
238
+ if @code
239
+ @code.notice_changes
240
+ end
241
+ end
242
+ end
243
+ end