spreadsheet_agent 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,418 @@
1
+ # Author: Darin London
2
+ # The license of this source is "MIT Licence"
3
+
4
+ require 'spreadsheet_agent/db'
5
+ require 'socket'
6
+ require 'open3'
7
+ require 'capture_io'
8
+ require 'mail'
9
+
10
+ # A Distributed Agent System using Google Spreadsheets
11
+ #
12
+ # Version 0.01
13
+ #
14
+ # SpreadsheetAgent is a framework for creating massively distributed pipelines
15
+ # across many different servers, each using the same google spreadsheet as a
16
+ # control panel. It is extensible, and flexible. It doesnt specify what
17
+ # goals any pipeline should be working towards, or which goals are prerequisites
18
+ # for other goals, but it does provide logic for easily defining these relationships
19
+ # based on your own needs. It does this by providing a subsumption architecture,
20
+ # whereby many small, highly focused agents are written to perform specific goals,
21
+ # and also know what resources they require to perform them. Agents can be coded to
22
+ # subsume other agents upon successful completion. In addition, it is
23
+ # designed from the beginning to support the creation of simple human-computational
24
+ # workflows.
25
+ #
26
+ # SpreadsheetAgent requires GoogleDrive[http://rubygems.org/gems/google_drive], and works with a Google Spreadsheet with some or all worksheets
27
+ # formatted according to the following:
28
+ # * The top row of a page to be processed has fields for all entry record in subsequent rows
29
+ # * You can define any fields necessary, but you must specify a 'ready' and a 'complete' field
30
+ # * You must define at least 1 key field, and the key field must be specified as required in the :config (see SpreadsheetAgent::Db)
31
+ # * You should then define fields named for agent_bin/#{ field_name }_agent.rb for each agent that you plan to deploy in your pipeline
32
+ #
33
+ module SpreadsheetAgent
34
+
35
+ # SpreadsheetAgent::Agent is designed to make it easy to create a single task which connects to
36
+ # a field within a record on a page within the configured SpreadsheetAgent compatible Google Spreadsheet,
37
+ # runs, and reports whether the job completed or ended in error. An agent can be configured to only run
38
+ # when certain prerequisite fields have completed. The data in these fields can be filled in by other
39
+ # SpreadsheetAgent::Agents, SpreadsheetAgent::Runners, or humans. Compute node configuration is available
40
+ # to prevent the agent from running more than a certain number of instances of itself, or not run if certain
41
+ # other agents or processes are running on the node. Finally, an agent can be configured to subsume another
42
+ # agent, and fill in the completion field for that agent in addition to its own when it completes successfully.
43
+ #
44
+ # extends SpreadsheetAgent::Db
45
+ class Agent < SpreadsheetAgent::Db
46
+
47
+ # The name of the field in the page to which the agent should report status
48
+ attr_accessor :agent_name
49
+
50
+ # The name of the Page on the Google Spreadsheet that contains the record to be worked on by the agent
51
+ attr_accessor :page_name
52
+
53
+ # hash of key-value pairs. The keys are defined in config/agent.conf.yml. The values
54
+ # specify the values for those fields in the record on the page for which the agent is running.
55
+ # All keys configured as 'required: 1' in config/agent.conf.yml must be included in the keys hash
56
+ attr_accessor :keys
57
+
58
+ # Boolean. When true, the agent code will print verbosely to STDERR. When false, and the process!
59
+ # returns a failure status, the agent will email all stdout and stderr to the email specified in the
60
+ # :config send_to value
61
+ attr_accessor :debug
62
+
63
+ # Optional array of prerequisite fields that must contain a 1 in them for the record on the page before
64
+ # the agent will attempt to run
65
+ attr_accessor :prerequisites
66
+
67
+ # Optional integer. This works on Linux with ps. The agent will not attempt to run if there are
68
+ # max_selves instances running
69
+ attr_accessor :max_selves
70
+
71
+ # Hash of process_name to number of max_instances. This works on Linux with ps. If the agent detects
72
+ # the specified number of max_instances of the given process (based on a line match), it will not
73
+ # attempt to run
74
+ attr_accessor :conflicts_with
75
+
76
+ # Array of fields on the record which this agent subsumes. If the agent completes successfully these
77
+ # fields will be updated with a 1 in addition to the field for the agent
78
+ attr_accessor :subsumes
79
+
80
+ # Readonly access to the GoogleDrive::Worksheet that is being access by the agent.
81
+ attr_reader :worksheet
82
+
83
+ # create a new SpreadsheetAgent::Agent with the following:
84
+ # == required configuration parameters:
85
+ # * agent_name
86
+ # * page_name
87
+ # * keys
88
+ #
89
+ # == optional parameters:
90
+ # * config_file: (see SpreadsheetAgent::DB)
91
+ # * debug
92
+ # * prerequisites
93
+ # * max_selves
94
+ # * conflicts_with
95
+ # * subsumes
96
+ #
97
+ def initialize(attributes)
98
+ @agent_name = attributes[:agent_name]
99
+ @page_name = attributes[:page_name]
100
+ @keys = attributes[:keys].clone
101
+ unless @agent_name && @page_name && @keys
102
+ raise SpreadsheetAgentError, "agent_name, page_name, and keys attributes are required!"
103
+ end
104
+ @config_file = attributes[:config_file]
105
+ build_db()
106
+
107
+ @worksheet = @db.worksheet_by_title(@page_name)
108
+ @debug = attributes[:debug]
109
+ if attributes[:prerequisites]
110
+ @prerequisites = attributes[:prerequisites].clone
111
+ end
112
+
113
+ @max_selves = attributes[:max_selves]
114
+ if attributes[:conflicts_with]
115
+ @conflicts_with = attributes[:conflicts_with].clone
116
+ end
117
+ if attributes[:subsumes]
118
+ @subsumes = attributes[:subsumes].clone
119
+ end
120
+ end
121
+
122
+ # If the agent does not have any conflicting processes (max_selves or conflicts_with)
123
+ # and if the entry is ready (field 'ready' has a 1), and all prerequisite fields have a 1,
124
+ # gets the GoogleDrive::List record, and passes it to the supplied agent_code PROC as argument.
125
+ # This PROC must return a required boolean field indicating success or failure, and an optional
126
+ # hash of key - value fields that will be updated on the GoogleDrive::List record. Note, the updates
127
+ # are made regardless of the value of success. In fact, the agent can be configured to update
128
+ # different fields based on success or failure. Also, note that any value can be stored in the
129
+ # hash. This allows the agent to communicate any useful information to the google spreadsheet for other
130
+ # agents (SpreadsheetAgent::Agent, SpreadsheetAgent::Runner, or human) to use. The PROC must try at all
131
+ # costs to avoid terminating. If an error is encountered, it should return false for the success field
132
+ # to signal that the process failed. If no errors are encountered it should return true for the success
133
+ # field.
134
+ #
135
+ # Exits successfully, enters a 1 in the agent_name field
136
+ # $agent->process! do |entry|
137
+ # true
138
+ # end
139
+ #
140
+ # Same, but also updates the 'notice' field in the record along with the 1 in the agent_name field
141
+ # $agent->process! do |entry|
142
+ # [true, {:notice => 'There were 30 files processed'}]
143
+ # end
144
+ #
145
+ # Fails, enters f:#{hostname} in the agent_name field
146
+ # $agent->process! do |entry|
147
+ # false
148
+ #
149
+ # Same, but also updates the 'notice' field in the record along with the failure notice
150
+ # $agent->process! do |entry|
151
+ # [false, {:notice => 'There were 10 files left to process!' }]
152
+ # end
153
+ #
154
+ # This agent passes different parameters based on success or failure
155
+ # $agent->process! do |entry|
156
+ # if $success
157
+ # true
158
+ # else
159
+ # [ false, {:notice => 'there were 10 remaining files'}]
160
+ # end
161
+ # end
162
+ #
163
+ def process!(&agent_code)
164
+ @worksheet.reload
165
+ no_problems = true
166
+ capture_output = nil
167
+ unless @debug
168
+ capture_output = CaptureIO.new
169
+ capture_output.start
170
+ end
171
+
172
+ begin
173
+ return true if has_conflicts()
174
+ (runnable, entry) = run_entry()
175
+ return false unless entry
176
+ return true unless runnable
177
+
178
+ success, update_entry = agent_code.call(entry)
179
+ if success
180
+ complete_entry(update_entry)
181
+ else
182
+ fail_entry(update_entry)
183
+ end
184
+ rescue
185
+ $stderr.puts "#{ $! }"
186
+ no_problems = false
187
+ end
188
+ unless capture_output.nil?
189
+ if no_problems
190
+ capture_output.stop
191
+ else
192
+ mail_error(capture_output.stop)
193
+ end
194
+ end
195
+ return no_problems
196
+ end
197
+
198
+ # Returns the GoogleDrive::List object for the specified keys
199
+ def get_entry
200
+ this_entry = nil
201
+ if @worksheet
202
+ @worksheet.list.each do |this_row|
203
+ keep_row = true
204
+
205
+ @config['key_fields'].keys.reject { |key_field|
206
+ !(@config['key_fields'][key_field]["required"]) && !(@keys[key_field])
207
+ }.each do |key|
208
+ break unless keep_row
209
+ keep_row = (this_row[key] == @keys[key])
210
+ end
211
+
212
+ if keep_row
213
+ return this_row
214
+ end
215
+ end
216
+ end
217
+ end
218
+
219
+ private
220
+
221
+ def has_conflicts
222
+ return unless (@max_selves || @conflicts_with) # nothing conflicts here
223
+
224
+ running_conflicters = {}
225
+ self_name = File.basename $0
226
+
227
+ begin
228
+ conflicting_in = Open3.popen3('ps','-eo','pid,command')[1]
229
+ conflicting_in.lines.each do |line|
230
+ unless(
231
+ (line.match(/emacs\s+|vim*\s+|pico\s+/)) ||
232
+ (line.match("#{ $$ }"))
233
+ )
234
+ if @max_selves && line.match(self_name)
235
+ if running_conflicters[@agent_name].nil?
236
+ running_conflicters[@agent_name] = 1
237
+ else
238
+ running_conflicters[@agent_name] += 1
239
+ end
240
+
241
+ if running_conflicters[@agent_name] == @max_selves
242
+ $stderr.puts "max_selves limit reached" if @debug
243
+ conflicting_in.close
244
+ return true
245
+ end
246
+ end
247
+
248
+ if @conflicts_with
249
+ @conflicts_with.keys.each do |conflicter|
250
+ if line.match(conflicter)
251
+ if running_conflicters[conflicter].nil?
252
+ running_conflicters[conflicter] = 1
253
+ else
254
+ running_conflicters[conflicter] += 1
255
+ end
256
+ if running_conflicters[conflicter] >= @conflicts_with[conflicter]
257
+ $stderr.puts "conflicts with #{ conflicter }" if @debug
258
+ conflicting_in.close
259
+ return true
260
+ end
261
+ end
262
+ end
263
+ end
264
+ end
265
+ end
266
+ conflicting_in.close
267
+ return false
268
+
269
+ rescue
270
+ $stderr.puts "Couldnt check conflicts #{ $! }" if @debug
271
+ return true
272
+ end
273
+
274
+ end
275
+
276
+ # this call initiates a race resistant attempt to make sure that there is only 1
277
+ # clear 'winner' among N potential agents attempting to run the same goal on the
278
+ # same spreadsheet agent's cell
279
+ def run_entry
280
+ entry = get_entry()
281
+ output = '';
282
+ @keys.keys.select { |k| @config['key_fields'][k] && @keys[k] }.each do |key|
283
+ output += [ key, @keys[key] ].join(' ') + " "
284
+ end
285
+
286
+ unless entry
287
+ $stderr.puts "#{ output } is not supported on #{ @page_name }" if @debug
288
+ return
289
+ end
290
+
291
+ unless entry['ready'] == "1"
292
+ $stderr.puts "#{ output } is not ready to run #{ @agent_name }" if @debug
293
+ return false, entry
294
+ end
295
+
296
+ if entry['complete'] == "1"
297
+ $stderr.puts "All goals are completed for #{ output }" if @debug
298
+ return false, entry
299
+ end
300
+
301
+ if entry[@agent_name]
302
+ (status, running_hostname) = entry[@agent_name].split(':')
303
+
304
+ case status
305
+ when 'r'
306
+ $stderr.puts " #{ output } is already running #{ @agent_name } on #{ running_hostname }" if @debug
307
+ return false, entry
308
+
309
+ when "1"
310
+ $stderr.puts " #{ output } has already run #{ @agent_name }" if @debug
311
+ return false, entry
312
+
313
+ when 'F'
314
+ $stderr.puts " #{ output } has already Failed #{ @agent_name }" if @debug
315
+ return false, entry
316
+ end
317
+ end
318
+
319
+ if @prerequisites
320
+ @prerequisites.each do |prereq_field|
321
+ unless entry[prereq_field] == "1"
322
+ $stderr.puts " #{ output } has not finished #{ prereq_field }" if @debug
323
+ return false, entry
324
+ end
325
+ end
326
+ end
327
+
328
+ # first attempt to set the hostname of the machine as the value of the agent
329
+ hostname = Socket.gethostname;
330
+ begin
331
+ entry.update @agent_name => "r:#{ hostname }"
332
+ @worksheet.save
333
+
334
+ rescue GoogleDrive::Error
335
+ # this is a collision, which is to be treated as if it is not runnable
336
+ $stderr.puts " #{ output } lost #{ @agent_name } on #{hostname}" if @debug
337
+ return false, entry
338
+ end
339
+
340
+ sleep 3
341
+ begin
342
+ @worksheet.reload
343
+ rescue GoogleDrive::Error
344
+ # this is a collision, which is to be treated as if it is not runnable
345
+ $stderr.puts " #{ output } lost #{ @agent_name } on #{hostname}" if @debug
346
+ return false, entry
347
+ end
348
+
349
+ check = entry[@agent_name]
350
+ (status, running_hostname) = check.split(':')
351
+ if hostname == running_hostname
352
+ return true, entry
353
+ end
354
+ $stderr.puts " #{ output } lost #{ @agent_name } on #{hostname}" if @debug
355
+ return false, entry
356
+ end
357
+
358
+ def complete_entry(update_entry)
359
+ if update_entry.nil?
360
+ update_entry = {}
361
+ end
362
+
363
+ if @subsumes && @subsumes.length > 0
364
+ @subsumes.each do |subsumed_agent|
365
+ update_entry[subsumed_agent] = 1
366
+ end
367
+ end
368
+
369
+ update_entry[@agent_name] = 1
370
+ entry = get_entry()
371
+ entry.update update_entry
372
+ @worksheet.save
373
+ end
374
+
375
+ def fail_entry(update_entry)
376
+ if update_entry.nil?
377
+ update_entry = { }
378
+ end
379
+ hostname = Socket.gethostname
380
+ update_entry[@agent_name] = "F:#{ hostname }"
381
+ entry = get_entry()
382
+ entry.update update_entry
383
+ @worksheet.save
384
+ end
385
+
386
+ def mail_error(error_message)
387
+ output = ''
388
+ @keys.keys.each do |key|
389
+ output += [key, @keys[key] ].join(' ') + " "
390
+ end
391
+
392
+ prefix = [Socket.gethostname, output, @agent_name ].join(' ')
393
+ begin
394
+ Mail.defaults do
395
+ delivery_method :smtp, {
396
+ :address => "smtp.gmail.com",
397
+ :port => 587,
398
+ :domain => Socket.gethostname,
399
+ :user_name => @config['guser'],
400
+ :password => @config['gpass'],
401
+ :authentication => 'plain',
402
+ :enable_starttls_auto => true }
403
+ end
404
+
405
+ mail = Mail.new do
406
+ from @config['reply_email']
407
+ to @config['send_to']
408
+ subject prefix
409
+ body error_message.to_s
410
+ end
411
+
412
+ mail.deliver!
413
+ rescue
414
+ #DO NOTHING
415
+ end
416
+ end
417
+ end
418
+ end
@@ -0,0 +1,55 @@
1
+ # Author: Darin London
2
+ # The license of this source is "MIT Licence"
3
+
4
+ require 'google_drive'
5
+ require 'psych'
6
+
7
+ module SpreadsheetAgent
8
+
9
+ # SpreadsheetAgent::Db is a class that is meant to be extended by SpreadsheetAgent classes. It
10
+ # stores shared code to instantiate and provide access to a GoogleDrive object and
11
+ # GoogleDrive::Spreadsheet object for use by the extending classes to access their Google Spreadsheets
12
+ class Db
13
+
14
+ # This holds the GoogleDrive::Spreadsheet object that can be used to query information from the google
15
+ # spreadsheet using its API. It cannot be changed after the object is constructed
16
+ attr_reader :db
17
+
18
+ # This holds the GoogleDrive object instantiated with the guser and gpass in the :config. It
19
+ # cannot be changed after the object is constructed
20
+ attr_reader :session
21
+
22
+ # This holds the hash that is constructed from the YAML :config_file. It
23
+ # cannot be changed after the object is constructed
24
+ attr_reader :config
25
+
26
+ # Passing this attribute to the constructor will override the location of config/agent.conf.yml.
27
+ # If passed, it must be a path to a file which matches the template in config/agent.conf.yml.
28
+ # The default is to load ../config/agent.config.yaml relative to the directory containing the
29
+ # calling script $0. This cannot be changed after the object is constructed
30
+ attr_reader :config_file
31
+
32
+ # This is for internal use by SpreadsheetAgent classes that extend SpreadsheetAgent::Db
33
+ def build_db
34
+ build_config()
35
+ unless @config['key_fields'].keys.select { |k| @config['key_fields'][k]['required'] }.count > 0
36
+ raise SpreadsheetAgentError, "Your configuration must have at least one required key_fields key"
37
+ end
38
+ @session = GoogleDrive.login(@config['guser'], @config['gpass'])
39
+ @db = @session.spreadsheet_by_title(@config['spreadsheet_name'])
40
+ end
41
+
42
+ private
43
+
44
+ def build_config()
45
+ if @config_file.nil?
46
+ @config_file = find_bin() + '../config/agent.conf.yml'
47
+ end
48
+ @config = Psych.load_file(@config_file)
49
+ end
50
+
51
+ def find_bin()
52
+ File.expand_path(File.dirname( $0 )) + '/'
53
+ end
54
+ end
55
+ end