spreadsheet_agent 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,418 @@
1
+ # Author: Darin London
2
+ # The license of this source is "MIT Licence"
3
+
4
+ require 'spreadsheet_agent/db'
5
+ require 'socket'
6
+ require 'open3'
7
+ require 'capture_io'
8
+ require 'mail'
9
+
10
+ # A Distributed Agent System using Google Spreadsheets
11
+ #
12
+ # Version 0.01
13
+ #
14
+ # SpreadsheetAgent is a framework for creating massively distributed pipelines
15
+ # across many different servers, each using the same google spreadsheet as a
16
+ # control panel. It is extensible, and flexible. It doesnt specify what
17
+ # goals any pipeline should be working towards, or which goals are prerequisites
18
+ # for other goals, but it does provide logic for easily defining these relationships
19
+ # based on your own needs. It does this by providing a subsumption architecture,
20
+ # whereby many small, highly focused agents are written to perform specific goals,
21
+ # and also know what resources they require to perform them. Agents can be coded to
22
+ # subsume other agents upon successful completion. In addition, it is
23
+ # designed from the beginning to support the creation of simple human-computational
24
+ # workflows.
25
+ #
26
+ # SpreadsheetAgent requires GoogleDrive[http://rubygems.org/gems/google_drive], and works with a Google Spreadsheet with some or all worksheets
27
+ # formatted according to the following:
28
+ # * The top row of a page to be processed has fields for all entry record in subsequent rows
29
+ # * You can define any fields necessary, but you must specify a 'ready' and a 'complete' field
30
+ # * You must define at least 1 key field, and the key field must be specified as required in the :config (see SpreadsheetAgent::Db)
31
+ # * You should then define fields named for agent_bin/#{ field_name }_agent.rb for each agent that you plan to deploy in your pipeline
32
+ #
33
+ module SpreadsheetAgent
34
+
35
+ # SpreadsheetAgent::Agent is designed to make it easy to create a single task which connects to
36
+ # a field within a record on a page within the configured SpreadsheetAgent compatible Google Spreadsheet,
37
+ # runs, and reports whether the job completed or ended in error. An agent can be configured to only run
38
+ # when certain prerequisite fields have completed. The data in these fields can be filled in by other
39
+ # SpreadsheetAgent::Agents, SpreadsheetAgent::Runners, or humans. Compute node configuration is available
40
+ # to prevent the agent from running more than a certain number of instances of itself, or not run if certain
41
+ # other agents or processes are running on the node. Finally, an agent can be configured to subsume another
42
+ # agent, and fill in the completion field for that agent in addition to its own when it completes successfully.
43
+ #
44
+ # extends SpreadsheetAgent::Db
45
+ class Agent < SpreadsheetAgent::Db
46
+
47
+ # The name of the field in the page to which the agent should report status
48
+ attr_accessor :agent_name
49
+
50
+ # The name of the Page on the Google Spreadsheet that contains the record to be worked on by the agent
51
+ attr_accessor :page_name
52
+
53
+ # hash of key-value pairs. The keys are defined in config/agent.conf.yml. The values
54
+ # specify the values for those fields in the record on the page for which the agent is running.
55
+ # All keys configured as 'required: 1' in config/agent.conf.yml must be included in the keys hash
56
+ attr_accessor :keys
57
+
58
+ # Boolean. When true, the agent code will print verbosely to STDERR. When false, and the process!
59
+ # returns a failure status, the agent will email all stdout and stderr to the email specified in the
60
+ # :config send_to value
61
+ attr_accessor :debug
62
+
63
+ # Optional array of prerequisite fields that must contain a 1 in them for the record on the page before
64
+ # the agent will attempt to run
65
+ attr_accessor :prerequisites
66
+
67
+ # Optional integer. This works on Linux with ps. The agent will not attempt to run if there are
68
+ # max_selves instances running
69
+ attr_accessor :max_selves
70
+
71
+ # Hash of process_name to number of max_instances. This works on Linux with ps. If the agent detects
72
+ # the specified number of max_instances of the given process (based on a line match), it will not
73
+ # attempt to run
74
+ attr_accessor :conflicts_with
75
+
76
+ # Array of fields on the record which this agent subsumes. If the agent completes successfully these
77
+ # fields will be updated with a 1 in addition to the field for the agent
78
+ attr_accessor :subsumes
79
+
80
+ # Readonly access to the GoogleDrive::Worksheet that is being access by the agent.
81
+ attr_reader :worksheet
82
+
83
+ # create a new SpreadsheetAgent::Agent with the following:
84
+ # == required configuration parameters:
85
+ # * agent_name
86
+ # * page_name
87
+ # * keys
88
+ #
89
+ # == optional parameters:
90
+ # * config_file: (see SpreadsheetAgent::DB)
91
+ # * debug
92
+ # * prerequisites
93
+ # * max_selves
94
+ # * conflicts_with
95
+ # * subsumes
96
+ #
97
+ def initialize(attributes)
98
+ @agent_name = attributes[:agent_name]
99
+ @page_name = attributes[:page_name]
100
+ @keys = attributes[:keys].clone
101
+ unless @agent_name && @page_name && @keys
102
+ raise SpreadsheetAgentError, "agent_name, page_name, and keys attributes are required!"
103
+ end
104
+ @config_file = attributes[:config_file]
105
+ build_db()
106
+
107
+ @worksheet = @db.worksheet_by_title(@page_name)
108
+ @debug = attributes[:debug]
109
+ if attributes[:prerequisites]
110
+ @prerequisites = attributes[:prerequisites].clone
111
+ end
112
+
113
+ @max_selves = attributes[:max_selves]
114
+ if attributes[:conflicts_with]
115
+ @conflicts_with = attributes[:conflicts_with].clone
116
+ end
117
+ if attributes[:subsumes]
118
+ @subsumes = attributes[:subsumes].clone
119
+ end
120
+ end
121
+
122
+ # If the agent does not have any conflicting processes (max_selves or conflicts_with)
123
+ # and if the entry is ready (field 'ready' has a 1), and all prerequisite fields have a 1,
124
+ # gets the GoogleDrive::List record, and passes it to the supplied agent_code PROC as argument.
125
+ # This PROC must return a required boolean field indicating success or failure, and an optional
126
+ # hash of key - value fields that will be updated on the GoogleDrive::List record. Note, the updates
127
+ # are made regardless of the value of success. In fact, the agent can be configured to update
128
+ # different fields based on success or failure. Also, note that any value can be stored in the
129
+ # hash. This allows the agent to communicate any useful information to the google spreadsheet for other
130
+ # agents (SpreadsheetAgent::Agent, SpreadsheetAgent::Runner, or human) to use. The PROC must try at all
131
+ # costs to avoid terminating. If an error is encountered, it should return false for the success field
132
+ # to signal that the process failed. If no errors are encountered it should return true for the success
133
+ # field.
134
+ #
135
+ # Exits successfully, enters a 1 in the agent_name field
136
+ # $agent->process! do |entry|
137
+ # true
138
+ # end
139
+ #
140
+ # Same, but also updates the 'notice' field in the record along with the 1 in the agent_name field
141
+ # $agent->process! do |entry|
142
+ # [true, {:notice => 'There were 30 files processed'}]
143
+ # end
144
+ #
145
+ # Fails, enters f:#{hostname} in the agent_name field
146
+ # $agent->process! do |entry|
147
+ # false
148
+ #
149
+ # Same, but also updates the 'notice' field in the record along with the failure notice
150
+ # $agent->process! do |entry|
151
+ # [false, {:notice => 'There were 10 files left to process!' }]
152
+ # end
153
+ #
154
+ # This agent passes different parameters based on success or failure
155
+ # $agent->process! do |entry|
156
+ # if $success
157
+ # true
158
+ # else
159
+ # [ false, {:notice => 'there were 10 remaining files'}]
160
+ # end
161
+ # end
162
+ #
163
+ def process!(&agent_code)
164
+ @worksheet.reload
165
+ no_problems = true
166
+ capture_output = nil
167
+ unless @debug
168
+ capture_output = CaptureIO.new
169
+ capture_output.start
170
+ end
171
+
172
+ begin
173
+ return true if has_conflicts()
174
+ (runnable, entry) = run_entry()
175
+ return false unless entry
176
+ return true unless runnable
177
+
178
+ success, update_entry = agent_code.call(entry)
179
+ if success
180
+ complete_entry(update_entry)
181
+ else
182
+ fail_entry(update_entry)
183
+ end
184
+ rescue
185
+ $stderr.puts "#{ $! }"
186
+ no_problems = false
187
+ end
188
+ unless capture_output.nil?
189
+ if no_problems
190
+ capture_output.stop
191
+ else
192
+ mail_error(capture_output.stop)
193
+ end
194
+ end
195
+ return no_problems
196
+ end
197
+
198
+ # Returns the GoogleDrive::List object for the specified keys
199
+ def get_entry
200
+ this_entry = nil
201
+ if @worksheet
202
+ @worksheet.list.each do |this_row|
203
+ keep_row = true
204
+
205
+ @config['key_fields'].keys.reject { |key_field|
206
+ !(@config['key_fields'][key_field]["required"]) && !(@keys[key_field])
207
+ }.each do |key|
208
+ break unless keep_row
209
+ keep_row = (this_row[key] == @keys[key])
210
+ end
211
+
212
+ if keep_row
213
+ return this_row
214
+ end
215
+ end
216
+ end
217
+ end
218
+
219
+ private
220
+
221
+ def has_conflicts
222
+ return unless (@max_selves || @conflicts_with) # nothing conflicts here
223
+
224
+ running_conflicters = {}
225
+ self_name = File.basename $0
226
+
227
+ begin
228
+ conflicting_in = Open3.popen3('ps','-eo','pid,command')[1]
229
+ conflicting_in.lines.each do |line|
230
+ unless(
231
+ (line.match(/emacs\s+|vim*\s+|pico\s+/)) ||
232
+ (line.match("#{ $$ }"))
233
+ )
234
+ if @max_selves && line.match(self_name)
235
+ if running_conflicters[@agent_name].nil?
236
+ running_conflicters[@agent_name] = 1
237
+ else
238
+ running_conflicters[@agent_name] += 1
239
+ end
240
+
241
+ if running_conflicters[@agent_name] == @max_selves
242
+ $stderr.puts "max_selves limit reached" if @debug
243
+ conflicting_in.close
244
+ return true
245
+ end
246
+ end
247
+
248
+ if @conflicts_with
249
+ @conflicts_with.keys.each do |conflicter|
250
+ if line.match(conflicter)
251
+ if running_conflicters[conflicter].nil?
252
+ running_conflicters[conflicter] = 1
253
+ else
254
+ running_conflicters[conflicter] += 1
255
+ end
256
+ if running_conflicters[conflicter] >= @conflicts_with[conflicter]
257
+ $stderr.puts "conflicts with #{ conflicter }" if @debug
258
+ conflicting_in.close
259
+ return true
260
+ end
261
+ end
262
+ end
263
+ end
264
+ end
265
+ end
266
+ conflicting_in.close
267
+ return false
268
+
269
+ rescue
270
+ $stderr.puts "Couldnt check conflicts #{ $! }" if @debug
271
+ return true
272
+ end
273
+
274
+ end
275
+
276
+ # this call initiates a race resistant attempt to make sure that there is only 1
277
+ # clear 'winner' among N potential agents attempting to run the same goal on the
278
+ # same spreadsheet agent's cell
279
+ def run_entry
280
+ entry = get_entry()
281
+ output = '';
282
+ @keys.keys.select { |k| @config['key_fields'][k] && @keys[k] }.each do |key|
283
+ output += [ key, @keys[key] ].join(' ') + " "
284
+ end
285
+
286
+ unless entry
287
+ $stderr.puts "#{ output } is not supported on #{ @page_name }" if @debug
288
+ return
289
+ end
290
+
291
+ unless entry['ready'] == "1"
292
+ $stderr.puts "#{ output } is not ready to run #{ @agent_name }" if @debug
293
+ return false, entry
294
+ end
295
+
296
+ if entry['complete'] == "1"
297
+ $stderr.puts "All goals are completed for #{ output }" if @debug
298
+ return false, entry
299
+ end
300
+
301
+ if entry[@agent_name]
302
+ (status, running_hostname) = entry[@agent_name].split(':')
303
+
304
+ case status
305
+ when 'r'
306
+ $stderr.puts " #{ output } is already running #{ @agent_name } on #{ running_hostname }" if @debug
307
+ return false, entry
308
+
309
+ when "1"
310
+ $stderr.puts " #{ output } has already run #{ @agent_name }" if @debug
311
+ return false, entry
312
+
313
+ when 'F'
314
+ $stderr.puts " #{ output } has already Failed #{ @agent_name }" if @debug
315
+ return false, entry
316
+ end
317
+ end
318
+
319
+ if @prerequisites
320
+ @prerequisites.each do |prereq_field|
321
+ unless entry[prereq_field] == "1"
322
+ $stderr.puts " #{ output } has not finished #{ prereq_field }" if @debug
323
+ return false, entry
324
+ end
325
+ end
326
+ end
327
+
328
+ # first attempt to set the hostname of the machine as the value of the agent
329
+ hostname = Socket.gethostname;
330
+ begin
331
+ entry.update @agent_name => "r:#{ hostname }"
332
+ @worksheet.save
333
+
334
+ rescue GoogleDrive::Error
335
+ # this is a collision, which is to be treated as if it is not runnable
336
+ $stderr.puts " #{ output } lost #{ @agent_name } on #{hostname}" if @debug
337
+ return false, entry
338
+ end
339
+
340
+ sleep 3
341
+ begin
342
+ @worksheet.reload
343
+ rescue GoogleDrive::Error
344
+ # this is a collision, which is to be treated as if it is not runnable
345
+ $stderr.puts " #{ output } lost #{ @agent_name } on #{hostname}" if @debug
346
+ return false, entry
347
+ end
348
+
349
+ check = entry[@agent_name]
350
+ (status, running_hostname) = check.split(':')
351
+ if hostname == running_hostname
352
+ return true, entry
353
+ end
354
+ $stderr.puts " #{ output } lost #{ @agent_name } on #{hostname}" if @debug
355
+ return false, entry
356
+ end
357
+
358
+ def complete_entry(update_entry)
359
+ if update_entry.nil?
360
+ update_entry = {}
361
+ end
362
+
363
+ if @subsumes && @subsumes.length > 0
364
+ @subsumes.each do |subsumed_agent|
365
+ update_entry[subsumed_agent] = 1
366
+ end
367
+ end
368
+
369
+ update_entry[@agent_name] = 1
370
+ entry = get_entry()
371
+ entry.update update_entry
372
+ @worksheet.save
373
+ end
374
+
375
+ def fail_entry(update_entry)
376
+ if update_entry.nil?
377
+ update_entry = { }
378
+ end
379
+ hostname = Socket.gethostname
380
+ update_entry[@agent_name] = "F:#{ hostname }"
381
+ entry = get_entry()
382
+ entry.update update_entry
383
+ @worksheet.save
384
+ end
385
+
386
+ def mail_error(error_message)
387
+ output = ''
388
+ @keys.keys.each do |key|
389
+ output += [key, @keys[key] ].join(' ') + " "
390
+ end
391
+
392
+ prefix = [Socket.gethostname, output, @agent_name ].join(' ')
393
+ begin
394
+ Mail.defaults do
395
+ delivery_method :smtp, {
396
+ :address => "smtp.gmail.com",
397
+ :port => 587,
398
+ :domain => Socket.gethostname,
399
+ :user_name => @config['guser'],
400
+ :password => @config['gpass'],
401
+ :authentication => 'plain',
402
+ :enable_starttls_auto => true }
403
+ end
404
+
405
+ mail = Mail.new do
406
+ from @config['reply_email']
407
+ to @config['send_to']
408
+ subject prefix
409
+ body error_message.to_s
410
+ end
411
+
412
+ mail.deliver!
413
+ rescue
414
+ #DO NOTHING
415
+ end
416
+ end
417
+ end
418
+ end
@@ -0,0 +1,55 @@
1
+ # Author: Darin London
2
+ # The license of this source is "MIT Licence"
3
+
4
+ require 'google_drive'
5
+ require 'psych'
6
+
7
+ module SpreadsheetAgent
8
+
9
+ # SpreadsheetAgent::Db is a class that is meant to be extended by SpreadsheetAgent classes. It
10
+ # stores shared code to instantiate and provide access to a GoogleDrive object and
11
+ # GoogleDrive::Spreadsheet object for use by the extending classes to access their Google Spreadsheets
12
+ class Db
13
+
14
+ # This holds the GoogleDrive::Spreadsheet object that can be used to query information from the google
15
+ # spreadsheet using its API. It cannot be changed after the object is constructed
16
+ attr_reader :db
17
+
18
+ # This holds the GoogleDrive object instantiated with the guser and gpass in the :config. It
19
+ # cannot be changed after the object is constructed
20
+ attr_reader :session
21
+
22
+ # This holds the hash that is constructed from the YAML :config_file. It
23
+ # cannot be changed after the object is constructed
24
+ attr_reader :config
25
+
26
+ # Passing this attribute to the constructor will override the location of config/agent.conf.yml.
27
+ # If passed, it must be a path to a file which matches the template in config/agent.conf.yml.
28
+ # The default is to load ../config/agent.config.yaml relative to the directory containing the
29
+ # calling script $0. This cannot be changed after the object is constructed
30
+ attr_reader :config_file
31
+
32
+ # This is for internal use by SpreadsheetAgent classes that extend SpreadsheetAgent::Db
33
+ def build_db
34
+ build_config()
35
+ unless @config['key_fields'].keys.select { |k| @config['key_fields'][k]['required'] }.count > 0
36
+ raise SpreadsheetAgentError, "Your configuration must have at least one required key_fields key"
37
+ end
38
+ @session = GoogleDrive.login(@config['guser'], @config['gpass'])
39
+ @db = @session.spreadsheet_by_title(@config['spreadsheet_name'])
40
+ end
41
+
42
+ private
43
+
44
+ def build_config()
45
+ if @config_file.nil?
46
+ @config_file = find_bin() + '../config/agent.conf.yml'
47
+ end
48
+ @config = Psych.load_file(@config_file)
49
+ end
50
+
51
+ def find_bin()
52
+ File.expand_path(File.dirname( $0 )) + '/'
53
+ end
54
+ end
55
+ end