typingpool 0.7.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/LICENSE +20 -0
  2. data/README.markdown +452 -0
  3. data/lib/typingpool/amazon/hit/assignment/empty.rb +19 -0
  4. data/lib/typingpool/amazon/hit/assignment.rb +43 -0
  5. data/lib/typingpool/amazon/hit/full/fromsearchhits.rb +44 -0
  6. data/lib/typingpool/amazon/hit/full.rb +105 -0
  7. data/lib/typingpool/amazon/hit.rb +458 -0
  8. data/lib/typingpool/amazon/question.rb +45 -0
  9. data/lib/typingpool/amazon.rb +3 -677
  10. data/lib/typingpool/app/cli/formatter.rb +16 -0
  11. data/lib/typingpool/app/cli.rb +64 -0
  12. data/lib/typingpool/app/friendlyexceptions.rb +34 -0
  13. data/lib/typingpool/app.rb +2 -97
  14. data/lib/typingpool/config/root.rb +114 -0
  15. data/lib/typingpool/config.rb +13 -119
  16. data/lib/typingpool/filer/audio.rb +84 -0
  17. data/lib/typingpool/filer/csv.rb +57 -0
  18. data/lib/typingpool/filer/dir.rb +76 -0
  19. data/lib/typingpool/filer/files/audio.rb +63 -0
  20. data/lib/typingpool/filer/files.rb +55 -0
  21. data/lib/typingpool/filer.rb +4 -313
  22. data/lib/typingpool/project/local.rb +117 -0
  23. data/lib/typingpool/project/remote/s3.rb +135 -0
  24. data/lib/typingpool/project/remote/sftp.rb +100 -0
  25. data/lib/typingpool/project/remote.rb +65 -0
  26. data/lib/typingpool/project.rb +2 -396
  27. data/lib/typingpool/template/assignment.rb +17 -0
  28. data/lib/typingpool/template/env.rb +77 -0
  29. data/lib/typingpool/template.rb +2 -87
  30. data/lib/typingpool/test/script.rb +310 -0
  31. data/lib/typingpool/test.rb +1 -306
  32. data/lib/typingpool/transcript/chunk.rb +129 -0
  33. data/lib/typingpool/transcript.rb +1 -125
  34. data/lib/typingpool/utility/castable.rb +65 -0
  35. data/lib/typingpool/utility.rb +1 -61
  36. data/test/test_integration_script_6_tp_finish.rb +1 -0
  37. metadata +135 -81
@@ -0,0 +1,458 @@
1
+ module Typingpool
2
+ class Amazon
3
+ #Class representing an Amazon Mechanical Turk Human Intelligence
4
+ #Task (HIT).
5
+ #
6
+ #We go above and beyond RTurk::Hit for several practical reasons:
7
+ # * To allow easy serialization. Caching is a very useful way of
8
+ # reducing network calls to Amazon, and thus of speeding up
9
+ # Typingpool. RTurk::Hit objects cannot be dumped via Marshal,
10
+ # apparently due to some Nokogiri objects they
11
+ # contain. Typingpool::Amazon::HIT objects, in contrast, are
12
+ # designed to be easily and compactly serialized. They store the
13
+ # minimal subset of information we need via simple
14
+ # attribtues. (Presently we serialize via PStore.)
15
+ # * To attach convenience methods. RTurk does not make it easy,
16
+ # for example, to get HITs beyond the first "page" returned by
17
+ # Amazon. This class provides methods that make it easy to get
18
+ # ALL HITs returned by various operations.
19
+ # * To attach methods specific to Typingpool. For example, the url
20
+ # and project_id methods read params we've embedded in the
21
+ # annotation or in hidden fields on an external question, while
22
+ # the underlying stashed_params method optimizes its lookup of
23
+ # these variables based on how the app is most likely to be
24
+ # used. See also the ours? and cacheable? methods.
25
+ # * To simplify. Typingpool HITs are constrained such that we can
26
+ # assume they all contain only one assignment and thus only a
27
+ # maximum of one answer. Also, once we've determined that a HIT
28
+ # does not belong to Typingpool, it is safe to cache it forever
29
+ # and never download it again from Amazon.
30
+ # * To clearly partition methods that result in network
31
+ # calls. When you access an attribute under hit.full, like
32
+ # hit.full.status, it is clear you are doing something
33
+ # potentially expensive to obtain your hit status. Same thing
34
+ # with accessing an attribute under hit.assignment, like
35
+ # hit.assignment.worker_id -- it is clear an assignment object
36
+ # will need to be created, implying a network call. Calling
37
+ # hit.id, in contrast, is always fast. (Caveat: Accessing
38
+ # partitioned attributes often, but not always, results in a
39
+ # network call. In some cases, hit.full is generated at the same
40
+ # time we create the hit, since we've obtained a full HIT
41
+ # serialization from Amazon. In other cases, we only have a HIT
42
+ # id, so accessing anything under hit.full generates a network
43
+ # call.)
44
+ class HIT
45
+ require 'set'
46
+ require 'uri'
47
+ require 'typingpool/amazon/hit/full'
48
+ require 'typingpool/amazon/hit/assignment'
49
+
50
+ class << self
51
+
52
+ #Constructor. Creates an Amazon Mechanical Turk HIT.
53
+ #** Warning: This method can spend your money! **
54
+ # ==== Params
55
+ # [question] Typingpool::Amazon::Question instance, used not
56
+ # only to generate the (external) question but
57
+ # also parsed to provide one or more core HIT
58
+ # attributes. Must include a non-nil
59
+ # annotation attribute. Provides fallback
60
+ # values for HIT title and description.
61
+ # [config_assign] The 'assign' attribute of a
62
+ # Typingpool::Config instance (that is, a
63
+ # Typingpool::Config::Root::Assign
64
+ # instance). Must include values for reward,
65
+ # lifetime, duration, and approval. May
66
+ # include values for keywords and
67
+ # qualifications. Preferred source for HIT
68
+ # title and description. See
69
+ # Typingpool::Config documentation for further
70
+ # details.
71
+ # ==== Returns
72
+ # Typingpool::Amazon::HIT instance corresponding to the new
73
+ # Mechanical Turk HIT.
74
+ def create(question, config_assign)
75
+ new(RTurk::Hit.create(:title => config_assign.title || question.title) do |hit|
76
+ hit.description = config_assign.description || question.description
77
+ hit.question(question.url)
78
+ hit.note = question.annotation or raise Error, "Missing annotation from question"
79
+ hit.reward = config_assign.reward or raise Error, "Missing reward config"
80
+ hit.assignments = 1
81
+ hit.lifetime = config_assign.lifetime or raise Error, "Missing lifetime config"
82
+ hit.duration = config_assign.deadline or raise Error, "Missing deadline config"
83
+ hit.auto_approval = config_assign.approval or raise Error, "Missing approval config"
84
+ hit.keywords = config_assign.keywords if config_assign.keywords
85
+ config_assign.qualify.each{|q| hit.qualifications.add(*q.to_arg)} if config_assign.qualify
86
+ end)
87
+ end
88
+
89
+ #Name of the hidden HTML form field used to provide the
90
+ #project_id in an external question or (form-encoded)
91
+ #annotation. Hard coded to typingpool_project_id but
92
+ #overridable in a subclass.
93
+ def id_at
94
+ @@id_at ||= 'typingpool_project_id'
95
+ end
96
+
97
+ #Name of the hidden HTML form field used to provide the
98
+ #(audio) url in an external question or (form-encoded)
99
+ #annotation. Hard coded to typingpool_url but overridable in a
100
+ #subclass.
101
+ def url_at
102
+ @@url_at ||= 'typingpool_url'
103
+ end
104
+
105
+ #Takes an array of HIT ids, returns Typingpool::Amazon::HIT
106
+ #instances corresponding to those ids.
107
+ def with_ids(ids)
108
+ ids.map{|id| cached_or_new(RTurk::Hit.new(id)) }
109
+ end
110
+
111
+ #Returns all Typingpool HITs that have been approved, as an
112
+ #array of Typingpool::Amazon::HIT instances.
113
+ def all_approved
114
+ hits = all_reviewable do |hit|
115
+ begin
116
+ #optimization: we assume it is more common to have an
117
+ #unapproved HIT than an approved HIT that does not
118
+ #belong to this app
119
+ hit.approved? && hit.ours?
120
+ rescue RestClient::ServiceUnavailable => e
121
+ warn "Warning: Service unavailable error, skipped HIT #{hit.id}. (Error: #{e})"
122
+ false
123
+ end
124
+ end
125
+ hits
126
+ end
127
+
128
+ #Returns as an array of Typingpool::Amazon::HIT instances all
129
+ #HITs returned by Amazon's GetReviewableHITs operation (which
130
+ #have HIT status == 'Reviewable'). Takes an optional filter
131
+ #block (which should return true for HITs to be included in
132
+ #the final results). If not supplied, will filter so the
133
+ #returned hits are all Typingpool HITs (hit.ours? == true).
134
+ def all_reviewable(&filter)
135
+ hits = each_page do |page_number|
136
+ RTurk.GetReviewableHITs(:page_number => page_number).hit_ids.map{|id| RTurk::Hit.new(id) }.map{|hit| cached_or_new(hit) }
137
+ end
138
+ filter_ours(hits, &filter)
139
+ end
140
+
141
+ #Takes a Typingpool::Project::Local#id and returns all HITs
142
+ #associated with that project, as an array of
143
+ #Typingpool::Amazon::HIT instances.
144
+ def all_for_project(id)
145
+ all{|hit| hit.ours? && hit.project_id == id}
146
+ end
147
+
148
+ #Returns all HITs associated with your AWS account as an array
149
+ #of Typingpool::Amazon::HIT instances. Takes an optional
150
+ #filter block (which should return true for HITs to be
151
+ #included in the final results). If not supplied, will filter
152
+ #so the returned hits are all Typingpool HITs (hit.ours? ==
153
+ #true).
154
+ def all(&filter)
155
+ hits = each_page do |page_number|
156
+ page = RTurk::SearchHITs.create(:page_number => page_number)
157
+ raw_hits = page.xml.xpath('//HIT')
158
+ page.hits.map do |rturk_hit|
159
+ annotation = raw_hits.shift.xpath('RequesterAnnotation').inner_text.strip
160
+ full = Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation)
161
+ cached_or_new_from_searchhits(rturk_hit, annotation)
162
+ end
163
+ end
164
+ filter_ours(hits, &filter)
165
+ end
166
+
167
+ #protected
168
+
169
+ #Constructor. Takes an RTurk::Hit instance. Returns a
170
+ #Typingpool::Amazon::HIT instance, preferably from the cache.
171
+ def cached_or_new(rturk_hit)
172
+ from_cache(rturk_hit.id) || new(rturk_hit)
173
+ end
174
+
175
+ #Constructor. Same as cached_or_new, but handles peculiarities
176
+ #of objects returned by RTurk::SearchHITs. Such objects map
177
+ #two Amazon HIT fields to different names than those used by
178
+ #other RTurk HIT instances. They also do not bother to extract
179
+ #the annotation from the Amazon HIT, so we have to do that
180
+ #ourselves (elsewhere) and take it as a param here. Finally,
181
+ #on the bright side, RTurk::SearchHITs already contain a big
182
+ #chunk of hit.full attributes, potentially obviating the need
183
+ #for an additional network call to flesh out the HIT, so this
184
+ #method pre-fleshes-out the HIT.
185
+ def cached_or_new_from_searchhits(rturk_hit, annotation)
186
+ if not (typingpool_hit = from_cache(rturk_hit.id))
187
+ typingpool_hit = new(rturk_hit)
188
+ typingpool_hit.full(Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation))
189
+ end
190
+ typingpool_hit
191
+ end
192
+
193
+ def from_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
194
+ Amazon.cache.transaction do
195
+ Amazon.cache[cache_key(hit_id, id_at, url_at)]
196
+ end
197
+ end
198
+
199
+ def delete_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
200
+ Amazon.cache.transaction do
201
+ key = cache_key(hit_id, id_at, url_at)
202
+ cached = Amazon.cache[key]
203
+ Amazon.cache.delete(key) unless cached.nil?
204
+ end
205
+ end
206
+
207
+ def cache_key(hit_id, id_at=self.id_at, url_at=self.url_at)
208
+ "RESULT///#{hit_id}///#{url_at}///#{id_at}"
209
+ end
210
+
211
+ def each_page
212
+ results = []
213
+ page = 0
214
+ begin
215
+ page += 1
216
+ new_results = yield(page)
217
+ results.push(*new_results)
218
+ end while new_results.count > 0
219
+ results
220
+ end
221
+
222
+ def filter_ours(hits, &filter)
223
+ filter ||= lambda{|hit| hit.ours? }
224
+ hits.select do |hit|
225
+ selected = filter.call(hit)
226
+ hit.to_cache
227
+ selected
228
+ end
229
+ end
230
+ end #class << self
231
+
232
+ #Corresponds to the Amazon Mechanical Turk HIT#HITId
233
+ attr_reader :id
234
+
235
+ #Constructor. Takes an RTurk::Hit instance.
236
+ def initialize(rturk_hit)
237
+ @id = rturk_hit.id
238
+ end
239
+
240
+ #URL of the audio file associated with this HIT (the audio file
241
+ #to be transcribed). Extracted from the annotation (when the HIT
242
+ #was assigned via Typingpool) or from a hidden field in the HTML
243
+ #form on the external question (when the HIT was assigned via
244
+ #the Amazon Mechanical Turk RUI).
245
+ def url
246
+ @url ||= stashed_param(self.class.url_at)
247
+ end
248
+
249
+ #The Typingpool::Project::Local#id associated with this
250
+ #HIT. Extracted as described for the url method.
251
+ def project_id
252
+ @project_id ||= stashed_param(self.class.id_at)
253
+ end
254
+
255
+ #Returns the Typingpool::Project#name associated with this HIT
256
+ #by parsing the #url. May be dropped in a future release.
257
+ def project_title_from_url(url=self.url)
258
+ matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
259
+ URI.unescape(matches[2])
260
+ end
261
+
262
+ #Returns true if this HIT has an approved assignment associated
263
+ #with it. (Attached to Typingpool::Amazon::HIT rather than
264
+ #Typingpool::Amazon::HIT::Assignment because sometimes we can
265
+ #tell simply from looking at hit.full that there are no approved
266
+ #assignments -- hit.full.assignments_completed == 0. This check
267
+ #is only performed when hit.full has already been loaded.)
268
+ def approved?
269
+ assignment_status_match?('Approved')
270
+ end
271
+
272
+ #Returns true if this HIT has a rejected assignment associated
273
+ #with it. (For an explanation of why this is not attached to
274
+ #Typingpool::Amazon::HIT::Assignment, see the documentation for
275
+ #approved?.)
276
+ def rejected?
277
+ assignment_status_match?('Rejected')
278
+ end
279
+
280
+ #Returns true if this HIT has a submitted assignment associated
281
+ #with it. (For an explanation of why this is not attached to
282
+ #Typingpool::Amazon::HIT::Assignment, see the documentation for
283
+ #approved?.)
284
+ def submitted?
285
+ assignment_status_match?('Submitted')
286
+ end
287
+
288
+
289
+ #Returns true if this HIT is associated with Typingpool. One
290
+ #Amazon account can be used for many tasks, so it's important to
291
+ #check whether the HIT belongs to this software. (Presently,
292
+ #this is determined by looking for a stashed param like url or
293
+ #project_id).
294
+ def ours?
295
+ @ours ||= not(url.to_s.empty?)
296
+ end
297
+
298
+ #Returns a Typingpool::Transcript::Chunk instance built using
299
+ #this HIT and its associated assignment.
300
+ def transcript
301
+ transcript = Transcript::Chunk.new(assignment.body)
302
+ transcript.url = url
303
+ transcript.project = project_id
304
+ transcript.worker = assignment.worker_id
305
+ transcript.hit = @id
306
+ transcript
307
+ end
308
+
309
+ #If this HIT is cacheable, serializes it to the cache file
310
+ #specified in the config passed to Amazon.setup, or specified in
311
+ #the default config file. In short, a HIT is cacheable if it
312
+ #does not belong to Typingpool (ours? == false), if it is
313
+ #approved or rejected (approved? || rejected?), or if it is
314
+ #expired (full.expired_and_overdue?). See also cacheable? code.
315
+ #
316
+ # When available, cached HITs are used by
317
+ # Typingpool::Amazon::HIT.all,
318
+ # Typingpool::Amazon::HIT.all_approved, and all the other class
319
+ # methods that retrieve HITs. These methods call to_cache for
320
+ # you at logical times (after downloading and filtering, when
321
+ # the HIT is most fleshed out), so you should not need to call
322
+ # this yourself. But if you have an operation that makes network
323
+ # calls to further flesh out the HIT, calling to_cache may be
324
+ # worthwhile.
325
+ def to_cache
326
+ #any obj containing a Nokogiri object cannot be stored in pstore - do
327
+ #not forget this (again)
328
+ if cacheable?
329
+ Amazon.cache.transaction do
330
+ Amazon.cache[self.class.cache_key(@id)] = self
331
+ end
332
+ end
333
+ end
334
+
335
+ #Returns an RTurk::Hit instance corresponding to this HIT.
336
+ def at_amazon
337
+ Amazon.rturk_hit_full(@id)
338
+ end
339
+
340
+ #Deletes the HIT from Amazon's servers. Examines the HIT and
341
+ #assignment status to determine whether calling the DisposeHIT
342
+ #or DisableHIT operation is most appropriate. If the HIT has
343
+ #been submitted but not approved or rejected, will raise an
344
+ #exception of type
345
+ #Typingpool::Error::Amazon::UnreviewedContent. Catch this
346
+ #exception in your own code if you'd like to automatically
347
+ #approve such HITs before removing them.
348
+ def remove_from_amazon
349
+ if full.status == 'Reviewable'
350
+ if assignment.status == 'Submitted'
351
+ raise Error::Amazon::UnreviewedContent, "There is an unreviewed submission for #{url}"
352
+ end
353
+ at_amazon.dispose!
354
+ else
355
+ at_amazon.disable!
356
+ end
357
+ end
358
+
359
+ #Returns "the full hit" - a Typingpool::Amazon::HIT::Full
360
+ #instance associated with this HIT. If the instance is being
361
+ #created for the first time, this will trigger an HTTP request
362
+ #to Amazon's servers. "Full" hit fields segregated because
363
+ #accessing any one of them is expensive if we only have a hit id
364
+ #(but after fetching one all are cheap). Accepts an optional
365
+ #Typingpool::Amazon::HIT::Full (or subclass) to set for this
366
+ #attribute, preventing the need to create one. This is useful in
367
+ #cases in which extensive HIT data was returned by an Amazon
368
+ #operation (for example, SearchHITs returns lots of HIT data)
369
+ def full(full_hit=nil)
370
+ if @full.nil?
371
+ @full = full_hit || Full.new(at_amazon)
372
+ end
373
+ @full
374
+ end
375
+
376
+ #Returns the assignment associated with this HIT - a
377
+ #Typingpool::Amazon::HIT::Assignment instance. The first time
378
+ #this is called, an Amazon HTTP request is typically (but not
379
+ #always) sent.
380
+ def assignment
381
+ if @assignment.nil?
382
+ if @full && full.assignments_completed == 0
383
+ #It would be dangerous to do this if the HIT were to be
384
+ #cached, since we would then never check for the
385
+ #assignment again. But we know this HIT won't be cached
386
+ #while it is active, since we only cache approved and
387
+ #rejected HITs.
388
+ @assignment = Assignment::Empty.new
389
+ else
390
+ @assignment = Assignment.new(at_amazon) #expensive
391
+ end
392
+ end
393
+ @assignment
394
+ end
395
+
396
+
397
+ #private
398
+
399
+ def stashed_param(param)
400
+ if @assignment && assignment.answers[param]
401
+ return assignment.answers[param]
402
+ elsif full.annotation[param]
403
+ #A question assigned through this software. May be
404
+ #expensive: May result in HTTP request to fetch HIT
405
+ #fields. We choose to fetch (sometimes) the HIT rather than
406
+ #the assignment on the assumption it will be MORE common to
407
+ #encounter HITs with no answers and LESS common to encounter
408
+ #HITs assigned through the RUI (and thus lacking in an
409
+ #annotation from this software and thus rendering the HTTP
410
+ #request to fetch the HIT fields pointless).
411
+ return full.annotation[param]
412
+ elsif full.assignments_completed.to_i >= 1
413
+ #A question assigned through Amazon's RUI, with an answer
414
+ #submitted. If the HIT belongs to this software, this
415
+ #assignment's answers will include our param. We prefer
416
+ #fetching the assignment to fetching the external question
417
+ #(as below) because fetching the assignment will potentially
418
+ #save us an HTTP request down the line -- for example, if we
419
+ #need other assignment data (e.g. assignment status).
420
+ #Fetching the external question only serves to give us
421
+ #access to params. If the answers do not include our param,
422
+ #we know the HIT does not belong to this software, since we
423
+ #know the param was also not in the annotation. So we are
424
+ #safe returning nil in that case.
425
+ return assignment.answers[param]
426
+ else
427
+ #A question assigned via Amazon's RUI, with no answer
428
+ #submitted. Expensive: Results in HTTP request to fetch
429
+ #external question.
430
+ return full.external_question_param(param)
431
+ end
432
+ end
433
+
434
+ def assignment_status_match?(status)
435
+ if @full
436
+ return false if full.assignments_completed == 0
437
+ return false if full.status != 'Reviewable'
438
+ end
439
+ assignment.status == status
440
+ end
441
+
442
+
443
+ @@cacheable_assignment_status = Set.new %w(Approved Rejected)
444
+ def cacheable?
445
+ if @ours == false
446
+ return true
447
+ end
448
+ if @full
449
+ return true if full.expired_and_overdue?
450
+ end
451
+ if @assignment && assignment.status
452
+ return true if @@cacheable_assignment_status.include?(assignment.status)
453
+ end
454
+ return false
455
+ end
456
+ end #HIT
457
+ end #Amazon
458
+ end #Typingpool
@@ -0,0 +1,45 @@
1
+ module Typingpool
2
+ class Amazon
3
+
4
+ #Class encapsulating the HTML form presented to Mechanical Turk workers
5
+ #transcribing a Typingpool audio chunk.
6
+ class Question
7
+ require 'nokogiri'
8
+ require 'uri'
9
+ require 'cgi'
10
+ attr_reader :url, :html
11
+
12
+ #Constructor. Takes the URL of where the question HTML has been
13
+ #uploaded, followed by the question HTML itself.
14
+ def initialize(url, html)
15
+ @url = url
16
+ @html = html
17
+ end
18
+
19
+ #Returns URL-encoded key-value pairs that can be used as the
20
+ #text for a HIT#annotation. The key-value pairs correspond to
21
+ #all hidden HTML form fields in the question HTML.
22
+ def annotation
23
+ CGI.escapeHTML(URI.encode_www_form(Hash[*noko.css('input[type="hidden"]').select{|e| e['name'].match(/^typingpool_/) }.map{|e| [e['name'], e['value']]}.flatten]))
24
+ end
25
+
26
+ #Returns the title, extracted from the title element of the
27
+ #HTML.
28
+ def title
29
+ noko.css('title')[0].content
30
+ end
31
+
32
+ #Returns the description, extracted from the element with the id
33
+ #'description' in the HTML.
34
+ def description
35
+ noko.css('#description')[0].content
36
+ end
37
+
38
+ protected
39
+
40
+ def noko(html=@html)
41
+ Nokogiri::HTML(html, nil, 'UTF-8')
42
+ end
43
+ end #Question
44
+ end #Amazon
45
+ end #Typingpool