typingpool 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/LICENSE +20 -0
  2. data/README.markdown +452 -0
  3. data/lib/typingpool/amazon/hit/assignment/empty.rb +19 -0
  4. data/lib/typingpool/amazon/hit/assignment.rb +43 -0
  5. data/lib/typingpool/amazon/hit/full/fromsearchhits.rb +44 -0
  6. data/lib/typingpool/amazon/hit/full.rb +105 -0
  7. data/lib/typingpool/amazon/hit.rb +458 -0
  8. data/lib/typingpool/amazon/question.rb +45 -0
  9. data/lib/typingpool/amazon.rb +3 -677
  10. data/lib/typingpool/app/cli/formatter.rb +16 -0
  11. data/lib/typingpool/app/cli.rb +64 -0
  12. data/lib/typingpool/app/friendlyexceptions.rb +34 -0
  13. data/lib/typingpool/app.rb +2 -97
  14. data/lib/typingpool/config/root.rb +114 -0
  15. data/lib/typingpool/config.rb +13 -119
  16. data/lib/typingpool/filer/audio.rb +84 -0
  17. data/lib/typingpool/filer/csv.rb +57 -0
  18. data/lib/typingpool/filer/dir.rb +76 -0
  19. data/lib/typingpool/filer/files/audio.rb +63 -0
  20. data/lib/typingpool/filer/files.rb +55 -0
  21. data/lib/typingpool/filer.rb +4 -313
  22. data/lib/typingpool/project/local.rb +117 -0
  23. data/lib/typingpool/project/remote/s3.rb +135 -0
  24. data/lib/typingpool/project/remote/sftp.rb +100 -0
  25. data/lib/typingpool/project/remote.rb +65 -0
  26. data/lib/typingpool/project.rb +2 -396
  27. data/lib/typingpool/template/assignment.rb +17 -0
  28. data/lib/typingpool/template/env.rb +77 -0
  29. data/lib/typingpool/template.rb +2 -87
  30. data/lib/typingpool/test/script.rb +310 -0
  31. data/lib/typingpool/test.rb +1 -306
  32. data/lib/typingpool/transcript/chunk.rb +129 -0
  33. data/lib/typingpool/transcript.rb +1 -125
  34. data/lib/typingpool/utility/castable.rb +65 -0
  35. data/lib/typingpool/utility.rb +1 -61
  36. data/test/test_integration_script_6_tp_finish.rb +1 -0
  37. metadata +135 -81
@@ -0,0 +1,458 @@
1
+ module Typingpool
2
+ class Amazon
3
+ #Class representing an Amazon Mechanical Turk Human Intelligence
4
+ #Task (HIT).
5
+ #
6
+ #We go above and beyond RTurk::Hit for several practical reasons:
7
+ # * To allow easy serialization. Caching is a very useful way of
8
+ # reducing network calls to Amazon, and thus of speeding up
9
+ # Typingpool. RTurk::Hit objects cannot be dumped via Marshal,
10
+ # apparently due to some Nokogiri objects they
11
+ # contain. Typingpool::Amazon::HIT objects, in contrast, are
12
+ # designed to be easily and compactly serialized. They store the
13
+ # minimal subset of information we need via simple
14
+ # attribtues. (Presently we serialize via PStore.)
15
+ # * To attach convenience methods. RTurk does not make it easy,
16
+ # for example, to get HITs beyond the first "page" returned by
17
+ # Amazon. This class provides methods that make it easy to get
18
+ # ALL HITs returned by various operations.
19
+ # * To attach methods specific to Typingpool. For example, the url
20
+ # and project_id methods read params we've embedded in the
21
+ # annotation or in hidden fields on an external question, while
22
+ # the underlying stashed_params method optimizes its lookup of
23
+ # these variables based on how the app is most likely to be
24
+ # used. See also the ours? and cacheable? methods.
25
+ # * To simplify. Typingpool HITs are constrained such that we can
26
+ # assume they all contain only one assignment and thus only a
27
+ # maximum of one answer. Also, once we've determined that a HIT
28
+ # does not belong to Typingpool, it is safe to cache it forever
29
+ # and never download it again from Amazon.
30
+ # * To clearly partition methods that result in network
31
+ # calls. When you access an attribute under hit.full, like
32
+ # hit.full.status, it is clear you are doing something
33
+ # potentially expensive to obtain your hit status. Same thing
34
+ # with accessing an attribute under hit.assignment, like
35
+ # hit.assignment.worker_id -- it is clear an assignment object
36
+ # will need to be created, implying a network call. Calling
37
+ # hit.id, in contrast, is always fast. (Caveat: Accessing
38
+ # partitioned attributes often, but not always, results in a
39
+ # network call. In some cases, hit.full is generated at the same
40
+ # time we create the hit, since we've obtained a full HIT
41
+ # serialization from Amazon. In other cases, we only have a HIT
42
+ # id, so accessing anything under hit.full generates a network
43
+ # call.)
44
+ class HIT
45
+ require 'set'
46
+ require 'uri'
47
+ require 'typingpool/amazon/hit/full'
48
+ require 'typingpool/amazon/hit/assignment'
49
+
50
+ class << self
51
+
52
+ #Constructor. Creates an Amazon Mechanical Turk HIT.
53
+ #** Warning: This method can spend your money! **
54
+ # ==== Params
55
+ # [question] Typingpool::Amazon::Question instance, used not
56
+ # only to generate the (external) question but
57
+ # also parsed to provide one or more core HIT
58
+ # attributes. Must include a non-nil
59
+ # annotation attribute. Provides fallback
60
+ # values for HIT title and description.
61
+ # [config_assign] The 'assign' attribute of a
62
+ # Typingpool::Config instance (that is, a
63
+ # Typingpool::Config::Root::Assign
64
+ # instance). Must include values for reward,
65
+ # lifetime, duration, and approval. May
66
+ # include values for keywords and
67
+ # qualifications. Preferred source for HIT
68
+ # title and description. See
69
+ # Typingpool::Config documentation for further
70
+ # details.
71
+ # ==== Returns
72
+ # Typingpool::Amazon::HIT instance corresponding to the new
73
+ # Mechanical Turk HIT.
74
+ def create(question, config_assign)
75
+ new(RTurk::Hit.create(:title => config_assign.title || question.title) do |hit|
76
+ hit.description = config_assign.description || question.description
77
+ hit.question(question.url)
78
+ hit.note = question.annotation or raise Error, "Missing annotation from question"
79
+ hit.reward = config_assign.reward or raise Error, "Missing reward config"
80
+ hit.assignments = 1
81
+ hit.lifetime = config_assign.lifetime or raise Error, "Missing lifetime config"
82
+ hit.duration = config_assign.deadline or raise Error, "Missing deadline config"
83
+ hit.auto_approval = config_assign.approval or raise Error, "Missing approval config"
84
+ hit.keywords = config_assign.keywords if config_assign.keywords
85
+ config_assign.qualify.each{|q| hit.qualifications.add(*q.to_arg)} if config_assign.qualify
86
+ end)
87
+ end
88
+
89
+ #Name of the hidden HTML form field used to provide the
90
+ #project_id in an external question or (form-encoded)
91
+ #annotation. Hard coded to typingpool_project_id but
92
+ #overridable in a subclass.
93
+ def id_at
94
+ @@id_at ||= 'typingpool_project_id'
95
+ end
96
+
97
+ #Name of the hidden HTML form field used to provide the
98
+ #(audio) url in an external question or (form-encoded)
99
+ #annotation. Hard coded to typingpool_url but overridable in a
100
+ #subclass.
101
+ def url_at
102
+ @@url_at ||= 'typingpool_url'
103
+ end
104
+
105
+ #Takes an array of HIT ids, returns Typingpool::Amazon::HIT
106
+ #instances corresponding to those ids.
107
+ def with_ids(ids)
108
+ ids.map{|id| cached_or_new(RTurk::Hit.new(id)) }
109
+ end
110
+
111
+ #Returns all Typingpool HITs that have been approved, as an
112
+ #array of Typingpool::Amazon::HIT instances.
113
+ def all_approved
114
+ hits = all_reviewable do |hit|
115
+ begin
116
+ #optimization: we assume it is more common to have an
117
+ #unapproved HIT than an approved HIT that does not
118
+ #belong to this app
119
+ hit.approved? && hit.ours?
120
+ rescue RestClient::ServiceUnavailable => e
121
+ warn "Warning: Service unavailable error, skipped HIT #{hit.id}. (Error: #{e})"
122
+ false
123
+ end
124
+ end
125
+ hits
126
+ end
127
+
128
+ #Returns as an array of Typingpool::Amazon::HIT instances all
129
+ #HITs returned by Amazon's GetReviewableHITs operation (which
130
+ #have HIT status == 'Reviewable'). Takes an optional filter
131
+ #block (which should return true for HITs to be included in
132
+ #the final results). If not supplied, will filter so the
133
+ #returned hits are all Typingpool HITs (hit.ours? == true).
134
+ def all_reviewable(&filter)
135
+ hits = each_page do |page_number|
136
+ RTurk.GetReviewableHITs(:page_number => page_number).hit_ids.map{|id| RTurk::Hit.new(id) }.map{|hit| cached_or_new(hit) }
137
+ end
138
+ filter_ours(hits, &filter)
139
+ end
140
+
141
+ #Takes a Typingpool::Project::Local#id and returns all HITs
142
+ #associated with that project, as an array of
143
+ #Typingpool::Amazon::HIT instances.
144
+ def all_for_project(id)
145
+ all{|hit| hit.ours? && hit.project_id == id}
146
+ end
147
+
148
+ #Returns all HITs associated with your AWS account as an array
149
+ #of Typingpool::Amazon::HIT instances. Takes an optional
150
+ #filter block (which should return true for HITs to be
151
+ #included in the final results). If not supplied, will filter
152
+ #so the returned hits are all Typingpool HITs (hit.ours? ==
153
+ #true).
154
+ def all(&filter)
155
+ hits = each_page do |page_number|
156
+ page = RTurk::SearchHITs.create(:page_number => page_number)
157
+ raw_hits = page.xml.xpath('//HIT')
158
+ page.hits.map do |rturk_hit|
159
+ annotation = raw_hits.shift.xpath('RequesterAnnotation').inner_text.strip
160
+ full = Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation)
161
+ cached_or_new_from_searchhits(rturk_hit, annotation)
162
+ end
163
+ end
164
+ filter_ours(hits, &filter)
165
+ end
166
+
167
+ #protected
168
+
169
+ #Constructor. Takes an RTurk::Hit instance. Returns a
170
+ #Typingpool::Amazon::HIT instance, preferably from the cache.
171
+ def cached_or_new(rturk_hit)
172
+ from_cache(rturk_hit.id) || new(rturk_hit)
173
+ end
174
+
175
+ #Constructor. Same as cached_or_new, but handles peculiarities
176
+ #of objects returned by RTurk::SearchHITs. Such objects map
177
+ #two Amazon HIT fields to different names than those used by
178
+ #other RTurk HIT instances. They also do not bother to extract
179
+ #the annotation from the Amazon HIT, so we have to do that
180
+ #ourselves (elsewhere) and take it as a param here. Finally,
181
+ #on the bright side, RTurk::SearchHITs already contain a big
182
+ #chunk of hit.full attributes, potentially obviating the need
183
+ #for an additional network call to flesh out the HIT, so this
184
+ #method pre-fleshes-out the HIT.
185
+ def cached_or_new_from_searchhits(rturk_hit, annotation)
186
+ if not (typingpool_hit = from_cache(rturk_hit.id))
187
+ typingpool_hit = new(rturk_hit)
188
+ typingpool_hit.full(Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation))
189
+ end
190
+ typingpool_hit
191
+ end
192
+
193
+ def from_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
194
+ Amazon.cache.transaction do
195
+ Amazon.cache[cache_key(hit_id, id_at, url_at)]
196
+ end
197
+ end
198
+
199
+ def delete_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
200
+ Amazon.cache.transaction do
201
+ key = cache_key(hit_id, id_at, url_at)
202
+ cached = Amazon.cache[key]
203
+ Amazon.cache.delete(key) unless cached.nil?
204
+ end
205
+ end
206
+
207
+ def cache_key(hit_id, id_at=self.id_at, url_at=self.url_at)
208
+ "RESULT///#{hit_id}///#{url_at}///#{id_at}"
209
+ end
210
+
211
+ def each_page
212
+ results = []
213
+ page = 0
214
+ begin
215
+ page += 1
216
+ new_results = yield(page)
217
+ results.push(*new_results)
218
+ end while new_results.count > 0
219
+ results
220
+ end
221
+
222
+ def filter_ours(hits, &filter)
223
+ filter ||= lambda{|hit| hit.ours? }
224
+ hits.select do |hit|
225
+ selected = filter.call(hit)
226
+ hit.to_cache
227
+ selected
228
+ end
229
+ end
230
+ end #class << self
231
+
232
+ #Corresponds to the Amazon Mechanical Turk HIT#HITId
233
+ attr_reader :id
234
+
235
+ #Constructor. Takes an RTurk::Hit instance.
236
+ def initialize(rturk_hit)
237
+ @id = rturk_hit.id
238
+ end
239
+
240
+ #URL of the audio file associated with this HIT (the audio file
241
+ #to be transcribed). Extracted from the annotation (when the HIT
242
+ #was assigned via Typingpool) or from a hidden field in the HTML
243
+ #form on the external question (when the HIT was assigned via
244
+ #the Amazon Mechanical Turk RUI).
245
+ def url
246
+ @url ||= stashed_param(self.class.url_at)
247
+ end
248
+
249
+ #The Typingpool::Project::Local#id associated with this
250
+ #HIT. Extracted as described for the url method.
251
+ def project_id
252
+ @project_id ||= stashed_param(self.class.id_at)
253
+ end
254
+
255
+ #Returns the Typingpool::Project#name associated with this HIT
256
+ #by parsing the #url. May be dropped in a future release.
257
+ def project_title_from_url(url=self.url)
258
+ matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
259
+ URI.unescape(matches[2])
260
+ end
261
+
262
+ #Returns true if this HIT has an approved assignment associated
263
+ #with it. (Attached to Typingpool::Amazon::HIT rather than
264
+ #Typingpool::Amazon::HIT::Assignment because sometimes we can
265
+ #tell simply from looking at hit.full that there are no approved
266
+ #assignments -- hit.full.assignments_completed == 0. This check
267
+ #is only performed when hit.full has already been loaded.)
268
+ def approved?
269
+ assignment_status_match?('Approved')
270
+ end
271
+
272
+ #Returns true if this HIT has a rejected assignment associated
273
+ #with it. (For an explanation of why this is not attached to
274
+ #Typingpool::Amazon::HIT::Assignment, see the documentation for
275
+ #approved?.)
276
+ def rejected?
277
+ assignment_status_match?('Rejected')
278
+ end
279
+
280
+ #Returns true if this HIT has a submitted assignment associated
281
+ #with it. (For an explanation of why this is not attached to
282
+ #Typingpool::Amazon::HIT::Assignment, see the documentation for
283
+ #approved?.)
284
+ def submitted?
285
+ assignment_status_match?('Submitted')
286
+ end
287
+
288
+
289
+ #Returns true if this HIT is associated with Typingpool. One
290
+ #Amazon account can be used for many tasks, so it's important to
291
+ #check whether the HIT belongs to this software. (Presently,
292
+ #this is determined by looking for a stashed param like url or
293
+ #project_id).
294
+ def ours?
295
+ @ours ||= not(url.to_s.empty?)
296
+ end
297
+
298
+ #Returns a Typingpool::Transcript::Chunk instance built using
299
+ #this HIT and its associated assignment.
300
+ def transcript
301
+ transcript = Transcript::Chunk.new(assignment.body)
302
+ transcript.url = url
303
+ transcript.project = project_id
304
+ transcript.worker = assignment.worker_id
305
+ transcript.hit = @id
306
+ transcript
307
+ end
308
+
309
+ #If this HIT is cacheable, serializes it to the cache file
310
+ #specified in the config passed to Amazon.setup, or specified in
311
+ #the default config file. In short, a HIT is cacheable if it
312
+ #does not belong to Typingpool (ours? == false), if it is
313
+ #approved or rejected (approved? || rejected?), or if it is
314
+ #expired (full.expired_and_overdue?). See also cacheable? code.
315
+ #
316
+ # When available, cached HITs are used by
317
+ # Typingpool::Amazon::HIT.all,
318
+ # Typingpool::Amazon::HIT.all_approved, and all the other class
319
+ # methods that retrieve HITs. These methods call to_cache for
320
+ # you at logical times (after downloading and filtering, when
321
+ # the HIT is most fleshed out), so you should not need to call
322
+ # this yourself. But if you have an operation that makes network
323
+ # calls to further flesh out the HIT, calling to_cache may be
324
+ # worthwhile.
325
+ def to_cache
326
+ #any obj containing a Nokogiri object cannot be stored in pstore - do
327
+ #not forget this (again)
328
+ if cacheable?
329
+ Amazon.cache.transaction do
330
+ Amazon.cache[self.class.cache_key(@id)] = self
331
+ end
332
+ end
333
+ end
334
+
335
+ #Returns an RTurk::Hit instance corresponding to this HIT.
336
+ def at_amazon
337
+ Amazon.rturk_hit_full(@id)
338
+ end
339
+
340
+ #Deletes the HIT from Amazon's servers. Examines the HIT and
341
+ #assignment status to determine whether calling the DisposeHIT
342
+ #or DisableHIT operation is most appropriate. If the HIT has
343
+ #been submitted but not approved or rejected, will raise an
344
+ #exception of type
345
+ #Typingpool::Error::Amazon::UnreviewedContent. Catch this
346
+ #exception in your own code if you'd like to automatically
347
+ #approve such HITs before removing them.
348
+ def remove_from_amazon
349
+ if full.status == 'Reviewable'
350
+ if assignment.status == 'Submitted'
351
+ raise Error::Amazon::UnreviewedContent, "There is an unreviewed submission for #{url}"
352
+ end
353
+ at_amazon.dispose!
354
+ else
355
+ at_amazon.disable!
356
+ end
357
+ end
358
+
359
+ #Returns "the full hit" - a Typingpool::Amazon::HIT::Full
360
+ #instance associated with this HIT. If the instance is being
361
+ #created for the first time, this will trigger an HTTP request
362
+ #to Amazon's servers. "Full" hit fields segregated because
363
+ #accessing any one of them is expensive if we only have a hit id
364
+ #(but after fetching one all are cheap). Accepts an optional
365
+ #Typingpool::Amazon::HIT::Full (or subclass) to set for this
366
+ #attribute, preventing the need to create one. This is useful in
367
+ #cases in which extensive HIT data was returned by an Amazon
368
+ #operation (for example, SearchHITs returns lots of HIT data)
369
+ def full(full_hit=nil)
370
+ if @full.nil?
371
+ @full = full_hit || Full.new(at_amazon)
372
+ end
373
+ @full
374
+ end
375
+
376
+ #Returns the assignment associated with this HIT - a
377
+ #Typingpool::Amazon::HIT::Assignment instance. The first time
378
+ #this is called, an Amazon HTTP request is typically (but not
379
+ #always) sent.
380
+ def assignment
381
+ if @assignment.nil?
382
+ if @full && full.assignments_completed == 0
383
+ #It would be dangerous to do this if the HIT were to be
384
+ #cached, since we would then never check for the
385
+ #assignment again. But we know this HIT won't be cached
386
+ #while it is active, since we only cache approved and
387
+ #rejected HITs.
388
+ @assignment = Assignment::Empty.new
389
+ else
390
+ @assignment = Assignment.new(at_amazon) #expensive
391
+ end
392
+ end
393
+ @assignment
394
+ end
395
+
396
+
397
+ #private
398
+
399
+ def stashed_param(param)
400
+ if @assignment && assignment.answers[param]
401
+ return assignment.answers[param]
402
+ elsif full.annotation[param]
403
+ #A question assigned through this software. May be
404
+ #expensive: May result in HTTP request to fetch HIT
405
+ #fields. We choose to fetch (sometimes) the HIT rather than
406
+ #the assignment on the assumption it will be MORE common to
407
+ #encounter HITs with no answers and LESS common to encounter
408
+ #HITs assigned through the RUI (and thus lacking in an
409
+ #annotation from this software and thus rendering the HTTP
410
+ #request to fetch the HIT fields pointless).
411
+ return full.annotation[param]
412
+ elsif full.assignments_completed.to_i >= 1
413
+ #A question assigned through Amazon's RUI, with an answer
414
+ #submitted. If the HIT belongs to this software, this
415
+ #assignment's answers will include our param. We prefer
416
+ #fetching the assignment to fetching the external question
417
+ #(as below) because fetching the assignment will potentially
418
+ #save us an HTTP request down the line -- for example, if we
419
+ #need other assignment data (e.g. assignment status).
420
+ #Fetching the external question only serves to give us
421
+ #access to params. If the answers do not include our param,
422
+ #we know the HIT does not belong to this software, since we
423
+ #know the param was also not in the annotation. So we are
424
+ #safe returning nil in that case.
425
+ return assignment.answers[param]
426
+ else
427
+ #A question assigned via Amazon's RUI, with no answer
428
+ #submitted. Expensive: Results in HTTP request to fetch
429
+ #external question.
430
+ return full.external_question_param(param)
431
+ end
432
+ end
433
+
434
+ def assignment_status_match?(status)
435
+ if @full
436
+ return false if full.assignments_completed == 0
437
+ return false if full.status != 'Reviewable'
438
+ end
439
+ assignment.status == status
440
+ end
441
+
442
+
443
+ @@cacheable_assignment_status = Set.new %w(Approved Rejected)
444
+ def cacheable?
445
+ if @ours == false
446
+ return true
447
+ end
448
+ if @full
449
+ return true if full.expired_and_overdue?
450
+ end
451
+ if @assignment && assignment.status
452
+ return true if @@cacheable_assignment_status.include?(assignment.status)
453
+ end
454
+ return false
455
+ end
456
+ end #HIT
457
+ end #Amazon
458
+ end #Typingpool
@@ -0,0 +1,45 @@
1
+ module Typingpool
2
+ class Amazon
3
+
4
+ #Class encapsulating the HTML form presented to Mechanical Turk workers
5
+ #transcribing a Typingpool audio chunk.
6
+ class Question
7
+ require 'nokogiri'
8
+ require 'uri'
9
+ require 'cgi'
10
+ attr_reader :url, :html
11
+
12
+ #Constructor. Takes the URL of where the question HTML has been
13
+ #uploaded, followed by the question HTML itself.
14
+ def initialize(url, html)
15
+ @url = url
16
+ @html = html
17
+ end
18
+
19
+ #Returns URL-encoded key-value pairs that can be used as the
20
+ #text for a HIT#annotation. The key-value pairs correspond to
21
+ #all hidden HTML form fields in the question HTML.
22
+ def annotation
23
+ CGI.escapeHTML(URI.encode_www_form(Hash[*noko.css('input[type="hidden"]').select{|e| e['name'].match(/^typingpool_/) }.map{|e| [e['name'], e['value']]}.flatten]))
24
+ end
25
+
26
+ #Returns the title, extracted from the title element of the
27
+ #HTML.
28
+ def title
29
+ noko.css('title')[0].content
30
+ end
31
+
32
+ #Returns the description, extracted from the element with the id
33
+ #'description' in the HTML.
34
+ def description
35
+ noko.css('#description')[0].content
36
+ end
37
+
38
+ protected
39
+
40
+ def noko(html=@html)
41
+ Nokogiri::HTML(html, nil, 'UTF-8')
42
+ end
43
+ end #Question
44
+ end #Amazon
45
+ end #Typingpool