typingpool 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. data/Rakefile +23 -0
  2. data/bin/tp-assign +240 -0
  3. data/bin/tp-collect +50 -0
  4. data/bin/tp-config +114 -0
  5. data/bin/tp-finish +101 -0
  6. data/bin/tp-make +169 -0
  7. data/bin/tp-review +175 -0
  8. data/lib/typingpool/amazon.rb +732 -0
  9. data/lib/typingpool/app.rb +634 -0
  10. data/lib/typingpool/config.rb +344 -0
  11. data/lib/typingpool/error.rb +22 -0
  12. data/lib/typingpool/filer.rb +396 -0
  13. data/lib/typingpool/project.rb +593 -0
  14. data/lib/typingpool/template.rb +175 -0
  15. data/lib/typingpool/templates/assignment/amazon-init.js +38 -0
  16. data/lib/typingpool/templates/assignment/interview/nameless.html.erb +13 -0
  17. data/lib/typingpool/templates/assignment/interview/noisy.html.erb +12 -0
  18. data/lib/typingpool/templates/assignment/interview/partials/voices.html.erb +10 -0
  19. data/lib/typingpool/templates/assignment/interview/phone.html.erb +12 -0
  20. data/lib/typingpool/templates/assignment/interview.html.erb +11 -0
  21. data/lib/typingpool/templates/assignment/main.css +20 -0
  22. data/lib/typingpool/templates/assignment/partials/entry.html.erb +19 -0
  23. data/lib/typingpool/templates/assignment/partials/footer.html.erb +3 -0
  24. data/lib/typingpool/templates/assignment/partials/header.html.erb +11 -0
  25. data/lib/typingpool/templates/assignment/partials/labeling-example.html.erb +4 -0
  26. data/lib/typingpool/templates/assignment/partials/labeling.html.erb +5 -0
  27. data/lib/typingpool/templates/assignment/partials/length-description.html.erb +6 -0
  28. data/lib/typingpool/templates/assignment/partials/voices.html.erb +10 -0
  29. data/lib/typingpool/templates/assignment/speech.html.erb +11 -0
  30. data/lib/typingpool/templates/config.yml +21 -0
  31. data/lib/typingpool/templates/project/audio/chunks/.empty_directory +0 -0
  32. data/lib/typingpool/templates/project/audio/originals/.empty_directory +0 -0
  33. data/lib/typingpool/templates/project/data/.empty_directory +0 -0
  34. data/lib/typingpool/templates/project/etc/ About these files - read me.txt +8 -0
  35. data/lib/typingpool/templates/project/etc/audio-compat.js +25 -0
  36. data/lib/typingpool/templates/project/etc/player/audio-player.js +4 -0
  37. data/lib/typingpool/templates/project/etc/player/license.txt +19 -0
  38. data/lib/typingpool/templates/project/etc/player/player.swf +0 -0
  39. data/lib/typingpool/templates/project/etc/transcript.css +49 -0
  40. data/lib/typingpool/templates/transcript.html.erb +23 -0
  41. data/lib/typingpool/test/fixtures/amazon-question-html.html +95 -0
  42. data/lib/typingpool/test/fixtures/amazon-question-url.txt +1 -0
  43. data/lib/typingpool/test/fixtures/audio/mp3/interview.1.mp3 +0 -0
  44. data/lib/typingpool/test/fixtures/audio/mp3/interview.2.mp3 +0 -0
  45. data/lib/typingpool/test/fixtures/audio/wma/VN620007.WMA +0 -0
  46. data/lib/typingpool/test/fixtures/audio/wma/VN620052.WMA +0 -0
  47. data/lib/typingpool/test/fixtures/config-1 +20 -0
  48. data/lib/typingpool/test/fixtures/config-2 +25 -0
  49. data/lib/typingpool/test/fixtures/not_yaml.txt +4 -0
  50. data/lib/typingpool/test/fixtures/template-2.html.erb +10 -0
  51. data/lib/typingpool/test/fixtures/template-3.html.erb +22 -0
  52. data/lib/typingpool/test/fixtures/template.html.erb +10 -0
  53. data/lib/typingpool/test/fixtures/tp_collect_id.txt +1 -0
  54. data/lib/typingpool/test/fixtures/tp_collect_sandbox-assignment.csv +8 -0
  55. data/lib/typingpool/test/fixtures/tp_review_id.txt +1 -0
  56. data/lib/typingpool/test/fixtures/tp_review_sandbox-assignment.csv +8 -0
  57. data/lib/typingpool/test/fixtures/transcript-chunks.csv +226 -0
  58. data/lib/typingpool/test/fixtures/utf8_transcript.txt +7 -0
  59. data/lib/typingpool/test/fixtures/vcr/tp-collect-1.yml +2712 -0
  60. data/lib/typingpool/test/fixtures/vcr/tp-collect-2.yml +2718 -0
  61. data/lib/typingpool/test/fixtures/vcr/tp-collect-3.yml +2768 -0
  62. data/lib/typingpool/test/fixtures/vcr/tp-review-1.yml +570 -0
  63. data/lib/typingpool/test/fixtures/vcr/tp-review-2.yml +351 -0
  64. data/lib/typingpool/test.rb +418 -0
  65. data/lib/typingpool/transcript.rb +181 -0
  66. data/lib/typingpool/utility.rb +272 -0
  67. data/lib/typingpool.rb +500 -0
  68. data/test/make_amazon_question_fixture.rb +24 -0
  69. data/test/make_tp_collect_fixture_1.rb +26 -0
  70. data/test/make_tp_collect_fixture_2.rb +16 -0
  71. data/test/make_tp_collect_fixture_3.rb +15 -0
  72. data/test/make_tp_collect_fixture_4.rb +17 -0
  73. data/test/make_tp_review_fixture_1.rb +26 -0
  74. data/test/make_tp_review_fixture_2.rb +30 -0
  75. data/test/make_transcript_chunks_fixture.rb +53 -0
  76. data/test/test_integration_script_1_tp_config.rb +108 -0
  77. data/test/test_integration_script_2_tp_make.rb +119 -0
  78. data/test/test_integration_script_3_tp_assign.rb +152 -0
  79. data/test/test_integration_script_4_tp_review.rb +72 -0
  80. data/test/test_integration_script_5_tp_collect.rb +44 -0
  81. data/test/test_integration_script_6_tp_finish.rb +123 -0
  82. data/test/test_unit_amazon.rb +153 -0
  83. data/test/test_unit_config.rb +94 -0
  84. data/test/test_unit_filer.rb +202 -0
  85. data/test/test_unit_project.rb +168 -0
  86. data/test/test_unit_project_local.rb +68 -0
  87. data/test/test_unit_project_remote.rb +157 -0
  88. data/test/test_unit_template.rb +111 -0
  89. data/test/test_unit_transcript.rb +77 -0
  90. metadata +234 -0
@@ -0,0 +1,732 @@
1
+ module Typingpool
2
+ class Amazon
3
+ require 'rturk'
4
+ require 'pstore'
5
+ @@cache_file = '~/.typingpool.cache'
6
+
7
+ class << self
8
+
9
+ #You must call Amazon.setup before using any subclass methods
10
+ #that rely on Amazon servers.
11
+ # ==== Params
12
+ # Takes params as a hash of named arguments.
13
+ #[:key] Your Amazon Web Services Access Key ID. Required
14
+ # param. If not passed, will be read from :config.
15
+ #[:secret] Your Amazon Web Services Secret Access Key. Required
16
+ # param. If not passed, will be read from :config.
17
+ #[:config] A Typingpool::Config instance. If not passed, will
18
+ # use the default Config.file (usually
19
+ # ~/.typingpool). Supplies the default values for :key
20
+ # and :secret and can override the default cache file
21
+ # location (usually ~/.typingpool.cache) via the
22
+ # 'cache' param.
23
+ #[:sandbox] Boolean specifying whether to perform all operations
24
+ # in the Amazon Mechanical Turk sandbox. Default is
25
+ # false.
26
+ # ==== Returns
27
+ # Result of call to RTurk.setup with security credentials and sandbox param.
28
+ def setup(args={})
29
+ args[:config] ||= Config.file
30
+ args[:key] ||= args[:config].amazon.key
31
+ args[:secret] ||= args[:config].amazon.secret
32
+ args[:sandbox] = false if args[:sandbox].nil?
33
+ if args[:config].cache
34
+ @@cache = nil
35
+ @@cache_file = args[:config].cache
36
+ end
37
+ RTurk.setup(args[:key], args[:secret], :sandbox => args[:sandbox])
38
+ end
39
+
40
+ #Convenience wrapper that calls RTurk::Hit.new with
41
+ #:include_assignment_summary set to true. Takes a HIT id and
42
+ #returns an RTurk::Hit instance.
43
+ def rturk_hit_full(id)
44
+ RTurk::Hit.new(id, nil, :include_assignment_summary => true)
45
+ end
46
+
47
+ #Returns a PStore instance tied to the cache file specified in
48
+ #Amazon.setup (or the default).
49
+ def cache
50
+ @@cache ||= PStore.new(File.expand_path(@@cache_file))
51
+ end
52
+
53
+ end #class << self
54
+
55
+ #Class representing an Amazon Mechanical Turk Human Intelligence
56
+ #Task (HIT).
57
+ #
58
+ #We go above and beyond RTurk::Hit for several practical reasons:
59
+ # * To allow easy serialization. Caching is a very useful way of
60
+ # reducing network calls to Amazon, and thus of speeding up
61
+ # Typingpool. RTurk::Hit objects cannot be dumped via Marshal,
62
+ # apparently due to some Nokogiri objects they
63
+ # contain. Typingpool::Amazon::HIT objects, in contrast, are
64
+ # designed to be easily and compactly serialized. They store the
65
+ # minimal subset of information we need via simple
66
+ # attribtues. (Presently we serialize via PStore.)
67
+ # * To attach convenience methods. RTurk does not make it easy,
68
+ # for example, to get HITs beyond the first "page" returned by
69
+ # Amazon. This class provides methods that make it easy to get
70
+ # ALL HITs returned by various operations.
71
+ # * To attach methods specific to Typingpool. For example, the url
72
+ # and project_id methods read params we've embedded in the
73
+ # annotation or in hidden fields on an external question, while
74
+ # the underlying stashed_params method optimizes its lookup of
75
+ # these variables based on how the app is most likely to be
76
+ # used. See also the ours? and cacheable? methods.
77
+ # * To simplify. Typingpool HITs are constrained such that we can
78
+ # assume they all contain only one assignment and thus only a
79
+ # maximum of one answer. Also, once we've determined that a HIT
80
+ # does not belong to Typingpool, it is safe to cache it forever
81
+ # and never download it again from Amazon.
82
+ # * To clearly partition methods that result in network
83
+ # calls. When you access an attribute under hit.full, like
84
+ # hit.full.status, it is clear you are doing something
85
+ # potentially expensive to obtain your hit status. Same thing
86
+ # with accessing an attribute under hit.assignment, like
87
+ # hit.assignment.worker_id -- it is clear an assignment object
88
+ # will need to be created, implying a network call. Calling
89
+ # hit.id, in contrast, is always fast. (Caveat: Accessing
90
+ # partitioned attributes often, but not always, results in a
91
+ # network call. In some cases, hit.full is generated at the same
92
+ # time we create the hit, since we've obtained a full HIT
93
+ # serialization from Amazon. In other cases, we only have a HIT
94
+ # id, so accessing anything under hit.full generates a network
95
+ # call.)
96
+ class HIT
97
+ require 'set'
98
+ require 'uri'
99
+
100
+ class << self
101
+
102
+ #Constructor. Creates an Amazon Mechanical Turk HIT.
103
+ #** Warning: This method can spend your money! **
104
+ # ==== Params
105
+ # [question] Typingpool::Amazon::Question instance, used not
106
+ # only to generate the (external) question but
107
+ # also parsed to provide one or more core HIT
108
+ # attributes. Must include a non-nil
109
+ # annotation attribute. Provides fallback
110
+ # values for HIT title and description.
111
+ # [config_assign] The 'assign' attribute of a
112
+ # Typingpool::Config instance (that is, a
113
+ # Typingpool::Config::Root::Assign
114
+ # instance). Must include values for reward,
115
+ # lifetime, duration, and approval. May
116
+ # include values for keywords and
117
+ # qualifications. Preferred source for HIT
118
+ # title and description. See
119
+ # Typingpool::Config documentation for further
120
+ # details.
121
+ # ==== Returns
122
+ # Typingpool::Amazon::HIT instance corresponding to the new
123
+ # Mechanical Turk HIT.
124
+ def create(question, config_assign)
125
+ new(RTurk::Hit.create(:title => config_assign.title || question.title) do |hit|
126
+ hit.description = config_assign.description || question.description
127
+ hit.question(question.url)
128
+ hit.note = question.annotation or raise Error, "Missing annotation from question"
129
+ hit.reward = config_assign.reward or raise Error, "Missing reward config"
130
+ hit.assignments = 1
131
+ hit.lifetime = config_assign.lifetime or raise Error, "Missing lifetime config"
132
+ hit.duration = config_assign.deadline or raise Error, "Missing deadline config"
133
+ hit.auto_approval = config_assign.approval or raise Error, "Missing approval config"
134
+ hit.keywords = config_assign.keywords if config_assign.keywords
135
+ config_assign.qualify.each{|q| hit.qualifications.add(*q.to_arg)} if config_assign.qualify
136
+ end)
137
+ end
138
+
139
+ #Name of the hidden HTML form field used to provide the
140
+ #project_id in an external question or (form-encoded)
141
+ #annotation. Hard coded to typingpool_project_id but
142
+ #overridable in a subclass.
143
+ def id_at
144
+ @@id_at ||= 'typingpool_project_id'
145
+ end
146
+
147
+ #Name of the hidden HTML form field used to provide the
148
+ #(audio) url in an external question or (form-encoded)
149
+ #annotation. Hard coded to typingpool_url but overridable in a
150
+ #subclass.
151
+ def url_at
152
+ @@url_at ||= 'typingpool_url'
153
+ end
154
+
155
+ #Takes an array of HIT ids, returns Typingpool::Amazon::HIT
156
+ #instances corresponding to those ids.
157
+ def with_ids(ids)
158
+ ids.map{|id| cached_or_new(RTurk::Hit.new(id)) }
159
+ end
160
+
161
+ #Returns all Typingpool HITs that have been approved, as an
162
+ #array of Typingpool::Amazon::HIT instances.
163
+ def all_approved
164
+ hits = all_reviewable do |hit|
165
+ begin
166
+ #optimization: we assume it is more common to have an
167
+ #unapproved HIT than an approved HIT that does not
168
+ #belong to this app
169
+ hit.approved? && hit.ours?
170
+ rescue RestClient::ServiceUnavailable => e
171
+ warn "Warning: Service unavailable error, skipped HIT #{hit.id}. (Error: #{e})"
172
+ false
173
+ end
174
+ end
175
+ hits
176
+ end
177
+
178
+ #Returns as an array of Typingpool::Amazon::HIT instances all
179
+ #HITs returned by Amazon's GetReviewableHITs operation (which
180
+ #have HIT status == 'Reviewable'). Takes an optional filter
181
+ #block (which should return true for HITs to be included in
182
+ #the final results). If not supplied, will filter so the
183
+ #returned hits are all Typingpool HITs (hit.ours? == true).
184
+ def all_reviewable(&filter)
185
+ hits = each_page do |page_number|
186
+ RTurk.GetReviewableHITs(:page_number => page_number).hit_ids.map{|id| RTurk::Hit.new(id) }.map{|hit| cached_or_new(hit) }
187
+ end
188
+ filter_ours(hits, &filter)
189
+ end
190
+
191
+ #Takes a Typingpool::Project::Local#id and returns all HITs
192
+ #associated with that project, as an array of
193
+ #Typingpool::Amazon::HIT instances.
194
+ def all_for_project(id)
195
+ all{|hit| hit.ours? && hit.project_id == id}
196
+ end
197
+
198
+ #Returns all HITs associated with your AWS account as an array
199
+ #of Typingpool::Amazon::HIT instances. Takes an optional
200
+ #filter block (which should return true for HITs to be
201
+ #included in the final results). If not supplied, will filter
202
+ #so the returned hits are all Typingpool HITs (hit.ours? ==
203
+ #true).
204
+ def all(&filter)
205
+ hits = each_page do |page_number|
206
+ page = RTurk::SearchHITs.create(:page_number => page_number)
207
+ raw_hits = page.xml.xpath('//HIT')
208
+ page.hits.map do |rturk_hit|
209
+ annotation = raw_hits.shift.xpath('RequesterAnnotation').inner_text.strip
210
+ full = Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation)
211
+ cached_or_new_from_searchhits(rturk_hit, annotation)
212
+ end
213
+ end
214
+ filter_ours(hits, &filter)
215
+ end
216
+
217
+ #protected
218
+
219
+ #Constructor. Takes an RTurk::Hit instance. Returns a
220
+ #Typingpool::Amazon::HIT instance, preferably from the cache.
221
+ def cached_or_new(rturk_hit)
222
+ from_cache(rturk_hit.id) || new(rturk_hit)
223
+ end
224
+
225
+ #Constructor. Same as cached_or_new, but handles peculiarities
226
+ #of objects returned by RTurk::SearchHITs. Such objects map
227
+ #two Amazon HIT fields to different names than those used by
228
+ #other RTurk HIT instances. They also do not bother to extract
229
+ #the annotation from the Amazon HIT, so we have to do that
230
+ #ourselves (elsewhere) and take it as a param here. Finally,
231
+ #on the bright side, RTurk::SearchHITs already contain a big
232
+ #chunk of hit.full attributes, potentially obviating the need
233
+ #for an additional network call to flesh out the HIT, so this
234
+ #method pre-fleshes-out the HIT.
235
+ def cached_or_new_from_searchhits(rturk_hit, annotation)
236
+ if not (typingpool_hit = from_cache(rturk_hit.id))
237
+ typingpool_hit = new(rturk_hit)
238
+ typingpool_hit.full(Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation))
239
+ end
240
+ typingpool_hit
241
+ end
242
+
243
+ def from_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
244
+ Amazon.cache.transaction do
245
+ Amazon.cache[cache_key(hit_id, id_at, url_at)]
246
+ end
247
+ end
248
+
249
+ def delete_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
250
+ Amazon.cache.transaction do
251
+ key = cache_key(hit_id, id_at, url_at)
252
+ cached = Amazon.cache[key]
253
+ Amazon.cache.delete(key) unless cached.nil?
254
+ end
255
+ end
256
+
257
+ def cache_key(hit_id, id_at=self.id_at, url_at=self.url_at)
258
+ "RESULT///#{hit_id}///#{url_at}///#{id_at}"
259
+ end
260
+
261
+ def each_page
262
+ results = []
263
+ page = 0
264
+ begin
265
+ page += 1
266
+ new_results = yield(page)
267
+ results.push(*new_results)
268
+ end while new_results.count > 0
269
+ results
270
+ end
271
+
272
+ def filter_ours(hits, &filter)
273
+ filter ||= lambda{|hit| hit.ours? }
274
+ hits.select do |hit|
275
+ selected = filter.call(hit)
276
+ hit.to_cache
277
+ selected
278
+ end
279
+ end
280
+ end #class << self
281
+
282
+ #Corresponds to the Amazon Mechanical Turk HIT#HITId
283
+ attr_reader :id
284
+
285
+ #Constructor. Takes an RTurk::Hit instance.
286
+ def initialize(rturk_hit)
287
+ @id = rturk_hit.id
288
+ end
289
+
290
+ #URL of the audio file associated with this HIT (the audio file
291
+ #to be transcribed). Extracted from the annotation (when the HIT
292
+ #was assigned via Typingpool) or from a hidden field in the HTML
293
+ #form on the external question (when the HIT was assigned via
294
+ #the Amazon Mechanical Turk RUI).
295
+ def url
296
+ @url ||= stashed_param(self.class.url_at)
297
+ end
298
+
299
+ #The Typingpool::Project::Local#id associated with this
300
+ #HIT. Extracted as described for the url method.
301
+ def project_id
302
+ @project_id ||= stashed_param(self.class.id_at)
303
+ end
304
+
305
+ #Returns the Typingpool::Project#name associated with this HIT
306
+ #by parsing the #url. May be dropped in a future release.
307
+ def project_title_from_url(url=self.url)
308
+ matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
309
+ URI.unescape(matches[2])
310
+ end
311
+
312
+ #Returns true if this HIT has an approved assignment associated
313
+ #with it. (Attached to Typingpool::Amazon::HIT rather than
314
+ #Typingpool::Amazon::HIT::Assignment because sometimes we can
315
+ #tell simply from looking at hit.full that there are no approved
316
+ #assignments -- hit.full.assignments_completed == 0. This check
317
+ #is only performed when hit.full has already been loaded.)
318
+ def approved?
319
+ assignment_status_match?('Approved')
320
+ end
321
+
322
+ #Returns true if this HIT has a rejected assignment associated
323
+ #with it. (For an explanation of why this is not attached to
324
+ #Typingpool::Amazon::HIT::Assignment, see the documentation for
325
+ #approved?.)
326
+ def rejected?
327
+ assignment_status_match?('Rejected')
328
+ end
329
+
330
+ #Returns true if this HIT has a submitted assignment associated
331
+ #with it. (For an explanation of why this is not attached to
332
+ #Typingpool::Amazon::HIT::Assignment, see the documentation for
333
+ #approved?.)
334
+ def submitted?
335
+ assignment_status_match?('Submitted')
336
+ end
337
+
338
+
339
+ #Returns true if this HIT is associated with Typingpool. One
340
+ #Amazon account can be used for many tasks, so it's important to
341
+ #check whether the HIT belongs to this software. (Presently,
342
+ #this is determined by looking for a stashed param like url or
343
+ #project_id).
344
+ def ours?
345
+ @ours ||= not(url.to_s.empty?)
346
+ end
347
+
348
+ #Returns a Typingpool::Transcript::Chunk instance built using
349
+ #this HIT and its associated assignment.
350
+ def transcript
351
+ transcript = Transcript::Chunk.new(assignment.body)
352
+ transcript.url = url
353
+ transcript.project = project_id
354
+ transcript.worker = assignment.worker_id
355
+ transcript.hit = @id
356
+ transcript
357
+ end
358
+
359
+ #If this HIT is cacheable, serializes it to the cache file
360
+ #specified in the config passed to Amazon.setup, or specified in
361
+ #the default config file. In short, a HIT is cacheable if it
362
+ #does not belong to Typingpool (ours? == false), if it is
363
+ #approved or rejected (approved? || rejected?), or if it is
364
+ #expired (full.expired_and_overdue?). See also cacheable? code.
365
+ #
366
+ # When available, cached HITs are used by
367
+ # Typingpool::Amazon::HIT.all,
368
+ # Typingpool::Amazon::HIT.all_approved, and all the other class
369
+ # methods that retrieve HITs. These methods call to_cache for
370
+ # you at logical times (after downloading and filtering, when
371
+ # the HIT is most fleshed out), so you should not need to call
372
+ # this yourself. But if you have an operation that makes network
373
+ # calls to further flesh out the HIT, calling to_cache may be
374
+ # worthwhile.
375
+ def to_cache
376
+ #any obj containing a Nokogiri object cannot be stored in pstore - do
377
+ #not forget this (again)
378
+ if cacheable?
379
+ Amazon.cache.transaction do
380
+ Amazon.cache[self.class.cache_key(@id)] = self
381
+ end
382
+ end
383
+ end
384
+
385
+ #Returns an RTurk::Hit instance corresponding to this HIT.
386
+ def at_amazon
387
+ Amazon.rturk_hit_full(@id)
388
+ end
389
+
390
+ #Deletes the HIT from Amazon's servers. Examines the HIT and
391
+ #assignment status to determine whether calling the DisposeHIT
392
+ #or DisableHIT operation is most appropriate. If the HIT has
393
+ #been submitted but not approved or rejected, will raise an
394
+ #exception of type
395
+ #Typingpool::Error::Amazon::UnreviewedContent. Catch this
396
+ #exception in your own code if you'd like to automatically
397
+ #approve such HITs before removing them.
398
+ def remove_from_amazon
399
+ if full.status == 'Reviewable'
400
+ if assignment.status == 'Submitted'
401
+ raise Error::Amazon::UnreviewedContent, "There is an unreviewed submission for #{url}"
402
+ end
403
+ at_amazon.dispose!
404
+ else
405
+ at_amazon.disable!
406
+ end
407
+ end
408
+
409
+ #Returns "the full hit" - a Typingpool::Amazon::HIT::Full
410
+ #instance associated with this HIT. If the instance is being
411
+ #created for the first time, this will trigger an HTTP request
412
+ #to Amazon's servers. "Full" hit fields segregated because
413
+ #accessing any one of them is expensive if we only have a hit id
414
+ #(but after fetching one all are cheap). Accepts an optional
415
+ #Typingpool::Amazon::HIT::Full (or subclass) to set for this
416
+ #attribute, preventing the need to create one. This is useful in
417
+ #cases in which extensive HIT data was returned by an Amazon
418
+ #operation (for example, SearchHITs returns lots of HIT data)
419
+ def full(full_hit=nil)
420
+ if @full.nil?
421
+ @full = full_hit || Full.new(at_amazon)
422
+ end
423
+ @full
424
+ end
425
+
426
+ #Returns the assignment associated with this HIT - a
427
+ #Typingpool::Amazon::HIT::Assignment instance. The first time
428
+ #this is called, an Amazon HTTP request is typically (but not
429
+ #always) sent.
430
+ def assignment
431
+ if @assignment.nil?
432
+ if @full && full.assignments_completed == 0
433
+ #It would be dangerous to do this if the HIT were to be
434
+ #cached, since we would then never check for the
435
+ #assignment again. But we know this HIT won't be cached
436
+ #while it is active, since we only cache approved and
437
+ #rejected HITs.
438
+ @assignment = Assignment::Empty.new
439
+ else
440
+ @assignment = Assignment.new(at_amazon) #expensive
441
+ end
442
+ end
443
+ @assignment
444
+ end
445
+
446
+
447
+ #private
448
+
449
+ def stashed_param(param)
450
+ if @assignment && assignment.answers[param]
451
+ return assignment.answers[param]
452
+ elsif full.annotation[param]
453
+ #A question assigned through this software. May be
454
+ #expensive: May result in HTTP request to fetch HIT
455
+ #fields. We choose to fetch (sometimes) the HIT rather than
456
+ #the assignment on the assumption it will be MORE common to
457
+ #encounter HITs with no answers and LESS common to encounter
458
+ #HITs assigned through the RUI (and thus lacking in an
459
+ #annotation from this software and thus rendering the HTTP
460
+ #request to fetch the HIT fields pointless).
461
+ return full.annotation[param]
462
+ elsif full.assignments_completed.to_i >= 1
463
+ #A question assigned through Amazon's RUI, with an answer
464
+ #submitted. If the HIT belongs to this software, this
465
+ #assignment's answers will include our param. We prefer
466
+ #fetching the assignment to fetching the external question
467
+ #(as below) because fetching the assignment will potentially
468
+ #save us an HTTP request down the line -- for example, if we
469
+ #need other assignment data (e.g. assignment status).
470
+ #Fetching the external question only serves to give us
471
+ #access to params. If the answers do not include our param,
472
+ #we know the HIT does not belong to this software, since we
473
+ #know the param was also not in the annotation. So we are
474
+ #safe returning nil in that case.
475
+ return assignment.answers[param]
476
+ else
477
+ #A question assigned via Amazon's RUI, with no answer
478
+ #submitted. Expensive: Results in HTTP request to fetch
479
+ #external question.
480
+ return full.external_question_param(param)
481
+ end
482
+ end
483
+
484
+ def assignment_status_match?(status)
485
+ if @full
486
+ return false if full.assignments_completed == 0
487
+ return false if full.status != 'Reviewable'
488
+ end
489
+ assignment.status == status
490
+ end
491
+
492
+
493
+ @@cacheable_assignment_status = Set.new %w(Approved Rejected)
494
+ def cacheable?
495
+ if @ours == false
496
+ return true
497
+ end
498
+ if @full
499
+ return true if full.expired_and_overdue?
500
+ end
501
+ if @assignment && assignment.status
502
+ return true if @@cacheable_assignment_status.include?(assignment.status)
503
+ end
504
+ return false
505
+ end
506
+
507
+ class Full
508
+ require 'uri'
509
+ require 'open-uri'
510
+ require 'nokogiri'
511
+
512
+ #See the RTurk documentation and Amazon Mechanical Turk API
513
+ #documentation for more on these fields.
514
+ attr_reader :id, :type_id, :status, :external_question_url, :assignments_completed, :assignments_pending, :expires_at, :assignments_duration
515
+
516
+ #Constructor. Takes an RTurk::HIT instance.
517
+ def initialize(rturk_hit)
518
+ import_standard_attrs_from_rturk_hit(rturk_hit)
519
+ @assignments_completed = rturk_hit.assignments_completed_count
520
+ @assignments_pending = rturk_hit.assignments_pending_count
521
+ self.annotation = rturk_hit.annotation
522
+ self.external_question_url = rturk_hit.xml
523
+ end
524
+
525
+ #Returns the HIT annotation as a hash. If the annotation
526
+ #contained URL-encoded form key-value pairs, it decodes them
527
+ #and returns them as a hash. Otherwise, returns an empty hash
528
+ #(throwing away any annotation text that is not URL-encoded
529
+ #key-value pairs, for example the tags attached by the Amazon
530
+ #Mechanical Turk RUI).
531
+ def annotation
532
+ @annotation ||= {}
533
+ end
534
+
535
+ #Returns boolean indicated whether the HIT is
536
+ #expired. Determined by comparing the HIT's expires_at
537
+ #attribute with the current time.
538
+ def expired?
539
+ expires_at < Time.now
540
+ end
541
+
542
+ #Returns boolean indicated whether the HIT is expired and
543
+ #overdue, at which point it is totally safe to prune. This is
544
+ #determined by adding the assignment duration (how long a
545
+ #worker has to complete the HIT) to the HIT's expires_at time
546
+ #(when the HIT is removed from the Mechanical Turk
547
+ #marketplace).
548
+ def expired_and_overdue?
549
+ (expires_at + assignments_duration) < Time.now
550
+ end
551
+
552
+ #Returns the HTML of the external question associated with the
553
+ #HIT. All Typingpool HITs use external questions (as opposed
554
+ #to "internal" HIT QuestionForms), so this should always
555
+ #return something. In first use, must make an HTTP request to
556
+ #obtain the HTML.
557
+ def external_question
558
+ if @external_question.nil?
559
+ if external_question_url && external_question_url.match(/^http/)
560
+ #expensive, obviously:
561
+ @external_question = open(external_question_url).read
562
+ end
563
+ end
564
+ @external_question
565
+ end
566
+
567
+ #Takes the name of an HTML form param and returns the value
568
+ #associated with that param in the external question
569
+ #HTML. Triggers an HTTP request on first use (unless
570
+ #external_question has already been called).
571
+ def external_question_param(param)
572
+ if external_question
573
+ if input = Nokogiri::HTML::Document.parse(external_question).css("input[name=#{param}]")[0]
574
+ return input['value']
575
+ end
576
+ end
577
+ end
578
+
579
+ protected
580
+
581
+ def import_standard_attrs_from_rturk_hit(hit)
582
+ %w(id type_id status expires_at assignments_duration).each do |attr|
583
+ instance_variable_set("@#{attr}", hit.send(attr))
584
+ end
585
+ end
586
+
587
+ def annotation=(encoded)
588
+ @annotation = CGI.unescapeHTML(encoded.to_s)
589
+ begin
590
+ @annotation = URI.decode_www_form(@annotation)
591
+ @annotation = Hash[*@annotation.flatten]
592
+ rescue ArgumentError
593
+ #Handle annotations like Department:Transcription (from
594
+ #the Amazon RUI), which make URI.decode_www_form barf
595
+ @annotation = {}
596
+ end
597
+ end
598
+
599
+ def external_question_url=(noko_xml)
600
+ if url = noko_xml.css('HIT Question eq|ExternalQuestion eq|ExternalURL', {'eq' => 'http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2006-07-14/ExternalQuestion.xsd'})[0].inner_text
601
+ @external_question_url = url
602
+ end
603
+ end
604
+
605
+ #For more on why this subclass is neccesary, see the
606
+ #documentation for
607
+ #Typingpool::Amazon::HIT.cached_or_new_from_searchhits. In
608
+ #short, RTurk::HITParser objects returned by RTurk::SearchHITs
609
+ #are pointlessly and subtly different from
610
+ #RTurk::GetHITResponse objects. (I need to submit a patch to
611
+ #RTurk.)
612
+ class FromSearchHITs < Full
613
+ #Constructor. Takes an RTurk::Hit instance and the text of
614
+ #the HIT's annotation. The text of the annotation must be
615
+ #submitted as a separate param because RTurk::Hit instances
616
+ #returned by RTurk::SearchHITs do not bother to extract the
617
+ #annotation into an attribute, so we have to so that
618
+ #ourselves (elsewhere) using the raw xml.
619
+ def initialize(rturk_hit, annotation)
620
+ import_standard_attrs_from_rturk_hit(rturk_hit)
621
+ @assignments_completed = rturk_hit.completed_assignments
622
+ @assignments_pending = rturk_hit.pending_assignments
623
+ self.annotation = annotation
624
+ end
625
+
626
+ protected
627
+
628
+ def external_question_url
629
+ unless @checked_question
630
+ self.external_question_url = at_amazon.xml
631
+ @checked_question = true
632
+ end
633
+ @external_question_url
634
+ end
635
+
636
+ def at_amazon
637
+ Amazon.rturk_hit_full(@id)
638
+ end
639
+ end #Amazon::HIT::Full::FromSearchHITs
640
+ end #Amazon::HIT::Full
641
+
642
+ class Assignment
643
+
644
+ #See the RTurk documentation and Amazon Mechanical Turk API
645
+ #documentation for more on these fields.
646
+ attr_reader :id, :status, :worker_id, :submitted_at
647
+
648
+ #Constructor. Takes an RTurk::Hit instance.
649
+ def initialize(rturk_hit)
650
+ if assignment = rturk_hit.assignments[0] #expensive!
651
+ @id = assignment.id
652
+ @status = assignment.status
653
+ @worker_id = assignment.worker_id
654
+ @submitted_at = assignment.submitted_at
655
+ if answers = assignment.answers
656
+ @answers = answers.to_hash
657
+ end
658
+ end
659
+ end
660
+
661
+ #Returns the answers associated with this assignment as a
662
+ #hash. If there are no answers, returns an empty hash.
663
+ def answers
664
+ @answers ||= {}
665
+ end
666
+
667
+ #Returns the transcription submitted by the user as raw text.
668
+ def body
669
+ (answers['transcription'] || answers['1']).to_s
670
+ end
671
+
672
+ #Returms an RTurk::Assignment object corresponding to this
673
+ #assignment.
674
+ def at_amazon
675
+ RTurk::Assignment.new(@id)
676
+ end
677
+
678
+ #Subclass used in cases where we know Amazon's servers have no
679
+ #assignments for us (because hit.full.assignments_completed ==
680
+ #0), so we don't want to bother doing an HTTP request to
681
+ #check.
682
+ class Empty < Assignment
683
+ def initialize
684
+ @answers = {}
685
+ end
686
+
687
+ end #Empty
688
+ end #Assignment
689
+ end #HIT
690
+
691
+ #Class encapsulating the HTML form presented to Mechanical Turk workers
692
+ #transcribing a Typingpool audio chunk.
693
+ class Question
694
+ require 'nokogiri'
695
+ require 'uri'
696
+ require 'cgi'
697
+ attr_reader :url, :html
698
+
699
+ #Constructor. Takes the URL of where the question HTML has been
700
+ #uploaded, followed by the question HTML itself.
701
+ def initialize(url, html)
702
+ @url = url
703
+ @html = html
704
+ end
705
+
706
+ #Returns URL-encoded key-value pairs that can be used as the
707
+ #text for a HIT#annotation. The key-value pairs correspond to
708
+ #all hidden HTML form fields in the question HTML.
709
+ def annotation
710
+ CGI.escapeHTML(URI.encode_www_form(Hash[*noko.css('input[type="hidden"]').select{|e| e['name'].match(/^typingpool_/) }.map{|e| [e['name'], e['value']]}.flatten]))
711
+ end
712
+
713
+ #Returns the title, extracted from the title element of the
714
+ #HTML.
715
+ def title
716
+ noko.css('title')[0].content
717
+ end
718
+
719
+ #Returns the description, extracted from the element with the id
720
+ #'description' in the HTML.
721
+ def description
722
+ noko.css('#description')[0].content
723
+ end
724
+
725
+ protected
726
+
727
+ def noko(html=@html)
728
+ Nokogiri::HTML(html, nil, 'UTF-8')
729
+ end
730
+ end #Question
731
+ end #Amazon
732
+ end #Typingpool