typingpool 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.markdown +452 -0
- data/lib/typingpool/amazon/hit/assignment/empty.rb +19 -0
- data/lib/typingpool/amazon/hit/assignment.rb +43 -0
- data/lib/typingpool/amazon/hit/full/fromsearchhits.rb +44 -0
- data/lib/typingpool/amazon/hit/full.rb +105 -0
- data/lib/typingpool/amazon/hit.rb +458 -0
- data/lib/typingpool/amazon/question.rb +45 -0
- data/lib/typingpool/amazon.rb +3 -677
- data/lib/typingpool/app/cli/formatter.rb +16 -0
- data/lib/typingpool/app/cli.rb +64 -0
- data/lib/typingpool/app/friendlyexceptions.rb +34 -0
- data/lib/typingpool/app.rb +2 -97
- data/lib/typingpool/config/root.rb +114 -0
- data/lib/typingpool/config.rb +13 -119
- data/lib/typingpool/filer/audio.rb +84 -0
- data/lib/typingpool/filer/csv.rb +57 -0
- data/lib/typingpool/filer/dir.rb +76 -0
- data/lib/typingpool/filer/files/audio.rb +63 -0
- data/lib/typingpool/filer/files.rb +55 -0
- data/lib/typingpool/filer.rb +4 -313
- data/lib/typingpool/project/local.rb +117 -0
- data/lib/typingpool/project/remote/s3.rb +135 -0
- data/lib/typingpool/project/remote/sftp.rb +100 -0
- data/lib/typingpool/project/remote.rb +65 -0
- data/lib/typingpool/project.rb +2 -396
- data/lib/typingpool/template/assignment.rb +17 -0
- data/lib/typingpool/template/env.rb +77 -0
- data/lib/typingpool/template.rb +2 -87
- data/lib/typingpool/test/script.rb +310 -0
- data/lib/typingpool/test.rb +1 -306
- data/lib/typingpool/transcript/chunk.rb +129 -0
- data/lib/typingpool/transcript.rb +1 -125
- data/lib/typingpool/utility/castable.rb +65 -0
- data/lib/typingpool/utility.rb +1 -61
- data/test/test_integration_script_6_tp_finish.rb +1 -0
- metadata +135 -81
data/lib/typingpool/amazon.rb
CHANGED
@@ -2,6 +2,9 @@ module Typingpool
|
|
2
2
|
class Amazon
|
3
3
|
require 'rturk'
|
4
4
|
require 'pstore'
|
5
|
+
require 'typingpool/amazon/hit'
|
6
|
+
require 'typingpool/amazon/question'
|
7
|
+
|
5
8
|
@@cache_file = '~/.typingpool.cache'
|
6
9
|
|
7
10
|
class << self
|
@@ -51,682 +54,5 @@ module Typingpool
|
|
51
54
|
end
|
52
55
|
|
53
56
|
end #class << self
|
54
|
-
|
55
|
-
#Class representing an Amazon Mechanical Turk Human Intelligence
|
56
|
-
#Task (HIT).
|
57
|
-
#
|
58
|
-
#We go above and beyond RTurk::Hit for several practical reasons:
|
59
|
-
# * To allow easy serialization. Caching is a very useful way of
|
60
|
-
# reducing network calls to Amazon, and thus of speeding up
|
61
|
-
# Typingpool. RTurk::Hit objects cannot be dumped via Marshal,
|
62
|
-
# apparently due to some Nokogiri objects they
|
63
|
-
# contain. Typingpool::Amazon::HIT objects, in contrast, are
|
64
|
-
# designed to be easily and compactly serialized. They store the
|
65
|
-
# minimal subset of information we need via simple
|
66
|
-
# attribtues. (Presently we serialize via PStore.)
|
67
|
-
# * To attach convenience methods. RTurk does not make it easy,
|
68
|
-
# for example, to get HITs beyond the first "page" returned by
|
69
|
-
# Amazon. This class provides methods that make it easy to get
|
70
|
-
# ALL HITs returned by various operations.
|
71
|
-
# * To attach methods specific to Typingpool. For example, the url
|
72
|
-
# and project_id methods read params we've embedded in the
|
73
|
-
# annotation or in hidden fields on an external question, while
|
74
|
-
# the underlying stashed_params method optimizes its lookup of
|
75
|
-
# these variables based on how the app is most likely to be
|
76
|
-
# used. See also the ours? and cacheable? methods.
|
77
|
-
# * To simplify. Typingpool HITs are constrained such that we can
|
78
|
-
# assume they all contain only one assignment and thus only a
|
79
|
-
# maximum of one answer. Also, once we've determined that a HIT
|
80
|
-
# does not belong to Typingpool, it is safe to cache it forever
|
81
|
-
# and never download it again from Amazon.
|
82
|
-
# * To clearly partition methods that result in network
|
83
|
-
# calls. When you access an attribute under hit.full, like
|
84
|
-
# hit.full.status, it is clear you are doing something
|
85
|
-
# potentially expensive to obtain your hit status. Same thing
|
86
|
-
# with accessing an attribute under hit.assignment, like
|
87
|
-
# hit.assignment.worker_id -- it is clear an assignment object
|
88
|
-
# will need to be created, implying a network call. Calling
|
89
|
-
# hit.id, in contrast, is always fast. (Caveat: Accessing
|
90
|
-
# partitioned attributes often, but not always, results in a
|
91
|
-
# network call. In some cases, hit.full is generated at the same
|
92
|
-
# time we create the hit, since we've obtained a full HIT
|
93
|
-
# serialization from Amazon. In other cases, we only have a HIT
|
94
|
-
# id, so accessing anything under hit.full generates a network
|
95
|
-
# call.)
|
96
|
-
class HIT
|
97
|
-
require 'set'
|
98
|
-
require 'uri'
|
99
|
-
|
100
|
-
class << self
|
101
|
-
|
102
|
-
#Constructor. Creates an Amazon Mechanical Turk HIT.
|
103
|
-
#** Warning: This method can spend your money! **
|
104
|
-
# ==== Params
|
105
|
-
# [question] Typingpool::Amazon::Question instance, used not
|
106
|
-
# only to generate the (external) question but
|
107
|
-
# also parsed to provide one or more core HIT
|
108
|
-
# attributes. Must include a non-nil
|
109
|
-
# annotation attribute. Provides fallback
|
110
|
-
# values for HIT title and description.
|
111
|
-
# [config_assign] The 'assign' attribute of a
|
112
|
-
# Typingpool::Config instance (that is, a
|
113
|
-
# Typingpool::Config::Root::Assign
|
114
|
-
# instance). Must include values for reward,
|
115
|
-
# lifetime, duration, and approval. May
|
116
|
-
# include values for keywords and
|
117
|
-
# qualifications. Preferred source for HIT
|
118
|
-
# title and description. See
|
119
|
-
# Typingpool::Config documentation for further
|
120
|
-
# details.
|
121
|
-
# ==== Returns
|
122
|
-
# Typingpool::Amazon::HIT instance corresponding to the new
|
123
|
-
# Mechanical Turk HIT.
|
124
|
-
def create(question, config_assign)
|
125
|
-
new(RTurk::Hit.create(:title => config_assign.title || question.title) do |hit|
|
126
|
-
hit.description = config_assign.description || question.description
|
127
|
-
hit.question(question.url)
|
128
|
-
hit.note = question.annotation or raise Error, "Missing annotation from question"
|
129
|
-
hit.reward = config_assign.reward or raise Error, "Missing reward config"
|
130
|
-
hit.assignments = 1
|
131
|
-
hit.lifetime = config_assign.lifetime or raise Error, "Missing lifetime config"
|
132
|
-
hit.duration = config_assign.deadline or raise Error, "Missing deadline config"
|
133
|
-
hit.auto_approval = config_assign.approval or raise Error, "Missing approval config"
|
134
|
-
hit.keywords = config_assign.keywords if config_assign.keywords
|
135
|
-
config_assign.qualify.each{|q| hit.qualifications.add(*q.to_arg)} if config_assign.qualify
|
136
|
-
end)
|
137
|
-
end
|
138
|
-
|
139
|
-
#Name of the hidden HTML form field used to provide the
|
140
|
-
#project_id in an external question or (form-encoded)
|
141
|
-
#annotation. Hard coded to typingpool_project_id but
|
142
|
-
#overridable in a subclass.
|
143
|
-
def id_at
|
144
|
-
@@id_at ||= 'typingpool_project_id'
|
145
|
-
end
|
146
|
-
|
147
|
-
#Name of the hidden HTML form field used to provide the
|
148
|
-
#(audio) url in an external question or (form-encoded)
|
149
|
-
#annotation. Hard coded to typingpool_url but overridable in a
|
150
|
-
#subclass.
|
151
|
-
def url_at
|
152
|
-
@@url_at ||= 'typingpool_url'
|
153
|
-
end
|
154
|
-
|
155
|
-
#Takes an array of HIT ids, returns Typingpool::Amazon::HIT
|
156
|
-
#instances corresponding to those ids.
|
157
|
-
def with_ids(ids)
|
158
|
-
ids.map{|id| cached_or_new(RTurk::Hit.new(id)) }
|
159
|
-
end
|
160
|
-
|
161
|
-
#Returns all Typingpool HITs that have been approved, as an
|
162
|
-
#array of Typingpool::Amazon::HIT instances.
|
163
|
-
def all_approved
|
164
|
-
hits = all_reviewable do |hit|
|
165
|
-
begin
|
166
|
-
#optimization: we assume it is more common to have an
|
167
|
-
#unapproved HIT than an approved HIT that does not
|
168
|
-
#belong to this app
|
169
|
-
hit.approved? && hit.ours?
|
170
|
-
rescue RestClient::ServiceUnavailable => e
|
171
|
-
warn "Warning: Service unavailable error, skipped HIT #{hit.id}. (Error: #{e})"
|
172
|
-
false
|
173
|
-
end
|
174
|
-
end
|
175
|
-
hits
|
176
|
-
end
|
177
|
-
|
178
|
-
#Returns as an array of Typingpool::Amazon::HIT instances all
|
179
|
-
#HITs returned by Amazon's GetReviewableHITs operation (which
|
180
|
-
#have HIT status == 'Reviewable'). Takes an optional filter
|
181
|
-
#block (which should return true for HITs to be included in
|
182
|
-
#the final results). If not supplied, will filter so the
|
183
|
-
#returned hits are all Typingpool HITs (hit.ours? == true).
|
184
|
-
def all_reviewable(&filter)
|
185
|
-
hits = each_page do |page_number|
|
186
|
-
RTurk.GetReviewableHITs(:page_number => page_number).hit_ids.map{|id| RTurk::Hit.new(id) }.map{|hit| cached_or_new(hit) }
|
187
|
-
end
|
188
|
-
filter_ours(hits, &filter)
|
189
|
-
end
|
190
|
-
|
191
|
-
#Takes a Typingpool::Project::Local#id and returns all HITs
|
192
|
-
#associated with that project, as an array of
|
193
|
-
#Typingpool::Amazon::HIT instances.
|
194
|
-
def all_for_project(id)
|
195
|
-
all{|hit| hit.ours? && hit.project_id == id}
|
196
|
-
end
|
197
|
-
|
198
|
-
#Returns all HITs associated with your AWS account as an array
|
199
|
-
#of Typingpool::Amazon::HIT instances. Takes an optional
|
200
|
-
#filter block (which should return true for HITs to be
|
201
|
-
#included in the final results). If not supplied, will filter
|
202
|
-
#so the returned hits are all Typingpool HITs (hit.ours? ==
|
203
|
-
#true).
|
204
|
-
def all(&filter)
|
205
|
-
hits = each_page do |page_number|
|
206
|
-
page = RTurk::SearchHITs.create(:page_number => page_number)
|
207
|
-
raw_hits = page.xml.xpath('//HIT')
|
208
|
-
page.hits.map do |rturk_hit|
|
209
|
-
annotation = raw_hits.shift.xpath('RequesterAnnotation').inner_text.strip
|
210
|
-
full = Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation)
|
211
|
-
cached_or_new_from_searchhits(rturk_hit, annotation)
|
212
|
-
end
|
213
|
-
end
|
214
|
-
filter_ours(hits, &filter)
|
215
|
-
end
|
216
|
-
|
217
|
-
#protected
|
218
|
-
|
219
|
-
#Constructor. Takes an RTurk::Hit instance. Returns a
|
220
|
-
#Typingpool::Amazon::HIT instance, preferably from the cache.
|
221
|
-
def cached_or_new(rturk_hit)
|
222
|
-
from_cache(rturk_hit.id) || new(rturk_hit)
|
223
|
-
end
|
224
|
-
|
225
|
-
#Constructor. Same as cached_or_new, but handles peculiarities
|
226
|
-
#of objects returned by RTurk::SearchHITs. Such objects map
|
227
|
-
#two Amazon HIT fields to different names than those used by
|
228
|
-
#other RTurk HIT instances. They also do not bother to extract
|
229
|
-
#the annotation from the Amazon HIT, so we have to do that
|
230
|
-
#ourselves (elsewhere) and take it as a param here. Finally,
|
231
|
-
#on the bright side, RTurk::SearchHITs already contain a big
|
232
|
-
#chunk of hit.full attributes, potentially obviating the need
|
233
|
-
#for an additional network call to flesh out the HIT, so this
|
234
|
-
#method pre-fleshes-out the HIT.
|
235
|
-
def cached_or_new_from_searchhits(rturk_hit, annotation)
|
236
|
-
if not (typingpool_hit = from_cache(rturk_hit.id))
|
237
|
-
typingpool_hit = new(rturk_hit)
|
238
|
-
typingpool_hit.full(Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation))
|
239
|
-
end
|
240
|
-
typingpool_hit
|
241
|
-
end
|
242
|
-
|
243
|
-
def from_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
|
244
|
-
Amazon.cache.transaction do
|
245
|
-
Amazon.cache[cache_key(hit_id, id_at, url_at)]
|
246
|
-
end
|
247
|
-
end
|
248
|
-
|
249
|
-
def delete_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
|
250
|
-
Amazon.cache.transaction do
|
251
|
-
key = cache_key(hit_id, id_at, url_at)
|
252
|
-
cached = Amazon.cache[key]
|
253
|
-
Amazon.cache.delete(key) unless cached.nil?
|
254
|
-
end
|
255
|
-
end
|
256
|
-
|
257
|
-
def cache_key(hit_id, id_at=self.id_at, url_at=self.url_at)
|
258
|
-
"RESULT///#{hit_id}///#{url_at}///#{id_at}"
|
259
|
-
end
|
260
|
-
|
261
|
-
def each_page
|
262
|
-
results = []
|
263
|
-
page = 0
|
264
|
-
begin
|
265
|
-
page += 1
|
266
|
-
new_results = yield(page)
|
267
|
-
results.push(*new_results)
|
268
|
-
end while new_results.count > 0
|
269
|
-
results
|
270
|
-
end
|
271
|
-
|
272
|
-
def filter_ours(hits, &filter)
|
273
|
-
filter ||= lambda{|hit| hit.ours? }
|
274
|
-
hits.select do |hit|
|
275
|
-
selected = filter.call(hit)
|
276
|
-
hit.to_cache
|
277
|
-
selected
|
278
|
-
end
|
279
|
-
end
|
280
|
-
end #class << self
|
281
|
-
|
282
|
-
#Corresponds to the Amazon Mechanical Turk HIT#HITId
|
283
|
-
attr_reader :id
|
284
|
-
|
285
|
-
#Constructor. Takes an RTurk::Hit instance.
|
286
|
-
def initialize(rturk_hit)
|
287
|
-
@id = rturk_hit.id
|
288
|
-
end
|
289
|
-
|
290
|
-
#URL of the audio file associated with this HIT (the audio file
|
291
|
-
#to be transcribed). Extracted from the annotation (when the HIT
|
292
|
-
#was assigned via Typingpool) or from a hidden field in the HTML
|
293
|
-
#form on the external question (when the HIT was assigned via
|
294
|
-
#the Amazon Mechanical Turk RUI).
|
295
|
-
def url
|
296
|
-
@url ||= stashed_param(self.class.url_at)
|
297
|
-
end
|
298
|
-
|
299
|
-
#The Typingpool::Project::Local#id associated with this
|
300
|
-
#HIT. Extracted as described for the url method.
|
301
|
-
def project_id
|
302
|
-
@project_id ||= stashed_param(self.class.id_at)
|
303
|
-
end
|
304
|
-
|
305
|
-
#Returns the Typingpool::Project#name associated with this HIT
|
306
|
-
#by parsing the #url. May be dropped in a future release.
|
307
|
-
def project_title_from_url(url=self.url)
|
308
|
-
matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
|
309
|
-
URI.unescape(matches[2])
|
310
|
-
end
|
311
|
-
|
312
|
-
#Returns true if this HIT has an approved assignment associated
|
313
|
-
#with it. (Attached to Typingpool::Amazon::HIT rather than
|
314
|
-
#Typingpool::Amazon::HIT::Assignment because sometimes we can
|
315
|
-
#tell simply from looking at hit.full that there are no approved
|
316
|
-
#assignments -- hit.full.assignments_completed == 0. This check
|
317
|
-
#is only performed when hit.full has already been loaded.)
|
318
|
-
def approved?
|
319
|
-
assignment_status_match?('Approved')
|
320
|
-
end
|
321
|
-
|
322
|
-
#Returns true if this HIT has a rejected assignment associated
|
323
|
-
#with it. (For an explanation of why this is not attached to
|
324
|
-
#Typingpool::Amazon::HIT::Assignment, see the documentation for
|
325
|
-
#approved?.)
|
326
|
-
def rejected?
|
327
|
-
assignment_status_match?('Rejected')
|
328
|
-
end
|
329
|
-
|
330
|
-
#Returns true if this HIT has a submitted assignment associated
|
331
|
-
#with it. (For an explanation of why this is not attached to
|
332
|
-
#Typingpool::Amazon::HIT::Assignment, see the documentation for
|
333
|
-
#approved?.)
|
334
|
-
def submitted?
|
335
|
-
assignment_status_match?('Submitted')
|
336
|
-
end
|
337
|
-
|
338
|
-
|
339
|
-
#Returns true if this HIT is associated with Typingpool. One
|
340
|
-
#Amazon account can be used for many tasks, so it's important to
|
341
|
-
#check whether the HIT belongs to this software. (Presently,
|
342
|
-
#this is determined by looking for a stashed param like url or
|
343
|
-
#project_id).
|
344
|
-
def ours?
|
345
|
-
@ours ||= not(url.to_s.empty?)
|
346
|
-
end
|
347
|
-
|
348
|
-
#Returns a Typingpool::Transcript::Chunk instance built using
|
349
|
-
#this HIT and its associated assignment.
|
350
|
-
def transcript
|
351
|
-
transcript = Transcript::Chunk.new(assignment.body)
|
352
|
-
transcript.url = url
|
353
|
-
transcript.project = project_id
|
354
|
-
transcript.worker = assignment.worker_id
|
355
|
-
transcript.hit = @id
|
356
|
-
transcript
|
357
|
-
end
|
358
|
-
|
359
|
-
#If this HIT is cacheable, serializes it to the cache file
|
360
|
-
#specified in the config passed to Amazon.setup, or specified in
|
361
|
-
#the default config file. In short, a HIT is cacheable if it
|
362
|
-
#does not belong to Typingpool (ours? == false), if it is
|
363
|
-
#approved or rejected (approved? || rejected?), or if it is
|
364
|
-
#expired (full.expired_and_overdue?). See also cacheable? code.
|
365
|
-
#
|
366
|
-
# When available, cached HITs are used by
|
367
|
-
# Typingpool::Amazon::HIT.all,
|
368
|
-
# Typingpool::Amazon::HIT.all_approved, and all the other class
|
369
|
-
# methods that retrieve HITs. These methods call to_cache for
|
370
|
-
# you at logical times (after downloading and filtering, when
|
371
|
-
# the HIT is most fleshed out), so you should not need to call
|
372
|
-
# this yourself. But if you have an operation that makes network
|
373
|
-
# calls to further flesh out the HIT, calling to_cache may be
|
374
|
-
# worthwhile.
|
375
|
-
def to_cache
|
376
|
-
#any obj containing a Nokogiri object cannot be stored in pstore - do
|
377
|
-
#not forget this (again)
|
378
|
-
if cacheable?
|
379
|
-
Amazon.cache.transaction do
|
380
|
-
Amazon.cache[self.class.cache_key(@id)] = self
|
381
|
-
end
|
382
|
-
end
|
383
|
-
end
|
384
|
-
|
385
|
-
#Returns an RTurk::Hit instance corresponding to this HIT.
|
386
|
-
def at_amazon
|
387
|
-
Amazon.rturk_hit_full(@id)
|
388
|
-
end
|
389
|
-
|
390
|
-
#Deletes the HIT from Amazon's servers. Examines the HIT and
|
391
|
-
#assignment status to determine whether calling the DisposeHIT
|
392
|
-
#or DisableHIT operation is most appropriate. If the HIT has
|
393
|
-
#been submitted but not approved or rejected, will raise an
|
394
|
-
#exception of type
|
395
|
-
#Typingpool::Error::Amazon::UnreviewedContent. Catch this
|
396
|
-
#exception in your own code if you'd like to automatically
|
397
|
-
#approve such HITs before removing them.
|
398
|
-
def remove_from_amazon
|
399
|
-
if full.status == 'Reviewable'
|
400
|
-
if assignment.status == 'Submitted'
|
401
|
-
raise Error::Amazon::UnreviewedContent, "There is an unreviewed submission for #{url}"
|
402
|
-
end
|
403
|
-
at_amazon.dispose!
|
404
|
-
else
|
405
|
-
at_amazon.disable!
|
406
|
-
end
|
407
|
-
end
|
408
|
-
|
409
|
-
#Returns "the full hit" - a Typingpool::Amazon::HIT::Full
|
410
|
-
#instance associated with this HIT. If the instance is being
|
411
|
-
#created for the first time, this will trigger an HTTP request
|
412
|
-
#to Amazon's servers. "Full" hit fields segregated because
|
413
|
-
#accessing any one of them is expensive if we only have a hit id
|
414
|
-
#(but after fetching one all are cheap). Accepts an optional
|
415
|
-
#Typingpool::Amazon::HIT::Full (or subclass) to set for this
|
416
|
-
#attribute, preventing the need to create one. This is useful in
|
417
|
-
#cases in which extensive HIT data was returned by an Amazon
|
418
|
-
#operation (for example, SearchHITs returns lots of HIT data)
|
419
|
-
def full(full_hit=nil)
|
420
|
-
if @full.nil?
|
421
|
-
@full = full_hit || Full.new(at_amazon)
|
422
|
-
end
|
423
|
-
@full
|
424
|
-
end
|
425
|
-
|
426
|
-
#Returns the assignment associated with this HIT - a
|
427
|
-
#Typingpool::Amazon::HIT::Assignment instance. The first time
|
428
|
-
#this is called, an Amazon HTTP request is typically (but not
|
429
|
-
#always) sent.
|
430
|
-
def assignment
|
431
|
-
if @assignment.nil?
|
432
|
-
if @full && full.assignments_completed == 0
|
433
|
-
#It would be dangerous to do this if the HIT were to be
|
434
|
-
#cached, since we would then never check for the
|
435
|
-
#assignment again. But we know this HIT won't be cached
|
436
|
-
#while it is active, since we only cache approved and
|
437
|
-
#rejected HITs.
|
438
|
-
@assignment = Assignment::Empty.new
|
439
|
-
else
|
440
|
-
@assignment = Assignment.new(at_amazon) #expensive
|
441
|
-
end
|
442
|
-
end
|
443
|
-
@assignment
|
444
|
-
end
|
445
|
-
|
446
|
-
|
447
|
-
#private
|
448
|
-
|
449
|
-
def stashed_param(param)
|
450
|
-
if @assignment && assignment.answers[param]
|
451
|
-
return assignment.answers[param]
|
452
|
-
elsif full.annotation[param]
|
453
|
-
#A question assigned through this software. May be
|
454
|
-
#expensive: May result in HTTP request to fetch HIT
|
455
|
-
#fields. We choose to fetch (sometimes) the HIT rather than
|
456
|
-
#the assignment on the assumption it will be MORE common to
|
457
|
-
#encounter HITs with no answers and LESS common to encounter
|
458
|
-
#HITs assigned through the RUI (and thus lacking in an
|
459
|
-
#annotation from this software and thus rendering the HTTP
|
460
|
-
#request to fetch the HIT fields pointless).
|
461
|
-
return full.annotation[param]
|
462
|
-
elsif full.assignments_completed.to_i >= 1
|
463
|
-
#A question assigned through Amazon's RUI, with an answer
|
464
|
-
#submitted. If the HIT belongs to this software, this
|
465
|
-
#assignment's answers will include our param. We prefer
|
466
|
-
#fetching the assignment to fetching the external question
|
467
|
-
#(as below) because fetching the assignment will potentially
|
468
|
-
#save us an HTTP request down the line -- for example, if we
|
469
|
-
#need other assignment data (e.g. assignment status).
|
470
|
-
#Fetching the external question only serves to give us
|
471
|
-
#access to params. If the answers do not include our param,
|
472
|
-
#we know the HIT does not belong to this software, since we
|
473
|
-
#know the param was also not in the annotation. So we are
|
474
|
-
#safe returning nil in that case.
|
475
|
-
return assignment.answers[param]
|
476
|
-
else
|
477
|
-
#A question assigned via Amazon's RUI, with no answer
|
478
|
-
#submitted. Expensive: Results in HTTP request to fetch
|
479
|
-
#external question.
|
480
|
-
return full.external_question_param(param)
|
481
|
-
end
|
482
|
-
end
|
483
|
-
|
484
|
-
def assignment_status_match?(status)
|
485
|
-
if @full
|
486
|
-
return false if full.assignments_completed == 0
|
487
|
-
return false if full.status != 'Reviewable'
|
488
|
-
end
|
489
|
-
assignment.status == status
|
490
|
-
end
|
491
|
-
|
492
|
-
|
493
|
-
@@cacheable_assignment_status = Set.new %w(Approved Rejected)
|
494
|
-
def cacheable?
|
495
|
-
if @ours == false
|
496
|
-
return true
|
497
|
-
end
|
498
|
-
if @full
|
499
|
-
return true if full.expired_and_overdue?
|
500
|
-
end
|
501
|
-
if @assignment && assignment.status
|
502
|
-
return true if @@cacheable_assignment_status.include?(assignment.status)
|
503
|
-
end
|
504
|
-
return false
|
505
|
-
end
|
506
|
-
|
507
|
-
class Full
|
508
|
-
require 'uri'
|
509
|
-
require 'open-uri'
|
510
|
-
require 'nokogiri'
|
511
|
-
|
512
|
-
#See the RTurk documentation and Amazon Mechanical Turk API
|
513
|
-
#documentation for more on these fields.
|
514
|
-
attr_reader :id, :type_id, :status, :external_question_url, :assignments_completed, :assignments_pending, :expires_at, :assignments_duration
|
515
|
-
|
516
|
-
#Constructor. Takes an RTurk::HIT instance.
|
517
|
-
def initialize(rturk_hit)
|
518
|
-
import_standard_attrs_from_rturk_hit(rturk_hit)
|
519
|
-
@assignments_completed = rturk_hit.assignments_completed_count
|
520
|
-
@assignments_pending = rturk_hit.assignments_pending_count
|
521
|
-
self.annotation = rturk_hit.annotation
|
522
|
-
self.external_question_url = rturk_hit.xml
|
523
|
-
end
|
524
|
-
|
525
|
-
#Returns the HIT annotation as a hash. If the annotation
|
526
|
-
#contained URL-encoded form key-value pairs, it decodes them
|
527
|
-
#and returns them as a hash. Otherwise, returns an empty hash
|
528
|
-
#(throwing away any annotation text that is not URL-encoded
|
529
|
-
#key-value pairs, for example the tags attached by the Amazon
|
530
|
-
#Mechanical Turk RUI).
|
531
|
-
def annotation
|
532
|
-
@annotation ||= {}
|
533
|
-
end
|
534
|
-
|
535
|
-
#Returns boolean indicated whether the HIT is
|
536
|
-
#expired. Determined by comparing the HIT's expires_at
|
537
|
-
#attribute with the current time.
|
538
|
-
def expired?
|
539
|
-
expires_at < Time.now
|
540
|
-
end
|
541
|
-
|
542
|
-
#Returns boolean indicated whether the HIT is expired and
|
543
|
-
#overdue, at which point it is totally safe to prune. This is
|
544
|
-
#determined by adding the assignment duration (how long a
|
545
|
-
#worker has to complete the HIT) to the HIT's expires_at time
|
546
|
-
#(when the HIT is removed from the Mechanical Turk
|
547
|
-
#marketplace).
|
548
|
-
def expired_and_overdue?
|
549
|
-
(expires_at + assignments_duration) < Time.now
|
550
|
-
end
|
551
|
-
|
552
|
-
#Returns the HTML of the external question associated with the
|
553
|
-
#HIT. All Typingpool HITs use external questions (as opposed
|
554
|
-
#to "internal" HIT QuestionForms), so this should always
|
555
|
-
#return something. In first use, must make an HTTP request to
|
556
|
-
#obtain the HTML.
|
557
|
-
def external_question
|
558
|
-
if @external_question.nil?
|
559
|
-
if external_question_url && external_question_url.match(/^http/)
|
560
|
-
#expensive, obviously:
|
561
|
-
@external_question = open(external_question_url).read
|
562
|
-
end
|
563
|
-
end
|
564
|
-
@external_question
|
565
|
-
end
|
566
|
-
|
567
|
-
#Takes the name of an HTML form param and returns the value
|
568
|
-
#associated with that param in the external question
|
569
|
-
#HTML. Triggers an HTTP request on first use (unless
|
570
|
-
#external_question has already been called).
|
571
|
-
def external_question_param(param)
|
572
|
-
if external_question
|
573
|
-
if input = Nokogiri::HTML::Document.parse(external_question).css("input[name=#{param}]")[0]
|
574
|
-
return input['value']
|
575
|
-
end
|
576
|
-
end
|
577
|
-
end
|
578
|
-
|
579
|
-
protected
|
580
|
-
|
581
|
-
def import_standard_attrs_from_rturk_hit(hit)
|
582
|
-
%w(id type_id status expires_at assignments_duration).each do |attr|
|
583
|
-
instance_variable_set("@#{attr}", hit.send(attr))
|
584
|
-
end
|
585
|
-
end
|
586
|
-
|
587
|
-
def annotation=(encoded)
|
588
|
-
@annotation = CGI.unescapeHTML(encoded.to_s)
|
589
|
-
begin
|
590
|
-
@annotation = URI.decode_www_form(@annotation)
|
591
|
-
@annotation = Hash[*@annotation.flatten]
|
592
|
-
rescue ArgumentError
|
593
|
-
#Handle annotations like Department:Transcription (from
|
594
|
-
#the Amazon RUI), which make URI.decode_www_form barf
|
595
|
-
@annotation = {}
|
596
|
-
end
|
597
|
-
end
|
598
|
-
|
599
|
-
def external_question_url=(noko_xml)
|
600
|
-
if url = noko_xml.css('HIT Question eq|ExternalQuestion eq|ExternalURL', {'eq' => 'http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2006-07-14/ExternalQuestion.xsd'})[0].inner_text
|
601
|
-
@external_question_url = url
|
602
|
-
end
|
603
|
-
end
|
604
|
-
|
605
|
-
#For more on why this subclass is neccesary, see the
|
606
|
-
#documentation for
|
607
|
-
#Typingpool::Amazon::HIT.cached_or_new_from_searchhits. In
|
608
|
-
#short, RTurk::HITParser objects returned by RTurk::SearchHITs
|
609
|
-
#are pointlessly and subtly different from
|
610
|
-
#RTurk::GetHITResponse objects. (I need to submit a patch to
|
611
|
-
#RTurk.)
|
612
|
-
class FromSearchHITs < Full
|
613
|
-
#Constructor. Takes an RTurk::Hit instance and the text of
|
614
|
-
#the HIT's annotation. The text of the annotation must be
|
615
|
-
#submitted as a separate param because RTurk::Hit instances
|
616
|
-
#returned by RTurk::SearchHITs do not bother to extract the
|
617
|
-
#annotation into an attribute, so we have to so that
|
618
|
-
#ourselves (elsewhere) using the raw xml.
|
619
|
-
def initialize(rturk_hit, annotation)
|
620
|
-
import_standard_attrs_from_rturk_hit(rturk_hit)
|
621
|
-
@assignments_completed = rturk_hit.completed_assignments
|
622
|
-
@assignments_pending = rturk_hit.pending_assignments
|
623
|
-
self.annotation = annotation
|
624
|
-
end
|
625
|
-
|
626
|
-
protected
|
627
|
-
|
628
|
-
def external_question_url
|
629
|
-
unless @checked_question
|
630
|
-
self.external_question_url = at_amazon.xml
|
631
|
-
@checked_question = true
|
632
|
-
end
|
633
|
-
@external_question_url
|
634
|
-
end
|
635
|
-
|
636
|
-
def at_amazon
|
637
|
-
Amazon.rturk_hit_full(@id)
|
638
|
-
end
|
639
|
-
end #Amazon::HIT::Full::FromSearchHITs
|
640
|
-
end #Amazon::HIT::Full
|
641
|
-
|
642
|
-
class Assignment
|
643
|
-
|
644
|
-
#See the RTurk documentation and Amazon Mechanical Turk API
|
645
|
-
#documentation for more on these fields.
|
646
|
-
attr_reader :id, :status, :worker_id, :submitted_at
|
647
|
-
|
648
|
-
#Constructor. Takes an RTurk::Hit instance.
|
649
|
-
def initialize(rturk_hit)
|
650
|
-
if assignment = rturk_hit.assignments[0] #expensive!
|
651
|
-
@id = assignment.id
|
652
|
-
@status = assignment.status
|
653
|
-
@worker_id = assignment.worker_id
|
654
|
-
@submitted_at = assignment.submitted_at
|
655
|
-
if answers = assignment.answers
|
656
|
-
@answers = answers.to_hash
|
657
|
-
end
|
658
|
-
end
|
659
|
-
end
|
660
|
-
|
661
|
-
#Returns the answers associated with this assignment as a
|
662
|
-
#hash. If there are no answers, returns an empty hash.
|
663
|
-
def answers
|
664
|
-
@answers ||= {}
|
665
|
-
end
|
666
|
-
|
667
|
-
#Returns the transcription submitted by the user as raw text.
|
668
|
-
def body
|
669
|
-
(answers['transcription'] || answers['1']).to_s
|
670
|
-
end
|
671
|
-
|
672
|
-
#Returms an RTurk::Assignment object corresponding to this
|
673
|
-
#assignment.
|
674
|
-
def at_amazon
|
675
|
-
RTurk::Assignment.new(@id)
|
676
|
-
end
|
677
|
-
|
678
|
-
#Subclass used in cases where we know Amazon's servers have no
|
679
|
-
#assignments for us (because hit.full.assignments_completed ==
|
680
|
-
#0), so we don't want to bother doing an HTTP request to
|
681
|
-
#check.
|
682
|
-
class Empty < Assignment
|
683
|
-
def initialize
|
684
|
-
@answers = {}
|
685
|
-
end
|
686
|
-
|
687
|
-
end #Empty
|
688
|
-
end #Assignment
|
689
|
-
end #HIT
|
690
|
-
|
691
|
-
#Class encapsulating the HTML form presented to Mechanical Turk workers
|
692
|
-
#transcribing a Typingpool audio chunk.
|
693
|
-
class Question
|
694
|
-
require 'nokogiri'
|
695
|
-
require 'uri'
|
696
|
-
require 'cgi'
|
697
|
-
attr_reader :url, :html
|
698
|
-
|
699
|
-
#Constructor. Takes the URL of where the question HTML has been
|
700
|
-
#uploaded, followed by the question HTML itself.
|
701
|
-
def initialize(url, html)
|
702
|
-
@url = url
|
703
|
-
@html = html
|
704
|
-
end
|
705
|
-
|
706
|
-
#Returns URL-encoded key-value pairs that can be used as the
|
707
|
-
#text for a HIT#annotation. The key-value pairs correspond to
|
708
|
-
#all hidden HTML form fields in the question HTML.
|
709
|
-
def annotation
|
710
|
-
CGI.escapeHTML(URI.encode_www_form(Hash[*noko.css('input[type="hidden"]').select{|e| e['name'].match(/^typingpool_/) }.map{|e| [e['name'], e['value']]}.flatten]))
|
711
|
-
end
|
712
|
-
|
713
|
-
#Returns the title, extracted from the title element of the
|
714
|
-
#HTML.
|
715
|
-
def title
|
716
|
-
noko.css('title')[0].content
|
717
|
-
end
|
718
|
-
|
719
|
-
#Returns the description, extracted from the element with the id
|
720
|
-
#'description' in the HTML.
|
721
|
-
def description
|
722
|
-
noko.css('#description')[0].content
|
723
|
-
end
|
724
|
-
|
725
|
-
protected
|
726
|
-
|
727
|
-
def noko(html=@html)
|
728
|
-
Nokogiri::HTML(html, nil, 'UTF-8')
|
729
|
-
end
|
730
|
-
end #Question
|
731
57
|
end #Amazon
|
732
58
|
end #Typingpool
|