typingpool 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.markdown +452 -0
- data/lib/typingpool/amazon/hit/assignment/empty.rb +19 -0
- data/lib/typingpool/amazon/hit/assignment.rb +43 -0
- data/lib/typingpool/amazon/hit/full/fromsearchhits.rb +44 -0
- data/lib/typingpool/amazon/hit/full.rb +105 -0
- data/lib/typingpool/amazon/hit.rb +458 -0
- data/lib/typingpool/amazon/question.rb +45 -0
- data/lib/typingpool/amazon.rb +3 -677
- data/lib/typingpool/app/cli/formatter.rb +16 -0
- data/lib/typingpool/app/cli.rb +64 -0
- data/lib/typingpool/app/friendlyexceptions.rb +34 -0
- data/lib/typingpool/app.rb +2 -97
- data/lib/typingpool/config/root.rb +114 -0
- data/lib/typingpool/config.rb +13 -119
- data/lib/typingpool/filer/audio.rb +84 -0
- data/lib/typingpool/filer/csv.rb +57 -0
- data/lib/typingpool/filer/dir.rb +76 -0
- data/lib/typingpool/filer/files/audio.rb +63 -0
- data/lib/typingpool/filer/files.rb +55 -0
- data/lib/typingpool/filer.rb +4 -313
- data/lib/typingpool/project/local.rb +117 -0
- data/lib/typingpool/project/remote/s3.rb +135 -0
- data/lib/typingpool/project/remote/sftp.rb +100 -0
- data/lib/typingpool/project/remote.rb +65 -0
- data/lib/typingpool/project.rb +2 -396
- data/lib/typingpool/template/assignment.rb +17 -0
- data/lib/typingpool/template/env.rb +77 -0
- data/lib/typingpool/template.rb +2 -87
- data/lib/typingpool/test/script.rb +310 -0
- data/lib/typingpool/test.rb +1 -306
- data/lib/typingpool/transcript/chunk.rb +129 -0
- data/lib/typingpool/transcript.rb +1 -125
- data/lib/typingpool/utility/castable.rb +65 -0
- data/lib/typingpool/utility.rb +1 -61
- data/test/test_integration_script_6_tp_finish.rb +1 -0
- metadata +135 -81
data/lib/typingpool/amazon.rb
CHANGED
@@ -2,6 +2,9 @@ module Typingpool
|
|
2
2
|
class Amazon
|
3
3
|
require 'rturk'
|
4
4
|
require 'pstore'
|
5
|
+
require 'typingpool/amazon/hit'
|
6
|
+
require 'typingpool/amazon/question'
|
7
|
+
|
5
8
|
@@cache_file = '~/.typingpool.cache'
|
6
9
|
|
7
10
|
class << self
|
@@ -51,682 +54,5 @@ module Typingpool
|
|
51
54
|
end
|
52
55
|
|
53
56
|
end #class << self
|
54
|
-
|
55
|
-
#Class representing an Amazon Mechanical Turk Human Intelligence
|
56
|
-
#Task (HIT).
|
57
|
-
#
|
58
|
-
#We go above and beyond RTurk::Hit for several practical reasons:
|
59
|
-
# * To allow easy serialization. Caching is a very useful way of
|
60
|
-
# reducing network calls to Amazon, and thus of speeding up
|
61
|
-
# Typingpool. RTurk::Hit objects cannot be dumped via Marshal,
|
62
|
-
# apparently due to some Nokogiri objects they
|
63
|
-
# contain. Typingpool::Amazon::HIT objects, in contrast, are
|
64
|
-
# designed to be easily and compactly serialized. They store the
|
65
|
-
# minimal subset of information we need via simple
|
66
|
-
# attribtues. (Presently we serialize via PStore.)
|
67
|
-
# * To attach convenience methods. RTurk does not make it easy,
|
68
|
-
# for example, to get HITs beyond the first "page" returned by
|
69
|
-
# Amazon. This class provides methods that make it easy to get
|
70
|
-
# ALL HITs returned by various operations.
|
71
|
-
# * To attach methods specific to Typingpool. For example, the url
|
72
|
-
# and project_id methods read params we've embedded in the
|
73
|
-
# annotation or in hidden fields on an external question, while
|
74
|
-
# the underlying stashed_params method optimizes its lookup of
|
75
|
-
# these variables based on how the app is most likely to be
|
76
|
-
# used. See also the ours? and cacheable? methods.
|
77
|
-
# * To simplify. Typingpool HITs are constrained such that we can
|
78
|
-
# assume they all contain only one assignment and thus only a
|
79
|
-
# maximum of one answer. Also, once we've determined that a HIT
|
80
|
-
# does not belong to Typingpool, it is safe to cache it forever
|
81
|
-
# and never download it again from Amazon.
|
82
|
-
# * To clearly partition methods that result in network
|
83
|
-
# calls. When you access an attribute under hit.full, like
|
84
|
-
# hit.full.status, it is clear you are doing something
|
85
|
-
# potentially expensive to obtain your hit status. Same thing
|
86
|
-
# with accessing an attribute under hit.assignment, like
|
87
|
-
# hit.assignment.worker_id -- it is clear an assignment object
|
88
|
-
# will need to be created, implying a network call. Calling
|
89
|
-
# hit.id, in contrast, is always fast. (Caveat: Accessing
|
90
|
-
# partitioned attributes often, but not always, results in a
|
91
|
-
# network call. In some cases, hit.full is generated at the same
|
92
|
-
# time we create the hit, since we've obtained a full HIT
|
93
|
-
# serialization from Amazon. In other cases, we only have a HIT
|
94
|
-
# id, so accessing anything under hit.full generates a network
|
95
|
-
# call.)
|
96
|
-
class HIT
|
97
|
-
require 'set'
|
98
|
-
require 'uri'
|
99
|
-
|
100
|
-
class << self
|
101
|
-
|
102
|
-
#Constructor. Creates an Amazon Mechanical Turk HIT.
|
103
|
-
#** Warning: This method can spend your money! **
|
104
|
-
# ==== Params
|
105
|
-
# [question] Typingpool::Amazon::Question instance, used not
|
106
|
-
# only to generate the (external) question but
|
107
|
-
# also parsed to provide one or more core HIT
|
108
|
-
# attributes. Must include a non-nil
|
109
|
-
# annotation attribute. Provides fallback
|
110
|
-
# values for HIT title and description.
|
111
|
-
# [config_assign] The 'assign' attribute of a
|
112
|
-
# Typingpool::Config instance (that is, a
|
113
|
-
# Typingpool::Config::Root::Assign
|
114
|
-
# instance). Must include values for reward,
|
115
|
-
# lifetime, duration, and approval. May
|
116
|
-
# include values for keywords and
|
117
|
-
# qualifications. Preferred source for HIT
|
118
|
-
# title and description. See
|
119
|
-
# Typingpool::Config documentation for further
|
120
|
-
# details.
|
121
|
-
# ==== Returns
|
122
|
-
# Typingpool::Amazon::HIT instance corresponding to the new
|
123
|
-
# Mechanical Turk HIT.
|
124
|
-
def create(question, config_assign)
|
125
|
-
new(RTurk::Hit.create(:title => config_assign.title || question.title) do |hit|
|
126
|
-
hit.description = config_assign.description || question.description
|
127
|
-
hit.question(question.url)
|
128
|
-
hit.note = question.annotation or raise Error, "Missing annotation from question"
|
129
|
-
hit.reward = config_assign.reward or raise Error, "Missing reward config"
|
130
|
-
hit.assignments = 1
|
131
|
-
hit.lifetime = config_assign.lifetime or raise Error, "Missing lifetime config"
|
132
|
-
hit.duration = config_assign.deadline or raise Error, "Missing deadline config"
|
133
|
-
hit.auto_approval = config_assign.approval or raise Error, "Missing approval config"
|
134
|
-
hit.keywords = config_assign.keywords if config_assign.keywords
|
135
|
-
config_assign.qualify.each{|q| hit.qualifications.add(*q.to_arg)} if config_assign.qualify
|
136
|
-
end)
|
137
|
-
end
|
138
|
-
|
139
|
-
#Name of the hidden HTML form field used to provide the
|
140
|
-
#project_id in an external question or (form-encoded)
|
141
|
-
#annotation. Hard coded to typingpool_project_id but
|
142
|
-
#overridable in a subclass.
|
143
|
-
def id_at
|
144
|
-
@@id_at ||= 'typingpool_project_id'
|
145
|
-
end
|
146
|
-
|
147
|
-
#Name of the hidden HTML form field used to provide the
|
148
|
-
#(audio) url in an external question or (form-encoded)
|
149
|
-
#annotation. Hard coded to typingpool_url but overridable in a
|
150
|
-
#subclass.
|
151
|
-
def url_at
|
152
|
-
@@url_at ||= 'typingpool_url'
|
153
|
-
end
|
154
|
-
|
155
|
-
#Takes an array of HIT ids, returns Typingpool::Amazon::HIT
|
156
|
-
#instances corresponding to those ids.
|
157
|
-
def with_ids(ids)
|
158
|
-
ids.map{|id| cached_or_new(RTurk::Hit.new(id)) }
|
159
|
-
end
|
160
|
-
|
161
|
-
#Returns all Typingpool HITs that have been approved, as an
|
162
|
-
#array of Typingpool::Amazon::HIT instances.
|
163
|
-
def all_approved
|
164
|
-
hits = all_reviewable do |hit|
|
165
|
-
begin
|
166
|
-
#optimization: we assume it is more common to have an
|
167
|
-
#unapproved HIT than an approved HIT that does not
|
168
|
-
#belong to this app
|
169
|
-
hit.approved? && hit.ours?
|
170
|
-
rescue RestClient::ServiceUnavailable => e
|
171
|
-
warn "Warning: Service unavailable error, skipped HIT #{hit.id}. (Error: #{e})"
|
172
|
-
false
|
173
|
-
end
|
174
|
-
end
|
175
|
-
hits
|
176
|
-
end
|
177
|
-
|
178
|
-
#Returns as an array of Typingpool::Amazon::HIT instances all
|
179
|
-
#HITs returned by Amazon's GetReviewableHITs operation (which
|
180
|
-
#have HIT status == 'Reviewable'). Takes an optional filter
|
181
|
-
#block (which should return true for HITs to be included in
|
182
|
-
#the final results). If not supplied, will filter so the
|
183
|
-
#returned hits are all Typingpool HITs (hit.ours? == true).
|
184
|
-
def all_reviewable(&filter)
|
185
|
-
hits = each_page do |page_number|
|
186
|
-
RTurk.GetReviewableHITs(:page_number => page_number).hit_ids.map{|id| RTurk::Hit.new(id) }.map{|hit| cached_or_new(hit) }
|
187
|
-
end
|
188
|
-
filter_ours(hits, &filter)
|
189
|
-
end
|
190
|
-
|
191
|
-
#Takes a Typingpool::Project::Local#id and returns all HITs
|
192
|
-
#associated with that project, as an array of
|
193
|
-
#Typingpool::Amazon::HIT instances.
|
194
|
-
def all_for_project(id)
|
195
|
-
all{|hit| hit.ours? && hit.project_id == id}
|
196
|
-
end
|
197
|
-
|
198
|
-
#Returns all HITs associated with your AWS account as an array
|
199
|
-
#of Typingpool::Amazon::HIT instances. Takes an optional
|
200
|
-
#filter block (which should return true for HITs to be
|
201
|
-
#included in the final results). If not supplied, will filter
|
202
|
-
#so the returned hits are all Typingpool HITs (hit.ours? ==
|
203
|
-
#true).
|
204
|
-
def all(&filter)
|
205
|
-
hits = each_page do |page_number|
|
206
|
-
page = RTurk::SearchHITs.create(:page_number => page_number)
|
207
|
-
raw_hits = page.xml.xpath('//HIT')
|
208
|
-
page.hits.map do |rturk_hit|
|
209
|
-
annotation = raw_hits.shift.xpath('RequesterAnnotation').inner_text.strip
|
210
|
-
full = Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation)
|
211
|
-
cached_or_new_from_searchhits(rturk_hit, annotation)
|
212
|
-
end
|
213
|
-
end
|
214
|
-
filter_ours(hits, &filter)
|
215
|
-
end
|
216
|
-
|
217
|
-
#protected
|
218
|
-
|
219
|
-
#Constructor. Takes an RTurk::Hit instance. Returns a
|
220
|
-
#Typingpool::Amazon::HIT instance, preferably from the cache.
|
221
|
-
def cached_or_new(rturk_hit)
|
222
|
-
from_cache(rturk_hit.id) || new(rturk_hit)
|
223
|
-
end
|
224
|
-
|
225
|
-
#Constructor. Same as cached_or_new, but handles peculiarities
|
226
|
-
#of objects returned by RTurk::SearchHITs. Such objects map
|
227
|
-
#two Amazon HIT fields to different names than those used by
|
228
|
-
#other RTurk HIT instances. They also do not bother to extract
|
229
|
-
#the annotation from the Amazon HIT, so we have to do that
|
230
|
-
#ourselves (elsewhere) and take it as a param here. Finally,
|
231
|
-
#on the bright side, RTurk::SearchHITs already contain a big
|
232
|
-
#chunk of hit.full attributes, potentially obviating the need
|
233
|
-
#for an additional network call to flesh out the HIT, so this
|
234
|
-
#method pre-fleshes-out the HIT.
|
235
|
-
def cached_or_new_from_searchhits(rturk_hit, annotation)
|
236
|
-
if not (typingpool_hit = from_cache(rturk_hit.id))
|
237
|
-
typingpool_hit = new(rturk_hit)
|
238
|
-
typingpool_hit.full(Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation))
|
239
|
-
end
|
240
|
-
typingpool_hit
|
241
|
-
end
|
242
|
-
|
243
|
-
def from_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
|
244
|
-
Amazon.cache.transaction do
|
245
|
-
Amazon.cache[cache_key(hit_id, id_at, url_at)]
|
246
|
-
end
|
247
|
-
end
|
248
|
-
|
249
|
-
def delete_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
|
250
|
-
Amazon.cache.transaction do
|
251
|
-
key = cache_key(hit_id, id_at, url_at)
|
252
|
-
cached = Amazon.cache[key]
|
253
|
-
Amazon.cache.delete(key) unless cached.nil?
|
254
|
-
end
|
255
|
-
end
|
256
|
-
|
257
|
-
def cache_key(hit_id, id_at=self.id_at, url_at=self.url_at)
|
258
|
-
"RESULT///#{hit_id}///#{url_at}///#{id_at}"
|
259
|
-
end
|
260
|
-
|
261
|
-
def each_page
|
262
|
-
results = []
|
263
|
-
page = 0
|
264
|
-
begin
|
265
|
-
page += 1
|
266
|
-
new_results = yield(page)
|
267
|
-
results.push(*new_results)
|
268
|
-
end while new_results.count > 0
|
269
|
-
results
|
270
|
-
end
|
271
|
-
|
272
|
-
def filter_ours(hits, &filter)
|
273
|
-
filter ||= lambda{|hit| hit.ours? }
|
274
|
-
hits.select do |hit|
|
275
|
-
selected = filter.call(hit)
|
276
|
-
hit.to_cache
|
277
|
-
selected
|
278
|
-
end
|
279
|
-
end
|
280
|
-
end #class << self
|
281
|
-
|
282
|
-
#Corresponds to the Amazon Mechanical Turk HIT#HITId
|
283
|
-
attr_reader :id
|
284
|
-
|
285
|
-
#Constructor. Takes an RTurk::Hit instance.
|
286
|
-
def initialize(rturk_hit)
|
287
|
-
@id = rturk_hit.id
|
288
|
-
end
|
289
|
-
|
290
|
-
#URL of the audio file associated with this HIT (the audio file
|
291
|
-
#to be transcribed). Extracted from the annotation (when the HIT
|
292
|
-
#was assigned via Typingpool) or from a hidden field in the HTML
|
293
|
-
#form on the external question (when the HIT was assigned via
|
294
|
-
#the Amazon Mechanical Turk RUI).
|
295
|
-
def url
|
296
|
-
@url ||= stashed_param(self.class.url_at)
|
297
|
-
end
|
298
|
-
|
299
|
-
#The Typingpool::Project::Local#id associated with this
|
300
|
-
#HIT. Extracted as described for the url method.
|
301
|
-
def project_id
|
302
|
-
@project_id ||= stashed_param(self.class.id_at)
|
303
|
-
end
|
304
|
-
|
305
|
-
#Returns the Typingpool::Project#name associated with this HIT
|
306
|
-
#by parsing the #url. May be dropped in a future release.
|
307
|
-
def project_title_from_url(url=self.url)
|
308
|
-
matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
|
309
|
-
URI.unescape(matches[2])
|
310
|
-
end
|
311
|
-
|
312
|
-
#Returns true if this HIT has an approved assignment associated
|
313
|
-
#with it. (Attached to Typingpool::Amazon::HIT rather than
|
314
|
-
#Typingpool::Amazon::HIT::Assignment because sometimes we can
|
315
|
-
#tell simply from looking at hit.full that there are no approved
|
316
|
-
#assignments -- hit.full.assignments_completed == 0. This check
|
317
|
-
#is only performed when hit.full has already been loaded.)
|
318
|
-
def approved?
|
319
|
-
assignment_status_match?('Approved')
|
320
|
-
end
|
321
|
-
|
322
|
-
#Returns true if this HIT has a rejected assignment associated
|
323
|
-
#with it. (For an explanation of why this is not attached to
|
324
|
-
#Typingpool::Amazon::HIT::Assignment, see the documentation for
|
325
|
-
#approved?.)
|
326
|
-
def rejected?
|
327
|
-
assignment_status_match?('Rejected')
|
328
|
-
end
|
329
|
-
|
330
|
-
#Returns true if this HIT has a submitted assignment associated
|
331
|
-
#with it. (For an explanation of why this is not attached to
|
332
|
-
#Typingpool::Amazon::HIT::Assignment, see the documentation for
|
333
|
-
#approved?.)
|
334
|
-
def submitted?
|
335
|
-
assignment_status_match?('Submitted')
|
336
|
-
end
|
337
|
-
|
338
|
-
|
339
|
-
#Returns true if this HIT is associated with Typingpool. One
|
340
|
-
#Amazon account can be used for many tasks, so it's important to
|
341
|
-
#check whether the HIT belongs to this software. (Presently,
|
342
|
-
#this is determined by looking for a stashed param like url or
|
343
|
-
#project_id).
|
344
|
-
def ours?
|
345
|
-
@ours ||= not(url.to_s.empty?)
|
346
|
-
end
|
347
|
-
|
348
|
-
#Returns a Typingpool::Transcript::Chunk instance built using
|
349
|
-
#this HIT and its associated assignment.
|
350
|
-
def transcript
|
351
|
-
transcript = Transcript::Chunk.new(assignment.body)
|
352
|
-
transcript.url = url
|
353
|
-
transcript.project = project_id
|
354
|
-
transcript.worker = assignment.worker_id
|
355
|
-
transcript.hit = @id
|
356
|
-
transcript
|
357
|
-
end
|
358
|
-
|
359
|
-
#If this HIT is cacheable, serializes it to the cache file
|
360
|
-
#specified in the config passed to Amazon.setup, or specified in
|
361
|
-
#the default config file. In short, a HIT is cacheable if it
|
362
|
-
#does not belong to Typingpool (ours? == false), if it is
|
363
|
-
#approved or rejected (approved? || rejected?), or if it is
|
364
|
-
#expired (full.expired_and_overdue?). See also cacheable? code.
|
365
|
-
#
|
366
|
-
# When available, cached HITs are used by
|
367
|
-
# Typingpool::Amazon::HIT.all,
|
368
|
-
# Typingpool::Amazon::HIT.all_approved, and all the other class
|
369
|
-
# methods that retrieve HITs. These methods call to_cache for
|
370
|
-
# you at logical times (after downloading and filtering, when
|
371
|
-
# the HIT is most fleshed out), so you should not need to call
|
372
|
-
# this yourself. But if you have an operation that makes network
|
373
|
-
# calls to further flesh out the HIT, calling to_cache may be
|
374
|
-
# worthwhile.
|
375
|
-
def to_cache
|
376
|
-
#any obj containing a Nokogiri object cannot be stored in pstore - do
|
377
|
-
#not forget this (again)
|
378
|
-
if cacheable?
|
379
|
-
Amazon.cache.transaction do
|
380
|
-
Amazon.cache[self.class.cache_key(@id)] = self
|
381
|
-
end
|
382
|
-
end
|
383
|
-
end
|
384
|
-
|
385
|
-
#Returns an RTurk::Hit instance corresponding to this HIT.
|
386
|
-
def at_amazon
|
387
|
-
Amazon.rturk_hit_full(@id)
|
388
|
-
end
|
389
|
-
|
390
|
-
#Deletes the HIT from Amazon's servers. Examines the HIT and
|
391
|
-
#assignment status to determine whether calling the DisposeHIT
|
392
|
-
#or DisableHIT operation is most appropriate. If the HIT has
|
393
|
-
#been submitted but not approved or rejected, will raise an
|
394
|
-
#exception of type
|
395
|
-
#Typingpool::Error::Amazon::UnreviewedContent. Catch this
|
396
|
-
#exception in your own code if you'd like to automatically
|
397
|
-
#approve such HITs before removing them.
|
398
|
-
def remove_from_amazon
|
399
|
-
if full.status == 'Reviewable'
|
400
|
-
if assignment.status == 'Submitted'
|
401
|
-
raise Error::Amazon::UnreviewedContent, "There is an unreviewed submission for #{url}"
|
402
|
-
end
|
403
|
-
at_amazon.dispose!
|
404
|
-
else
|
405
|
-
at_amazon.disable!
|
406
|
-
end
|
407
|
-
end
|
408
|
-
|
409
|
-
#Returns "the full hit" - a Typingpool::Amazon::HIT::Full
|
410
|
-
#instance associated with this HIT. If the instance is being
|
411
|
-
#created for the first time, this will trigger an HTTP request
|
412
|
-
#to Amazon's servers. "Full" hit fields segregated because
|
413
|
-
#accessing any one of them is expensive if we only have a hit id
|
414
|
-
#(but after fetching one all are cheap). Accepts an optional
|
415
|
-
#Typingpool::Amazon::HIT::Full (or subclass) to set for this
|
416
|
-
#attribute, preventing the need to create one. This is useful in
|
417
|
-
#cases in which extensive HIT data was returned by an Amazon
|
418
|
-
#operation (for example, SearchHITs returns lots of HIT data)
|
419
|
-
def full(full_hit=nil)
|
420
|
-
if @full.nil?
|
421
|
-
@full = full_hit || Full.new(at_amazon)
|
422
|
-
end
|
423
|
-
@full
|
424
|
-
end
|
425
|
-
|
426
|
-
#Returns the assignment associated with this HIT - a
|
427
|
-
#Typingpool::Amazon::HIT::Assignment instance. The first time
|
428
|
-
#this is called, an Amazon HTTP request is typically (but not
|
429
|
-
#always) sent.
|
430
|
-
def assignment
|
431
|
-
if @assignment.nil?
|
432
|
-
if @full && full.assignments_completed == 0
|
433
|
-
#It would be dangerous to do this if the HIT were to be
|
434
|
-
#cached, since we would then never check for the
|
435
|
-
#assignment again. But we know this HIT won't be cached
|
436
|
-
#while it is active, since we only cache approved and
|
437
|
-
#rejected HITs.
|
438
|
-
@assignment = Assignment::Empty.new
|
439
|
-
else
|
440
|
-
@assignment = Assignment.new(at_amazon) #expensive
|
441
|
-
end
|
442
|
-
end
|
443
|
-
@assignment
|
444
|
-
end
|
445
|
-
|
446
|
-
|
447
|
-
#private
|
448
|
-
|
449
|
-
def stashed_param(param)
|
450
|
-
if @assignment && assignment.answers[param]
|
451
|
-
return assignment.answers[param]
|
452
|
-
elsif full.annotation[param]
|
453
|
-
#A question assigned through this software. May be
|
454
|
-
#expensive: May result in HTTP request to fetch HIT
|
455
|
-
#fields. We choose to fetch (sometimes) the HIT rather than
|
456
|
-
#the assignment on the assumption it will be MORE common to
|
457
|
-
#encounter HITs with no answers and LESS common to encounter
|
458
|
-
#HITs assigned through the RUI (and thus lacking in an
|
459
|
-
#annotation from this software and thus rendering the HTTP
|
460
|
-
#request to fetch the HIT fields pointless).
|
461
|
-
return full.annotation[param]
|
462
|
-
elsif full.assignments_completed.to_i >= 1
|
463
|
-
#A question assigned through Amazon's RUI, with an answer
|
464
|
-
#submitted. If the HIT belongs to this software, this
|
465
|
-
#assignment's answers will include our param. We prefer
|
466
|
-
#fetching the assignment to fetching the external question
|
467
|
-
#(as below) because fetching the assignment will potentially
|
468
|
-
#save us an HTTP request down the line -- for example, if we
|
469
|
-
#need other assignment data (e.g. assignment status).
|
470
|
-
#Fetching the external question only serves to give us
|
471
|
-
#access to params. If the answers do not include our param,
|
472
|
-
#we know the HIT does not belong to this software, since we
|
473
|
-
#know the param was also not in the annotation. So we are
|
474
|
-
#safe returning nil in that case.
|
475
|
-
return assignment.answers[param]
|
476
|
-
else
|
477
|
-
#A question assigned via Amazon's RUI, with no answer
|
478
|
-
#submitted. Expensive: Results in HTTP request to fetch
|
479
|
-
#external question.
|
480
|
-
return full.external_question_param(param)
|
481
|
-
end
|
482
|
-
end
|
483
|
-
|
484
|
-
def assignment_status_match?(status)
|
485
|
-
if @full
|
486
|
-
return false if full.assignments_completed == 0
|
487
|
-
return false if full.status != 'Reviewable'
|
488
|
-
end
|
489
|
-
assignment.status == status
|
490
|
-
end
|
491
|
-
|
492
|
-
|
493
|
-
@@cacheable_assignment_status = Set.new %w(Approved Rejected)
|
494
|
-
def cacheable?
|
495
|
-
if @ours == false
|
496
|
-
return true
|
497
|
-
end
|
498
|
-
if @full
|
499
|
-
return true if full.expired_and_overdue?
|
500
|
-
end
|
501
|
-
if @assignment && assignment.status
|
502
|
-
return true if @@cacheable_assignment_status.include?(assignment.status)
|
503
|
-
end
|
504
|
-
return false
|
505
|
-
end
|
506
|
-
|
507
|
-
class Full
|
508
|
-
require 'uri'
|
509
|
-
require 'open-uri'
|
510
|
-
require 'nokogiri'
|
511
|
-
|
512
|
-
#See the RTurk documentation and Amazon Mechanical Turk API
|
513
|
-
#documentation for more on these fields.
|
514
|
-
attr_reader :id, :type_id, :status, :external_question_url, :assignments_completed, :assignments_pending, :expires_at, :assignments_duration
|
515
|
-
|
516
|
-
#Constructor. Takes an RTurk::HIT instance.
|
517
|
-
def initialize(rturk_hit)
|
518
|
-
import_standard_attrs_from_rturk_hit(rturk_hit)
|
519
|
-
@assignments_completed = rturk_hit.assignments_completed_count
|
520
|
-
@assignments_pending = rturk_hit.assignments_pending_count
|
521
|
-
self.annotation = rturk_hit.annotation
|
522
|
-
self.external_question_url = rturk_hit.xml
|
523
|
-
end
|
524
|
-
|
525
|
-
#Returns the HIT annotation as a hash. If the annotation
|
526
|
-
#contained URL-encoded form key-value pairs, it decodes them
|
527
|
-
#and returns them as a hash. Otherwise, returns an empty hash
|
528
|
-
#(throwing away any annotation text that is not URL-encoded
|
529
|
-
#key-value pairs, for example the tags attached by the Amazon
|
530
|
-
#Mechanical Turk RUI).
|
531
|
-
def annotation
|
532
|
-
@annotation ||= {}
|
533
|
-
end
|
534
|
-
|
535
|
-
#Returns boolean indicated whether the HIT is
|
536
|
-
#expired. Determined by comparing the HIT's expires_at
|
537
|
-
#attribute with the current time.
|
538
|
-
def expired?
|
539
|
-
expires_at < Time.now
|
540
|
-
end
|
541
|
-
|
542
|
-
#Returns boolean indicated whether the HIT is expired and
|
543
|
-
#overdue, at which point it is totally safe to prune. This is
|
544
|
-
#determined by adding the assignment duration (how long a
|
545
|
-
#worker has to complete the HIT) to the HIT's expires_at time
|
546
|
-
#(when the HIT is removed from the Mechanical Turk
|
547
|
-
#marketplace).
|
548
|
-
def expired_and_overdue?
|
549
|
-
(expires_at + assignments_duration) < Time.now
|
550
|
-
end
|
551
|
-
|
552
|
-
#Returns the HTML of the external question associated with the
|
553
|
-
#HIT. All Typingpool HITs use external questions (as opposed
|
554
|
-
#to "internal" HIT QuestionForms), so this should always
|
555
|
-
#return something. In first use, must make an HTTP request to
|
556
|
-
#obtain the HTML.
|
557
|
-
def external_question
|
558
|
-
if @external_question.nil?
|
559
|
-
if external_question_url && external_question_url.match(/^http/)
|
560
|
-
#expensive, obviously:
|
561
|
-
@external_question = open(external_question_url).read
|
562
|
-
end
|
563
|
-
end
|
564
|
-
@external_question
|
565
|
-
end
|
566
|
-
|
567
|
-
#Takes the name of an HTML form param and returns the value
|
568
|
-
#associated with that param in the external question
|
569
|
-
#HTML. Triggers an HTTP request on first use (unless
|
570
|
-
#external_question has already been called).
|
571
|
-
def external_question_param(param)
|
572
|
-
if external_question
|
573
|
-
if input = Nokogiri::HTML::Document.parse(external_question).css("input[name=#{param}]")[0]
|
574
|
-
return input['value']
|
575
|
-
end
|
576
|
-
end
|
577
|
-
end
|
578
|
-
|
579
|
-
protected
|
580
|
-
|
581
|
-
def import_standard_attrs_from_rturk_hit(hit)
|
582
|
-
%w(id type_id status expires_at assignments_duration).each do |attr|
|
583
|
-
instance_variable_set("@#{attr}", hit.send(attr))
|
584
|
-
end
|
585
|
-
end
|
586
|
-
|
587
|
-
def annotation=(encoded)
|
588
|
-
@annotation = CGI.unescapeHTML(encoded.to_s)
|
589
|
-
begin
|
590
|
-
@annotation = URI.decode_www_form(@annotation)
|
591
|
-
@annotation = Hash[*@annotation.flatten]
|
592
|
-
rescue ArgumentError
|
593
|
-
#Handle annotations like Department:Transcription (from
|
594
|
-
#the Amazon RUI), which make URI.decode_www_form barf
|
595
|
-
@annotation = {}
|
596
|
-
end
|
597
|
-
end
|
598
|
-
|
599
|
-
def external_question_url=(noko_xml)
|
600
|
-
if url = noko_xml.css('HIT Question eq|ExternalQuestion eq|ExternalURL', {'eq' => 'http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2006-07-14/ExternalQuestion.xsd'})[0].inner_text
|
601
|
-
@external_question_url = url
|
602
|
-
end
|
603
|
-
end
|
604
|
-
|
605
|
-
#For more on why this subclass is neccesary, see the
|
606
|
-
#documentation for
|
607
|
-
#Typingpool::Amazon::HIT.cached_or_new_from_searchhits. In
|
608
|
-
#short, RTurk::HITParser objects returned by RTurk::SearchHITs
|
609
|
-
#are pointlessly and subtly different from
|
610
|
-
#RTurk::GetHITResponse objects. (I need to submit a patch to
|
611
|
-
#RTurk.)
|
612
|
-
class FromSearchHITs < Full
|
613
|
-
#Constructor. Takes an RTurk::Hit instance and the text of
|
614
|
-
#the HIT's annotation. The text of the annotation must be
|
615
|
-
#submitted as a separate param because RTurk::Hit instances
|
616
|
-
#returned by RTurk::SearchHITs do not bother to extract the
|
617
|
-
#annotation into an attribute, so we have to so that
|
618
|
-
#ourselves (elsewhere) using the raw xml.
|
619
|
-
def initialize(rturk_hit, annotation)
|
620
|
-
import_standard_attrs_from_rturk_hit(rturk_hit)
|
621
|
-
@assignments_completed = rturk_hit.completed_assignments
|
622
|
-
@assignments_pending = rturk_hit.pending_assignments
|
623
|
-
self.annotation = annotation
|
624
|
-
end
|
625
|
-
|
626
|
-
protected
|
627
|
-
|
628
|
-
def external_question_url
|
629
|
-
unless @checked_question
|
630
|
-
self.external_question_url = at_amazon.xml
|
631
|
-
@checked_question = true
|
632
|
-
end
|
633
|
-
@external_question_url
|
634
|
-
end
|
635
|
-
|
636
|
-
def at_amazon
|
637
|
-
Amazon.rturk_hit_full(@id)
|
638
|
-
end
|
639
|
-
end #Amazon::HIT::Full::FromSearchHITs
|
640
|
-
end #Amazon::HIT::Full
|
641
|
-
|
642
|
-
class Assignment
|
643
|
-
|
644
|
-
#See the RTurk documentation and Amazon Mechanical Turk API
|
645
|
-
#documentation for more on these fields.
|
646
|
-
attr_reader :id, :status, :worker_id, :submitted_at
|
647
|
-
|
648
|
-
#Constructor. Takes an RTurk::Hit instance.
|
649
|
-
def initialize(rturk_hit)
|
650
|
-
if assignment = rturk_hit.assignments[0] #expensive!
|
651
|
-
@id = assignment.id
|
652
|
-
@status = assignment.status
|
653
|
-
@worker_id = assignment.worker_id
|
654
|
-
@submitted_at = assignment.submitted_at
|
655
|
-
if answers = assignment.answers
|
656
|
-
@answers = answers.to_hash
|
657
|
-
end
|
658
|
-
end
|
659
|
-
end
|
660
|
-
|
661
|
-
#Returns the answers associated with this assignment as a
|
662
|
-
#hash. If there are no answers, returns an empty hash.
|
663
|
-
def answers
|
664
|
-
@answers ||= {}
|
665
|
-
end
|
666
|
-
|
667
|
-
#Returns the transcription submitted by the user as raw text.
|
668
|
-
def body
|
669
|
-
(answers['transcription'] || answers['1']).to_s
|
670
|
-
end
|
671
|
-
|
672
|
-
#Returms an RTurk::Assignment object corresponding to this
|
673
|
-
#assignment.
|
674
|
-
def at_amazon
|
675
|
-
RTurk::Assignment.new(@id)
|
676
|
-
end
|
677
|
-
|
678
|
-
#Subclass used in cases where we know Amazon's servers have no
|
679
|
-
#assignments for us (because hit.full.assignments_completed ==
|
680
|
-
#0), so we don't want to bother doing an HTTP request to
|
681
|
-
#check.
|
682
|
-
class Empty < Assignment
|
683
|
-
def initialize
|
684
|
-
@answers = {}
|
685
|
-
end
|
686
|
-
|
687
|
-
end #Empty
|
688
|
-
end #Assignment
|
689
|
-
end #HIT
|
690
|
-
|
691
|
-
#Class encapsulating the HTML form presented to Mechanical Turk workers
|
692
|
-
#transcribing a Typingpool audio chunk.
|
693
|
-
class Question
|
694
|
-
require 'nokogiri'
|
695
|
-
require 'uri'
|
696
|
-
require 'cgi'
|
697
|
-
attr_reader :url, :html
|
698
|
-
|
699
|
-
#Constructor. Takes the URL of where the question HTML has been
|
700
|
-
#uploaded, followed by the question HTML itself.
|
701
|
-
def initialize(url, html)
|
702
|
-
@url = url
|
703
|
-
@html = html
|
704
|
-
end
|
705
|
-
|
706
|
-
#Returns URL-encoded key-value pairs that can be used as the
|
707
|
-
#text for a HIT#annotation. The key-value pairs correspond to
|
708
|
-
#all hidden HTML form fields in the question HTML.
|
709
|
-
def annotation
|
710
|
-
CGI.escapeHTML(URI.encode_www_form(Hash[*noko.css('input[type="hidden"]').select{|e| e['name'].match(/^typingpool_/) }.map{|e| [e['name'], e['value']]}.flatten]))
|
711
|
-
end
|
712
|
-
|
713
|
-
#Returns the title, extracted from the title element of the
|
714
|
-
#HTML.
|
715
|
-
def title
|
716
|
-
noko.css('title')[0].content
|
717
|
-
end
|
718
|
-
|
719
|
-
#Returns the description, extracted from the element with the id
|
720
|
-
#'description' in the HTML.
|
721
|
-
def description
|
722
|
-
noko.css('#description')[0].content
|
723
|
-
end
|
724
|
-
|
725
|
-
protected
|
726
|
-
|
727
|
-
def noko(html=@html)
|
728
|
-
Nokogiri::HTML(html, nil, 'UTF-8')
|
729
|
-
end
|
730
|
-
end #Question
|
731
57
|
end #Amazon
|
732
58
|
end #Typingpool
|