typingpool 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.markdown +452 -0
- data/lib/typingpool/amazon/hit/assignment/empty.rb +19 -0
- data/lib/typingpool/amazon/hit/assignment.rb +43 -0
- data/lib/typingpool/amazon/hit/full/fromsearchhits.rb +44 -0
- data/lib/typingpool/amazon/hit/full.rb +105 -0
- data/lib/typingpool/amazon/hit.rb +458 -0
- data/lib/typingpool/amazon/question.rb +45 -0
- data/lib/typingpool/amazon.rb +3 -677
- data/lib/typingpool/app/cli/formatter.rb +16 -0
- data/lib/typingpool/app/cli.rb +64 -0
- data/lib/typingpool/app/friendlyexceptions.rb +34 -0
- data/lib/typingpool/app.rb +2 -97
- data/lib/typingpool/config/root.rb +114 -0
- data/lib/typingpool/config.rb +13 -119
- data/lib/typingpool/filer/audio.rb +84 -0
- data/lib/typingpool/filer/csv.rb +57 -0
- data/lib/typingpool/filer/dir.rb +76 -0
- data/lib/typingpool/filer/files/audio.rb +63 -0
- data/lib/typingpool/filer/files.rb +55 -0
- data/lib/typingpool/filer.rb +4 -313
- data/lib/typingpool/project/local.rb +117 -0
- data/lib/typingpool/project/remote/s3.rb +135 -0
- data/lib/typingpool/project/remote/sftp.rb +100 -0
- data/lib/typingpool/project/remote.rb +65 -0
- data/lib/typingpool/project.rb +2 -396
- data/lib/typingpool/template/assignment.rb +17 -0
- data/lib/typingpool/template/env.rb +77 -0
- data/lib/typingpool/template.rb +2 -87
- data/lib/typingpool/test/script.rb +310 -0
- data/lib/typingpool/test.rb +1 -306
- data/lib/typingpool/transcript/chunk.rb +129 -0
- data/lib/typingpool/transcript.rb +1 -125
- data/lib/typingpool/utility/castable.rb +65 -0
- data/lib/typingpool/utility.rb +1 -61
- data/test/test_integration_script_6_tp_finish.rb +1 -0
- metadata +135 -81
@@ -0,0 +1,458 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
#Class representing an Amazon Mechanical Turk Human Intelligence
|
4
|
+
#Task (HIT).
|
5
|
+
#
|
6
|
+
#We go above and beyond RTurk::Hit for several practical reasons:
|
7
|
+
# * To allow easy serialization. Caching is a very useful way of
|
8
|
+
# reducing network calls to Amazon, and thus of speeding up
|
9
|
+
# Typingpool. RTurk::Hit objects cannot be dumped via Marshal,
|
10
|
+
# apparently due to some Nokogiri objects they
|
11
|
+
# contain. Typingpool::Amazon::HIT objects, in contrast, are
|
12
|
+
# designed to be easily and compactly serialized. They store the
|
13
|
+
# minimal subset of information we need via simple
|
14
|
+
# attribtues. (Presently we serialize via PStore.)
|
15
|
+
# * To attach convenience methods. RTurk does not make it easy,
|
16
|
+
# for example, to get HITs beyond the first "page" returned by
|
17
|
+
# Amazon. This class provides methods that make it easy to get
|
18
|
+
# ALL HITs returned by various operations.
|
19
|
+
# * To attach methods specific to Typingpool. For example, the url
|
20
|
+
# and project_id methods read params we've embedded in the
|
21
|
+
# annotation or in hidden fields on an external question, while
|
22
|
+
# the underlying stashed_params method optimizes its lookup of
|
23
|
+
# these variables based on how the app is most likely to be
|
24
|
+
# used. See also the ours? and cacheable? methods.
|
25
|
+
# * To simplify. Typingpool HITs are constrained such that we can
|
26
|
+
# assume they all contain only one assignment and thus only a
|
27
|
+
# maximum of one answer. Also, once we've determined that a HIT
|
28
|
+
# does not belong to Typingpool, it is safe to cache it forever
|
29
|
+
# and never download it again from Amazon.
|
30
|
+
# * To clearly partition methods that result in network
|
31
|
+
# calls. When you access an attribute under hit.full, like
|
32
|
+
# hit.full.status, it is clear you are doing something
|
33
|
+
# potentially expensive to obtain your hit status. Same thing
|
34
|
+
# with accessing an attribute under hit.assignment, like
|
35
|
+
# hit.assignment.worker_id -- it is clear an assignment object
|
36
|
+
# will need to be created, implying a network call. Calling
|
37
|
+
# hit.id, in contrast, is always fast. (Caveat: Accessing
|
38
|
+
# partitioned attributes often, but not always, results in a
|
39
|
+
# network call. In some cases, hit.full is generated at the same
|
40
|
+
# time we create the hit, since we've obtained a full HIT
|
41
|
+
# serialization from Amazon. In other cases, we only have a HIT
|
42
|
+
# id, so accessing anything under hit.full generates a network
|
43
|
+
# call.)
|
44
|
+
class HIT
|
45
|
+
require 'set'
|
46
|
+
require 'uri'
|
47
|
+
require 'typingpool/amazon/hit/full'
|
48
|
+
require 'typingpool/amazon/hit/assignment'
|
49
|
+
|
50
|
+
class << self
|
51
|
+
|
52
|
+
#Constructor. Creates an Amazon Mechanical Turk HIT.
|
53
|
+
#** Warning: This method can spend your money! **
|
54
|
+
# ==== Params
|
55
|
+
# [question] Typingpool::Amazon::Question instance, used not
|
56
|
+
# only to generate the (external) question but
|
57
|
+
# also parsed to provide one or more core HIT
|
58
|
+
# attributes. Must include a non-nil
|
59
|
+
# annotation attribute. Provides fallback
|
60
|
+
# values for HIT title and description.
|
61
|
+
# [config_assign] The 'assign' attribute of a
|
62
|
+
# Typingpool::Config instance (that is, a
|
63
|
+
# Typingpool::Config::Root::Assign
|
64
|
+
# instance). Must include values for reward,
|
65
|
+
# lifetime, duration, and approval. May
|
66
|
+
# include values for keywords and
|
67
|
+
# qualifications. Preferred source for HIT
|
68
|
+
# title and description. See
|
69
|
+
# Typingpool::Config documentation for further
|
70
|
+
# details.
|
71
|
+
# ==== Returns
|
72
|
+
# Typingpool::Amazon::HIT instance corresponding to the new
|
73
|
+
# Mechanical Turk HIT.
|
74
|
+
def create(question, config_assign)
|
75
|
+
new(RTurk::Hit.create(:title => config_assign.title || question.title) do |hit|
|
76
|
+
hit.description = config_assign.description || question.description
|
77
|
+
hit.question(question.url)
|
78
|
+
hit.note = question.annotation or raise Error, "Missing annotation from question"
|
79
|
+
hit.reward = config_assign.reward or raise Error, "Missing reward config"
|
80
|
+
hit.assignments = 1
|
81
|
+
hit.lifetime = config_assign.lifetime or raise Error, "Missing lifetime config"
|
82
|
+
hit.duration = config_assign.deadline or raise Error, "Missing deadline config"
|
83
|
+
hit.auto_approval = config_assign.approval or raise Error, "Missing approval config"
|
84
|
+
hit.keywords = config_assign.keywords if config_assign.keywords
|
85
|
+
config_assign.qualify.each{|q| hit.qualifications.add(*q.to_arg)} if config_assign.qualify
|
86
|
+
end)
|
87
|
+
end
|
88
|
+
|
89
|
+
#Name of the hidden HTML form field used to provide the
|
90
|
+
#project_id in an external question or (form-encoded)
|
91
|
+
#annotation. Hard coded to typingpool_project_id but
|
92
|
+
#overridable in a subclass.
|
93
|
+
def id_at
|
94
|
+
@@id_at ||= 'typingpool_project_id'
|
95
|
+
end
|
96
|
+
|
97
|
+
#Name of the hidden HTML form field used to provide the
|
98
|
+
#(audio) url in an external question or (form-encoded)
|
99
|
+
#annotation. Hard coded to typingpool_url but overridable in a
|
100
|
+
#subclass.
|
101
|
+
def url_at
|
102
|
+
@@url_at ||= 'typingpool_url'
|
103
|
+
end
|
104
|
+
|
105
|
+
#Takes an array of HIT ids, returns Typingpool::Amazon::HIT
|
106
|
+
#instances corresponding to those ids.
|
107
|
+
def with_ids(ids)
|
108
|
+
ids.map{|id| cached_or_new(RTurk::Hit.new(id)) }
|
109
|
+
end
|
110
|
+
|
111
|
+
#Returns all Typingpool HITs that have been approved, as an
|
112
|
+
#array of Typingpool::Amazon::HIT instances.
|
113
|
+
def all_approved
|
114
|
+
hits = all_reviewable do |hit|
|
115
|
+
begin
|
116
|
+
#optimization: we assume it is more common to have an
|
117
|
+
#unapproved HIT than an approved HIT that does not
|
118
|
+
#belong to this app
|
119
|
+
hit.approved? && hit.ours?
|
120
|
+
rescue RestClient::ServiceUnavailable => e
|
121
|
+
warn "Warning: Service unavailable error, skipped HIT #{hit.id}. (Error: #{e})"
|
122
|
+
false
|
123
|
+
end
|
124
|
+
end
|
125
|
+
hits
|
126
|
+
end
|
127
|
+
|
128
|
+
#Returns as an array of Typingpool::Amazon::HIT instances all
|
129
|
+
#HITs returned by Amazon's GetReviewableHITs operation (which
|
130
|
+
#have HIT status == 'Reviewable'). Takes an optional filter
|
131
|
+
#block (which should return true for HITs to be included in
|
132
|
+
#the final results). If not supplied, will filter so the
|
133
|
+
#returned hits are all Typingpool HITs (hit.ours? == true).
|
134
|
+
def all_reviewable(&filter)
|
135
|
+
hits = each_page do |page_number|
|
136
|
+
RTurk.GetReviewableHITs(:page_number => page_number).hit_ids.map{|id| RTurk::Hit.new(id) }.map{|hit| cached_or_new(hit) }
|
137
|
+
end
|
138
|
+
filter_ours(hits, &filter)
|
139
|
+
end
|
140
|
+
|
141
|
+
#Takes a Typingpool::Project::Local#id and returns all HITs
|
142
|
+
#associated with that project, as an array of
|
143
|
+
#Typingpool::Amazon::HIT instances.
|
144
|
+
def all_for_project(id)
|
145
|
+
all{|hit| hit.ours? && hit.project_id == id}
|
146
|
+
end
|
147
|
+
|
148
|
+
#Returns all HITs associated with your AWS account as an array
|
149
|
+
#of Typingpool::Amazon::HIT instances. Takes an optional
|
150
|
+
#filter block (which should return true for HITs to be
|
151
|
+
#included in the final results). If not supplied, will filter
|
152
|
+
#so the returned hits are all Typingpool HITs (hit.ours? ==
|
153
|
+
#true).
|
154
|
+
def all(&filter)
|
155
|
+
hits = each_page do |page_number|
|
156
|
+
page = RTurk::SearchHITs.create(:page_number => page_number)
|
157
|
+
raw_hits = page.xml.xpath('//HIT')
|
158
|
+
page.hits.map do |rturk_hit|
|
159
|
+
annotation = raw_hits.shift.xpath('RequesterAnnotation').inner_text.strip
|
160
|
+
full = Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation)
|
161
|
+
cached_or_new_from_searchhits(rturk_hit, annotation)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
filter_ours(hits, &filter)
|
165
|
+
end
|
166
|
+
|
167
|
+
#protected
|
168
|
+
|
169
|
+
#Constructor. Takes an RTurk::Hit instance. Returns a
|
170
|
+
#Typingpool::Amazon::HIT instance, preferably from the cache.
|
171
|
+
def cached_or_new(rturk_hit)
|
172
|
+
from_cache(rturk_hit.id) || new(rturk_hit)
|
173
|
+
end
|
174
|
+
|
175
|
+
#Constructor. Same as cached_or_new, but handles peculiarities
|
176
|
+
#of objects returned by RTurk::SearchHITs. Such objects map
|
177
|
+
#two Amazon HIT fields to different names than those used by
|
178
|
+
#other RTurk HIT instances. They also do not bother to extract
|
179
|
+
#the annotation from the Amazon HIT, so we have to do that
|
180
|
+
#ourselves (elsewhere) and take it as a param here. Finally,
|
181
|
+
#on the bright side, RTurk::SearchHITs already contain a big
|
182
|
+
#chunk of hit.full attributes, potentially obviating the need
|
183
|
+
#for an additional network call to flesh out the HIT, so this
|
184
|
+
#method pre-fleshes-out the HIT.
|
185
|
+
def cached_or_new_from_searchhits(rturk_hit, annotation)
|
186
|
+
if not (typingpool_hit = from_cache(rturk_hit.id))
|
187
|
+
typingpool_hit = new(rturk_hit)
|
188
|
+
typingpool_hit.full(Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation))
|
189
|
+
end
|
190
|
+
typingpool_hit
|
191
|
+
end
|
192
|
+
|
193
|
+
def from_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
|
194
|
+
Amazon.cache.transaction do
|
195
|
+
Amazon.cache[cache_key(hit_id, id_at, url_at)]
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def delete_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
|
200
|
+
Amazon.cache.transaction do
|
201
|
+
key = cache_key(hit_id, id_at, url_at)
|
202
|
+
cached = Amazon.cache[key]
|
203
|
+
Amazon.cache.delete(key) unless cached.nil?
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def cache_key(hit_id, id_at=self.id_at, url_at=self.url_at)
|
208
|
+
"RESULT///#{hit_id}///#{url_at}///#{id_at}"
|
209
|
+
end
|
210
|
+
|
211
|
+
def each_page
|
212
|
+
results = []
|
213
|
+
page = 0
|
214
|
+
begin
|
215
|
+
page += 1
|
216
|
+
new_results = yield(page)
|
217
|
+
results.push(*new_results)
|
218
|
+
end while new_results.count > 0
|
219
|
+
results
|
220
|
+
end
|
221
|
+
|
222
|
+
def filter_ours(hits, &filter)
|
223
|
+
filter ||= lambda{|hit| hit.ours? }
|
224
|
+
hits.select do |hit|
|
225
|
+
selected = filter.call(hit)
|
226
|
+
hit.to_cache
|
227
|
+
selected
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end #class << self
|
231
|
+
|
232
|
+
#Corresponds to the Amazon Mechanical Turk HIT#HITId
|
233
|
+
attr_reader :id
|
234
|
+
|
235
|
+
#Constructor. Takes an RTurk::Hit instance.
|
236
|
+
def initialize(rturk_hit)
|
237
|
+
@id = rturk_hit.id
|
238
|
+
end
|
239
|
+
|
240
|
+
#URL of the audio file associated with this HIT (the audio file
|
241
|
+
#to be transcribed). Extracted from the annotation (when the HIT
|
242
|
+
#was assigned via Typingpool) or from a hidden field in the HTML
|
243
|
+
#form on the external question (when the HIT was assigned via
|
244
|
+
#the Amazon Mechanical Turk RUI).
|
245
|
+
def url
|
246
|
+
@url ||= stashed_param(self.class.url_at)
|
247
|
+
end
|
248
|
+
|
249
|
+
#The Typingpool::Project::Local#id associated with this
|
250
|
+
#HIT. Extracted as described for the url method.
|
251
|
+
def project_id
|
252
|
+
@project_id ||= stashed_param(self.class.id_at)
|
253
|
+
end
|
254
|
+
|
255
|
+
#Returns the Typingpool::Project#name associated with this HIT
|
256
|
+
#by parsing the #url. May be dropped in a future release.
|
257
|
+
def project_title_from_url(url=self.url)
|
258
|
+
matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
|
259
|
+
URI.unescape(matches[2])
|
260
|
+
end
|
261
|
+
|
262
|
+
#Returns true if this HIT has an approved assignment associated
|
263
|
+
#with it. (Attached to Typingpool::Amazon::HIT rather than
|
264
|
+
#Typingpool::Amazon::HIT::Assignment because sometimes we can
|
265
|
+
#tell simply from looking at hit.full that there are no approved
|
266
|
+
#assignments -- hit.full.assignments_completed == 0. This check
|
267
|
+
#is only performed when hit.full has already been loaded.)
|
268
|
+
def approved?
|
269
|
+
assignment_status_match?('Approved')
|
270
|
+
end
|
271
|
+
|
272
|
+
#Returns true if this HIT has a rejected assignment associated
|
273
|
+
#with it. (For an explanation of why this is not attached to
|
274
|
+
#Typingpool::Amazon::HIT::Assignment, see the documentation for
|
275
|
+
#approved?.)
|
276
|
+
def rejected?
|
277
|
+
assignment_status_match?('Rejected')
|
278
|
+
end
|
279
|
+
|
280
|
+
#Returns true if this HIT has a submitted assignment associated
|
281
|
+
#with it. (For an explanation of why this is not attached to
|
282
|
+
#Typingpool::Amazon::HIT::Assignment, see the documentation for
|
283
|
+
#approved?.)
|
284
|
+
def submitted?
|
285
|
+
assignment_status_match?('Submitted')
|
286
|
+
end
|
287
|
+
|
288
|
+
|
289
|
+
#Returns true if this HIT is associated with Typingpool. One
|
290
|
+
#Amazon account can be used for many tasks, so it's important to
|
291
|
+
#check whether the HIT belongs to this software. (Presently,
|
292
|
+
#this is determined by looking for a stashed param like url or
|
293
|
+
#project_id).
|
294
|
+
def ours?
|
295
|
+
@ours ||= not(url.to_s.empty?)
|
296
|
+
end
|
297
|
+
|
298
|
+
#Returns a Typingpool::Transcript::Chunk instance built using
|
299
|
+
#this HIT and its associated assignment.
|
300
|
+
def transcript
|
301
|
+
transcript = Transcript::Chunk.new(assignment.body)
|
302
|
+
transcript.url = url
|
303
|
+
transcript.project = project_id
|
304
|
+
transcript.worker = assignment.worker_id
|
305
|
+
transcript.hit = @id
|
306
|
+
transcript
|
307
|
+
end
|
308
|
+
|
309
|
+
#If this HIT is cacheable, serializes it to the cache file
|
310
|
+
#specified in the config passed to Amazon.setup, or specified in
|
311
|
+
#the default config file. In short, a HIT is cacheable if it
|
312
|
+
#does not belong to Typingpool (ours? == false), if it is
|
313
|
+
#approved or rejected (approved? || rejected?), or if it is
|
314
|
+
#expired (full.expired_and_overdue?). See also cacheable? code.
|
315
|
+
#
|
316
|
+
# When available, cached HITs are used by
|
317
|
+
# Typingpool::Amazon::HIT.all,
|
318
|
+
# Typingpool::Amazon::HIT.all_approved, and all the other class
|
319
|
+
# methods that retrieve HITs. These methods call to_cache for
|
320
|
+
# you at logical times (after downloading and filtering, when
|
321
|
+
# the HIT is most fleshed out), so you should not need to call
|
322
|
+
# this yourself. But if you have an operation that makes network
|
323
|
+
# calls to further flesh out the HIT, calling to_cache may be
|
324
|
+
# worthwhile.
|
325
|
+
def to_cache
|
326
|
+
#any obj containing a Nokogiri object cannot be stored in pstore - do
|
327
|
+
#not forget this (again)
|
328
|
+
if cacheable?
|
329
|
+
Amazon.cache.transaction do
|
330
|
+
Amazon.cache[self.class.cache_key(@id)] = self
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
#Returns an RTurk::Hit instance corresponding to this HIT.
|
336
|
+
def at_amazon
|
337
|
+
Amazon.rturk_hit_full(@id)
|
338
|
+
end
|
339
|
+
|
340
|
+
#Deletes the HIT from Amazon's servers. Examines the HIT and
|
341
|
+
#assignment status to determine whether calling the DisposeHIT
|
342
|
+
#or DisableHIT operation is most appropriate. If the HIT has
|
343
|
+
#been submitted but not approved or rejected, will raise an
|
344
|
+
#exception of type
|
345
|
+
#Typingpool::Error::Amazon::UnreviewedContent. Catch this
|
346
|
+
#exception in your own code if you'd like to automatically
|
347
|
+
#approve such HITs before removing them.
|
348
|
+
def remove_from_amazon
|
349
|
+
if full.status == 'Reviewable'
|
350
|
+
if assignment.status == 'Submitted'
|
351
|
+
raise Error::Amazon::UnreviewedContent, "There is an unreviewed submission for #{url}"
|
352
|
+
end
|
353
|
+
at_amazon.dispose!
|
354
|
+
else
|
355
|
+
at_amazon.disable!
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
#Returns "the full hit" - a Typingpool::Amazon::HIT::Full
|
360
|
+
#instance associated with this HIT. If the instance is being
|
361
|
+
#created for the first time, this will trigger an HTTP request
|
362
|
+
#to Amazon's servers. "Full" hit fields segregated because
|
363
|
+
#accessing any one of them is expensive if we only have a hit id
|
364
|
+
#(but after fetching one all are cheap). Accepts an optional
|
365
|
+
#Typingpool::Amazon::HIT::Full (or subclass) to set for this
|
366
|
+
#attribute, preventing the need to create one. This is useful in
|
367
|
+
#cases in which extensive HIT data was returned by an Amazon
|
368
|
+
#operation (for example, SearchHITs returns lots of HIT data)
|
369
|
+
def full(full_hit=nil)
|
370
|
+
if @full.nil?
|
371
|
+
@full = full_hit || Full.new(at_amazon)
|
372
|
+
end
|
373
|
+
@full
|
374
|
+
end
|
375
|
+
|
376
|
+
#Returns the assignment associated with this HIT - a
|
377
|
+
#Typingpool::Amazon::HIT::Assignment instance. The first time
|
378
|
+
#this is called, an Amazon HTTP request is typically (but not
|
379
|
+
#always) sent.
|
380
|
+
def assignment
|
381
|
+
if @assignment.nil?
|
382
|
+
if @full && full.assignments_completed == 0
|
383
|
+
#It would be dangerous to do this if the HIT were to be
|
384
|
+
#cached, since we would then never check for the
|
385
|
+
#assignment again. But we know this HIT won't be cached
|
386
|
+
#while it is active, since we only cache approved and
|
387
|
+
#rejected HITs.
|
388
|
+
@assignment = Assignment::Empty.new
|
389
|
+
else
|
390
|
+
@assignment = Assignment.new(at_amazon) #expensive
|
391
|
+
end
|
392
|
+
end
|
393
|
+
@assignment
|
394
|
+
end
|
395
|
+
|
396
|
+
|
397
|
+
#private
|
398
|
+
|
399
|
+
def stashed_param(param)
|
400
|
+
if @assignment && assignment.answers[param]
|
401
|
+
return assignment.answers[param]
|
402
|
+
elsif full.annotation[param]
|
403
|
+
#A question assigned through this software. May be
|
404
|
+
#expensive: May result in HTTP request to fetch HIT
|
405
|
+
#fields. We choose to fetch (sometimes) the HIT rather than
|
406
|
+
#the assignment on the assumption it will be MORE common to
|
407
|
+
#encounter HITs with no answers and LESS common to encounter
|
408
|
+
#HITs assigned through the RUI (and thus lacking in an
|
409
|
+
#annotation from this software and thus rendering the HTTP
|
410
|
+
#request to fetch the HIT fields pointless).
|
411
|
+
return full.annotation[param]
|
412
|
+
elsif full.assignments_completed.to_i >= 1
|
413
|
+
#A question assigned through Amazon's RUI, with an answer
|
414
|
+
#submitted. If the HIT belongs to this software, this
|
415
|
+
#assignment's answers will include our param. We prefer
|
416
|
+
#fetching the assignment to fetching the external question
|
417
|
+
#(as below) because fetching the assignment will potentially
|
418
|
+
#save us an HTTP request down the line -- for example, if we
|
419
|
+
#need other assignment data (e.g. assignment status).
|
420
|
+
#Fetching the external question only serves to give us
|
421
|
+
#access to params. If the answers do not include our param,
|
422
|
+
#we know the HIT does not belong to this software, since we
|
423
|
+
#know the param was also not in the annotation. So we are
|
424
|
+
#safe returning nil in that case.
|
425
|
+
return assignment.answers[param]
|
426
|
+
else
|
427
|
+
#A question assigned via Amazon's RUI, with no answer
|
428
|
+
#submitted. Expensive: Results in HTTP request to fetch
|
429
|
+
#external question.
|
430
|
+
return full.external_question_param(param)
|
431
|
+
end
|
432
|
+
end
|
433
|
+
|
434
|
+
def assignment_status_match?(status)
|
435
|
+
if @full
|
436
|
+
return false if full.assignments_completed == 0
|
437
|
+
return false if full.status != 'Reviewable'
|
438
|
+
end
|
439
|
+
assignment.status == status
|
440
|
+
end
|
441
|
+
|
442
|
+
|
443
|
+
@@cacheable_assignment_status = Set.new %w(Approved Rejected)
|
444
|
+
def cacheable?
|
445
|
+
if @ours == false
|
446
|
+
return true
|
447
|
+
end
|
448
|
+
if @full
|
449
|
+
return true if full.expired_and_overdue?
|
450
|
+
end
|
451
|
+
if @assignment && assignment.status
|
452
|
+
return true if @@cacheable_assignment_status.include?(assignment.status)
|
453
|
+
end
|
454
|
+
return false
|
455
|
+
end
|
456
|
+
end #HIT
|
457
|
+
end #Amazon
|
458
|
+
end #Typingpool
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
|
4
|
+
#Class encapsulating the HTML form presented to Mechanical Turk workers
|
5
|
+
#transcribing a Typingpool audio chunk.
|
6
|
+
class Question
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'uri'
|
9
|
+
require 'cgi'
|
10
|
+
attr_reader :url, :html
|
11
|
+
|
12
|
+
#Constructor. Takes the URL of where the question HTML has been
|
13
|
+
#uploaded, followed by the question HTML itself.
|
14
|
+
def initialize(url, html)
|
15
|
+
@url = url
|
16
|
+
@html = html
|
17
|
+
end
|
18
|
+
|
19
|
+
#Returns URL-encoded key-value pairs that can be used as the
|
20
|
+
#text for a HIT#annotation. The key-value pairs correspond to
|
21
|
+
#all hidden HTML form fields in the question HTML.
|
22
|
+
def annotation
|
23
|
+
CGI.escapeHTML(URI.encode_www_form(Hash[*noko.css('input[type="hidden"]').select{|e| e['name'].match(/^typingpool_/) }.map{|e| [e['name'], e['value']]}.flatten]))
|
24
|
+
end
|
25
|
+
|
26
|
+
#Returns the title, extracted from the title element of the
|
27
|
+
#HTML.
|
28
|
+
def title
|
29
|
+
noko.css('title')[0].content
|
30
|
+
end
|
31
|
+
|
32
|
+
#Returns the description, extracted from the element with the id
|
33
|
+
#'description' in the HTML.
|
34
|
+
def description
|
35
|
+
noko.css('#description')[0].content
|
36
|
+
end
|
37
|
+
|
38
|
+
protected
|
39
|
+
|
40
|
+
def noko(html=@html)
|
41
|
+
Nokogiri::HTML(html, nil, 'UTF-8')
|
42
|
+
end
|
43
|
+
end #Question
|
44
|
+
end #Amazon
|
45
|
+
end #Typingpool
|