typingpool 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +23 -0
- data/bin/tp-assign +240 -0
- data/bin/tp-collect +50 -0
- data/bin/tp-config +114 -0
- data/bin/tp-finish +101 -0
- data/bin/tp-make +169 -0
- data/bin/tp-review +175 -0
- data/lib/typingpool/amazon.rb +732 -0
- data/lib/typingpool/app.rb +634 -0
- data/lib/typingpool/config.rb +344 -0
- data/lib/typingpool/error.rb +22 -0
- data/lib/typingpool/filer.rb +396 -0
- data/lib/typingpool/project.rb +593 -0
- data/lib/typingpool/template.rb +175 -0
- data/lib/typingpool/templates/assignment/amazon-init.js +38 -0
- data/lib/typingpool/templates/assignment/interview/nameless.html.erb +13 -0
- data/lib/typingpool/templates/assignment/interview/noisy.html.erb +12 -0
- data/lib/typingpool/templates/assignment/interview/partials/voices.html.erb +10 -0
- data/lib/typingpool/templates/assignment/interview/phone.html.erb +12 -0
- data/lib/typingpool/templates/assignment/interview.html.erb +11 -0
- data/lib/typingpool/templates/assignment/main.css +20 -0
- data/lib/typingpool/templates/assignment/partials/entry.html.erb +19 -0
- data/lib/typingpool/templates/assignment/partials/footer.html.erb +3 -0
- data/lib/typingpool/templates/assignment/partials/header.html.erb +11 -0
- data/lib/typingpool/templates/assignment/partials/labeling-example.html.erb +4 -0
- data/lib/typingpool/templates/assignment/partials/labeling.html.erb +5 -0
- data/lib/typingpool/templates/assignment/partials/length-description.html.erb +6 -0
- data/lib/typingpool/templates/assignment/partials/voices.html.erb +10 -0
- data/lib/typingpool/templates/assignment/speech.html.erb +11 -0
- data/lib/typingpool/templates/config.yml +21 -0
- data/lib/typingpool/templates/project/audio/chunks/.empty_directory +0 -0
- data/lib/typingpool/templates/project/audio/originals/.empty_directory +0 -0
- data/lib/typingpool/templates/project/data/.empty_directory +0 -0
- data/lib/typingpool/templates/project/etc/ About these files - read me.txt +8 -0
- data/lib/typingpool/templates/project/etc/audio-compat.js +25 -0
- data/lib/typingpool/templates/project/etc/player/audio-player.js +4 -0
- data/lib/typingpool/templates/project/etc/player/license.txt +19 -0
- data/lib/typingpool/templates/project/etc/player/player.swf +0 -0
- data/lib/typingpool/templates/project/etc/transcript.css +49 -0
- data/lib/typingpool/templates/transcript.html.erb +23 -0
- data/lib/typingpool/test/fixtures/amazon-question-html.html +95 -0
- data/lib/typingpool/test/fixtures/amazon-question-url.txt +1 -0
- data/lib/typingpool/test/fixtures/audio/mp3/interview.1.mp3 +0 -0
- data/lib/typingpool/test/fixtures/audio/mp3/interview.2.mp3 +0 -0
- data/lib/typingpool/test/fixtures/audio/wma/VN620007.WMA +0 -0
- data/lib/typingpool/test/fixtures/audio/wma/VN620052.WMA +0 -0
- data/lib/typingpool/test/fixtures/config-1 +20 -0
- data/lib/typingpool/test/fixtures/config-2 +25 -0
- data/lib/typingpool/test/fixtures/not_yaml.txt +4 -0
- data/lib/typingpool/test/fixtures/template-2.html.erb +10 -0
- data/lib/typingpool/test/fixtures/template-3.html.erb +22 -0
- data/lib/typingpool/test/fixtures/template.html.erb +10 -0
- data/lib/typingpool/test/fixtures/tp_collect_id.txt +1 -0
- data/lib/typingpool/test/fixtures/tp_collect_sandbox-assignment.csv +8 -0
- data/lib/typingpool/test/fixtures/tp_review_id.txt +1 -0
- data/lib/typingpool/test/fixtures/tp_review_sandbox-assignment.csv +8 -0
- data/lib/typingpool/test/fixtures/transcript-chunks.csv +226 -0
- data/lib/typingpool/test/fixtures/utf8_transcript.txt +7 -0
- data/lib/typingpool/test/fixtures/vcr/tp-collect-1.yml +2712 -0
- data/lib/typingpool/test/fixtures/vcr/tp-collect-2.yml +2718 -0
- data/lib/typingpool/test/fixtures/vcr/tp-collect-3.yml +2768 -0
- data/lib/typingpool/test/fixtures/vcr/tp-review-1.yml +570 -0
- data/lib/typingpool/test/fixtures/vcr/tp-review-2.yml +351 -0
- data/lib/typingpool/test.rb +418 -0
- data/lib/typingpool/transcript.rb +181 -0
- data/lib/typingpool/utility.rb +272 -0
- data/lib/typingpool.rb +500 -0
- data/test/make_amazon_question_fixture.rb +24 -0
- data/test/make_tp_collect_fixture_1.rb +26 -0
- data/test/make_tp_collect_fixture_2.rb +16 -0
- data/test/make_tp_collect_fixture_3.rb +15 -0
- data/test/make_tp_collect_fixture_4.rb +17 -0
- data/test/make_tp_review_fixture_1.rb +26 -0
- data/test/make_tp_review_fixture_2.rb +30 -0
- data/test/make_transcript_chunks_fixture.rb +53 -0
- data/test/test_integration_script_1_tp_config.rb +108 -0
- data/test/test_integration_script_2_tp_make.rb +119 -0
- data/test/test_integration_script_3_tp_assign.rb +152 -0
- data/test/test_integration_script_4_tp_review.rb +72 -0
- data/test/test_integration_script_5_tp_collect.rb +44 -0
- data/test/test_integration_script_6_tp_finish.rb +123 -0
- data/test/test_unit_amazon.rb +153 -0
- data/test/test_unit_config.rb +94 -0
- data/test/test_unit_filer.rb +202 -0
- data/test/test_unit_project.rb +168 -0
- data/test/test_unit_project_local.rb +68 -0
- data/test/test_unit_project_remote.rb +157 -0
- data/test/test_unit_template.rb +111 -0
- data/test/test_unit_transcript.rb +77 -0
- metadata +234 -0
@@ -0,0 +1,732 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
require 'rturk'
|
4
|
+
require 'pstore'
|
5
|
+
@@cache_file = '~/.typingpool.cache'
|
6
|
+
|
7
|
+
class << self
|
8
|
+
|
9
|
+
#You must call Amazon.setup before using any subclass methods
|
10
|
+
#that rely on Amazon servers.
|
11
|
+
# ==== Params
|
12
|
+
# Takes params as a hash of named arguments.
|
13
|
+
#[:key] Your Amazon Web Services Access Key ID. Required
|
14
|
+
# param. If not passed, will be read from :config.
|
15
|
+
#[:secret] Your Amazon Web Services Secret Access Key. Required
|
16
|
+
# param. If not passed, will be read from :config.
|
17
|
+
#[:config] A Typingpool::Config instance. If not passed, will
|
18
|
+
# use the default Config.file (usually
|
19
|
+
# ~/.typingpool). Supplies the default values for :key
|
20
|
+
# and :secret and can override the default cache file
|
21
|
+
# location (usually ~/.typingpool.cache) via the
|
22
|
+
# 'cache' param.
|
23
|
+
#[:sandbox] Boolean specifying whether to perform all operations
|
24
|
+
# in the Amazon Mechanical Turk sandbox. Default is
|
25
|
+
# false.
|
26
|
+
# ==== Returns
|
27
|
+
# Result of call to RTurk.setup with security credentials and sandbox param.
|
28
|
+
def setup(args={})
|
29
|
+
args[:config] ||= Config.file
|
30
|
+
args[:key] ||= args[:config].amazon.key
|
31
|
+
args[:secret] ||= args[:config].amazon.secret
|
32
|
+
args[:sandbox] = false if args[:sandbox].nil?
|
33
|
+
if args[:config].cache
|
34
|
+
@@cache = nil
|
35
|
+
@@cache_file = args[:config].cache
|
36
|
+
end
|
37
|
+
RTurk.setup(args[:key], args[:secret], :sandbox => args[:sandbox])
|
38
|
+
end
|
39
|
+
|
40
|
+
#Convenience wrapper that calls RTurk::Hit.new with
|
41
|
+
#:include_assignment_summary set to true. Takes a HIT id and
|
42
|
+
#returns an RTurk::Hit instance.
|
43
|
+
def rturk_hit_full(id)
|
44
|
+
RTurk::Hit.new(id, nil, :include_assignment_summary => true)
|
45
|
+
end
|
46
|
+
|
47
|
+
#Returns a PStore instance tied to the cache file specified in
|
48
|
+
#Amazon.setup (or the default).
|
49
|
+
def cache
|
50
|
+
@@cache ||= PStore.new(File.expand_path(@@cache_file))
|
51
|
+
end
|
52
|
+
|
53
|
+
end #class << self
|
54
|
+
|
55
|
+
#Class representing an Amazon Mechanical Turk Human Intelligence
|
56
|
+
#Task (HIT).
|
57
|
+
#
|
58
|
+
#We go above and beyond RTurk::Hit for several practical reasons:
|
59
|
+
# * To allow easy serialization. Caching is a very useful way of
|
60
|
+
# reducing network calls to Amazon, and thus of speeding up
|
61
|
+
# Typingpool. RTurk::Hit objects cannot be dumped via Marshal,
|
62
|
+
# apparently due to some Nokogiri objects they
|
63
|
+
# contain. Typingpool::Amazon::HIT objects, in contrast, are
|
64
|
+
# designed to be easily and compactly serialized. They store the
|
65
|
+
# minimal subset of information we need via simple
|
66
|
+
# attribtues. (Presently we serialize via PStore.)
|
67
|
+
# * To attach convenience methods. RTurk does not make it easy,
|
68
|
+
# for example, to get HITs beyond the first "page" returned by
|
69
|
+
# Amazon. This class provides methods that make it easy to get
|
70
|
+
# ALL HITs returned by various operations.
|
71
|
+
# * To attach methods specific to Typingpool. For example, the url
|
72
|
+
# and project_id methods read params we've embedded in the
|
73
|
+
# annotation or in hidden fields on an external question, while
|
74
|
+
# the underlying stashed_params method optimizes its lookup of
|
75
|
+
# these variables based on how the app is most likely to be
|
76
|
+
# used. See also the ours? and cacheable? methods.
|
77
|
+
# * To simplify. Typingpool HITs are constrained such that we can
|
78
|
+
# assume they all contain only one assignment and thus only a
|
79
|
+
# maximum of one answer. Also, once we've determined that a HIT
|
80
|
+
# does not belong to Typingpool, it is safe to cache it forever
|
81
|
+
# and never download it again from Amazon.
|
82
|
+
# * To clearly partition methods that result in network
|
83
|
+
# calls. When you access an attribute under hit.full, like
|
84
|
+
# hit.full.status, it is clear you are doing something
|
85
|
+
# potentially expensive to obtain your hit status. Same thing
|
86
|
+
# with accessing an attribute under hit.assignment, like
|
87
|
+
# hit.assignment.worker_id -- it is clear an assignment object
|
88
|
+
# will need to be created, implying a network call. Calling
|
89
|
+
# hit.id, in contrast, is always fast. (Caveat: Accessing
|
90
|
+
# partitioned attributes often, but not always, results in a
|
91
|
+
# network call. In some cases, hit.full is generated at the same
|
92
|
+
# time we create the hit, since we've obtained a full HIT
|
93
|
+
# serialization from Amazon. In other cases, we only have a HIT
|
94
|
+
# id, so accessing anything under hit.full generates a network
|
95
|
+
# call.)
|
96
|
+
class HIT
|
97
|
+
require 'set'
|
98
|
+
require 'uri'
|
99
|
+
|
100
|
+
class << self
|
101
|
+
|
102
|
+
#Constructor. Creates an Amazon Mechanical Turk HIT.
|
103
|
+
#** Warning: This method can spend your money! **
|
104
|
+
# ==== Params
|
105
|
+
# [question] Typingpool::Amazon::Question instance, used not
|
106
|
+
# only to generate the (external) question but
|
107
|
+
# also parsed to provide one or more core HIT
|
108
|
+
# attributes. Must include a non-nil
|
109
|
+
# annotation attribute. Provides fallback
|
110
|
+
# values for HIT title and description.
|
111
|
+
# [config_assign] The 'assign' attribute of a
|
112
|
+
# Typingpool::Config instance (that is, a
|
113
|
+
# Typingpool::Config::Root::Assign
|
114
|
+
# instance). Must include values for reward,
|
115
|
+
# lifetime, duration, and approval. May
|
116
|
+
# include values for keywords and
|
117
|
+
# qualifications. Preferred source for HIT
|
118
|
+
# title and description. See
|
119
|
+
# Typingpool::Config documentation for further
|
120
|
+
# details.
|
121
|
+
# ==== Returns
|
122
|
+
# Typingpool::Amazon::HIT instance corresponding to the new
|
123
|
+
# Mechanical Turk HIT.
|
124
|
+
def create(question, config_assign)
|
125
|
+
new(RTurk::Hit.create(:title => config_assign.title || question.title) do |hit|
|
126
|
+
hit.description = config_assign.description || question.description
|
127
|
+
hit.question(question.url)
|
128
|
+
hit.note = question.annotation or raise Error, "Missing annotation from question"
|
129
|
+
hit.reward = config_assign.reward or raise Error, "Missing reward config"
|
130
|
+
hit.assignments = 1
|
131
|
+
hit.lifetime = config_assign.lifetime or raise Error, "Missing lifetime config"
|
132
|
+
hit.duration = config_assign.deadline or raise Error, "Missing deadline config"
|
133
|
+
hit.auto_approval = config_assign.approval or raise Error, "Missing approval config"
|
134
|
+
hit.keywords = config_assign.keywords if config_assign.keywords
|
135
|
+
config_assign.qualify.each{|q| hit.qualifications.add(*q.to_arg)} if config_assign.qualify
|
136
|
+
end)
|
137
|
+
end
|
138
|
+
|
139
|
+
#Name of the hidden HTML form field used to provide the
|
140
|
+
#project_id in an external question or (form-encoded)
|
141
|
+
#annotation. Hard coded to typingpool_project_id but
|
142
|
+
#overridable in a subclass.
|
143
|
+
def id_at
|
144
|
+
@@id_at ||= 'typingpool_project_id'
|
145
|
+
end
|
146
|
+
|
147
|
+
#Name of the hidden HTML form field used to provide the
|
148
|
+
#(audio) url in an external question or (form-encoded)
|
149
|
+
#annotation. Hard coded to typingpool_url but overridable in a
|
150
|
+
#subclass.
|
151
|
+
def url_at
|
152
|
+
@@url_at ||= 'typingpool_url'
|
153
|
+
end
|
154
|
+
|
155
|
+
#Takes an array of HIT ids, returns Typingpool::Amazon::HIT
|
156
|
+
#instances corresponding to those ids.
|
157
|
+
def with_ids(ids)
|
158
|
+
ids.map{|id| cached_or_new(RTurk::Hit.new(id)) }
|
159
|
+
end
|
160
|
+
|
161
|
+
#Returns all Typingpool HITs that have been approved, as an
|
162
|
+
#array of Typingpool::Amazon::HIT instances.
|
163
|
+
def all_approved
|
164
|
+
hits = all_reviewable do |hit|
|
165
|
+
begin
|
166
|
+
#optimization: we assume it is more common to have an
|
167
|
+
#unapproved HIT than an approved HIT that does not
|
168
|
+
#belong to this app
|
169
|
+
hit.approved? && hit.ours?
|
170
|
+
rescue RestClient::ServiceUnavailable => e
|
171
|
+
warn "Warning: Service unavailable error, skipped HIT #{hit.id}. (Error: #{e})"
|
172
|
+
false
|
173
|
+
end
|
174
|
+
end
|
175
|
+
hits
|
176
|
+
end
|
177
|
+
|
178
|
+
#Returns as an array of Typingpool::Amazon::HIT instances all
|
179
|
+
#HITs returned by Amazon's GetReviewableHITs operation (which
|
180
|
+
#have HIT status == 'Reviewable'). Takes an optional filter
|
181
|
+
#block (which should return true for HITs to be included in
|
182
|
+
#the final results). If not supplied, will filter so the
|
183
|
+
#returned hits are all Typingpool HITs (hit.ours? == true).
|
184
|
+
def all_reviewable(&filter)
|
185
|
+
hits = each_page do |page_number|
|
186
|
+
RTurk.GetReviewableHITs(:page_number => page_number).hit_ids.map{|id| RTurk::Hit.new(id) }.map{|hit| cached_or_new(hit) }
|
187
|
+
end
|
188
|
+
filter_ours(hits, &filter)
|
189
|
+
end
|
190
|
+
|
191
|
+
#Takes a Typingpool::Project::Local#id and returns all HITs
|
192
|
+
#associated with that project, as an array of
|
193
|
+
#Typingpool::Amazon::HIT instances.
|
194
|
+
def all_for_project(id)
|
195
|
+
all{|hit| hit.ours? && hit.project_id == id}
|
196
|
+
end
|
197
|
+
|
198
|
+
#Returns all HITs associated with your AWS account as an array
|
199
|
+
#of Typingpool::Amazon::HIT instances. Takes an optional
|
200
|
+
#filter block (which should return true for HITs to be
|
201
|
+
#included in the final results). If not supplied, will filter
|
202
|
+
#so the returned hits are all Typingpool HITs (hit.ours? ==
|
203
|
+
#true).
|
204
|
+
def all(&filter)
|
205
|
+
hits = each_page do |page_number|
|
206
|
+
page = RTurk::SearchHITs.create(:page_number => page_number)
|
207
|
+
raw_hits = page.xml.xpath('//HIT')
|
208
|
+
page.hits.map do |rturk_hit|
|
209
|
+
annotation = raw_hits.shift.xpath('RequesterAnnotation').inner_text.strip
|
210
|
+
full = Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation)
|
211
|
+
cached_or_new_from_searchhits(rturk_hit, annotation)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
filter_ours(hits, &filter)
|
215
|
+
end
|
216
|
+
|
217
|
+
#protected
|
218
|
+
|
219
|
+
#Constructor. Takes an RTurk::Hit instance. Returns a
|
220
|
+
#Typingpool::Amazon::HIT instance, preferably from the cache.
|
221
|
+
def cached_or_new(rturk_hit)
|
222
|
+
from_cache(rturk_hit.id) || new(rturk_hit)
|
223
|
+
end
|
224
|
+
|
225
|
+
#Constructor. Same as cached_or_new, but handles peculiarities
|
226
|
+
#of objects returned by RTurk::SearchHITs. Such objects map
|
227
|
+
#two Amazon HIT fields to different names than those used by
|
228
|
+
#other RTurk HIT instances. They also do not bother to extract
|
229
|
+
#the annotation from the Amazon HIT, so we have to do that
|
230
|
+
#ourselves (elsewhere) and take it as a param here. Finally,
|
231
|
+
#on the bright side, RTurk::SearchHITs already contain a big
|
232
|
+
#chunk of hit.full attributes, potentially obviating the need
|
233
|
+
#for an additional network call to flesh out the HIT, so this
|
234
|
+
#method pre-fleshes-out the HIT.
|
235
|
+
def cached_or_new_from_searchhits(rturk_hit, annotation)
|
236
|
+
if not (typingpool_hit = from_cache(rturk_hit.id))
|
237
|
+
typingpool_hit = new(rturk_hit)
|
238
|
+
typingpool_hit.full(Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation))
|
239
|
+
end
|
240
|
+
typingpool_hit
|
241
|
+
end
|
242
|
+
|
243
|
+
def from_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
|
244
|
+
Amazon.cache.transaction do
|
245
|
+
Amazon.cache[cache_key(hit_id, id_at, url_at)]
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
def delete_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
|
250
|
+
Amazon.cache.transaction do
|
251
|
+
key = cache_key(hit_id, id_at, url_at)
|
252
|
+
cached = Amazon.cache[key]
|
253
|
+
Amazon.cache.delete(key) unless cached.nil?
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
def cache_key(hit_id, id_at=self.id_at, url_at=self.url_at)
|
258
|
+
"RESULT///#{hit_id}///#{url_at}///#{id_at}"
|
259
|
+
end
|
260
|
+
|
261
|
+
def each_page
|
262
|
+
results = []
|
263
|
+
page = 0
|
264
|
+
begin
|
265
|
+
page += 1
|
266
|
+
new_results = yield(page)
|
267
|
+
results.push(*new_results)
|
268
|
+
end while new_results.count > 0
|
269
|
+
results
|
270
|
+
end
|
271
|
+
|
272
|
+
def filter_ours(hits, &filter)
|
273
|
+
filter ||= lambda{|hit| hit.ours? }
|
274
|
+
hits.select do |hit|
|
275
|
+
selected = filter.call(hit)
|
276
|
+
hit.to_cache
|
277
|
+
selected
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end #class << self
|
281
|
+
|
282
|
+
#Corresponds to the Amazon Mechanical Turk HIT#HITId
|
283
|
+
attr_reader :id
|
284
|
+
|
285
|
+
#Constructor. Takes an RTurk::Hit instance.
|
286
|
+
def initialize(rturk_hit)
|
287
|
+
@id = rturk_hit.id
|
288
|
+
end
|
289
|
+
|
290
|
+
#URL of the audio file associated with this HIT (the audio file
|
291
|
+
#to be transcribed). Extracted from the annotation (when the HIT
|
292
|
+
#was assigned via Typingpool) or from a hidden field in the HTML
|
293
|
+
#form on the external question (when the HIT was assigned via
|
294
|
+
#the Amazon Mechanical Turk RUI).
|
295
|
+
def url
|
296
|
+
@url ||= stashed_param(self.class.url_at)
|
297
|
+
end
|
298
|
+
|
299
|
+
#The Typingpool::Project::Local#id associated with this
|
300
|
+
#HIT. Extracted as described for the url method.
|
301
|
+
def project_id
|
302
|
+
@project_id ||= stashed_param(self.class.id_at)
|
303
|
+
end
|
304
|
+
|
305
|
+
#Returns the Typingpool::Project#name associated with this HIT
|
306
|
+
#by parsing the #url. May be dropped in a future release.
|
307
|
+
def project_title_from_url(url=self.url)
|
308
|
+
matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
|
309
|
+
URI.unescape(matches[2])
|
310
|
+
end
|
311
|
+
|
312
|
+
#Returns true if this HIT has an approved assignment associated
|
313
|
+
#with it. (Attached to Typingpool::Amazon::HIT rather than
|
314
|
+
#Typingpool::Amazon::HIT::Assignment because sometimes we can
|
315
|
+
#tell simply from looking at hit.full that there are no approved
|
316
|
+
#assignments -- hit.full.assignments_completed == 0. This check
|
317
|
+
#is only performed when hit.full has already been loaded.)
|
318
|
+
def approved?
|
319
|
+
assignment_status_match?('Approved')
|
320
|
+
end
|
321
|
+
|
322
|
+
#Returns true if this HIT has a rejected assignment associated
|
323
|
+
#with it. (For an explanation of why this is not attached to
|
324
|
+
#Typingpool::Amazon::HIT::Assignment, see the documentation for
|
325
|
+
#approved?.)
|
326
|
+
def rejected?
|
327
|
+
assignment_status_match?('Rejected')
|
328
|
+
end
|
329
|
+
|
330
|
+
#Returns true if this HIT has a submitted assignment associated
|
331
|
+
#with it. (For an explanation of why this is not attached to
|
332
|
+
#Typingpool::Amazon::HIT::Assignment, see the documentation for
|
333
|
+
#approved?.)
|
334
|
+
def submitted?
|
335
|
+
assignment_status_match?('Submitted')
|
336
|
+
end
|
337
|
+
|
338
|
+
|
339
|
+
#Returns true if this HIT is associated with Typingpool. One
|
340
|
+
#Amazon account can be used for many tasks, so it's important to
|
341
|
+
#check whether the HIT belongs to this software. (Presently,
|
342
|
+
#this is determined by looking for a stashed param like url or
|
343
|
+
#project_id).
|
344
|
+
def ours?
|
345
|
+
@ours ||= not(url.to_s.empty?)
|
346
|
+
end
|
347
|
+
|
348
|
+
#Returns a Typingpool::Transcript::Chunk instance built using
|
349
|
+
#this HIT and its associated assignment.
|
350
|
+
def transcript
|
351
|
+
transcript = Transcript::Chunk.new(assignment.body)
|
352
|
+
transcript.url = url
|
353
|
+
transcript.project = project_id
|
354
|
+
transcript.worker = assignment.worker_id
|
355
|
+
transcript.hit = @id
|
356
|
+
transcript
|
357
|
+
end
|
358
|
+
|
359
|
+
#If this HIT is cacheable, serializes it to the cache file
|
360
|
+
#specified in the config passed to Amazon.setup, or specified in
|
361
|
+
#the default config file. In short, a HIT is cacheable if it
|
362
|
+
#does not belong to Typingpool (ours? == false), if it is
|
363
|
+
#approved or rejected (approved? || rejected?), or if it is
|
364
|
+
#expired (full.expired_and_overdue?). See also cacheable? code.
|
365
|
+
#
|
366
|
+
# When available, cached HITs are used by
|
367
|
+
# Typingpool::Amazon::HIT.all,
|
368
|
+
# Typingpool::Amazon::HIT.all_approved, and all the other class
|
369
|
+
# methods that retrieve HITs. These methods call to_cache for
|
370
|
+
# you at logical times (after downloading and filtering, when
|
371
|
+
# the HIT is most fleshed out), so you should not need to call
|
372
|
+
# this yourself. But if you have an operation that makes network
|
373
|
+
# calls to further flesh out the HIT, calling to_cache may be
|
374
|
+
# worthwhile.
|
375
|
+
def to_cache
|
376
|
+
#any obj containing a Nokogiri object cannot be stored in pstore - do
|
377
|
+
#not forget this (again)
|
378
|
+
if cacheable?
|
379
|
+
Amazon.cache.transaction do
|
380
|
+
Amazon.cache[self.class.cache_key(@id)] = self
|
381
|
+
end
|
382
|
+
end
|
383
|
+
end
|
384
|
+
|
385
|
+
#Returns an RTurk::Hit instance corresponding to this HIT.
|
386
|
+
def at_amazon
|
387
|
+
Amazon.rturk_hit_full(@id)
|
388
|
+
end
|
389
|
+
|
390
|
+
#Deletes the HIT from Amazon's servers. Examines the HIT and
|
391
|
+
#assignment status to determine whether calling the DisposeHIT
|
392
|
+
#or DisableHIT operation is most appropriate. If the HIT has
|
393
|
+
#been submitted but not approved or rejected, will raise an
|
394
|
+
#exception of type
|
395
|
+
#Typingpool::Error::Amazon::UnreviewedContent. Catch this
|
396
|
+
#exception in your own code if you'd like to automatically
|
397
|
+
#approve such HITs before removing them.
|
398
|
+
def remove_from_amazon
|
399
|
+
if full.status == 'Reviewable'
|
400
|
+
if assignment.status == 'Submitted'
|
401
|
+
raise Error::Amazon::UnreviewedContent, "There is an unreviewed submission for #{url}"
|
402
|
+
end
|
403
|
+
at_amazon.dispose!
|
404
|
+
else
|
405
|
+
at_amazon.disable!
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
#Returns "the full hit" - a Typingpool::Amazon::HIT::Full
|
410
|
+
#instance associated with this HIT. If the instance is being
|
411
|
+
#created for the first time, this will trigger an HTTP request
|
412
|
+
#to Amazon's servers. "Full" hit fields segregated because
|
413
|
+
#accessing any one of them is expensive if we only have a hit id
|
414
|
+
#(but after fetching one all are cheap). Accepts an optional
|
415
|
+
#Typingpool::Amazon::HIT::Full (or subclass) to set for this
|
416
|
+
#attribute, preventing the need to create one. This is useful in
|
417
|
+
#cases in which extensive HIT data was returned by an Amazon
|
418
|
+
#operation (for example, SearchHITs returns lots of HIT data)
|
419
|
+
def full(full_hit=nil)
|
420
|
+
if @full.nil?
|
421
|
+
@full = full_hit || Full.new(at_amazon)
|
422
|
+
end
|
423
|
+
@full
|
424
|
+
end
|
425
|
+
|
426
|
+
#Returns the assignment associated with this HIT - a
|
427
|
+
#Typingpool::Amazon::HIT::Assignment instance. The first time
|
428
|
+
#this is called, an Amazon HTTP request is typically (but not
|
429
|
+
#always) sent.
|
430
|
+
def assignment
|
431
|
+
if @assignment.nil?
|
432
|
+
if @full && full.assignments_completed == 0
|
433
|
+
#It would be dangerous to do this if the HIT were to be
|
434
|
+
#cached, since we would then never check for the
|
435
|
+
#assignment again. But we know this HIT won't be cached
|
436
|
+
#while it is active, since we only cache approved and
|
437
|
+
#rejected HITs.
|
438
|
+
@assignment = Assignment::Empty.new
|
439
|
+
else
|
440
|
+
@assignment = Assignment.new(at_amazon) #expensive
|
441
|
+
end
|
442
|
+
end
|
443
|
+
@assignment
|
444
|
+
end
|
445
|
+
|
446
|
+
|
447
|
+
#private
|
448
|
+
|
449
|
+
def stashed_param(param)
|
450
|
+
if @assignment && assignment.answers[param]
|
451
|
+
return assignment.answers[param]
|
452
|
+
elsif full.annotation[param]
|
453
|
+
#A question assigned through this software. May be
|
454
|
+
#expensive: May result in HTTP request to fetch HIT
|
455
|
+
#fields. We choose to fetch (sometimes) the HIT rather than
|
456
|
+
#the assignment on the assumption it will be MORE common to
|
457
|
+
#encounter HITs with no answers and LESS common to encounter
|
458
|
+
#HITs assigned through the RUI (and thus lacking in an
|
459
|
+
#annotation from this software and thus rendering the HTTP
|
460
|
+
#request to fetch the HIT fields pointless).
|
461
|
+
return full.annotation[param]
|
462
|
+
elsif full.assignments_completed.to_i >= 1
|
463
|
+
#A question assigned through Amazon's RUI, with an answer
|
464
|
+
#submitted. If the HIT belongs to this software, this
|
465
|
+
#assignment's answers will include our param. We prefer
|
466
|
+
#fetching the assignment to fetching the external question
|
467
|
+
#(as below) because fetching the assignment will potentially
|
468
|
+
#save us an HTTP request down the line -- for example, if we
|
469
|
+
#need other assignment data (e.g. assignment status).
|
470
|
+
#Fetching the external question only serves to give us
|
471
|
+
#access to params. If the answers do not include our param,
|
472
|
+
#we know the HIT does not belong to this software, since we
|
473
|
+
#know the param was also not in the annotation. So we are
|
474
|
+
#safe returning nil in that case.
|
475
|
+
return assignment.answers[param]
|
476
|
+
else
|
477
|
+
#A question assigned via Amazon's RUI, with no answer
|
478
|
+
#submitted. Expensive: Results in HTTP request to fetch
|
479
|
+
#external question.
|
480
|
+
return full.external_question_param(param)
|
481
|
+
end
|
482
|
+
end
|
483
|
+
|
484
|
+
def assignment_status_match?(status)
|
485
|
+
if @full
|
486
|
+
return false if full.assignments_completed == 0
|
487
|
+
return false if full.status != 'Reviewable'
|
488
|
+
end
|
489
|
+
assignment.status == status
|
490
|
+
end
|
491
|
+
|
492
|
+
|
493
|
+
@@cacheable_assignment_status = Set.new %w(Approved Rejected)
|
494
|
+
def cacheable?
|
495
|
+
if @ours == false
|
496
|
+
return true
|
497
|
+
end
|
498
|
+
if @full
|
499
|
+
return true if full.expired_and_overdue?
|
500
|
+
end
|
501
|
+
if @assignment && assignment.status
|
502
|
+
return true if @@cacheable_assignment_status.include?(assignment.status)
|
503
|
+
end
|
504
|
+
return false
|
505
|
+
end
|
506
|
+
|
507
|
+
class Full
|
508
|
+
require 'uri'
|
509
|
+
require 'open-uri'
|
510
|
+
require 'nokogiri'
|
511
|
+
|
512
|
+
#See the RTurk documentation and Amazon Mechanical Turk API
|
513
|
+
#documentation for more on these fields.
|
514
|
+
attr_reader :id, :type_id, :status, :external_question_url, :assignments_completed, :assignments_pending, :expires_at, :assignments_duration
|
515
|
+
|
516
|
+
#Constructor. Takes an RTurk::HIT instance.
|
517
|
+
def initialize(rturk_hit)
|
518
|
+
import_standard_attrs_from_rturk_hit(rturk_hit)
|
519
|
+
@assignments_completed = rturk_hit.assignments_completed_count
|
520
|
+
@assignments_pending = rturk_hit.assignments_pending_count
|
521
|
+
self.annotation = rturk_hit.annotation
|
522
|
+
self.external_question_url = rturk_hit.xml
|
523
|
+
end
|
524
|
+
|
525
|
+
#Returns the HIT annotation as a hash. If the annotation
|
526
|
+
#contained URL-encoded form key-value pairs, it decodes them
|
527
|
+
#and returns them as a hash. Otherwise, returns an empty hash
|
528
|
+
#(throwing away any annotation text that is not URL-encoded
|
529
|
+
#key-value pairs, for example the tags attached by the Amazon
|
530
|
+
#Mechanical Turk RUI).
|
531
|
+
def annotation
|
532
|
+
@annotation ||= {}
|
533
|
+
end
|
534
|
+
|
535
|
+
#Returns boolean indicated whether the HIT is
|
536
|
+
#expired. Determined by comparing the HIT's expires_at
|
537
|
+
#attribute with the current time.
|
538
|
+
def expired?
|
539
|
+
expires_at < Time.now
|
540
|
+
end
|
541
|
+
|
542
|
+
#Returns boolean indicated whether the HIT is expired and
|
543
|
+
#overdue, at which point it is totally safe to prune. This is
|
544
|
+
#determined by adding the assignment duration (how long a
|
545
|
+
#worker has to complete the HIT) to the HIT's expires_at time
|
546
|
+
#(when the HIT is removed from the Mechanical Turk
|
547
|
+
#marketplace).
|
548
|
+
def expired_and_overdue?
|
549
|
+
(expires_at + assignments_duration) < Time.now
|
550
|
+
end
|
551
|
+
|
552
|
+
#Returns the HTML of the external question associated with the
|
553
|
+
#HIT. All Typingpool HITs use external questions (as opposed
|
554
|
+
#to "internal" HIT QuestionForms), so this should always
|
555
|
+
#return something. In first use, must make an HTTP request to
|
556
|
+
#obtain the HTML.
|
557
|
+
def external_question
|
558
|
+
if @external_question.nil?
|
559
|
+
if external_question_url && external_question_url.match(/^http/)
|
560
|
+
#expensive, obviously:
|
561
|
+
@external_question = open(external_question_url).read
|
562
|
+
end
|
563
|
+
end
|
564
|
+
@external_question
|
565
|
+
end
|
566
|
+
|
567
|
+
#Takes the name of an HTML form param and returns the value
|
568
|
+
#associated with that param in the external question
|
569
|
+
#HTML. Triggers an HTTP request on first use (unless
|
570
|
+
#external_question has already been called).
|
571
|
+
def external_question_param(param)
|
572
|
+
if external_question
|
573
|
+
if input = Nokogiri::HTML::Document.parse(external_question).css("input[name=#{param}]")[0]
|
574
|
+
return input['value']
|
575
|
+
end
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
579
|
+
protected
|
580
|
+
|
581
|
+
def import_standard_attrs_from_rturk_hit(hit)
|
582
|
+
%w(id type_id status expires_at assignments_duration).each do |attr|
|
583
|
+
instance_variable_set("@#{attr}", hit.send(attr))
|
584
|
+
end
|
585
|
+
end
|
586
|
+
|
587
|
+
def annotation=(encoded)
|
588
|
+
@annotation = CGI.unescapeHTML(encoded.to_s)
|
589
|
+
begin
|
590
|
+
@annotation = URI.decode_www_form(@annotation)
|
591
|
+
@annotation = Hash[*@annotation.flatten]
|
592
|
+
rescue ArgumentError
|
593
|
+
#Handle annotations like Department:Transcription (from
|
594
|
+
#the Amazon RUI), which make URI.decode_www_form barf
|
595
|
+
@annotation = {}
|
596
|
+
end
|
597
|
+
end
|
598
|
+
|
599
|
+
def external_question_url=(noko_xml)
|
600
|
+
if url = noko_xml.css('HIT Question eq|ExternalQuestion eq|ExternalURL', {'eq' => 'http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2006-07-14/ExternalQuestion.xsd'})[0].inner_text
|
601
|
+
@external_question_url = url
|
602
|
+
end
|
603
|
+
end
|
604
|
+
|
605
|
+
#For more on why this subclass is neccesary, see the
|
606
|
+
#documentation for
|
607
|
+
#Typingpool::Amazon::HIT.cached_or_new_from_searchhits. In
|
608
|
+
#short, RTurk::HITParser objects returned by RTurk::SearchHITs
|
609
|
+
#are pointlessly and subtly different from
|
610
|
+
#RTurk::GetHITResponse objects. (I need to submit a patch to
|
611
|
+
#RTurk.)
|
612
|
+
class FromSearchHITs < Full
|
613
|
+
#Constructor. Takes an RTurk::Hit instance and the text of
|
614
|
+
#the HIT's annotation. The text of the annotation must be
|
615
|
+
#submitted as a separate param because RTurk::Hit instances
|
616
|
+
#returned by RTurk::SearchHITs do not bother to extract the
|
617
|
+
#annotation into an attribute, so we have to so that
|
618
|
+
#ourselves (elsewhere) using the raw xml.
|
619
|
+
def initialize(rturk_hit, annotation)
|
620
|
+
import_standard_attrs_from_rturk_hit(rturk_hit)
|
621
|
+
@assignments_completed = rturk_hit.completed_assignments
|
622
|
+
@assignments_pending = rturk_hit.pending_assignments
|
623
|
+
self.annotation = annotation
|
624
|
+
end
|
625
|
+
|
626
|
+
protected
|
627
|
+
|
628
|
+
def external_question_url
|
629
|
+
unless @checked_question
|
630
|
+
self.external_question_url = at_amazon.xml
|
631
|
+
@checked_question = true
|
632
|
+
end
|
633
|
+
@external_question_url
|
634
|
+
end
|
635
|
+
|
636
|
+
def at_amazon
|
637
|
+
Amazon.rturk_hit_full(@id)
|
638
|
+
end
|
639
|
+
end #Amazon::HIT::Full::FromSearchHITs
|
640
|
+
end #Amazon::HIT::Full
|
641
|
+
|
642
|
+
class Assignment
|
643
|
+
|
644
|
+
#See the RTurk documentation and Amazon Mechanical Turk API
|
645
|
+
#documentation for more on these fields.
|
646
|
+
attr_reader :id, :status, :worker_id, :submitted_at
|
647
|
+
|
648
|
+
#Constructor. Takes an RTurk::Hit instance.
|
649
|
+
def initialize(rturk_hit)
|
650
|
+
if assignment = rturk_hit.assignments[0] #expensive!
|
651
|
+
@id = assignment.id
|
652
|
+
@status = assignment.status
|
653
|
+
@worker_id = assignment.worker_id
|
654
|
+
@submitted_at = assignment.submitted_at
|
655
|
+
if answers = assignment.answers
|
656
|
+
@answers = answers.to_hash
|
657
|
+
end
|
658
|
+
end
|
659
|
+
end
|
660
|
+
|
661
|
+
#Returns the answers associated with this assignment as a
|
662
|
+
#hash. If there are no answers, returns an empty hash.
|
663
|
+
def answers
|
664
|
+
@answers ||= {}
|
665
|
+
end
|
666
|
+
|
667
|
+
#Returns the transcription submitted by the user as raw text.
|
668
|
+
def body
|
669
|
+
(answers['transcription'] || answers['1']).to_s
|
670
|
+
end
|
671
|
+
|
672
|
+
#Returms an RTurk::Assignment object corresponding to this
|
673
|
+
#assignment.
|
674
|
+
def at_amazon
|
675
|
+
RTurk::Assignment.new(@id)
|
676
|
+
end
|
677
|
+
|
678
|
+
#Subclass used in cases where we know Amazon's servers have no
|
679
|
+
#assignments for us (because hit.full.assignments_completed ==
|
680
|
+
#0), so we don't want to bother doing an HTTP request to
|
681
|
+
#check.
|
682
|
+
class Empty < Assignment
|
683
|
+
def initialize
|
684
|
+
@answers = {}
|
685
|
+
end
|
686
|
+
|
687
|
+
end #Empty
|
688
|
+
end #Assignment
|
689
|
+
end #HIT
|
690
|
+
|
691
|
+
#Class encapsulating the HTML form presented to Mechanical Turk workers
|
692
|
+
#transcribing a Typingpool audio chunk.
|
693
|
+
class Question
|
694
|
+
require 'nokogiri'
|
695
|
+
require 'uri'
|
696
|
+
require 'cgi'
|
697
|
+
attr_reader :url, :html
|
698
|
+
|
699
|
+
#Constructor. Takes the URL of where the question HTML has been
|
700
|
+
#uploaded, followed by the question HTML itself.
|
701
|
+
def initialize(url, html)
|
702
|
+
@url = url
|
703
|
+
@html = html
|
704
|
+
end
|
705
|
+
|
706
|
+
#Returns URL-encoded key-value pairs that can be used as the
|
707
|
+
#text for a HIT#annotation. The key-value pairs correspond to
|
708
|
+
#all hidden HTML form fields in the question HTML.
|
709
|
+
def annotation
|
710
|
+
CGI.escapeHTML(URI.encode_www_form(Hash[*noko.css('input[type="hidden"]').select{|e| e['name'].match(/^typingpool_/) }.map{|e| [e['name'], e['value']]}.flatten]))
|
711
|
+
end
|
712
|
+
|
713
|
+
#Returns the title, extracted from the title element of the
|
714
|
+
#HTML.
|
715
|
+
def title
|
716
|
+
noko.css('title')[0].content
|
717
|
+
end
|
718
|
+
|
719
|
+
#Returns the description, extracted from the element with the id
|
720
|
+
#'description' in the HTML.
|
721
|
+
def description
|
722
|
+
noko.css('#description')[0].content
|
723
|
+
end
|
724
|
+
|
725
|
+
protected
|
726
|
+
|
727
|
+
def noko(html=@html)
|
728
|
+
Nokogiri::HTML(html, nil, 'UTF-8')
|
729
|
+
end
|
730
|
+
end #Question
|
731
|
+
end #Amazon
|
732
|
+
end #Typingpool
|