typingpool 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/LICENSE +20 -0
  2. data/README.markdown +452 -0
  3. data/lib/typingpool/amazon/hit/assignment/empty.rb +19 -0
  4. data/lib/typingpool/amazon/hit/assignment.rb +43 -0
  5. data/lib/typingpool/amazon/hit/full/fromsearchhits.rb +44 -0
  6. data/lib/typingpool/amazon/hit/full.rb +105 -0
  7. data/lib/typingpool/amazon/hit.rb +458 -0
  8. data/lib/typingpool/amazon/question.rb +45 -0
  9. data/lib/typingpool/amazon.rb +3 -677
  10. data/lib/typingpool/app/cli/formatter.rb +16 -0
  11. data/lib/typingpool/app/cli.rb +64 -0
  12. data/lib/typingpool/app/friendlyexceptions.rb +34 -0
  13. data/lib/typingpool/app.rb +2 -97
  14. data/lib/typingpool/config/root.rb +114 -0
  15. data/lib/typingpool/config.rb +13 -119
  16. data/lib/typingpool/filer/audio.rb +84 -0
  17. data/lib/typingpool/filer/csv.rb +57 -0
  18. data/lib/typingpool/filer/dir.rb +76 -0
  19. data/lib/typingpool/filer/files/audio.rb +63 -0
  20. data/lib/typingpool/filer/files.rb +55 -0
  21. data/lib/typingpool/filer.rb +4 -313
  22. data/lib/typingpool/project/local.rb +117 -0
  23. data/lib/typingpool/project/remote/s3.rb +135 -0
  24. data/lib/typingpool/project/remote/sftp.rb +100 -0
  25. data/lib/typingpool/project/remote.rb +65 -0
  26. data/lib/typingpool/project.rb +2 -396
  27. data/lib/typingpool/template/assignment.rb +17 -0
  28. data/lib/typingpool/template/env.rb +77 -0
  29. data/lib/typingpool/template.rb +2 -87
  30. data/lib/typingpool/test/script.rb +310 -0
  31. data/lib/typingpool/test.rb +1 -306
  32. data/lib/typingpool/transcript/chunk.rb +129 -0
  33. data/lib/typingpool/transcript.rb +1 -125
  34. data/lib/typingpool/utility/castable.rb +65 -0
  35. data/lib/typingpool/utility.rb +1 -61
  36. data/test/test_integration_script_6_tp_finish.rb +1 -0
  37. metadata +135 -81
@@ -2,6 +2,9 @@ module Typingpool
2
2
  class Amazon
3
3
  require 'rturk'
4
4
  require 'pstore'
5
+ require 'typingpool/amazon/hit'
6
+ require 'typingpool/amazon/question'
7
+
5
8
  @@cache_file = '~/.typingpool.cache'
6
9
 
7
10
  class << self
@@ -51,682 +54,5 @@ module Typingpool
51
54
  end
52
55
 
53
56
  end #class << self
54
-
55
- #Class representing an Amazon Mechanical Turk Human Intelligence
56
- #Task (HIT).
57
- #
58
- #We go above and beyond RTurk::Hit for several practical reasons:
59
- # * To allow easy serialization. Caching is a very useful way of
60
- # reducing network calls to Amazon, and thus of speeding up
61
- # Typingpool. RTurk::Hit objects cannot be dumped via Marshal,
62
- # apparently due to some Nokogiri objects they
63
- # contain. Typingpool::Amazon::HIT objects, in contrast, are
64
- # designed to be easily and compactly serialized. They store the
65
- # minimal subset of information we need via simple
66
- # attribtues. (Presently we serialize via PStore.)
67
- # * To attach convenience methods. RTurk does not make it easy,
68
- # for example, to get HITs beyond the first "page" returned by
69
- # Amazon. This class provides methods that make it easy to get
70
- # ALL HITs returned by various operations.
71
- # * To attach methods specific to Typingpool. For example, the url
72
- # and project_id methods read params we've embedded in the
73
- # annotation or in hidden fields on an external question, while
74
- # the underlying stashed_params method optimizes its lookup of
75
- # these variables based on how the app is most likely to be
76
- # used. See also the ours? and cacheable? methods.
77
- # * To simplify. Typingpool HITs are constrained such that we can
78
- # assume they all contain only one assignment and thus only a
79
- # maximum of one answer. Also, once we've determined that a HIT
80
- # does not belong to Typingpool, it is safe to cache it forever
81
- # and never download it again from Amazon.
82
- # * To clearly partition methods that result in network
83
- # calls. When you access an attribute under hit.full, like
84
- # hit.full.status, it is clear you are doing something
85
- # potentially expensive to obtain your hit status. Same thing
86
- # with accessing an attribute under hit.assignment, like
87
- # hit.assignment.worker_id -- it is clear an assignment object
88
- # will need to be created, implying a network call. Calling
89
- # hit.id, in contrast, is always fast. (Caveat: Accessing
90
- # partitioned attributes often, but not always, results in a
91
- # network call. In some cases, hit.full is generated at the same
92
- # time we create the hit, since we've obtained a full HIT
93
- # serialization from Amazon. In other cases, we only have a HIT
94
- # id, so accessing anything under hit.full generates a network
95
- # call.)
96
- class HIT
97
- require 'set'
98
- require 'uri'
99
-
100
- class << self
101
-
102
- #Constructor. Creates an Amazon Mechanical Turk HIT.
103
- #** Warning: This method can spend your money! **
104
- # ==== Params
105
- # [question] Typingpool::Amazon::Question instance, used not
106
- # only to generate the (external) question but
107
- # also parsed to provide one or more core HIT
108
- # attributes. Must include a non-nil
109
- # annotation attribute. Provides fallback
110
- # values for HIT title and description.
111
- # [config_assign] The 'assign' attribute of a
112
- # Typingpool::Config instance (that is, a
113
- # Typingpool::Config::Root::Assign
114
- # instance). Must include values for reward,
115
- # lifetime, duration, and approval. May
116
- # include values for keywords and
117
- # qualifications. Preferred source for HIT
118
- # title and description. See
119
- # Typingpool::Config documentation for further
120
- # details.
121
- # ==== Returns
122
- # Typingpool::Amazon::HIT instance corresponding to the new
123
- # Mechanical Turk HIT.
124
- def create(question, config_assign)
125
- new(RTurk::Hit.create(:title => config_assign.title || question.title) do |hit|
126
- hit.description = config_assign.description || question.description
127
- hit.question(question.url)
128
- hit.note = question.annotation or raise Error, "Missing annotation from question"
129
- hit.reward = config_assign.reward or raise Error, "Missing reward config"
130
- hit.assignments = 1
131
- hit.lifetime = config_assign.lifetime or raise Error, "Missing lifetime config"
132
- hit.duration = config_assign.deadline or raise Error, "Missing deadline config"
133
- hit.auto_approval = config_assign.approval or raise Error, "Missing approval config"
134
- hit.keywords = config_assign.keywords if config_assign.keywords
135
- config_assign.qualify.each{|q| hit.qualifications.add(*q.to_arg)} if config_assign.qualify
136
- end)
137
- end
138
-
139
- #Name of the hidden HTML form field used to provide the
140
- #project_id in an external question or (form-encoded)
141
- #annotation. Hard coded to typingpool_project_id but
142
- #overridable in a subclass.
143
- def id_at
144
- @@id_at ||= 'typingpool_project_id'
145
- end
146
-
147
- #Name of the hidden HTML form field used to provide the
148
- #(audio) url in an external question or (form-encoded)
149
- #annotation. Hard coded to typingpool_url but overridable in a
150
- #subclass.
151
- def url_at
152
- @@url_at ||= 'typingpool_url'
153
- end
154
-
155
- #Takes an array of HIT ids, returns Typingpool::Amazon::HIT
156
- #instances corresponding to those ids.
157
- def with_ids(ids)
158
- ids.map{|id| cached_or_new(RTurk::Hit.new(id)) }
159
- end
160
-
161
- #Returns all Typingpool HITs that have been approved, as an
162
- #array of Typingpool::Amazon::HIT instances.
163
- def all_approved
164
- hits = all_reviewable do |hit|
165
- begin
166
- #optimization: we assume it is more common to have an
167
- #unapproved HIT than an approved HIT that does not
168
- #belong to this app
169
- hit.approved? && hit.ours?
170
- rescue RestClient::ServiceUnavailable => e
171
- warn "Warning: Service unavailable error, skipped HIT #{hit.id}. (Error: #{e})"
172
- false
173
- end
174
- end
175
- hits
176
- end
177
-
178
- #Returns as an array of Typingpool::Amazon::HIT instances all
179
- #HITs returned by Amazon's GetReviewableHITs operation (which
180
- #have HIT status == 'Reviewable'). Takes an optional filter
181
- #block (which should return true for HITs to be included in
182
- #the final results). If not supplied, will filter so the
183
- #returned hits are all Typingpool HITs (hit.ours? == true).
184
- def all_reviewable(&filter)
185
- hits = each_page do |page_number|
186
- RTurk.GetReviewableHITs(:page_number => page_number).hit_ids.map{|id| RTurk::Hit.new(id) }.map{|hit| cached_or_new(hit) }
187
- end
188
- filter_ours(hits, &filter)
189
- end
190
-
191
- #Takes a Typingpool::Project::Local#id and returns all HITs
192
- #associated with that project, as an array of
193
- #Typingpool::Amazon::HIT instances.
194
- def all_for_project(id)
195
- all{|hit| hit.ours? && hit.project_id == id}
196
- end
197
-
198
- #Returns all HITs associated with your AWS account as an array
199
- #of Typingpool::Amazon::HIT instances. Takes an optional
200
- #filter block (which should return true for HITs to be
201
- #included in the final results). If not supplied, will filter
202
- #so the returned hits are all Typingpool HITs (hit.ours? ==
203
- #true).
204
- def all(&filter)
205
- hits = each_page do |page_number|
206
- page = RTurk::SearchHITs.create(:page_number => page_number)
207
- raw_hits = page.xml.xpath('//HIT')
208
- page.hits.map do |rturk_hit|
209
- annotation = raw_hits.shift.xpath('RequesterAnnotation').inner_text.strip
210
- full = Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation)
211
- cached_or_new_from_searchhits(rturk_hit, annotation)
212
- end
213
- end
214
- filter_ours(hits, &filter)
215
- end
216
-
217
- #protected
218
-
219
- #Constructor. Takes an RTurk::Hit instance. Returns a
220
- #Typingpool::Amazon::HIT instance, preferably from the cache.
221
- def cached_or_new(rturk_hit)
222
- from_cache(rturk_hit.id) || new(rturk_hit)
223
- end
224
-
225
- #Constructor. Same as cached_or_new, but handles peculiarities
226
- #of objects returned by RTurk::SearchHITs. Such objects map
227
- #two Amazon HIT fields to different names than those used by
228
- #other RTurk HIT instances. They also do not bother to extract
229
- #the annotation from the Amazon HIT, so we have to do that
230
- #ourselves (elsewhere) and take it as a param here. Finally,
231
- #on the bright side, RTurk::SearchHITs already contain a big
232
- #chunk of hit.full attributes, potentially obviating the need
233
- #for an additional network call to flesh out the HIT, so this
234
- #method pre-fleshes-out the HIT.
235
- def cached_or_new_from_searchhits(rturk_hit, annotation)
236
- if not (typingpool_hit = from_cache(rturk_hit.id))
237
- typingpool_hit = new(rturk_hit)
238
- typingpool_hit.full(Amazon::HIT::Full::FromSearchHITs.new(rturk_hit, annotation))
239
- end
240
- typingpool_hit
241
- end
242
-
243
- def from_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
244
- Amazon.cache.transaction do
245
- Amazon.cache[cache_key(hit_id, id_at, url_at)]
246
- end
247
- end
248
-
249
- def delete_cache(hit_id, id_at=self.id_at, url_at=self.url_at)
250
- Amazon.cache.transaction do
251
- key = cache_key(hit_id, id_at, url_at)
252
- cached = Amazon.cache[key]
253
- Amazon.cache.delete(key) unless cached.nil?
254
- end
255
- end
256
-
257
- def cache_key(hit_id, id_at=self.id_at, url_at=self.url_at)
258
- "RESULT///#{hit_id}///#{url_at}///#{id_at}"
259
- end
260
-
261
- def each_page
262
- results = []
263
- page = 0
264
- begin
265
- page += 1
266
- new_results = yield(page)
267
- results.push(*new_results)
268
- end while new_results.count > 0
269
- results
270
- end
271
-
272
- def filter_ours(hits, &filter)
273
- filter ||= lambda{|hit| hit.ours? }
274
- hits.select do |hit|
275
- selected = filter.call(hit)
276
- hit.to_cache
277
- selected
278
- end
279
- end
280
- end #class << self
281
-
282
- #Corresponds to the Amazon Mechanical Turk HIT#HITId
283
- attr_reader :id
284
-
285
- #Constructor. Takes an RTurk::Hit instance.
286
- def initialize(rturk_hit)
287
- @id = rturk_hit.id
288
- end
289
-
290
- #URL of the audio file associated with this HIT (the audio file
291
- #to be transcribed). Extracted from the annotation (when the HIT
292
- #was assigned via Typingpool) or from a hidden field in the HTML
293
- #form on the external question (when the HIT was assigned via
294
- #the Amazon Mechanical Turk RUI).
295
- def url
296
- @url ||= stashed_param(self.class.url_at)
297
- end
298
-
299
- #The Typingpool::Project::Local#id associated with this
300
- #HIT. Extracted as described for the url method.
301
- def project_id
302
- @project_id ||= stashed_param(self.class.id_at)
303
- end
304
-
305
- #Returns the Typingpool::Project#name associated with this HIT
306
- #by parsing the #url. May be dropped in a future release.
307
- def project_title_from_url(url=self.url)
308
- matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
309
- URI.unescape(matches[2])
310
- end
311
-
312
- #Returns true if this HIT has an approved assignment associated
313
- #with it. (Attached to Typingpool::Amazon::HIT rather than
314
- #Typingpool::Amazon::HIT::Assignment because sometimes we can
315
- #tell simply from looking at hit.full that there are no approved
316
- #assignments -- hit.full.assignments_completed == 0. This check
317
- #is only performed when hit.full has already been loaded.)
318
- def approved?
319
- assignment_status_match?('Approved')
320
- end
321
-
322
- #Returns true if this HIT has a rejected assignment associated
323
- #with it. (For an explanation of why this is not attached to
324
- #Typingpool::Amazon::HIT::Assignment, see the documentation for
325
- #approved?.)
326
- def rejected?
327
- assignment_status_match?('Rejected')
328
- end
329
-
330
- #Returns true if this HIT has a submitted assignment associated
331
- #with it. (For an explanation of why this is not attached to
332
- #Typingpool::Amazon::HIT::Assignment, see the documentation for
333
- #approved?.)
334
- def submitted?
335
- assignment_status_match?('Submitted')
336
- end
337
-
338
-
339
- #Returns true if this HIT is associated with Typingpool. One
340
- #Amazon account can be used for many tasks, so it's important to
341
- #check whether the HIT belongs to this software. (Presently,
342
- #this is determined by looking for a stashed param like url or
343
- #project_id).
344
- def ours?
345
- @ours ||= not(url.to_s.empty?)
346
- end
347
-
348
- #Returns a Typingpool::Transcript::Chunk instance built using
349
- #this HIT and its associated assignment.
350
- def transcript
351
- transcript = Transcript::Chunk.new(assignment.body)
352
- transcript.url = url
353
- transcript.project = project_id
354
- transcript.worker = assignment.worker_id
355
- transcript.hit = @id
356
- transcript
357
- end
358
-
359
- #If this HIT is cacheable, serializes it to the cache file
360
- #specified in the config passed to Amazon.setup, or specified in
361
- #the default config file. In short, a HIT is cacheable if it
362
- #does not belong to Typingpool (ours? == false), if it is
363
- #approved or rejected (approved? || rejected?), or if it is
364
- #expired (full.expired_and_overdue?). See also cacheable? code.
365
- #
366
- # When available, cached HITs are used by
367
- # Typingpool::Amazon::HIT.all,
368
- # Typingpool::Amazon::HIT.all_approved, and all the other class
369
- # methods that retrieve HITs. These methods call to_cache for
370
- # you at logical times (after downloading and filtering, when
371
- # the HIT is most fleshed out), so you should not need to call
372
- # this yourself. But if you have an operation that makes network
373
- # calls to further flesh out the HIT, calling to_cache may be
374
- # worthwhile.
375
- def to_cache
376
- #any obj containing a Nokogiri object cannot be stored in pstore - do
377
- #not forget this (again)
378
- if cacheable?
379
- Amazon.cache.transaction do
380
- Amazon.cache[self.class.cache_key(@id)] = self
381
- end
382
- end
383
- end
384
-
385
- #Returns an RTurk::Hit instance corresponding to this HIT.
386
- def at_amazon
387
- Amazon.rturk_hit_full(@id)
388
- end
389
-
390
- #Deletes the HIT from Amazon's servers. Examines the HIT and
391
- #assignment status to determine whether calling the DisposeHIT
392
- #or DisableHIT operation is most appropriate. If the HIT has
393
- #been submitted but not approved or rejected, will raise an
394
- #exception of type
395
- #Typingpool::Error::Amazon::UnreviewedContent. Catch this
396
- #exception in your own code if you'd like to automatically
397
- #approve such HITs before removing them.
398
- def remove_from_amazon
399
- if full.status == 'Reviewable'
400
- if assignment.status == 'Submitted'
401
- raise Error::Amazon::UnreviewedContent, "There is an unreviewed submission for #{url}"
402
- end
403
- at_amazon.dispose!
404
- else
405
- at_amazon.disable!
406
- end
407
- end
408
-
409
- #Returns "the full hit" - a Typingpool::Amazon::HIT::Full
410
- #instance associated with this HIT. If the instance is being
411
- #created for the first time, this will trigger an HTTP request
412
- #to Amazon's servers. "Full" hit fields segregated because
413
- #accessing any one of them is expensive if we only have a hit id
414
- #(but after fetching one all are cheap). Accepts an optional
415
- #Typingpool::Amazon::HIT::Full (or subclass) to set for this
416
- #attribute, preventing the need to create one. This is useful in
417
- #cases in which extensive HIT data was returned by an Amazon
418
- #operation (for example, SearchHITs returns lots of HIT data)
419
- def full(full_hit=nil)
420
- if @full.nil?
421
- @full = full_hit || Full.new(at_amazon)
422
- end
423
- @full
424
- end
425
-
426
- #Returns the assignment associated with this HIT - a
427
- #Typingpool::Amazon::HIT::Assignment instance. The first time
428
- #this is called, an Amazon HTTP request is typically (but not
429
- #always) sent.
430
- def assignment
431
- if @assignment.nil?
432
- if @full && full.assignments_completed == 0
433
- #It would be dangerous to do this if the HIT were to be
434
- #cached, since we would then never check for the
435
- #assignment again. But we know this HIT won't be cached
436
- #while it is active, since we only cache approved and
437
- #rejected HITs.
438
- @assignment = Assignment::Empty.new
439
- else
440
- @assignment = Assignment.new(at_amazon) #expensive
441
- end
442
- end
443
- @assignment
444
- end
445
-
446
-
447
- #private
448
-
449
- def stashed_param(param)
450
- if @assignment && assignment.answers[param]
451
- return assignment.answers[param]
452
- elsif full.annotation[param]
453
- #A question assigned through this software. May be
454
- #expensive: May result in HTTP request to fetch HIT
455
- #fields. We choose to fetch (sometimes) the HIT rather than
456
- #the assignment on the assumption it will be MORE common to
457
- #encounter HITs with no answers and LESS common to encounter
458
- #HITs assigned through the RUI (and thus lacking in an
459
- #annotation from this software and thus rendering the HTTP
460
- #request to fetch the HIT fields pointless).
461
- return full.annotation[param]
462
- elsif full.assignments_completed.to_i >= 1
463
- #A question assigned through Amazon's RUI, with an answer
464
- #submitted. If the HIT belongs to this software, this
465
- #assignment's answers will include our param. We prefer
466
- #fetching the assignment to fetching the external question
467
- #(as below) because fetching the assignment will potentially
468
- #save us an HTTP request down the line -- for example, if we
469
- #need other assignment data (e.g. assignment status).
470
- #Fetching the external question only serves to give us
471
- #access to params. If the answers do not include our param,
472
- #we know the HIT does not belong to this software, since we
473
- #know the param was also not in the annotation. So we are
474
- #safe returning nil in that case.
475
- return assignment.answers[param]
476
- else
477
- #A question assigned via Amazon's RUI, with no answer
478
- #submitted. Expensive: Results in HTTP request to fetch
479
- #external question.
480
- return full.external_question_param(param)
481
- end
482
- end
483
-
484
- def assignment_status_match?(status)
485
- if @full
486
- return false if full.assignments_completed == 0
487
- return false if full.status != 'Reviewable'
488
- end
489
- assignment.status == status
490
- end
491
-
492
-
493
- @@cacheable_assignment_status = Set.new %w(Approved Rejected)
494
- def cacheable?
495
- if @ours == false
496
- return true
497
- end
498
- if @full
499
- return true if full.expired_and_overdue?
500
- end
501
- if @assignment && assignment.status
502
- return true if @@cacheable_assignment_status.include?(assignment.status)
503
- end
504
- return false
505
- end
506
-
507
- class Full
508
- require 'uri'
509
- require 'open-uri'
510
- require 'nokogiri'
511
-
512
- #See the RTurk documentation and Amazon Mechanical Turk API
513
- #documentation for more on these fields.
514
- attr_reader :id, :type_id, :status, :external_question_url, :assignments_completed, :assignments_pending, :expires_at, :assignments_duration
515
-
516
- #Constructor. Takes an RTurk::HIT instance.
517
- def initialize(rturk_hit)
518
- import_standard_attrs_from_rturk_hit(rturk_hit)
519
- @assignments_completed = rturk_hit.assignments_completed_count
520
- @assignments_pending = rturk_hit.assignments_pending_count
521
- self.annotation = rturk_hit.annotation
522
- self.external_question_url = rturk_hit.xml
523
- end
524
-
525
- #Returns the HIT annotation as a hash. If the annotation
526
- #contained URL-encoded form key-value pairs, it decodes them
527
- #and returns them as a hash. Otherwise, returns an empty hash
528
- #(throwing away any annotation text that is not URL-encoded
529
- #key-value pairs, for example the tags attached by the Amazon
530
- #Mechanical Turk RUI).
531
- def annotation
532
- @annotation ||= {}
533
- end
534
-
535
- #Returns boolean indicated whether the HIT is
536
- #expired. Determined by comparing the HIT's expires_at
537
- #attribute with the current time.
538
- def expired?
539
- expires_at < Time.now
540
- end
541
-
542
- #Returns boolean indicated whether the HIT is expired and
543
- #overdue, at which point it is totally safe to prune. This is
544
- #determined by adding the assignment duration (how long a
545
- #worker has to complete the HIT) to the HIT's expires_at time
546
- #(when the HIT is removed from the Mechanical Turk
547
- #marketplace).
548
- def expired_and_overdue?
549
- (expires_at + assignments_duration) < Time.now
550
- end
551
-
552
- #Returns the HTML of the external question associated with the
553
- #HIT. All Typingpool HITs use external questions (as opposed
554
- #to "internal" HIT QuestionForms), so this should always
555
- #return something. In first use, must make an HTTP request to
556
- #obtain the HTML.
557
- def external_question
558
- if @external_question.nil?
559
- if external_question_url && external_question_url.match(/^http/)
560
- #expensive, obviously:
561
- @external_question = open(external_question_url).read
562
- end
563
- end
564
- @external_question
565
- end
566
-
567
- #Takes the name of an HTML form param and returns the value
568
- #associated with that param in the external question
569
- #HTML. Triggers an HTTP request on first use (unless
570
- #external_question has already been called).
571
- def external_question_param(param)
572
- if external_question
573
- if input = Nokogiri::HTML::Document.parse(external_question).css("input[name=#{param}]")[0]
574
- return input['value']
575
- end
576
- end
577
- end
578
-
579
- protected
580
-
581
- def import_standard_attrs_from_rturk_hit(hit)
582
- %w(id type_id status expires_at assignments_duration).each do |attr|
583
- instance_variable_set("@#{attr}", hit.send(attr))
584
- end
585
- end
586
-
587
- def annotation=(encoded)
588
- @annotation = CGI.unescapeHTML(encoded.to_s)
589
- begin
590
- @annotation = URI.decode_www_form(@annotation)
591
- @annotation = Hash[*@annotation.flatten]
592
- rescue ArgumentError
593
- #Handle annotations like Department:Transcription (from
594
- #the Amazon RUI), which make URI.decode_www_form barf
595
- @annotation = {}
596
- end
597
- end
598
-
599
- def external_question_url=(noko_xml)
600
- if url = noko_xml.css('HIT Question eq|ExternalQuestion eq|ExternalURL', {'eq' => 'http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2006-07-14/ExternalQuestion.xsd'})[0].inner_text
601
- @external_question_url = url
602
- end
603
- end
604
-
605
- #For more on why this subclass is neccesary, see the
606
- #documentation for
607
- #Typingpool::Amazon::HIT.cached_or_new_from_searchhits. In
608
- #short, RTurk::HITParser objects returned by RTurk::SearchHITs
609
- #are pointlessly and subtly different from
610
- #RTurk::GetHITResponse objects. (I need to submit a patch to
611
- #RTurk.)
612
- class FromSearchHITs < Full
613
- #Constructor. Takes an RTurk::Hit instance and the text of
614
- #the HIT's annotation. The text of the annotation must be
615
- #submitted as a separate param because RTurk::Hit instances
616
- #returned by RTurk::SearchHITs do not bother to extract the
617
- #annotation into an attribute, so we have to so that
618
- #ourselves (elsewhere) using the raw xml.
619
- def initialize(rturk_hit, annotation)
620
- import_standard_attrs_from_rturk_hit(rturk_hit)
621
- @assignments_completed = rturk_hit.completed_assignments
622
- @assignments_pending = rturk_hit.pending_assignments
623
- self.annotation = annotation
624
- end
625
-
626
- protected
627
-
628
- def external_question_url
629
- unless @checked_question
630
- self.external_question_url = at_amazon.xml
631
- @checked_question = true
632
- end
633
- @external_question_url
634
- end
635
-
636
- def at_amazon
637
- Amazon.rturk_hit_full(@id)
638
- end
639
- end #Amazon::HIT::Full::FromSearchHITs
640
- end #Amazon::HIT::Full
641
-
642
- class Assignment
643
-
644
- #See the RTurk documentation and Amazon Mechanical Turk API
645
- #documentation for more on these fields.
646
- attr_reader :id, :status, :worker_id, :submitted_at
647
-
648
- #Constructor. Takes an RTurk::Hit instance.
649
- def initialize(rturk_hit)
650
- if assignment = rturk_hit.assignments[0] #expensive!
651
- @id = assignment.id
652
- @status = assignment.status
653
- @worker_id = assignment.worker_id
654
- @submitted_at = assignment.submitted_at
655
- if answers = assignment.answers
656
- @answers = answers.to_hash
657
- end
658
- end
659
- end
660
-
661
- #Returns the answers associated with this assignment as a
662
- #hash. If there are no answers, returns an empty hash.
663
- def answers
664
- @answers ||= {}
665
- end
666
-
667
- #Returns the transcription submitted by the user as raw text.
668
- def body
669
- (answers['transcription'] || answers['1']).to_s
670
- end
671
-
672
- #Returms an RTurk::Assignment object corresponding to this
673
- #assignment.
674
- def at_amazon
675
- RTurk::Assignment.new(@id)
676
- end
677
-
678
- #Subclass used in cases where we know Amazon's servers have no
679
- #assignments for us (because hit.full.assignments_completed ==
680
- #0), so we don't want to bother doing an HTTP request to
681
- #check.
682
- class Empty < Assignment
683
- def initialize
684
- @answers = {}
685
- end
686
-
687
- end #Empty
688
- end #Assignment
689
- end #HIT
690
-
691
- #Class encapsulating the HTML form presented to Mechanical Turk workers
692
- #transcribing a Typingpool audio chunk.
693
- class Question
694
- require 'nokogiri'
695
- require 'uri'
696
- require 'cgi'
697
- attr_reader :url, :html
698
-
699
- #Constructor. Takes the URL of where the question HTML has been
700
- #uploaded, followed by the question HTML itself.
701
- def initialize(url, html)
702
- @url = url
703
- @html = html
704
- end
705
-
706
- #Returns URL-encoded key-value pairs that can be used as the
707
- #text for a HIT#annotation. The key-value pairs correspond to
708
- #all hidden HTML form fields in the question HTML.
709
- def annotation
710
- CGI.escapeHTML(URI.encode_www_form(Hash[*noko.css('input[type="hidden"]').select{|e| e['name'].match(/^typingpool_/) }.map{|e| [e['name'], e['value']]}.flatten]))
711
- end
712
-
713
- #Returns the title, extracted from the title element of the
714
- #HTML.
715
- def title
716
- noko.css('title')[0].content
717
- end
718
-
719
- #Returns the description, extracted from the element with the id
720
- #'description' in the HTML.
721
- def description
722
- noko.css('#description')[0].content
723
- end
724
-
725
- protected
726
-
727
- def noko(html=@html)
728
- Nokogiri::HTML(html, nil, 'UTF-8')
729
- end
730
- end #Question
731
57
  end #Amazon
732
58
  end #Typingpool