typingpool 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.markdown +452 -0
- data/lib/typingpool/amazon/hit/assignment/empty.rb +19 -0
- data/lib/typingpool/amazon/hit/assignment.rb +43 -0
- data/lib/typingpool/amazon/hit/full/fromsearchhits.rb +44 -0
- data/lib/typingpool/amazon/hit/full.rb +105 -0
- data/lib/typingpool/amazon/hit.rb +458 -0
- data/lib/typingpool/amazon/question.rb +45 -0
- data/lib/typingpool/amazon.rb +3 -677
- data/lib/typingpool/app/cli/formatter.rb +16 -0
- data/lib/typingpool/app/cli.rb +64 -0
- data/lib/typingpool/app/friendlyexceptions.rb +34 -0
- data/lib/typingpool/app.rb +2 -97
- data/lib/typingpool/config/root.rb +114 -0
- data/lib/typingpool/config.rb +13 -119
- data/lib/typingpool/filer/audio.rb +84 -0
- data/lib/typingpool/filer/csv.rb +57 -0
- data/lib/typingpool/filer/dir.rb +76 -0
- data/lib/typingpool/filer/files/audio.rb +63 -0
- data/lib/typingpool/filer/files.rb +55 -0
- data/lib/typingpool/filer.rb +4 -313
- data/lib/typingpool/project/local.rb +117 -0
- data/lib/typingpool/project/remote/s3.rb +135 -0
- data/lib/typingpool/project/remote/sftp.rb +100 -0
- data/lib/typingpool/project/remote.rb +65 -0
- data/lib/typingpool/project.rb +2 -396
- data/lib/typingpool/template/assignment.rb +17 -0
- data/lib/typingpool/template/env.rb +77 -0
- data/lib/typingpool/template.rb +2 -87
- data/lib/typingpool/test/script.rb +310 -0
- data/lib/typingpool/test.rb +1 -306
- data/lib/typingpool/transcript/chunk.rb +129 -0
- data/lib/typingpool/transcript.rb +1 -125
- data/lib/typingpool/utility/castable.rb +65 -0
- data/lib/typingpool/utility.rb +1 -61
- data/test/test_integration_script_6_tp_finish.rb +1 -0
- metadata +135 -81
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011-2012 Ryan Tate
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,452 @@
|
|
1
|
+
# Typingpool
|
2
|
+
|
3
|
+
Typingpool is an app for easily making transcripts of audio using
|
4
|
+
Amazon's labor marketplace, Mechanical Turk.
|
5
|
+
|
6
|
+
Typingpool is distributed as a Ruby gem. It is a made up of a handful
|
7
|
+
of scripts for users and a collection of library files for
|
8
|
+
developers.
|
9
|
+
|
10
|
+
Typingpool also includes a collection of ERB templates for
|
11
|
+
generating Mechanical Turk assignments and the final transcript HTML
|
12
|
+
file.
|
13
|
+
|
14
|
+
## Dependencies
|
15
|
+
|
16
|
+
Typingpool depends on these command-line tools, which are not
|
17
|
+
included in the gem since they are external to Ruby:
|
18
|
+
|
19
|
+
* [ffmpeg] A powerhouse audio/video converter.
|
20
|
+
* [libmp3lame] An mp3 encoder/decoder, used by ffmpeg.
|
21
|
+
* [mp3splt] An audio file-splitting utility.
|
22
|
+
* [mp3wrap] An audio file-merging utility.
|
23
|
+
|
24
|
+
## User overview
|
25
|
+
|
26
|
+
### Setup
|
27
|
+
|
28
|
+
After installing the gem and its dependencies, run tp-config from the
|
29
|
+
command line to create your config file (~/.typingpool). At the
|
30
|
+
prompts, you will need to supply your Amazon Web Services Access Key
|
31
|
+
ID and your Amazon Web Services Secret Access key.
|
32
|
+
|
33
|
+
The config file is in YAML format and may be customized using any
|
34
|
+
text editor. For more details on configuration options, see the
|
35
|
+
documentation for Typingpool::Config.
|
36
|
+
|
37
|
+
### Workflow
|
38
|
+
|
39
|
+
A typical workflow will use the bundled scripts in this order:
|
40
|
+
|
41
|
+
tp-make -> tp-assign -> [wait] -> tp-review -> tp-finish
|
42
|
+
|
43
|
+
tp-review may be called repeatedly, until transcripts for all audio
|
44
|
+
chunks have been processed. Similarly, tp-assign may be called
|
45
|
+
repeatedly, for example to re-assign chunks rejected using tp-review,
|
46
|
+
or to re-assign chunks that have expired.
|
47
|
+
|
48
|
+
An alternate workflow would go like this:
|
49
|
+
|
50
|
+
tp-make -> [manually upload assignments.csv to Amazon RUI] ->
|
51
|
+
[wait] -> [approve/reject assignments via RUI] -> tp-collect ->
|
52
|
+
tp-finish
|
53
|
+
|
54
|
+
### Examples
|
55
|
+
|
56
|
+
Typical usage scenario:
|
57
|
+
|
58
|
+
tp-make 'Chad Interview' chad1.WMA chad2.WMA --unusual 'Hack Day,
|
59
|
+
Yahoo' --subtitle 'Phone interview re Yahoo Hack Day'
|
60
|
+
|
61
|
+
# => Converting chad1.WMA to mp3
|
62
|
+
# => Converting chad2.WMA to mp3
|
63
|
+
# => Merging audio
|
64
|
+
# => Splitting audio into uniform bits
|
65
|
+
# => Uploading Chad Interview.00.00.mp3 to
|
66
|
+
ryantate42.s3.amazonaws.com as Chad
|
67
|
+
Interview.00.00.33ca7f2cceba9f8031bf4fb7c3f819f4.LHFJEM.mp3
|
68
|
+
# => Uploading Chad Interview.01.00.mp3 to
|
69
|
+
ryantate42.s3.amazonaws.com as Chad #
|
70
|
+
Interview.01.00.33ca7f2cceba9f8031bf4fb7c3f819f4.XMWNYW.mp3
|
71
|
+
# => Uploading Chad Interview.02.00.mp3 to
|
72
|
+
ryantate42.s3.amazonaws.com as Chad #
|
73
|
+
Interview.02.00.33ca7f2cceba9f8031bf4fb7c3f819f4.FNEIWN.mp3
|
74
|
+
# => ... [snip]
|
75
|
+
# => Done. Project at:
|
76
|
+
# => /Users/ryantate/Desktop/Transcripts/Chad Interview
|
77
|
+
|
78
|
+
|
79
|
+
tp-assign 'Chad Interview' interview/nameless --reward 1.00
|
80
|
+
--deadline 90m --approval 6h --lifetime 2d
|
81
|
+
|
82
|
+
# => Figuring out what needs to be assigned
|
83
|
+
# => 85 assignments total
|
84
|
+
# => 85 assignments to assign
|
85
|
+
# => Deleting old assignment HTML from ryantate42.s3.amazonaws.com
|
86
|
+
# => Uploading assignment HTML to ryantate42.s3.amazonaws.com
|
87
|
+
# => Assigning
|
88
|
+
# => Assigned 85 transcription jobs for $85
|
89
|
+
# => Remaining balance: $115.00
|
90
|
+
|
91
|
+
[Wait...]
|
92
|
+
|
93
|
+
|
94
|
+
tp-review 'Chad Interview'
|
95
|
+
|
96
|
+
# => Gathering submissions from Amazon
|
97
|
+
# => Matching submissions with local projects
|
98
|
+
# =>
|
99
|
+
# => Transcript for: https://ryantate42.s3.amazonaws.com/
|
100
|
+
Chad%20Interview.29.00.263d492275a81afb005c8231d8d8afdb.
|
101
|
+
UEMOCN.mp3
|
102
|
+
# => Project: Chad Interview: Phone interview re Yahoo Hack Day
|
103
|
+
# => Submitted at: 2012-08-11 17:00:36 -0700 by A9S0AOAI8HO9P
|
104
|
+
# =>
|
105
|
+
# => Chad: ... so it had sort of some geek history. And the
|
106
|
+
# => weather was really bad. But it was an indoor event,
|
107
|
+
# => right? So people were staying indoors. And like very
|
108
|
+
# => early... And there was all this really expensive gear
|
109
|
+
# => that the BBC had. Like these cameras that guys were like
|
110
|
+
# => riding around on and stuff, huge sound stage, bigger than
|
111
|
+
# => the one we had in Sunnyvale.
|
112
|
+
# =>
|
113
|
+
# => Two hours into the event, we heard this big lightning
|
114
|
+
# => strike, because we were up on a hill in London. And all
|
115
|
+
# => the lights went out and the roof opened up in the
|
116
|
+
# => building. What we didn't know is the fire supression
|
117
|
+
# => system in that building which got blown up by the
|
118
|
+
# => lightning during a fire would cause the roof to open
|
119
|
+
# => up. So we had all these geeks with equipment and all this
|
120
|
+
# => BBC equipment and it was literally raining on them.
|
121
|
+
# =>
|
122
|
+
# => (A)pprove, (R)eject, (Q)uit, [(S)kip]? (1/20)
|
123
|
+
|
124
|
+
a
|
125
|
+
|
126
|
+
# => Approved. Chad Interview transcript updated.
|
127
|
+
# =>
|
128
|
+
# => Transcript for: https://ryantate42.s3.amazonaws.com/
|
129
|
+
Chad%20Interview.30.00.263d492275a81afb005c8231d8d8afdb.
|
130
|
+
RXNKRN.mp3
|
131
|
+
# => Project: Chad Interview: Phone interview re Yahoo Hack Day
|
132
|
+
# => Submitted at: 2012-08-11 17:00:58 -0700 by A9S0AOAI8HO9P
|
133
|
+
# =>
|
134
|
+
# => Blah blah blah blah okay I am done typing byeeeeeeee
|
135
|
+
# =>
|
136
|
+
# => (A)pprove, (R)eject, (Q)uit, [(S)kip]? (2/20)
|
137
|
+
|
138
|
+
r
|
139
|
+
|
140
|
+
# => Rejection reason, for worker:
|
141
|
+
|
142
|
+
There's no transcription at all, just nonsense
|
143
|
+
|
144
|
+
# => Rejected
|
145
|
+
# =>
|
146
|
+
# => Transcript for...
|
147
|
+
# => ... [snip]
|
148
|
+
|
149
|
+
tp-finish 'Chad Interview'
|
150
|
+
|
151
|
+
# => Removing from Amazon
|
152
|
+
# => Collecting all results
|
153
|
+
# => Removing HIT 2GKMIKMN9U8PNHKK58NXL3SU4TCBSN (Reviewable)
|
154
|
+
# => Removing from data/assignment.csv
|
155
|
+
# => Removing from local cache
|
156
|
+
# => Removing HIT 2CFX2Q45UUKQ2HXZU8SNV8OG6CQBTC (Assignable)
|
157
|
+
# => Removing from data/assignment.csv
|
158
|
+
# => Removing from local cache
|
159
|
+
# => Removing HIT 294EZZ2MIKMNNDP1LAU8WWWXOEI7O0...
|
160
|
+
# => ... [snip]
|
161
|
+
# => Removing Chad Interview.00.00.
|
162
|
+
263d492275a81afb005c8231d8d8afdb.ORSENE.html from
|
163
|
+
ryantate42.s3.amazonaws.com
|
164
|
+
# => Removing Chad Interview.01.00...
|
165
|
+
# => ... [snip]
|
166
|
+
# => Removing Chad Interview.00.00.
|
167
|
+
263d492275a81afb005c8231d8d8afdb.RNTVLN.mp3 from
|
168
|
+
ryantate42.s3.amazonaws.com
|
169
|
+
# => Removing Chad Interview.01.00....
|
170
|
+
# => ... [snip]
|
171
|
+
|
172
|
+
### Output
|
173
|
+
|
174
|
+
The final output of Typingpool is a project directory containing a
|
175
|
+
transcript file.
|
176
|
+
|
177
|
+
The transcript file is HTML with audio chunks embedded alongside each
|
178
|
+
associated transcript chunk.
|
179
|
+
|
180
|
+
The transcript file is called transcript.html when complete. A
|
181
|
+
partial transcript file is called transcript_in_progress.html.
|
182
|
+
|
183
|
+
The project directory also includes supporting files, including a CSV
|
184
|
+
data file used to store raw transcript chunks, Amazon Mechanical Turk
|
185
|
+
HIT information, and other metdata; Javscript code that swaps in
|
186
|
+
Flash players on browsers that don't support mp3 files in audio tags;
|
187
|
+
the original audio files and the audio chunks generated from them;
|
188
|
+
and a CSS file.
|
189
|
+
|
190
|
+
The directory is laid out like so:
|
191
|
+
|
192
|
+
Chad Interview/
|
193
|
+
-> transcript.html | transcript_in_progress.html
|
194
|
+
-> audio/
|
195
|
+
-> chunks/
|
196
|
+
-> Chad Interview.00.00.mp3
|
197
|
+
-> Chad Interview.01.00.mp3
|
198
|
+
-> ... [snip]
|
199
|
+
-> originals/
|
200
|
+
-> chad1.WMA
|
201
|
+
-> chad2.WMA
|
202
|
+
-> data/
|
203
|
+
-> assignment.csv
|
204
|
+
-> id.txt
|
205
|
+
-> subtitle.txt
|
206
|
+
-> etc/
|
207
|
+
-> audio-compat.js
|
208
|
+
-> transcript.css
|
209
|
+
-> About these files - readme.txt
|
210
|
+
-> player/
|
211
|
+
-> audio-player.js
|
212
|
+
-> license.txt
|
213
|
+
-> player.swf
|
214
|
+
|
215
|
+
You may safely edit the files transcript.html, etc/transcript.css,
|
216
|
+
and data/subtitle.txt, and you may safely delete the files in
|
217
|
+
audio/originals and any .txt files in etc/. Editing or deleting other
|
218
|
+
files may interfere with the operation of Typingpool or render the
|
219
|
+
transcript inoperative. Do not edit transcript_in_progress.html as
|
220
|
+
your changes will be overwritten if/when the transcript is next
|
221
|
+
updated.
|
222
|
+
|
223
|
+
|
224
|
+
### Workflow (additional)
|
225
|
+
|
226
|
+
When you've rejected some submissions in tp-review and need to
|
227
|
+
re-assign these chunks to be transcribed, simply re-run tp-assign
|
228
|
+
with the name of your project. You may select the same template,
|
229
|
+
reward, deadlines, etc., or pick new ones. tp-assign will be careful
|
230
|
+
not to re-assign chunks for which you have approved a transcript, or
|
231
|
+
which are pending on Mechanical Turk.
|
232
|
+
|
233
|
+
When some chunks previously assigned via tp-assign have expired
|
234
|
+
without attracting submissions, simply re-run tp-assign as described
|
235
|
+
above to re-assign these chunks. Consider increasing the dollar
|
236
|
+
amount specified in your --reward argument.
|
237
|
+
|
238
|
+
When some chunks previously assigned via tp-assign have been
|
239
|
+
submitted by workers but not approved or rejected in time for the
|
240
|
+
approval deadline (assign/approval in your config file or --approval
|
241
|
+
as passed to tp-assign), Mechanical Turk has automatically approved
|
242
|
+
these submissions for you and you'll need to run tp-collect to
|
243
|
+
collect them.
|
244
|
+
|
245
|
+
When you want to cancel outstanding assignments, for example because
|
246
|
+
you realize you supplied the wrong parameter to tp-assign, simply run
|
247
|
+
tp-finish with the name of your project. If your assignments have
|
248
|
+
already attracted submissions, you may be prompted to run tp-review
|
249
|
+
first.
|
250
|
+
|
251
|
+
When tp-make, tp-assign, or tp-finish unsuccessfully attempts an
|
252
|
+
upload, deletion, or Amazon command, simply re-run the script with
|
253
|
+
the same arguments to re-attempt the upload, deletion or Amazon
|
254
|
+
command. Typingpool carefully records which network operations it is
|
255
|
+
attempting and which network operations have completed. It can
|
256
|
+
robustly handle network errors, including uncaught exceptions.
|
257
|
+
|
258
|
+
When you want to preview your assignments, run tp-assign with the
|
259
|
+
--sandbox option and with --qualify 'rejection_rate < 100' (to make
|
260
|
+
sure you qualify to view your own HITs). Then visit
|
261
|
+
http://workersandbox.mturk.com and find your assignments (a seach for
|
262
|
+
"mp3" works if you left mp3 set as a keyword in your config
|
263
|
+
file). When you are done previewing, run tp-finish with the name/path
|
264
|
+
of your project and the --sandbox option.
|
265
|
+
|
266
|
+
|
267
|
+
### Maintenance
|
268
|
+
|
269
|
+
* [cache] If the cache file grows too large, you'll need to delete
|
270
|
+
it manually. It may be safely deleted as long as no
|
271
|
+
Typingpool scripts are running. Its location is
|
272
|
+
specified in the 'cache' param in the config
|
273
|
+
file. (The config file is at ~/.typingpool and the
|
274
|
+
cache, by default, is at ~/.typingpool.cache.)
|
275
|
+
|
276
|
+
Typingpool takes no steps to limit the size of the
|
277
|
+
cache file. It prunes the cache of project-specific
|
278
|
+
entries when you run tp-finish on a project, but the
|
279
|
+
cache may grow large if you work on many active
|
280
|
+
projects in parallel, or if you fail to run tp-finish
|
281
|
+
on projects when you are done with them.
|
282
|
+
|
283
|
+
* [tp-finish] You should run tp-finish PROJECT each time you finish
|
284
|
+
a project, where PROJECT may be either the project
|
285
|
+
name or path. Assuming you have no submissions pending
|
286
|
+
or awaiting approval, this clears all traces of the
|
287
|
+
project from Amazon Mechanical Turk, from Amazon S3 or
|
288
|
+
your SFTP server, and from the local cache. This will
|
289
|
+
keep your local cache from balooning in size and will
|
290
|
+
minimize your S3 charges or SFTP disk usage. It will
|
291
|
+
also help Typingpool scripts run faster by reducing
|
292
|
+
the number of HITs you have on Amazon Mechanical Turk;
|
293
|
+
many Typingpool operations involve iterating through
|
294
|
+
all of your HITs.
|
295
|
+
|
296
|
+
|
297
|
+
### See also
|
298
|
+
|
299
|
+
* Run any script with the --help options for further details on how
|
300
|
+
to run the script.
|
301
|
+
|
302
|
+
* See the docs for Typingpool::Config for details of the config
|
303
|
+
file format.
|
304
|
+
|
305
|
+
* See Amazon's Mechanical Turk documentation for guides and
|
306
|
+
overviews on how Mechanical Turk works.
|
307
|
+
|
308
|
+
* See the documentation on ffmpeg and related libraries for clues
|
309
|
+
as to how to make Typingpool support additional file
|
310
|
+
formats. Typingpool can work with any file format that ffmpeg can
|
311
|
+
convert to mp3 (libmp3lame).
|
312
|
+
|
313
|
+
|
314
|
+
## Developer overview
|
315
|
+
|
316
|
+
Views, used for the final transcript and for rendering HTML
|
317
|
+
assignments for Amazon Mechanical Turk workers, are contained in a
|
318
|
+
series of templates in lib/typingpool/templates, particularly
|
319
|
+
transcript.html.erb and assignment/*. The control layer lives in the
|
320
|
+
App class (lib/typingpool/app.rb) and within the individual
|
321
|
+
scripts. The models constitute the other Typingpool classes,
|
322
|
+
including most importantly and in rough order of importance the
|
323
|
+
Project, Transcript, Amazon, Config and Filer classes (the latter of
|
324
|
+
interest mainly because of Filer::Audio, which handles splitting,
|
325
|
+
merging, and conversion).
|
326
|
+
|
327
|
+
The models in particular, along with the App class, are underdeveloped
|
328
|
+
and not particularly clear or fully thought through. The Transcript
|
329
|
+
model, for example, should almost certainly be folded into the Project
|
330
|
+
model. Dividing Project into Project::Local and Project::Remote only
|
331
|
+
makes sense on a superficial level; Project::Remote could probably be
|
332
|
+
its own class or even part of Utility. Amazon will probably be simpler
|
333
|
+
if I can get some patches into RTurk, and Amazon::HIT should probably
|
334
|
+
be integrated more closely with Project.
|
335
|
+
|
336
|
+
One of the most frustrating things about the code is that there are so
|
337
|
+
many subtly different ways a "chunk" of a transcript/project is
|
338
|
+
represented: As a simple hash derived from a row in
|
339
|
+
data/assignment.csv within a project folder, as an Amazon::HIT, as a
|
340
|
+
Transcription::Chunk, as an audio file on a remote server, and as a
|
341
|
+
local audio file (which has a different name from the remote file). So
|
342
|
+
in future versions I'll probably reduce the number of different ways
|
343
|
+
to represent a chunk.
|
344
|
+
|
345
|
+
Also in the future, it's very likely that App will evolve from a
|
346
|
+
simple collection of class methods into a real class with a simple
|
347
|
+
set of instance methods called in a particular order by a "run"
|
348
|
+
method or similar. Subclasses for particular scripts/commands will
|
349
|
+
then override these methods.
|
350
|
+
|
351
|
+
|
352
|
+
### Examples
|
353
|
+
|
354
|
+
The most comprehensive examples of how the Typingpool classes
|
355
|
+
actually work and interact are the tp-* scripts themselves, in
|
356
|
+
particular tp-make, tp-assign, tp-review, and tp-finish.
|
357
|
+
|
358
|
+
More concise examples follow below, to give you a sense of what the
|
359
|
+
various classes actually do:
|
360
|
+
|
361
|
+
```ruby
|
362
|
+
require 'typingpool'
|
363
|
+
|
364
|
+
#new Project instance
|
365
|
+
project = Typingpool::Project.new('Chad Interview')
|
366
|
+
|
367
|
+
#check if project exists on disk
|
368
|
+
unless project.local
|
369
|
+
#make a skeleton project folder in Config#transcripts dir
|
370
|
+
project.create_local
|
371
|
+
#make subtitle record in project folder
|
372
|
+
project.local.subtitle = 'Interview about Hack Day Jan 21'
|
373
|
+
end
|
374
|
+
|
375
|
+
id = project.local.id
|
376
|
+
|
377
|
+
#Wrap file in Typingpool::Filer
|
378
|
+
wma = Typingpool::Filer::Audio.new('/foo/bar.wma')
|
379
|
+
|
380
|
+
#convert file to mp3
|
381
|
+
mp3 = wma.to_mp3
|
382
|
+
other_mp3 = Typingpool::Filer::Audio.new('/foo/bar2.wma').to_mp3
|
383
|
+
|
384
|
+
#merge audio
|
385
|
+
combined_mp3 = Typingpool::Filer::Files::Audio.new([mp3,
|
386
|
+
other_mp3]).merge(Typingpool::Filer.new('/foo/combined.mp3')
|
387
|
+
|
388
|
+
#split audio every 1 minute
|
389
|
+
chunks = combined_mp3.split('1.00')
|
390
|
+
|
391
|
+
#upload mp3s
|
392
|
+
urls = project.remote.put(chunks.to_streams,
|
393
|
+
project.create_remote_names(chunks))
|
394
|
+
|
395
|
+
#remove mp3s
|
396
|
+
project.remote.remove_urls(urls)
|
397
|
+
|
398
|
+
#new Template instance
|
399
|
+
template = Typingpool::Template::Assignment.from_config('interview/nameless')
|
400
|
+
html = template.render({
|
401
|
+
'audio_url' => urls[0],
|
402
|
+
'unusual' => ['Hack Day', 'Yahoo', 'Atlassian'],
|
403
|
+
'chunk_minutes' => 1,
|
404
|
+
'project_id' => project.local.id
|
405
|
+
})
|
406
|
+
|
407
|
+
question = Typingpool::Amazon::Question.new(urls[0], html)
|
408
|
+
|
409
|
+
Typingpool::Amazon.setup
|
410
|
+
|
411
|
+
#Assign a transcription job (1 chunk)
|
412
|
+
hit = Typingpool::Amazon::HIT.create(question, Typingpool::Config.file.assign)
|
413
|
+
|
414
|
+
#Find all Typingpool HITs on Amazon Mechanical Turk
|
415
|
+
all = Typingpool::Amazon::HIT.all
|
416
|
+
#Find all reviewable Typingpool HITs
|
417
|
+
reviewable = Typingpool::Amazon::HIT.all_reviewable
|
418
|
+
#Find all approved Typingpool HITs
|
419
|
+
approved = Typingpool::Amazon::HIT.all_approved
|
420
|
+
#Find all HITs for our project
|
421
|
+
project_hits = Typingpool::Amazon::HIT.all_for_project(project.local.id)
|
422
|
+
#Filter all HITs (not just Typingpool HITs) arbitrarily
|
423
|
+
safe_to_delete = Typingpool::Amazon::HIT.all{|hit| hit.ours? && hit.full.expired_and_overdue? }
|
424
|
+
#Filter all approved HITs arbitrarily
|
425
|
+
ready_for_judgment = Typingpool::Amazon::HIT.all_reviewable{|hit| hit.submitted? && hit.ours? }
|
426
|
+
|
427
|
+
#Approve a HIT
|
428
|
+
ready_for_judgment[0].at_amazon.approve! #at_amazon is an rturk instance
|
429
|
+
#Reject a HIT
|
430
|
+
ready_for_judgment[1].at_amazon.reject!('Your transcription is just random gibberish')
|
431
|
+
#Delete a HIT from Amazon
|
432
|
+
safe_to_delete[0].remove_from_amazon
|
433
|
+
|
434
|
+
#Get text of transcript chunk (Typingpool::Transcript::Chunk)
|
435
|
+
transcript_chunk = approved[0].transcript
|
436
|
+
puts transcript_chunk.body
|
437
|
+
#Get formmated text of transcript chunk
|
438
|
+
puts transcript_chunk.body_as_text
|
439
|
+
#Get transcript chunk as HTML
|
440
|
+
puts transcript_chunk.body_as_html
|
441
|
+
#Get transcript chunk metadata
|
442
|
+
puts "--#{transcript_chunk.url} (audio at #{transcript_chunk.offset})"
|
443
|
+
```
|
444
|
+
|
445
|
+
##Author
|
446
|
+
|
447
|
+
Ryan Tate - ryantate@ryantate.com
|
448
|
+
|
449
|
+
##License
|
450
|
+
|
451
|
+
Copyright (c) 2011-2012 Ryan Tate. Released under the terms of the MIT
|
452
|
+
license. See LICENSE for details.
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
class HIT
|
4
|
+
class Assignment
|
5
|
+
|
6
|
+
#Subclass used in cases where we know Amazon's servers have no
|
7
|
+
#assignments for us (because hit.full.assignments_completed ==
|
8
|
+
#0), so we don't want to bother doing an HTTP request to
|
9
|
+
#check.
|
10
|
+
class Empty < Assignment
|
11
|
+
def initialize
|
12
|
+
@answers = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
end #Empty
|
16
|
+
end #Assignment
|
17
|
+
end #HIT
|
18
|
+
end #Amazon
|
19
|
+
end #Typingpool
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
class HIT
|
4
|
+
class Assignment
|
5
|
+
require 'typingpool/amazon/hit/assignment/empty'
|
6
|
+
|
7
|
+
#See the RTurk documentation and Amazon Mechanical Turk API
|
8
|
+
#documentation for more on these fields.
|
9
|
+
attr_reader :id, :status, :worker_id, :submitted_at
|
10
|
+
|
11
|
+
#Constructor. Takes an RTurk::Hit instance.
|
12
|
+
def initialize(rturk_hit)
|
13
|
+
if assignment = rturk_hit.assignments[0] #expensive!
|
14
|
+
@id = assignment.id
|
15
|
+
@status = assignment.status
|
16
|
+
@worker_id = assignment.worker_id
|
17
|
+
@submitted_at = assignment.submitted_at
|
18
|
+
if answers = assignment.answers
|
19
|
+
@answers = answers.to_hash
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
#Returns the answers associated with this assignment as a
|
25
|
+
#hash. If there are no answers, returns an empty hash.
|
26
|
+
def answers
|
27
|
+
@answers ||= {}
|
28
|
+
end
|
29
|
+
|
30
|
+
#Returns the transcription submitted by the user as raw text.
|
31
|
+
def body
|
32
|
+
(answers['transcription'] || answers['1']).to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
#Returms an RTurk::Assignment object corresponding to this
|
36
|
+
#assignment.
|
37
|
+
def at_amazon
|
38
|
+
RTurk::Assignment.new(@id)
|
39
|
+
end
|
40
|
+
end #Assignment
|
41
|
+
end #HIT
|
42
|
+
end #Amazon
|
43
|
+
end #Typingpool
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
class HIT
|
4
|
+
class Full
|
5
|
+
|
6
|
+
#For more on why this subclass is neccesary, see the
|
7
|
+
#documentation for
|
8
|
+
#Typingpool::Amazon::HIT.cached_or_new_from_searchhits. In
|
9
|
+
#short, RTurk::HITParser objects returned by RTurk::SearchHITs
|
10
|
+
#are pointlessly and subtly different from
|
11
|
+
#RTurk::GetHITResponse objects. (I need to submit a patch to
|
12
|
+
#RTurk.)
|
13
|
+
class FromSearchHITs < Full
|
14
|
+
#Constructor. Takes an RTurk::Hit instance and the text of
|
15
|
+
#the HIT's annotation. The text of the annotation must be
|
16
|
+
#submitted as a separate param because RTurk::Hit instances
|
17
|
+
#returned by RTurk::SearchHITs do not bother to extract the
|
18
|
+
#annotation into an attribute, so we have to so that
|
19
|
+
#ourselves (elsewhere) using the raw xml.
|
20
|
+
def initialize(rturk_hit, annotation)
|
21
|
+
import_standard_attrs_from_rturk_hit(rturk_hit)
|
22
|
+
@assignments_completed = rturk_hit.completed_assignments
|
23
|
+
@assignments_pending = rturk_hit.pending_assignments
|
24
|
+
self.annotation = annotation
|
25
|
+
end
|
26
|
+
|
27
|
+
protected
|
28
|
+
|
29
|
+
def external_question_url
|
30
|
+
unless @checked_question
|
31
|
+
self.external_question_url = at_amazon.xml
|
32
|
+
@checked_question = true
|
33
|
+
end
|
34
|
+
@external_question_url
|
35
|
+
end
|
36
|
+
|
37
|
+
def at_amazon
|
38
|
+
Amazon.rturk_hit_full(@id)
|
39
|
+
end
|
40
|
+
end #FromSearchHITs
|
41
|
+
end #Full
|
42
|
+
end #HIT
|
43
|
+
end #Amazon
|
44
|
+
end #Typingpool
|
@@ -0,0 +1,105 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
class HIT
|
4
|
+
class Full
|
5
|
+
require 'uri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'typingpool/amazon/hit/full/fromsearchhits'
|
9
|
+
|
10
|
+
#See the RTurk documentation and Amazon Mechanical Turk API
|
11
|
+
#documentation for more on these fields.
|
12
|
+
attr_reader :id, :type_id, :status, :external_question_url, :assignments_completed, :assignments_pending, :expires_at, :assignments_duration
|
13
|
+
|
14
|
+
#Constructor. Takes an RTurk::HIT instance.
|
15
|
+
def initialize(rturk_hit)
|
16
|
+
import_standard_attrs_from_rturk_hit(rturk_hit)
|
17
|
+
@assignments_completed = rturk_hit.assignments_completed_count
|
18
|
+
@assignments_pending = rturk_hit.assignments_pending_count
|
19
|
+
self.annotation = rturk_hit.annotation
|
20
|
+
self.external_question_url = rturk_hit.xml
|
21
|
+
end
|
22
|
+
|
23
|
+
#Returns the HIT annotation as a hash. If the annotation
|
24
|
+
#contained URL-encoded form key-value pairs, it decodes them
|
25
|
+
#and returns them as a hash. Otherwise, returns an empty hash
|
26
|
+
#(throwing away any annotation text that is not URL-encoded
|
27
|
+
#key-value pairs, for example the tags attached by the Amazon
|
28
|
+
#Mechanical Turk RUI).
|
29
|
+
def annotation
|
30
|
+
@annotation ||= {}
|
31
|
+
end
|
32
|
+
|
33
|
+
#Returns boolean indicated whether the HIT is
|
34
|
+
#expired. Determined by comparing the HIT's expires_at
|
35
|
+
#attribute with the current time.
|
36
|
+
def expired?
|
37
|
+
expires_at < Time.now
|
38
|
+
end
|
39
|
+
|
40
|
+
#Returns boolean indicated whether the HIT is expired and
|
41
|
+
#overdue, at which point it is totally safe to prune. This is
|
42
|
+
#determined by adding the assignment duration (how long a
|
43
|
+
#worker has to complete the HIT) to the HIT's expires_at time
|
44
|
+
#(when the HIT is removed from the Mechanical Turk
|
45
|
+
#marketplace).
|
46
|
+
def expired_and_overdue?
|
47
|
+
(expires_at + assignments_duration) < Time.now
|
48
|
+
end
|
49
|
+
|
50
|
+
#Returns the HTML of the external question associated with the
|
51
|
+
#HIT. All Typingpool HITs use external questions (as opposed
|
52
|
+
#to "internal" HIT QuestionForms), so this should always
|
53
|
+
#return something. In first use, must make an HTTP request to
|
54
|
+
#obtain the HTML.
|
55
|
+
def external_question
|
56
|
+
if @external_question.nil?
|
57
|
+
if external_question_url && external_question_url.match(/^http/)
|
58
|
+
#expensive, obviously:
|
59
|
+
@external_question = open(external_question_url).read
|
60
|
+
end
|
61
|
+
end
|
62
|
+
@external_question
|
63
|
+
end
|
64
|
+
|
65
|
+
#Takes the name of an HTML form param and returns the value
|
66
|
+
#associated with that param in the external question
|
67
|
+
#HTML. Triggers an HTTP request on first use (unless
|
68
|
+
#external_question has already been called).
|
69
|
+
def external_question_param(param)
|
70
|
+
if external_question
|
71
|
+
if input = Nokogiri::HTML::Document.parse(external_question).css("input[name=#{param}]")[0]
|
72
|
+
return input['value']
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
def import_standard_attrs_from_rturk_hit(hit)
|
80
|
+
%w(id type_id status expires_at assignments_duration).each do |attr|
|
81
|
+
instance_variable_set("@#{attr}", hit.send(attr))
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def annotation=(encoded)
|
86
|
+
@annotation = CGI.unescapeHTML(encoded.to_s)
|
87
|
+
begin
|
88
|
+
@annotation = URI.decode_www_form(@annotation)
|
89
|
+
@annotation = Hash[*@annotation.flatten]
|
90
|
+
rescue ArgumentError
|
91
|
+
#Handle annotations like Department:Transcription (from
|
92
|
+
#the Amazon RUI), which make URI.decode_www_form barf
|
93
|
+
@annotation = {}
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def external_question_url=(noko_xml)
|
98
|
+
if url = noko_xml.css('HIT Question eq|ExternalQuestion eq|ExternalURL', {'eq' => 'http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2006-07-14/ExternalQuestion.xsd'})[0].inner_text
|
99
|
+
@external_question_url = url
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end #Full
|
103
|
+
end #HIT
|
104
|
+
end #Amazon
|
105
|
+
end #Typingpool
|