typingpool 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.markdown +452 -0
- data/lib/typingpool/amazon/hit/assignment/empty.rb +19 -0
- data/lib/typingpool/amazon/hit/assignment.rb +43 -0
- data/lib/typingpool/amazon/hit/full/fromsearchhits.rb +44 -0
- data/lib/typingpool/amazon/hit/full.rb +105 -0
- data/lib/typingpool/amazon/hit.rb +458 -0
- data/lib/typingpool/amazon/question.rb +45 -0
- data/lib/typingpool/amazon.rb +3 -677
- data/lib/typingpool/app/cli/formatter.rb +16 -0
- data/lib/typingpool/app/cli.rb +64 -0
- data/lib/typingpool/app/friendlyexceptions.rb +34 -0
- data/lib/typingpool/app.rb +2 -97
- data/lib/typingpool/config/root.rb +114 -0
- data/lib/typingpool/config.rb +13 -119
- data/lib/typingpool/filer/audio.rb +84 -0
- data/lib/typingpool/filer/csv.rb +57 -0
- data/lib/typingpool/filer/dir.rb +76 -0
- data/lib/typingpool/filer/files/audio.rb +63 -0
- data/lib/typingpool/filer/files.rb +55 -0
- data/lib/typingpool/filer.rb +4 -313
- data/lib/typingpool/project/local.rb +117 -0
- data/lib/typingpool/project/remote/s3.rb +135 -0
- data/lib/typingpool/project/remote/sftp.rb +100 -0
- data/lib/typingpool/project/remote.rb +65 -0
- data/lib/typingpool/project.rb +2 -396
- data/lib/typingpool/template/assignment.rb +17 -0
- data/lib/typingpool/template/env.rb +77 -0
- data/lib/typingpool/template.rb +2 -87
- data/lib/typingpool/test/script.rb +310 -0
- data/lib/typingpool/test.rb +1 -306
- data/lib/typingpool/transcript/chunk.rb +129 -0
- data/lib/typingpool/transcript.rb +1 -125
- data/lib/typingpool/utility/castable.rb +65 -0
- data/lib/typingpool/utility.rb +1 -61
- data/test/test_integration_script_6_tp_finish.rb +1 -0
- metadata +135 -81
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011-2012 Ryan Tate
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,452 @@
|
|
1
|
+
# Typingpool
|
2
|
+
|
3
|
+
Typingpool is an app for easily making transcripts of audio using
|
4
|
+
Amazon's labor marketplace, Mechanical Turk.
|
5
|
+
|
6
|
+
Typingpool is distributed as a Ruby gem. It is a made up of a handful
|
7
|
+
of scripts for users and a collection of library files for
|
8
|
+
developers.
|
9
|
+
|
10
|
+
Typingpool also includes a collection of ERB templates for
|
11
|
+
generating Mechanical Turk assignments and the final transcript HTML
|
12
|
+
file.
|
13
|
+
|
14
|
+
## Dependencies
|
15
|
+
|
16
|
+
Typingpool depends on these command-line tools, which are not
|
17
|
+
included in the gem since they are external to Ruby:
|
18
|
+
|
19
|
+
* [ffmpeg] A powerhouse audio/video converter.
|
20
|
+
* [libmp3lame] An mp3 encoder/decoder, used by ffmpeg.
|
21
|
+
* [mp3splt] An audio file-splitting utility.
|
22
|
+
* [mp3wrap] An audio file-merging utility.
|
23
|
+
|
24
|
+
## User overview
|
25
|
+
|
26
|
+
### Setup
|
27
|
+
|
28
|
+
After installing the gem and its dependencies, run tp-config from the
|
29
|
+
command line to create your config file (~/.typingpool). At the
|
30
|
+
prompts, you will need to supply your Amazon Web Services Access Key
|
31
|
+
ID and your Amazon Web Services Secret Access key.
|
32
|
+
|
33
|
+
The config file is in YAML format and may be customized using any
|
34
|
+
text editor. For more details on configuration options, see the
|
35
|
+
documentation for Typingpool::Config.
|
36
|
+
|
37
|
+
### Workflow
|
38
|
+
|
39
|
+
A typical workflow will use the bundled scripts in this order:
|
40
|
+
|
41
|
+
tp-make -> tp-assign -> [wait] -> tp-review -> tp-finish
|
42
|
+
|
43
|
+
tp-review may be called repeatedly, until transcripts for all audio
|
44
|
+
chunks have been processed. Similarly, tp-assign may be called
|
45
|
+
repeatedly, for example to re-assign chunks rejected using tp-review,
|
46
|
+
or to re-assign chunks that have expired.
|
47
|
+
|
48
|
+
An alternate workflow would go like this:
|
49
|
+
|
50
|
+
tp-make -> [manually upload assignments.csv to Amazon RUI] ->
|
51
|
+
[wait] -> [approve/reject assignments via RUI] -> tp-collect ->
|
52
|
+
tp-finish
|
53
|
+
|
54
|
+
### Examples
|
55
|
+
|
56
|
+
Typical usage scenario:
|
57
|
+
|
58
|
+
tp-make 'Chad Interview' chad1.WMA chad2.WMA --unusual 'Hack Day,
|
59
|
+
Yahoo' --subtitle 'Phone interview re Yahoo Hack Day'
|
60
|
+
|
61
|
+
# => Converting chad1.WMA to mp3
|
62
|
+
# => Converting chad2.WMA to mp3
|
63
|
+
# => Merging audio
|
64
|
+
# => Splitting audio into uniform bits
|
65
|
+
# => Uploading Chad Interview.00.00.mp3 to
|
66
|
+
ryantate42.s3.amazonaws.com as Chad
|
67
|
+
Interview.00.00.33ca7f2cceba9f8031bf4fb7c3f819f4.LHFJEM.mp3
|
68
|
+
# => Uploading Chad Interview.01.00.mp3 to
|
69
|
+
ryantate42.s3.amazonaws.com as Chad #
|
70
|
+
Interview.01.00.33ca7f2cceba9f8031bf4fb7c3f819f4.XMWNYW.mp3
|
71
|
+
# => Uploading Chad Interview.02.00.mp3 to
|
72
|
+
ryantate42.s3.amazonaws.com as Chad #
|
73
|
+
Interview.02.00.33ca7f2cceba9f8031bf4fb7c3f819f4.FNEIWN.mp3
|
74
|
+
# => ... [snip]
|
75
|
+
# => Done. Project at:
|
76
|
+
# => /Users/ryantate/Desktop/Transcripts/Chad Interview
|
77
|
+
|
78
|
+
|
79
|
+
tp-assign 'Chad Interview' interview/nameless --reward 1.00
|
80
|
+
--deadline 90m --approval 6h --lifetime 2d
|
81
|
+
|
82
|
+
# => Figuring out what needs to be assigned
|
83
|
+
# => 85 assignments total
|
84
|
+
# => 85 assignments to assign
|
85
|
+
# => Deleting old assignment HTML from ryantate42.s3.amazonaws.com
|
86
|
+
# => Uploading assignment HTML to ryantate42.s3.amazonaws.com
|
87
|
+
# => Assigning
|
88
|
+
# => Assigned 85 transcription jobs for $85
|
89
|
+
# => Remaining balance: $115.00
|
90
|
+
|
91
|
+
[Wait...]
|
92
|
+
|
93
|
+
|
94
|
+
tp-review 'Chad Interview'
|
95
|
+
|
96
|
+
# => Gathering submissions from Amazon
|
97
|
+
# => Matching submissions with local projects
|
98
|
+
# =>
|
99
|
+
# => Transcript for: https://ryantate42.s3.amazonaws.com/
|
100
|
+
Chad%20Interview.29.00.263d492275a81afb005c8231d8d8afdb.
|
101
|
+
UEMOCN.mp3
|
102
|
+
# => Project: Chad Interview: Phone interview re Yahoo Hack Day
|
103
|
+
# => Submitted at: 2012-08-11 17:00:36 -0700 by A9S0AOAI8HO9P
|
104
|
+
# =>
|
105
|
+
# => Chad: ... so it had sort of some geek history. And the
|
106
|
+
# => weather was really bad. But it was an indoor event,
|
107
|
+
# => right? So people were staying indoors. And like very
|
108
|
+
# => early... And there was all this really expensive gear
|
109
|
+
# => that the BBC had. Like these cameras that guys were like
|
110
|
+
# => riding around on and stuff, huge sound stage, bigger than
|
111
|
+
# => the one we had in Sunnyvale.
|
112
|
+
# =>
|
113
|
+
# => Two hours into the event, we heard this big lightning
|
114
|
+
# => strike, because we were up on a hill in London. And all
|
115
|
+
# => the lights went out and the roof opened up in the
|
116
|
+
# => building. What we didn't know is the fire supression
|
117
|
+
# => system in that building which got blown up by the
|
118
|
+
# => lightning during a fire would cause the roof to open
|
119
|
+
# => up. So we had all these geeks with equipment and all this
|
120
|
+
# => BBC equipment and it was literally raining on them.
|
121
|
+
# =>
|
122
|
+
# => (A)pprove, (R)eject, (Q)uit, [(S)kip]? (1/20)
|
123
|
+
|
124
|
+
a
|
125
|
+
|
126
|
+
# => Approved. Chad Interview transcript updated.
|
127
|
+
# =>
|
128
|
+
# => Transcript for: https://ryantate42.s3.amazonaws.com/
|
129
|
+
Chad%20Interview.30.00.263d492275a81afb005c8231d8d8afdb.
|
130
|
+
RXNKRN.mp3
|
131
|
+
# => Project: Chad Interview: Phone interview re Yahoo Hack Day
|
132
|
+
# => Submitted at: 2012-08-11 17:00:58 -0700 by A9S0AOAI8HO9P
|
133
|
+
# =>
|
134
|
+
# => Blah blah blah blah okay I am done typing byeeeeeeee
|
135
|
+
# =>
|
136
|
+
# => (A)pprove, (R)eject, (Q)uit, [(S)kip]? (2/20)
|
137
|
+
|
138
|
+
r
|
139
|
+
|
140
|
+
# => Rejection reason, for worker:
|
141
|
+
|
142
|
+
There's no transcription at all, just nonsense
|
143
|
+
|
144
|
+
# => Rejected
|
145
|
+
# =>
|
146
|
+
# => Transcript for...
|
147
|
+
# => ... [snip]
|
148
|
+
|
149
|
+
tp-finish 'Chad Interview'
|
150
|
+
|
151
|
+
# => Removing from Amazon
|
152
|
+
# => Collecting all results
|
153
|
+
# => Removing HIT 2GKMIKMN9U8PNHKK58NXL3SU4TCBSN (Reviewable)
|
154
|
+
# => Removing from data/assignment.csv
|
155
|
+
# => Removing from local cache
|
156
|
+
# => Removing HIT 2CFX2Q45UUKQ2HXZU8SNV8OG6CQBTC (Assignable)
|
157
|
+
# => Removing from data/assignment.csv
|
158
|
+
# => Removing from local cache
|
159
|
+
# => Removing HIT 294EZZ2MIKMNNDP1LAU8WWWXOEI7O0...
|
160
|
+
# => ... [snip]
|
161
|
+
# => Removing Chad Interview.00.00.
|
162
|
+
263d492275a81afb005c8231d8d8afdb.ORSENE.html from
|
163
|
+
ryantate42.s3.amazonaws.com
|
164
|
+
# => Removing Chad Interview.01.00...
|
165
|
+
# => ... [snip]
|
166
|
+
# => Removing Chad Interview.00.00.
|
167
|
+
263d492275a81afb005c8231d8d8afdb.RNTVLN.mp3 from
|
168
|
+
ryantate42.s3.amazonaws.com
|
169
|
+
# => Removing Chad Interview.01.00....
|
170
|
+
# => ... [snip]
|
171
|
+
|
172
|
+
### Output
|
173
|
+
|
174
|
+
The final output of Typingpool is a project directory containing a
|
175
|
+
transcript file.
|
176
|
+
|
177
|
+
The transcript file is HTML with audio chunks embedded alongside each
|
178
|
+
associated transcript chunk.
|
179
|
+
|
180
|
+
The transcript file is called transcript.html when complete. A
|
181
|
+
partial transcript file is called transcript_in_progress.html.
|
182
|
+
|
183
|
+
The project directory also includes supporting files, including a CSV
|
184
|
+
data file used to store raw transcript chunks, Amazon Mechanical Turk
|
185
|
+
HIT information, and other metdata; Javscript code that swaps in
|
186
|
+
Flash players on browsers that don't support mp3 files in audio tags;
|
187
|
+
the original audio files and the audio chunks generated from them;
|
188
|
+
and a CSS file.
|
189
|
+
|
190
|
+
The directory is laid out like so:
|
191
|
+
|
192
|
+
Chad Interview/
|
193
|
+
-> transcript.html | transcript_in_progress.html
|
194
|
+
-> audio/
|
195
|
+
-> chunks/
|
196
|
+
-> Chad Interview.00.00.mp3
|
197
|
+
-> Chad Interview.01.00.mp3
|
198
|
+
-> ... [snip]
|
199
|
+
-> originals/
|
200
|
+
-> chad1.WMA
|
201
|
+
-> chad2.WMA
|
202
|
+
-> data/
|
203
|
+
-> assignment.csv
|
204
|
+
-> id.txt
|
205
|
+
-> subtitle.txt
|
206
|
+
-> etc/
|
207
|
+
-> audio-compat.js
|
208
|
+
-> transcript.css
|
209
|
+
-> About these files - readme.txt
|
210
|
+
-> player/
|
211
|
+
-> audio-player.js
|
212
|
+
-> license.txt
|
213
|
+
-> player.swf
|
214
|
+
|
215
|
+
You may safely edit the files transcript.html, etc/transcript.css,
|
216
|
+
and data/subtitle.txt, and you may safely delete the files in
|
217
|
+
audio/originals and any .txt files in etc/. Editing or deleting other
|
218
|
+
files may interfere with the operation of Typingpool or render the
|
219
|
+
transcript inoperative. Do not edit transcript_in_progress.html as
|
220
|
+
your changes will be overwritten if/when the transcript is next
|
221
|
+
updated.
|
222
|
+
|
223
|
+
|
224
|
+
### Workflow (additional)
|
225
|
+
|
226
|
+
When you've rejected some submissions in tp-review and need to
|
227
|
+
re-assign these chunks to be transcribed, simply re-run tp-assign
|
228
|
+
with the name of your project. You may select the same template,
|
229
|
+
reward, deadlines, etc., or pick new ones. tp-assign will be careful
|
230
|
+
not to re-assign chunks for which you have approved a transcript, or
|
231
|
+
which are pending on Mechanical Turk.
|
232
|
+
|
233
|
+
When some chunks previously assigned via tp-assign have expired
|
234
|
+
without attracting submissions, simply re-run tp-assign as described
|
235
|
+
above to re-assign these chunks. Consider increasing the dollar
|
236
|
+
amount specified in your --reward argument.
|
237
|
+
|
238
|
+
When some chunks previously assigned via tp-assign have been
|
239
|
+
submitted by workers but not approved or rejected in time for the
|
240
|
+
approval deadline (assign/approval in your config file or --approval
|
241
|
+
as passed to tp-assign), Mechanical Turk has automatically approved
|
242
|
+
these submissions for you and you'll need to run tp-collect to
|
243
|
+
collect them.
|
244
|
+
|
245
|
+
When you want to cancel outstanding assignments, for example because
|
246
|
+
you realize you supplied the wrong parameter to tp-assign, simply run
|
247
|
+
tp-finish with the name of your project. If your assignments have
|
248
|
+
already attracted submissions, you may be prompted to run tp-review
|
249
|
+
first.
|
250
|
+
|
251
|
+
When tp-make, tp-assign, or tp-finish unsuccessfully attempts an
|
252
|
+
upload, deletion, or Amazon command, simply re-run the script with
|
253
|
+
the same arguments to re-attempt the upload, deletion or Amazon
|
254
|
+
command. Typingpool carefully records which network operations it is
|
255
|
+
attempting and which network operations have completed. It can
|
256
|
+
robustly handle network errors, including uncaught exceptions.
|
257
|
+
|
258
|
+
When you want to preview your assignments, run tp-assign with the
|
259
|
+
--sandbox option and with --qualify 'rejection_rate < 100' (to make
|
260
|
+
sure you qualify to view your own HITs). Then visit
|
261
|
+
http://workersandbox.mturk.com and find your assignments (a seach for
|
262
|
+
"mp3" works if you left mp3 set as a keyword in your config
|
263
|
+
file). When you are done previewing, run tp-finish with the name/path
|
264
|
+
of your project and the --sandbox option.
|
265
|
+
|
266
|
+
|
267
|
+
### Maintenance
|
268
|
+
|
269
|
+
* [cache] If the cache file grows too large, you'll need to delete
|
270
|
+
it manually. It may be safely deleted as long as no
|
271
|
+
Typingpool scripts are running. Its location is
|
272
|
+
specified in the 'cache' param in the config
|
273
|
+
file. (The config file is at ~/.typingpool and the
|
274
|
+
cache, by default, is at ~/.typingpool.cache.)
|
275
|
+
|
276
|
+
Typingpool takes no steps to limit the size of the
|
277
|
+
cache file. It prunes the cache of project-specific
|
278
|
+
entries when you run tp-finish on a project, but the
|
279
|
+
cache may grow large if you work on many active
|
280
|
+
projects in parallel, or if you fail to run tp-finish
|
281
|
+
on projects when you are done with them.
|
282
|
+
|
283
|
+
* [tp-finish] You should run tp-finish PROJECT each time you finish
|
284
|
+
a project, where PROJECT may be either the project
|
285
|
+
name or path. Assuming you have no submissions pending
|
286
|
+
or awaiting approval, this clears all traces of the
|
287
|
+
project from Amazon Mechanical Turk, from Amazon S3 or
|
288
|
+
your SFTP server, and from the local cache. This will
|
289
|
+
keep your local cache from balooning in size and will
|
290
|
+
minimize your S3 charges or SFTP disk usage. It will
|
291
|
+
also help Typingpool scripts run faster by reducing
|
292
|
+
the number of HITs you have on Amazon Mechanical Turk;
|
293
|
+
many Typingpool operations involve iterating through
|
294
|
+
all of your HITs.
|
295
|
+
|
296
|
+
|
297
|
+
### See also
|
298
|
+
|
299
|
+
* Run any script with the --help options for further details on how
|
300
|
+
to run the script.
|
301
|
+
|
302
|
+
* See the docs for Typingpool::Config for details of the config
|
303
|
+
file format.
|
304
|
+
|
305
|
+
* See Amazon's Mechanical Turk documentation for guides and
|
306
|
+
overviews on how Mechanical Turk works.
|
307
|
+
|
308
|
+
* See the documentation on ffmpeg and related libraries for clues
|
309
|
+
as to how to make Typingpool support additional file
|
310
|
+
formats. Typingpool can work with any file format that ffmpeg can
|
311
|
+
convert to mp3 (libmp3lame).
|
312
|
+
|
313
|
+
|
314
|
+
## Developer overview
|
315
|
+
|
316
|
+
Views, used for the final transcript and for rendering HTML
|
317
|
+
assignments for Amazon Mechanical Turk workers, are contained in a
|
318
|
+
series of templates in lib/typingpool/templates, particularly
|
319
|
+
transcript.html.erb and assignment/*. The control layer lives in the
|
320
|
+
App class (lib/typingpool/app.rb) and within the individual
|
321
|
+
scripts. The models constitute the other Typingpool classes,
|
322
|
+
including most importantly and in rough order of importance the
|
323
|
+
Project, Transcript, Amazon, Config and Filer classes (the latter of
|
324
|
+
interest mainly because of Filer::Audio, which handles splitting,
|
325
|
+
merging, and conversion).
|
326
|
+
|
327
|
+
The models in particular, along with the App class, are underdeveloped
|
328
|
+
and not particularly clear or fully thought through. The Transcript
|
329
|
+
model, for example, should almost certainly be folded into the Project
|
330
|
+
model. Dividing Project into Project::Local and Project::Remote only
|
331
|
+
makes sense on a superficial level; Project::Remote could probably be
|
332
|
+
its own class or even part of Utility. Amazon will probably be simpler
|
333
|
+
if I can get some patches into RTurk, and Amazon::HIT should probably
|
334
|
+
be integrated more closely with Project.
|
335
|
+
|
336
|
+
One of the most frustrating things about the code is that there are so
|
337
|
+
many subtly different ways a "chunk" of a transcript/project is
|
338
|
+
represented: As a simple hash derived from a row in
|
339
|
+
data/assignment.csv within a project folder, as an Amazon::HIT, as a
|
340
|
+
Transcription::Chunk, as an audio file on a remote server, and as a
|
341
|
+
local audio file (which has a different name from the remote file). So
|
342
|
+
in future versions I'll probably reduce the number of different ways
|
343
|
+
to represent a chunk.
|
344
|
+
|
345
|
+
Also in the future, it's very likely that App will evolve from a
|
346
|
+
simple collection of class methods into a real class with a simple
|
347
|
+
set of instance methods called in a particular order by a "run"
|
348
|
+
method or similar. Subclasses for particular scripts/commands will
|
349
|
+
then override these methods.
|
350
|
+
|
351
|
+
|
352
|
+
### Examples
|
353
|
+
|
354
|
+
The most comprehensive examples of how the Typingpool classes
|
355
|
+
actually work and interact are the tp-* scripts themselves, in
|
356
|
+
particular tp-make, tp-assign, tp-review, and tp-finish.
|
357
|
+
|
358
|
+
More concise examples follow below, to give you a sense of what the
|
359
|
+
various classes actually do:
|
360
|
+
|
361
|
+
```ruby
|
362
|
+
require 'typingpool'
|
363
|
+
|
364
|
+
#new Project instance
|
365
|
+
project = Typingpool::Project.new('Chad Interview')
|
366
|
+
|
367
|
+
#check if project exists on disk
|
368
|
+
unless project.local
|
369
|
+
#make a skeleton project folder in Config#transcripts dir
|
370
|
+
project.create_local
|
371
|
+
#make subtitle record in project folder
|
372
|
+
project.local.subtitle = 'Interview about Hack Day Jan 21'
|
373
|
+
end
|
374
|
+
|
375
|
+
id = project.local.id
|
376
|
+
|
377
|
+
#Wrap file in Typingpool::Filer
|
378
|
+
wma = Typingpool::Filer::Audio.new('/foo/bar.wma')
|
379
|
+
|
380
|
+
#convert file to mp3
|
381
|
+
mp3 = wma.to_mp3
|
382
|
+
other_mp3 = Typingpool::Filer::Audio.new('/foo/bar2.wma').to_mp3
|
383
|
+
|
384
|
+
#merge audio
|
385
|
+
combined_mp3 = Typingpool::Filer::Files::Audio.new([mp3,
|
386
|
+
other_mp3]).merge(Typingpool::Filer.new('/foo/combined.mp3')
|
387
|
+
|
388
|
+
#split audio every 1 minute
|
389
|
+
chunks = combined_mp3.split('1.00')
|
390
|
+
|
391
|
+
#upload mp3s
|
392
|
+
urls = project.remote.put(chunks.to_streams,
|
393
|
+
project.create_remote_names(chunks))
|
394
|
+
|
395
|
+
#remove mp3s
|
396
|
+
project.remote.remove_urls(urls)
|
397
|
+
|
398
|
+
#new Template instance
|
399
|
+
template = Typingpool::Template::Assignment.from_config('interview/nameless')
|
400
|
+
html = template.render({
|
401
|
+
'audio_url' => urls[0],
|
402
|
+
'unusual' => ['Hack Day', 'Yahoo', 'Atlassian'],
|
403
|
+
'chunk_minutes' => 1,
|
404
|
+
'project_id' => project.local.id
|
405
|
+
})
|
406
|
+
|
407
|
+
question = Typingpool::Amazon::Question.new(urls[0], html)
|
408
|
+
|
409
|
+
Typingpool::Amazon.setup
|
410
|
+
|
411
|
+
#Assign a transcription job (1 chunk)
|
412
|
+
hit = Typingpool::Amazon::HIT.create(question, Typingpool::Config.file.assign)
|
413
|
+
|
414
|
+
#Find all Typingpool HITs on Amazon Mechanical Turk
|
415
|
+
all = Typingpool::Amazon::HIT.all
|
416
|
+
#Find all reviewable Typingpool HITs
|
417
|
+
reviewable = Typingpool::Amazon::HIT.all_reviewable
|
418
|
+
#Find all approved Typingpool HITs
|
419
|
+
approved = Typingpool::Amazon::HIT.all_approved
|
420
|
+
#Find all HITs for our project
|
421
|
+
project_hits = Typingpool::Amazon::HIT.all_for_project(project.local.id)
|
422
|
+
#Filter all HITs (not just Typingpool HITs) arbitrarily
|
423
|
+
safe_to_delete = Typingpool::Amazon::HIT.all{|hit| hit.ours? && hit.full.expired_and_overdue? }
|
424
|
+
#Filter all approved HITs arbitrarily
|
425
|
+
ready_for_judgment = Typingpool::Amazon::HIT.all_reviewable{|hit| hit.submitted? && hit.ours? }
|
426
|
+
|
427
|
+
#Approve a HIT
|
428
|
+
ready_for_judgment[0].at_amazon.approve! #at_amazon is an rturk instance
|
429
|
+
#Reject a HIT
|
430
|
+
ready_for_judgment[1].at_amazon.reject!('Your transcription is just random gibberish')
|
431
|
+
#Delete a HIT from Amazon
|
432
|
+
safe_to_delete[0].remove_from_amazon
|
433
|
+
|
434
|
+
#Get text of transcript chunk (Typingpool::Transcript::Chunk)
|
435
|
+
transcript_chunk = approved[0].transcript
|
436
|
+
puts transcript_chunk.body
|
437
|
+
#Get formmated text of transcript chunk
|
438
|
+
puts transcript_chunk.body_as_text
|
439
|
+
#Get transcript chunk as HTML
|
440
|
+
puts transcript_chunk.body_as_html
|
441
|
+
#Get transcript chunk metadata
|
442
|
+
puts "--#{transcript_chunk.url} (audio at #{transcript_chunk.offset})"
|
443
|
+
```
|
444
|
+
|
445
|
+
##Author
|
446
|
+
|
447
|
+
Ryan Tate - ryantate@ryantate.com
|
448
|
+
|
449
|
+
##License
|
450
|
+
|
451
|
+
Copyright (c) 2011-2012 Ryan Tate. Released under the terms of the MIT
|
452
|
+
license. See LICENSE for details.
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
class HIT
|
4
|
+
class Assignment
|
5
|
+
|
6
|
+
#Subclass used in cases where we know Amazon's servers have no
|
7
|
+
#assignments for us (because hit.full.assignments_completed ==
|
8
|
+
#0), so we don't want to bother doing an HTTP request to
|
9
|
+
#check.
|
10
|
+
class Empty < Assignment
|
11
|
+
def initialize
|
12
|
+
@answers = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
end #Empty
|
16
|
+
end #Assignment
|
17
|
+
end #HIT
|
18
|
+
end #Amazon
|
19
|
+
end #Typingpool
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
class HIT
|
4
|
+
class Assignment
|
5
|
+
require 'typingpool/amazon/hit/assignment/empty'
|
6
|
+
|
7
|
+
#See the RTurk documentation and Amazon Mechanical Turk API
|
8
|
+
#documentation for more on these fields.
|
9
|
+
attr_reader :id, :status, :worker_id, :submitted_at
|
10
|
+
|
11
|
+
#Constructor. Takes an RTurk::Hit instance.
|
12
|
+
def initialize(rturk_hit)
|
13
|
+
if assignment = rturk_hit.assignments[0] #expensive!
|
14
|
+
@id = assignment.id
|
15
|
+
@status = assignment.status
|
16
|
+
@worker_id = assignment.worker_id
|
17
|
+
@submitted_at = assignment.submitted_at
|
18
|
+
if answers = assignment.answers
|
19
|
+
@answers = answers.to_hash
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
#Returns the answers associated with this assignment as a
|
25
|
+
#hash. If there are no answers, returns an empty hash.
|
26
|
+
def answers
|
27
|
+
@answers ||= {}
|
28
|
+
end
|
29
|
+
|
30
|
+
#Returns the transcription submitted by the user as raw text.
|
31
|
+
def body
|
32
|
+
(answers['transcription'] || answers['1']).to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
#Returms an RTurk::Assignment object corresponding to this
|
36
|
+
#assignment.
|
37
|
+
def at_amazon
|
38
|
+
RTurk::Assignment.new(@id)
|
39
|
+
end
|
40
|
+
end #Assignment
|
41
|
+
end #HIT
|
42
|
+
end #Amazon
|
43
|
+
end #Typingpool
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
class HIT
|
4
|
+
class Full
|
5
|
+
|
6
|
+
#For more on why this subclass is neccesary, see the
|
7
|
+
#documentation for
|
8
|
+
#Typingpool::Amazon::HIT.cached_or_new_from_searchhits. In
|
9
|
+
#short, RTurk::HITParser objects returned by RTurk::SearchHITs
|
10
|
+
#are pointlessly and subtly different from
|
11
|
+
#RTurk::GetHITResponse objects. (I need to submit a patch to
|
12
|
+
#RTurk.)
|
13
|
+
class FromSearchHITs < Full
|
14
|
+
#Constructor. Takes an RTurk::Hit instance and the text of
|
15
|
+
#the HIT's annotation. The text of the annotation must be
|
16
|
+
#submitted as a separate param because RTurk::Hit instances
|
17
|
+
#returned by RTurk::SearchHITs do not bother to extract the
|
18
|
+
#annotation into an attribute, so we have to so that
|
19
|
+
#ourselves (elsewhere) using the raw xml.
|
20
|
+
def initialize(rturk_hit, annotation)
|
21
|
+
import_standard_attrs_from_rturk_hit(rturk_hit)
|
22
|
+
@assignments_completed = rturk_hit.completed_assignments
|
23
|
+
@assignments_pending = rturk_hit.pending_assignments
|
24
|
+
self.annotation = annotation
|
25
|
+
end
|
26
|
+
|
27
|
+
protected
|
28
|
+
|
29
|
+
def external_question_url
|
30
|
+
unless @checked_question
|
31
|
+
self.external_question_url = at_amazon.xml
|
32
|
+
@checked_question = true
|
33
|
+
end
|
34
|
+
@external_question_url
|
35
|
+
end
|
36
|
+
|
37
|
+
def at_amazon
|
38
|
+
Amazon.rturk_hit_full(@id)
|
39
|
+
end
|
40
|
+
end #FromSearchHITs
|
41
|
+
end #Full
|
42
|
+
end #HIT
|
43
|
+
end #Amazon
|
44
|
+
end #Typingpool
|
@@ -0,0 +1,105 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Amazon
|
3
|
+
class HIT
|
4
|
+
class Full
|
5
|
+
require 'uri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'nokogiri'
|
8
|
+
require 'typingpool/amazon/hit/full/fromsearchhits'
|
9
|
+
|
10
|
+
#See the RTurk documentation and Amazon Mechanical Turk API
|
11
|
+
#documentation for more on these fields.
|
12
|
+
attr_reader :id, :type_id, :status, :external_question_url, :assignments_completed, :assignments_pending, :expires_at, :assignments_duration
|
13
|
+
|
14
|
+
#Constructor. Takes an RTurk::HIT instance.
|
15
|
+
def initialize(rturk_hit)
|
16
|
+
import_standard_attrs_from_rturk_hit(rturk_hit)
|
17
|
+
@assignments_completed = rturk_hit.assignments_completed_count
|
18
|
+
@assignments_pending = rturk_hit.assignments_pending_count
|
19
|
+
self.annotation = rturk_hit.annotation
|
20
|
+
self.external_question_url = rturk_hit.xml
|
21
|
+
end
|
22
|
+
|
23
|
+
#Returns the HIT annotation as a hash. If the annotation
|
24
|
+
#contained URL-encoded form key-value pairs, it decodes them
|
25
|
+
#and returns them as a hash. Otherwise, returns an empty hash
|
26
|
+
#(throwing away any annotation text that is not URL-encoded
|
27
|
+
#key-value pairs, for example the tags attached by the Amazon
|
28
|
+
#Mechanical Turk RUI).
|
29
|
+
def annotation
|
30
|
+
@annotation ||= {}
|
31
|
+
end
|
32
|
+
|
33
|
+
#Returns boolean indicated whether the HIT is
|
34
|
+
#expired. Determined by comparing the HIT's expires_at
|
35
|
+
#attribute with the current time.
|
36
|
+
def expired?
|
37
|
+
expires_at < Time.now
|
38
|
+
end
|
39
|
+
|
40
|
+
#Returns boolean indicated whether the HIT is expired and
|
41
|
+
#overdue, at which point it is totally safe to prune. This is
|
42
|
+
#determined by adding the assignment duration (how long a
|
43
|
+
#worker has to complete the HIT) to the HIT's expires_at time
|
44
|
+
#(when the HIT is removed from the Mechanical Turk
|
45
|
+
#marketplace).
|
46
|
+
def expired_and_overdue?
|
47
|
+
(expires_at + assignments_duration) < Time.now
|
48
|
+
end
|
49
|
+
|
50
|
+
#Returns the HTML of the external question associated with the
|
51
|
+
#HIT. All Typingpool HITs use external questions (as opposed
|
52
|
+
#to "internal" HIT QuestionForms), so this should always
|
53
|
+
#return something. In first use, must make an HTTP request to
|
54
|
+
#obtain the HTML.
|
55
|
+
def external_question
|
56
|
+
if @external_question.nil?
|
57
|
+
if external_question_url && external_question_url.match(/^http/)
|
58
|
+
#expensive, obviously:
|
59
|
+
@external_question = open(external_question_url).read
|
60
|
+
end
|
61
|
+
end
|
62
|
+
@external_question
|
63
|
+
end
|
64
|
+
|
65
|
+
#Takes the name of an HTML form param and returns the value
|
66
|
+
#associated with that param in the external question
|
67
|
+
#HTML. Triggers an HTTP request on first use (unless
|
68
|
+
#external_question has already been called).
|
69
|
+
def external_question_param(param)
|
70
|
+
if external_question
|
71
|
+
if input = Nokogiri::HTML::Document.parse(external_question).css("input[name=#{param}]")[0]
|
72
|
+
return input['value']
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
def import_standard_attrs_from_rturk_hit(hit)
|
80
|
+
%w(id type_id status expires_at assignments_duration).each do |attr|
|
81
|
+
instance_variable_set("@#{attr}", hit.send(attr))
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def annotation=(encoded)
|
86
|
+
@annotation = CGI.unescapeHTML(encoded.to_s)
|
87
|
+
begin
|
88
|
+
@annotation = URI.decode_www_form(@annotation)
|
89
|
+
@annotation = Hash[*@annotation.flatten]
|
90
|
+
rescue ArgumentError
|
91
|
+
#Handle annotations like Department:Transcription (from
|
92
|
+
#the Amazon RUI), which make URI.decode_www_form barf
|
93
|
+
@annotation = {}
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def external_question_url=(noko_xml)
|
98
|
+
if url = noko_xml.css('HIT Question eq|ExternalQuestion eq|ExternalURL', {'eq' => 'http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2006-07-14/ExternalQuestion.xsd'})[0].inner_text
|
99
|
+
@external_question_url = url
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end #Full
|
103
|
+
end #HIT
|
104
|
+
end #Amazon
|
105
|
+
end #Typingpool
|