typingpool 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +23 -0
- data/bin/tp-assign +240 -0
- data/bin/tp-collect +50 -0
- data/bin/tp-config +114 -0
- data/bin/tp-finish +101 -0
- data/bin/tp-make +169 -0
- data/bin/tp-review +175 -0
- data/lib/typingpool/amazon.rb +732 -0
- data/lib/typingpool/app.rb +634 -0
- data/lib/typingpool/config.rb +344 -0
- data/lib/typingpool/error.rb +22 -0
- data/lib/typingpool/filer.rb +396 -0
- data/lib/typingpool/project.rb +593 -0
- data/lib/typingpool/template.rb +175 -0
- data/lib/typingpool/templates/assignment/amazon-init.js +38 -0
- data/lib/typingpool/templates/assignment/interview/nameless.html.erb +13 -0
- data/lib/typingpool/templates/assignment/interview/noisy.html.erb +12 -0
- data/lib/typingpool/templates/assignment/interview/partials/voices.html.erb +10 -0
- data/lib/typingpool/templates/assignment/interview/phone.html.erb +12 -0
- data/lib/typingpool/templates/assignment/interview.html.erb +11 -0
- data/lib/typingpool/templates/assignment/main.css +20 -0
- data/lib/typingpool/templates/assignment/partials/entry.html.erb +19 -0
- data/lib/typingpool/templates/assignment/partials/footer.html.erb +3 -0
- data/lib/typingpool/templates/assignment/partials/header.html.erb +11 -0
- data/lib/typingpool/templates/assignment/partials/labeling-example.html.erb +4 -0
- data/lib/typingpool/templates/assignment/partials/labeling.html.erb +5 -0
- data/lib/typingpool/templates/assignment/partials/length-description.html.erb +6 -0
- data/lib/typingpool/templates/assignment/partials/voices.html.erb +10 -0
- data/lib/typingpool/templates/assignment/speech.html.erb +11 -0
- data/lib/typingpool/templates/config.yml +21 -0
- data/lib/typingpool/templates/project/audio/chunks/.empty_directory +0 -0
- data/lib/typingpool/templates/project/audio/originals/.empty_directory +0 -0
- data/lib/typingpool/templates/project/data/.empty_directory +0 -0
- data/lib/typingpool/templates/project/etc/ About these files - read me.txt +8 -0
- data/lib/typingpool/templates/project/etc/audio-compat.js +25 -0
- data/lib/typingpool/templates/project/etc/player/audio-player.js +4 -0
- data/lib/typingpool/templates/project/etc/player/license.txt +19 -0
- data/lib/typingpool/templates/project/etc/player/player.swf +0 -0
- data/lib/typingpool/templates/project/etc/transcript.css +49 -0
- data/lib/typingpool/templates/transcript.html.erb +23 -0
- data/lib/typingpool/test/fixtures/amazon-question-html.html +95 -0
- data/lib/typingpool/test/fixtures/amazon-question-url.txt +1 -0
- data/lib/typingpool/test/fixtures/audio/mp3/interview.1.mp3 +0 -0
- data/lib/typingpool/test/fixtures/audio/mp3/interview.2.mp3 +0 -0
- data/lib/typingpool/test/fixtures/audio/wma/VN620007.WMA +0 -0
- data/lib/typingpool/test/fixtures/audio/wma/VN620052.WMA +0 -0
- data/lib/typingpool/test/fixtures/config-1 +20 -0
- data/lib/typingpool/test/fixtures/config-2 +25 -0
- data/lib/typingpool/test/fixtures/not_yaml.txt +4 -0
- data/lib/typingpool/test/fixtures/template-2.html.erb +10 -0
- data/lib/typingpool/test/fixtures/template-3.html.erb +22 -0
- data/lib/typingpool/test/fixtures/template.html.erb +10 -0
- data/lib/typingpool/test/fixtures/tp_collect_id.txt +1 -0
- data/lib/typingpool/test/fixtures/tp_collect_sandbox-assignment.csv +8 -0
- data/lib/typingpool/test/fixtures/tp_review_id.txt +1 -0
- data/lib/typingpool/test/fixtures/tp_review_sandbox-assignment.csv +8 -0
- data/lib/typingpool/test/fixtures/transcript-chunks.csv +226 -0
- data/lib/typingpool/test/fixtures/utf8_transcript.txt +7 -0
- data/lib/typingpool/test/fixtures/vcr/tp-collect-1.yml +2712 -0
- data/lib/typingpool/test/fixtures/vcr/tp-collect-2.yml +2718 -0
- data/lib/typingpool/test/fixtures/vcr/tp-collect-3.yml +2768 -0
- data/lib/typingpool/test/fixtures/vcr/tp-review-1.yml +570 -0
- data/lib/typingpool/test/fixtures/vcr/tp-review-2.yml +351 -0
- data/lib/typingpool/test.rb +418 -0
- data/lib/typingpool/transcript.rb +181 -0
- data/lib/typingpool/utility.rb +272 -0
- data/lib/typingpool.rb +500 -0
- data/test/make_amazon_question_fixture.rb +24 -0
- data/test/make_tp_collect_fixture_1.rb +26 -0
- data/test/make_tp_collect_fixture_2.rb +16 -0
- data/test/make_tp_collect_fixture_3.rb +15 -0
- data/test/make_tp_collect_fixture_4.rb +17 -0
- data/test/make_tp_review_fixture_1.rb +26 -0
- data/test/make_tp_review_fixture_2.rb +30 -0
- data/test/make_transcript_chunks_fixture.rb +53 -0
- data/test/test_integration_script_1_tp_config.rb +108 -0
- data/test/test_integration_script_2_tp_make.rb +119 -0
- data/test/test_integration_script_3_tp_assign.rb +152 -0
- data/test/test_integration_script_4_tp_review.rb +72 -0
- data/test/test_integration_script_5_tp_collect.rb +44 -0
- data/test/test_integration_script_6_tp_finish.rb +123 -0
- data/test/test_unit_amazon.rb +153 -0
- data/test/test_unit_config.rb +94 -0
- data/test/test_unit_filer.rb +202 -0
- data/test/test_unit_project.rb +168 -0
- data/test/test_unit_project_local.rb +68 -0
- data/test/test_unit_project_remote.rb +157 -0
- data/test/test_unit_template.rb +111 -0
- data/test/test_unit_transcript.rb +77 -0
- metadata +234 -0
data/lib/typingpool.rb
ADDED
@@ -0,0 +1,500 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#= Typingpool
|
3
|
+
#
|
4
|
+
#Typingpool is an app for easily making transcripts of audio using
|
5
|
+
#Amazon's labor marketplace, Mechanical Turk.
|
6
|
+
#
|
7
|
+
#Typingpool is distributed as a Ruby gem. It is a made up of a handful
|
8
|
+
#of scripts for users and a collection of library files for
|
9
|
+
#developers.
|
10
|
+
#
|
11
|
+
#Typingpool also includes a collection of ERB templates for
|
12
|
+
#generating Mechanical Turk assignments and the final transcript HTML
|
13
|
+
#file.
|
14
|
+
#
|
15
|
+
#== Dependencies
|
16
|
+
#
|
17
|
+
#Typingpool depends on these command-line tools, which are not
|
18
|
+
#included in the gem since they are external to Ruby:
|
19
|
+
#
|
20
|
+
# [ffmpeg] A powerhouse audio/video converter.
|
21
|
+
# [libmp3lame] An mp3 encoder/decoder, used by ffmpeg.
|
22
|
+
# [mp3splt] An audio file-splitting utility.
|
23
|
+
# [mp3wrap] An audio file-merging utility.
|
24
|
+
#
|
25
|
+
#== User overview
|
26
|
+
#
|
27
|
+
#=== Setup
|
28
|
+
#
|
29
|
+
#After installing the gem and its dependencies, run tp-config from the
|
30
|
+
#command line to create your config file (~/.typingpool). At the
|
31
|
+
#prompts, you will need to supply your Amazon Web Services Access Key
|
32
|
+
#ID and your Amazon Web Services Secret Access key.
|
33
|
+
#
|
34
|
+
#The config file is in YAML format and may be customized using any
|
35
|
+
#text editor. For more details on configuration options, see the
|
36
|
+
#documentation for Typingpool::Config.
|
37
|
+
#
|
38
|
+
#=== Workflow
|
39
|
+
#
|
40
|
+
#A typical workflow will use the bundled scripts in this order:
|
41
|
+
#
|
42
|
+
# tp-make -> tp-assign -> [wait] -> tp-review -> tp-finish
|
43
|
+
#
|
44
|
+
#tp-review may be called repeatedly, until transcripts for all audio
|
45
|
+
#chunks have been processed. Similarly, tp-assign may be called
|
46
|
+
#repeatedly, for example to re-assign chunks rejected using tp-review,
|
47
|
+
#or to re-assign chunks that have expired.
|
48
|
+
#
|
49
|
+
#An alternate workflow would go like this:
|
50
|
+
#
|
51
|
+
# tp-make -> [manually upload assignments.csv to Amazon RUI] ->
|
52
|
+
# [wait] -> [approve/reject assignments via RUI] -> tp-collect ->
|
53
|
+
# tp-finish
|
54
|
+
#
|
55
|
+
#=== Examples
|
56
|
+
#
|
57
|
+
#Typical usage scenario:
|
58
|
+
#
|
59
|
+
# tp-make 'Chad Interview' chad1.WMA chad2.WMA --unusual 'Hack Day,
|
60
|
+
# Yahoo' --subtitle 'Phone interview re Yahoo Hack Day'
|
61
|
+
#
|
62
|
+
# # => Converting chad1.WMA to mp3
|
63
|
+
# # => Converting chad2.WMA to mp3
|
64
|
+
# # => Merging audio
|
65
|
+
# # => Splitting audio into uniform bits
|
66
|
+
# # => Uploading Chad Interview.00.00.mp3 to
|
67
|
+
# ryantate42.s3.amazonaws.com as Chad
|
68
|
+
# Interview.00.00.33ca7f2cceba9f8031bf4fb7c3f819f4.LHFJEM.mp3
|
69
|
+
# # => Uploading Chad Interview.01.00.mp3 to
|
70
|
+
# ryantate42.s3.amazonaws.com as Chad #
|
71
|
+
# Interview.01.00.33ca7f2cceba9f8031bf4fb7c3f819f4.XMWNYW.mp3
|
72
|
+
# # => Uploading Chad Interview.02.00.mp3 to
|
73
|
+
# ryantate42.s3.amazonaws.com as Chad #
|
74
|
+
# Interview.02.00.33ca7f2cceba9f8031bf4fb7c3f819f4.FNEIWN.mp3
|
75
|
+
# # => ... [snip]
|
76
|
+
# # => Done. Project at:
|
77
|
+
# # => /Users/ryantate/Desktop/Transcripts/Chad Interview
|
78
|
+
#
|
79
|
+
#
|
80
|
+
# tp-assign 'Chad Interview' interview/nameless --reward 1.00
|
81
|
+
# --deadline 90m --approval 6h --lifetime 2d
|
82
|
+
#
|
83
|
+
# # => Figuring out what needs to be assigned
|
84
|
+
# # => 85 assignments total
|
85
|
+
# # => 85 assignments to assign
|
86
|
+
# # => Deleting old assignment HTML from ryantate42.s3.amazonaws.com
|
87
|
+
# # => Uploading assignment HTML to ryantate42.s3.amazonaws.com
|
88
|
+
# # => Assigning
|
89
|
+
# # => Assigned 85 transcription jobs for $85
|
90
|
+
# # => Remaining balance: $115.00
|
91
|
+
#
|
92
|
+
# [Wait...]
|
93
|
+
#
|
94
|
+
#
|
95
|
+
# tp-review 'Chad Interview'
|
96
|
+
#
|
97
|
+
# # => Gathering submissions from Amazon
|
98
|
+
# # => Matching submissions with local projects
|
99
|
+
# # =>
|
100
|
+
# # => Transcript for: https://ryantate42.s3.amazonaws.com/
|
101
|
+
# Chad%20Interview.29.00.263d492275a81afb005c8231d8d8afdb.
|
102
|
+
# UEMOCN.mp3
|
103
|
+
# # => Project: Chad Interview: Phone interview re Yahoo Hack Day
|
104
|
+
# # => Submitted at: 2012-08-11 17:00:36 -0700 by A9S0AOAI8HO9P
|
105
|
+
# # =>
|
106
|
+
# # => Chad: ... so it had sort of some geek history. And the
|
107
|
+
# # => weather was really bad. But it was an indoor event,
|
108
|
+
# # => right? So people were staying indoors. And like very
|
109
|
+
# # => early... And there was all this really expensive gear
|
110
|
+
# # => that the BBC had. Like these cameras that guys were like
|
111
|
+
# # => riding around on and stuff, huge sound stage, bigger than
|
112
|
+
# # => the one we had in Sunnyvale.
|
113
|
+
# # =>
|
114
|
+
# # => Two hours into the event, we heard this big lightning
|
115
|
+
# # => strike, because we were up on a hill in London. And all
|
116
|
+
# # => the lights went out and the roof opened up in the
|
117
|
+
# # => building. What we didn't know is the fire supression
|
118
|
+
# # => system in that building which got blown up by the
|
119
|
+
# # => lightning during a fire would cause the roof to open
|
120
|
+
# # => up. So we had all these geeks with equipment and all this
|
121
|
+
# # => BBC equipment and it was literally raining on them.
|
122
|
+
# # =>
|
123
|
+
# # => (A)pprove, (R)eject, (Q)uit, [(S)kip]? (1/20)
|
124
|
+
#
|
125
|
+
# a
|
126
|
+
#
|
127
|
+
# # => Approved. Chad Interview transcript updated.
|
128
|
+
# # =>
|
129
|
+
# # => Transcript for: https://ryantate42.s3.amazonaws.com/
|
130
|
+
# Chad%20Interview.30.00.263d492275a81afb005c8231d8d8afdb.
|
131
|
+
# RXNKRN.mp3
|
132
|
+
# # => Project: Chad Interview: Phone interview re Yahoo Hack Day
|
133
|
+
# # => Submitted at: 2012-08-11 17:00:58 -0700 by A9S0AOAI8HO9P
|
134
|
+
# # =>
|
135
|
+
# # => Blah blah blah blah okay I am done typing byeeeeeeee
|
136
|
+
# # =>
|
137
|
+
# # => (A)pprove, (R)eject, (Q)uit, [(S)kip]? (2/20)
|
138
|
+
#
|
139
|
+
# r
|
140
|
+
#
|
141
|
+
# # => Rejection reason, for worker:
|
142
|
+
#
|
143
|
+
# There's no transcription at all, just nonsense
|
144
|
+
#
|
145
|
+
# # => Rejected
|
146
|
+
# # =>
|
147
|
+
# # => Transcript for...
|
148
|
+
# # => ... [snip]
|
149
|
+
#
|
150
|
+
#
|
151
|
+
# tp-finish 'Chad Interview'
|
152
|
+
#
|
153
|
+
# # => Removing from Amazon
|
154
|
+
# # => Collecting all results
|
155
|
+
# # => Removing HIT 2GKMIKMN9U8PNHKK58NXL3SU4TCBSN (Reviewable)
|
156
|
+
# # => Removing from data/assignment.csv
|
157
|
+
# # => Removing from local cache
|
158
|
+
# # => Removing HIT 2CFX2Q45UUKQ2HXZU8SNV8OG6CQBTC (Assignable)
|
159
|
+
# # => Removing from data/assignment.csv
|
160
|
+
# # => Removing from local cache
|
161
|
+
# # => Removing HIT 294EZZ2MIKMNNDP1LAU8WWWXOEI7O0...
|
162
|
+
# # => ... [snip]
|
163
|
+
# # => Removing Chad Interview.00.00.
|
164
|
+
# 263d492275a81afb005c8231d8d8afdb.ORSENE.html from
|
165
|
+
# ryantate42.s3.amazonaws.com
|
166
|
+
# # => Removing Chad Interview.01.00...
|
167
|
+
# # => ... [snip]
|
168
|
+
# # => Removing Chad Interview.00.00.
|
169
|
+
# 263d492275a81afb005c8231d8d8afdb.RNTVLN.mp3 from
|
170
|
+
# ryantate42.s3.amazonaws.com
|
171
|
+
# # => Removing Chad Interview.01.00....
|
172
|
+
# # => ... [snip]
|
173
|
+
#
|
174
|
+
#=== Output
|
175
|
+
#
|
176
|
+
#The final output of Typingpool is a project directory containing a
|
177
|
+
#transcript file.
|
178
|
+
#
|
179
|
+
#The transcript file is HTML with audio chunks embedded alongside each
|
180
|
+
#associated transcript chunk.
|
181
|
+
#
|
182
|
+
#The transcript file is called transcript.html when complete. A
|
183
|
+
#partial transcript file is called transcript_in_progress.html.
|
184
|
+
#
|
185
|
+
#The project directory also includes supporting files, including a CSV
|
186
|
+
#data file used to store raw transcript chunks, Amazon Mechanical Turk
|
187
|
+
#HIT information, and other metdata; Javscript code that swaps in
|
188
|
+
#Flash players on browsers that don't support mp3 files in audio tags;
|
189
|
+
#the original audio files and the audio chunks generated from them;
|
190
|
+
#and a CSS file.
|
191
|
+
#
|
192
|
+
#The directory is laid out like so:
|
193
|
+
#
|
194
|
+
# Chad Interview/
|
195
|
+
# -> transcript.html | transcript_in_progress.html
|
196
|
+
# -> audio/
|
197
|
+
# -> chunks/
|
198
|
+
# -> Chad Interview.00.00.mp3
|
199
|
+
# -> Chad Interview.01.00.mp3
|
200
|
+
# -> ... [snip]
|
201
|
+
# -> originals/
|
202
|
+
# -> chad1.WMA
|
203
|
+
# -> chad2.WMA
|
204
|
+
# -> data/
|
205
|
+
# -> assignment.csv
|
206
|
+
# -> id.txt
|
207
|
+
# -> subtitle.txt
|
208
|
+
# -> etc/
|
209
|
+
# -> audio-compat.js
|
210
|
+
# -> transcript.css
|
211
|
+
# -> About these files - readme.txt
|
212
|
+
# -> player/
|
213
|
+
# -> audio-player.js
|
214
|
+
# -> license.txt
|
215
|
+
# -> player.swf
|
216
|
+
#
|
217
|
+
#You may safely edit the files transcript.html, etc/transcript.css,
|
218
|
+
#and data/subtitle.txt, and you may safely delete the files in
|
219
|
+
#audio/originals and any .txt files in etc/. Editing or deleting other
|
220
|
+
#files may interfere with the operation of Typingpool or render the
|
221
|
+
#transcript inoperative. Do not edit transcript_in_progress.html as
|
222
|
+
#your changes will be overwritten if/when the transcript is next
|
223
|
+
#updated.
|
224
|
+
#
|
225
|
+
#
|
226
|
+
#=== Workflow (additional)
|
227
|
+
# * When you want to preview your assignments, run tp-assign with the
|
228
|
+
# option --sandbox and with --qualify 'rejection_rate < 100' (to
|
229
|
+
# make sure you qualify to view your own HITs). Then visit
|
230
|
+
# http://workersandbox.mturk.com and find your assignments (a seach
|
231
|
+
# for "mp3" works if you left mp3 set as a keyword in your config
|
232
|
+
# file). When you are done previewing, run tp-finish with the
|
233
|
+
# name/path of your project and the --sandbox option.
|
234
|
+
#
|
235
|
+
# * When you assign your transcription jobs via tp-assign, you must
|
236
|
+
# supply a template name or relative path as the second
|
237
|
+
# argument. In the example above, the named template is
|
238
|
+
# “interview/nameless.”
|
239
|
+
#
|
240
|
+
# The template “interview/nameless” is a great general purpose
|
241
|
+
# template. It instructs the transcriber not to worry about the
|
242
|
+
# names of the speakers, and instead to use labels like “male 1,”
|
243
|
+
# “male 2,” etc. This allows the transcriber to work quickly and
|
244
|
+
# usually results in a viable transcript, since you can consult
|
245
|
+
# your memory or the original audio to figure out who is who.
|
246
|
+
#
|
247
|
+
# To find what other templates are available, navigate to the
|
248
|
+
# directory where typingpool is installed (`gem which typingpool`)
|
249
|
+
# and then go into typingpool/templates/assignment and its
|
250
|
+
# subdirectories. Anything that ends in ‘.html.erb’ is an available
|
251
|
+
# template. You may also create your own templates in the directory
|
252
|
+
# listed in the “templates” param of your config file.
|
253
|
+
#
|
254
|
+
# The templates interview, interview/phone, and interview/noisy
|
255
|
+
# require you to have passed the names of two voices to tp-make
|
256
|
+
# when you created your project. The first voice should be the name
|
257
|
+
# (and optional title) of the interviewer, and the second the name
|
258
|
+
# (and title) of the interviewee, like so:
|
259
|
+
#
|
260
|
+
# tp-make 'Chad Interview' chad1.WMA chad2.WMA –voice ‘Ryan,
|
261
|
+
# hack reporter’ --voice ‘Chad, a software engineer’ --unusual
|
262
|
+
# 'Hack Day, Yahoo' --subtitle 'Phone interview re Yahoo Hack
|
263
|
+
# Day'
|
264
|
+
#
|
265
|
+
#
|
266
|
+
# * When you've rejected some submissions in tp-review and need to
|
267
|
+
# re-assign these chunks to be transcribed, simply re-run tp-assign
|
268
|
+
# with the name (or path) of your project. You may select the same
|
269
|
+
# template, reward, deadlines, etc., or pick new ones. tp-assign
|
270
|
+
# will be careful not to re-assign chunks for which you have
|
271
|
+
# approved a transcript, or which are pending on Mechanical Turk.
|
272
|
+
#
|
273
|
+
# * When some chunks previously assigned via tp-assign have expired
|
274
|
+
# without attracting submissions, simply re-run tp-assign as
|
275
|
+
# described above to re-assign these chunks. Consider increasing
|
276
|
+
# the dollar amount specified in your --reward argument.
|
277
|
+
#
|
278
|
+
# * When some chunks previously assigned via tp-assign have been
|
279
|
+
# submitted by workers but not approved or rejected in time for the
|
280
|
+
# approval deadline (assign/approval in your config file or
|
281
|
+
# --approval as passed to tp-assign), Mechanical Turk has
|
282
|
+
# automatically approved these submissions for you and you'll need
|
283
|
+
# to run tp-collect to collect them. (Yes, it’s silly you need run
|
284
|
+
# a whole different script instead of just calling tp-review as
|
285
|
+
# usual. I’ll fix this in a future version.)
|
286
|
+
#
|
287
|
+
# * When you want to cancel outstanding assignments, simply run
|
288
|
+
# tp-finish with the name of your project. If your assignments have
|
289
|
+
# already attracted submissions, you may be prompted to run
|
290
|
+
# tp-review first.
|
291
|
+
#
|
292
|
+
# * When tp-make, tp-assign, or tp-finish tells you it failed an
|
293
|
+
# upload, deletion, or Amazon command, simply re-run the script
|
294
|
+
# with the same arguments to re-attempt the upload, deletion or
|
295
|
+
# Amazon command. Typingpool carefully records which network
|
296
|
+
# operations it is attempting and which network operations have
|
297
|
+
# completed. It can robustly handle network errors, including
|
298
|
+
# uncaught exceptions.
|
299
|
+
##
|
300
|
+
#
|
301
|
+
#=== Maintenance
|
302
|
+
#
|
303
|
+
# [cache] If the cache file grows too large, you'll need to delete it
|
304
|
+
# manually. It may be safely deleted as long as no
|
305
|
+
# Typingpool scripts are running. Its location is
|
306
|
+
# specified in the 'cache' param in the config file. (The
|
307
|
+
# config file is at ~/.typingpool and the cache, by
|
308
|
+
# default, is at ~/.typingpool.cache.)
|
309
|
+
#
|
310
|
+
# Typingpool takes no steps to limit the size of the
|
311
|
+
# cache file. It prunes the cache of project-specific
|
312
|
+
# entries when you run tp-finish on a project, but the
|
313
|
+
# cache may grow large if you work on many active
|
314
|
+
# projects in parallel, or if you fail to run tp-finish
|
315
|
+
# on projects when you are done with them.
|
316
|
+
#
|
317
|
+
# [tp-finish] You should run tp-finish PROJECT each time you finish a
|
318
|
+
# project, where PROJECT may be either the project name
|
319
|
+
# or path. Assuming you have no submissions pending or
|
320
|
+
# awaiting approval, this clears all traces of the
|
321
|
+
# project from Amazon Mechanical Turk, from Amazon S3 or
|
322
|
+
# your SFTP server, and from the local cache. This will
|
323
|
+
# keep your local cache from balooning in size and will
|
324
|
+
# minimize your S3 charges or SFTP disk usage. It will
|
325
|
+
# also help Typingpool scripts run faster by reducing the
|
326
|
+
# number of HITs you have on Amazon Mechanical Turk; many
|
327
|
+
# Typingpool operations involve iterating through all of
|
328
|
+
# your HITs.
|
329
|
+
#
|
330
|
+
#
|
331
|
+
#=== See also
|
332
|
+
#
|
333
|
+
# * Run any script with the --help options for further details on how
|
334
|
+
# to run the script.
|
335
|
+
#
|
336
|
+
# * See the docs for Typingpool::Config for details of the config
|
337
|
+
# file format.
|
338
|
+
#
|
339
|
+
# * See Amazon's Mechanical Turk documentation for guides and
|
340
|
+
# overviews on how Mechanical Turk
|
341
|
+
# works. https://requester.mturk.com/help
|
342
|
+
#
|
343
|
+
# * See the documentation on ffmpeg and related libraries for clues
|
344
|
+
# as to how to make Typingpool support additional file
|
345
|
+
# formats. Typingpool can work with any file format that ffmpeg can
|
346
|
+
# convert to mp3 (libmp3lame).
|
347
|
+
#
|
348
|
+
# * For an overview of the concepts on which Typingpool is built, see
|
349
|
+
# Andy Baio’s guide to using Mechanical Turk for transcription:
|
350
|
+
# http://waxy.org/2008/09/audio_transcription_with_mechanical_turk/
|
351
|
+
#
|
352
|
+
#== Developer overview
|
353
|
+
#
|
354
|
+
#Views, used for the final transcript and for rendering HTML
|
355
|
+
#assignments for Amazon Mechanical Turk workers, are contained in a
|
356
|
+
#series of templates in lib/typingpool/templates, particularly
|
357
|
+
#transcript.html.erb and assignment/*. The control layer lives in the
|
358
|
+
#App class (lib/typingpool/app.rb) and within the individual
|
359
|
+
#scripts. The models constitute the other Typingpool classes,
|
360
|
+
#including most importantly and in rough order of importance the
|
361
|
+
#Project, Transcript, Amazon, Config and Filer classes (the latter of
|
362
|
+
#interest mainly because of Filer::Audio, which handles splitting,
|
363
|
+
#merging, and conversion).
|
364
|
+
#
|
365
|
+
#The models in particular, along with the App class, are
|
366
|
+
#underdeveloped and not particularly clear or fully thought
|
367
|
+
#through. The Transcript model, for example, should almost certainly
|
368
|
+
#be folded into the Project model. Dividing Project into
|
369
|
+
#Project::Local and Project::Remote only makes sense on a superficial
|
370
|
+
#level; Project::Remote could probably be its own class or even part
|
371
|
+
#of Utility. Amazon will probably be simpler if I can get some patches
|
372
|
+
#into RTurk, and Amazon::HITshould probably be integrated more closely
|
373
|
+
#with Project.
|
374
|
+
#
|
375
|
+
#One of the most frustrating things about the code is that there are
|
376
|
+
#so many subtly different ways a "chunk" of a transcript/project is
|
377
|
+
#represented: As a simple hash derived from a row in
|
378
|
+
#data/assignment.csv within a project folder, as an Amazon::HIT, as a
|
379
|
+
#Transcription::Chunk, as an audio file on a remote server, and as a
|
380
|
+
#local audio file (which has a different name from the remote
|
381
|
+
#file). So in future versions I'll probably reduce the number of
|
382
|
+
#different ways to represent a chunk.
|
383
|
+
#
|
384
|
+
#Also in the future, it's very likely that App will evolve from a
|
385
|
+
#simple collection of class methods into a real class with a simple
|
386
|
+
#set of instance methods called in a particular order by a "run"
|
387
|
+
#method or similar. Subclasses for particular scripts/commands will
|
388
|
+
#then override these methods.
|
389
|
+
#
|
390
|
+
#
|
391
|
+
#===Examples
|
392
|
+
#
|
393
|
+
#The most comprehensive examples of how the Typingpool classes
|
394
|
+
#actually work and interact are the tp-* scripts themselves, in
|
395
|
+
#particular tp-make, tp-assign, tp-review, and tp-finish.
|
396
|
+
#
|
397
|
+
#More concise examples follow below, to give you a sense of what the
|
398
|
+
#various classes actually do:
|
399
|
+
#
|
400
|
+
# require 'typingpool'
|
401
|
+
#
|
402
|
+
# #new Project instance
|
403
|
+
# project = Typingpool::Project.new('Chad Interview')
|
404
|
+
#
|
405
|
+
# #check if project exists on disk
|
406
|
+
# unless project.local
|
407
|
+
# #make a skeleton project folder in Config#transcripts dir
|
408
|
+
# project.create_local
|
409
|
+
# #make subtitle record in project folder
|
410
|
+
# project.local.subtitle = 'Interview about Hack Day Jan 21'
|
411
|
+
# end
|
412
|
+
#
|
413
|
+
# id = project.local.id
|
414
|
+
#
|
415
|
+
# #Wrap file in Typingpool::Filer
|
416
|
+
# wma = Typingpool::Filer::Audio.new('/foo/bar.wma')
|
417
|
+
#
|
418
|
+
# #convert file to mp3
|
419
|
+
# mp3 = wma.to_mp3
|
420
|
+
# other_mp3 = Typingpool::Filer::Audio.new('/foo/bar2.wma').to_mp3
|
421
|
+
#
|
422
|
+
# #merge audio
|
423
|
+
# combined_mp3 = Typingpool::Filer::Files::Audio.new([mp3,
|
424
|
+
# other_mp3]).merge(Typingpool::Filer.new('/foo/combined.mp3')
|
425
|
+
#
|
426
|
+
# #split audio every 1 minute
|
427
|
+
# chunks = combined_mp3.split('1.00')
|
428
|
+
#
|
429
|
+
# #upload mp3s
|
430
|
+
# urls = project.remote.put(chunks.to_streams,
|
431
|
+
# project.create_remote_names(chunks))
|
432
|
+
#
|
433
|
+
# #remove mp3s
|
434
|
+
# project.remote.remove_urls(urls)
|
435
|
+
#
|
436
|
+
# #new Template instance
|
437
|
+
# template = Typingpool::Template::Assignment.from_config('interview/nameless')
|
438
|
+
# html = template.render({
|
439
|
+
# 'audio_url' => urls[0],
|
440
|
+
# 'unusual' => ['Hack Day', 'Yahoo', 'Atlassian'],
|
441
|
+
# 'chunk_minutes' => 1,
|
442
|
+
# 'project_id' => project.local.id
|
443
|
+
# })
|
444
|
+
#
|
445
|
+
# question = Typingpool::Amazon::Question.new(urls[0], html)
|
446
|
+
#
|
447
|
+
# Typingpool::Amazon.setup
|
448
|
+
#
|
449
|
+
# #Assign a transcription job (1 chunk)
|
450
|
+
# hit = Typingpool::Amazon::HIT.create(question, Typingpool::Config.file.assign)
|
451
|
+
#
|
452
|
+
# #Find all Typingpool HITs on Amazon Mechanical Turk
|
453
|
+
# all = Typingpool::Amazon::HIT.all
|
454
|
+
# #Find all reviewable Typingpool HITs
|
455
|
+
# reviewable = Typingpool::Amazon::HIT.all_reviewable
|
456
|
+
# #Find all approved Typingpool HITs
|
457
|
+
# approved = Typingpool::Amazon::HIT.all_approved
|
458
|
+
# #Find all HITs for our project
|
459
|
+
# project_hits = Typingpool::Amazon::HIT.all_for_project(project.local.id)
|
460
|
+
# #Filter all HITs (not just Typingpool HITs) arbitrarily
|
461
|
+
# safe_to_delete = Typingpool::Amazon::HIT.all{|hit| hit.ours? && hit.full.expired_and_overdue? }
|
462
|
+
# #Filter all approved HITs arbitrarily
|
463
|
+
# ready_for_judgment = Typingpool::Amazon::HIT.all_approved{|hit| hit.submitted? && hit.ours? }
|
464
|
+
#
|
465
|
+
# #Approve a HIT
|
466
|
+
# ready_for_judgment[0].at_amazon.approve! #at_amazon is an rturk instance
|
467
|
+
# #Reject a HIT
|
468
|
+
# ready_for_judgment[1].at_amazon.reject!('Your transcription is just random gibberish')
|
469
|
+
# #Delete a HIT from Amazon
|
470
|
+
# safe_to_delete[0].remove_from_amazon
|
471
|
+
#
|
472
|
+
# #Get text of transcript chunk (Typingpool::Transcript::Chunk)
|
473
|
+
# transcript_chunk = approved[0].transcript
|
474
|
+
# puts transcript_chunk.body
|
475
|
+
# #Get formmated text of transcript chunk
|
476
|
+
# puts transcript_chunk.body_as_text
|
477
|
+
# #Get transcript chunk as HTML
|
478
|
+
# puts transcript_chunk.body_as_html
|
479
|
+
# #Get transcript chunk metadata
|
480
|
+
# puts "--#{transcript_chunk.url} (audio at #{transcript_chunk.offset})"
|
481
|
+
#
|
482
|
+
#==Author
|
483
|
+
# Ryan Tate - ryantate@ryantate.com
|
484
|
+
#
|
485
|
+
#==License
|
486
|
+
# Copyright (c) 2011-2012 Ryan Tate. Released under the terms of the MIT
|
487
|
+
# license. See LICENSE for details.
|
488
|
+
|
489
|
+
module Typingpool
|
490
|
+
VERSION = '0.7.0'
|
491
|
+
require 'typingpool/error'
|
492
|
+
require 'typingpool/utility'
|
493
|
+
require 'typingpool/config'
|
494
|
+
require 'typingpool/filer'
|
495
|
+
require 'typingpool/amazon'
|
496
|
+
require 'typingpool/project'
|
497
|
+
require 'typingpool/transcript'
|
498
|
+
require 'typingpool/template'
|
499
|
+
require 'typingpool/app'
|
500
|
+
end #Typingpool
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
4
|
+
|
5
|
+
require 'typingpool'
|
6
|
+
require 'typingpool/test'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
class MakeAmazonQuestion < Typingpool::Test::Script
|
10
|
+
def test_make_amazon_question_fixture
|
11
|
+
in_temp_tp_dir do |dir|
|
12
|
+
tp_make(dir)
|
13
|
+
template = Typingpool::Template::Assignment.from_config(assign_default[:template], config_from_dir(dir))
|
14
|
+
assignment = temp_tp_dir_project(dir).local.file('data', 'assignment.csv').as(:csv).read.first
|
15
|
+
question_html = template.render(assignment)
|
16
|
+
question_url = 'http://example.com/assignments/101.html'
|
17
|
+
assert_match(question_html, /\S/)
|
18
|
+
assert_match(question_url, /http/i)
|
19
|
+
File.open(File.join(fixtures_dir, 'amazon-question-html.html'), 'w'){|f| f << question_html}
|
20
|
+
File.open(File.join(fixtures_dir, 'amazon-question-url.txt'), 'w'){|f| f << question_url}
|
21
|
+
end #in_temp_tp_dir
|
22
|
+
add_goodbye_message("Amazon question fixtures created.")
|
23
|
+
end
|
24
|
+
end #MakeAmazonQuestion
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
4
|
+
|
5
|
+
require 'typingpool'
|
6
|
+
require 'typingpool/test'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
class MakeAndAssignProject < Typingpool::Test::Script
|
10
|
+
def test_prep_for_fixture
|
11
|
+
dir = make_fixture_project_dir('tp_collect_project_temp')
|
12
|
+
setup_temp_tp_dir(dir)
|
13
|
+
begin
|
14
|
+
tp_make(dir)
|
15
|
+
tp_assign(dir)
|
16
|
+
rescue
|
17
|
+
FileUtils.remove_entry_secure(dir)
|
18
|
+
raise
|
19
|
+
end
|
20
|
+
#copy key files over to permanent locations within fixture dir
|
21
|
+
with_fixtures_in_temp_tp_dir(dir, 'tp_collect_') do |fixture_path, project_path|
|
22
|
+
FileUtils.cp(project_path, fixture_path)
|
23
|
+
end
|
24
|
+
add_goodbye_message("Temp project assigned in Mechanical Turk sandbox. Complete and approve TWO assignments and run make_tp_collect_fixture_2.rb. Check for assignments at\nhttps://workersandbox.mturk.com/mturk/searchbar?minReward=0.00&searchWords=typingpooltest&selectedSearchType=hitgroups\n...and then approve them at\nhttps://requestersandbox.mturk.com/mturk/manageHITs?hitSortType=CREATION_DESCENDING&%2Fsort.x=11&%2Fsort.y=7")
|
25
|
+
end
|
26
|
+
end #MakeAndAssignProject
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
4
|
+
|
5
|
+
|
6
|
+
require 'typingpool/test'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
class CollectProjectFixtureGen2 < Typingpool::Test::Script
|
10
|
+
def test_populate_fixture
|
11
|
+
fixture_path = File.join(fixtures_dir, 'vcr', 'tp-collect-1')
|
12
|
+
tp_collect_with_fixture(fixture_project_dir('tp_collect_project_temp'), fixture_path)
|
13
|
+
assert(File.exists?("#{fixture_path}.yml"))
|
14
|
+
add_goodbye_message("Initial tp-collect recorded. Please complete and approve TWO more assignments and run make_tp_collect_fixture_3.rb. Check for assignments at\nhttps://workersandbox.mturk.com/mturk/searchbar?minReward=0.00&searchWords=typingpooltest&selectedSearchType=hitgroups\n...and then approve them at\nhttps://requestersandbox.mturk.com/mturk/manageHITs?hitSortType=CREATION_DESCENDING&%2Fsort.x=11&%2Fsort.y=7")
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
4
|
+
|
5
|
+
require 'typingpool/test'
|
6
|
+
require 'fileutils'
|
7
|
+
|
8
|
+
class CollectProjectFixtureGen3 < Typingpool::Test::Script
|
9
|
+
def test_populate_fixture2
|
10
|
+
fixture_path = File.join(fixtures_dir, 'vcr', 'tp-collect-2')
|
11
|
+
tp_collect_with_fixture(fixture_project_dir('tp_collect_project_temp'), fixture_path)
|
12
|
+
assert(File.exists?("#{fixture_path}.yml"))
|
13
|
+
add_goodbye_message("Second tp-collect recorded. Please complete and approve THREE more assignments and run make_tp_collect_fixture_4.rb. Check for assignments at\nhttps://workersandbox.mturk.com/mturk/searchbar?minReward=0.00&searchWords=typingpooltest&selectedSearchType=hitgroups\n...and then approve them at\nhttps://requestersandbox.mturk.com/mturk/manageHITs?hitSortType=CREATION_DESCENDING&%2Fsort.x=11&%2Fsort.y=7")
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
4
|
+
|
5
|
+
require 'typingpool/test'
|
6
|
+
require 'fileutils'
|
7
|
+
|
8
|
+
class CollectProjectFixtureGen4 < Typingpool::Test::Script
|
9
|
+
def test_populate_fixture3
|
10
|
+
fixture_path = File.join(fixtures_dir, 'vcr', 'tp-collect-3')
|
11
|
+
tp_collect_with_fixture(fixture_project_dir('tp_collect_project_temp'), fixture_path)
|
12
|
+
assert(File.exists?("#{fixture_path}.yml"))
|
13
|
+
tp_finish(fixture_project_dir('tp_collect_project_temp'))
|
14
|
+
remove_fixture_project_dir('tp_collect_project_temp')
|
15
|
+
add_goodbye_message("Third and final tp-collect recorded. Fixtures for tp-collect testing successfully generated in #{File.dirname(fixture_path)}!")
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
4
|
+
|
5
|
+
require 'typingpool'
|
6
|
+
require 'typingpool/test'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
class ReviewProjectFixtureGen1 < Typingpool::Test::Script
|
10
|
+
def test_prep_for_fixture
|
11
|
+
dir = make_fixture_project_dir('tp_review_project_temp')
|
12
|
+
setup_temp_tp_dir(dir)
|
13
|
+
begin
|
14
|
+
tp_make(dir)
|
15
|
+
tp_assign(dir)
|
16
|
+
rescue
|
17
|
+
FileUtils.remove_entry_secure(dir)
|
18
|
+
raise
|
19
|
+
end
|
20
|
+
#copy key files over to permanent locations within fixture dir
|
21
|
+
with_fixtures_in_temp_tp_dir(dir, 'tp_review_') do |fixture_path, project_path|
|
22
|
+
FileUtils.cp(project_path, fixture_path)
|
23
|
+
end
|
24
|
+
add_goodbye_message("Temp project assigned in Mechanical Turk sandbox. Complete SIX assignments and run make_tp_review_fixture_2.rb. Check for assignments at\nhttps://workersandbox.mturk.com/mturk/searchbar?minReward=0.00&searchWords=typingpooltest&selectedSearchType=hitgroups\n")
|
25
|
+
end
|
26
|
+
end #MakeAndAssignProjectForReview
|
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
4
|
+
|
5
|
+
require 'typingpool/test'
|
6
|
+
require 'fileutils'
|
7
|
+
|
8
|
+
class ReviewProjectFixtureGen2 < Typingpool::Test::Script
|
9
|
+
def test_populate_fixture
|
10
|
+
fixture_path = File.join(fixtures_dir, 'vcr', 'tp-review-1')
|
11
|
+
dir = fixture_project_dir('tp_review_project_temp')
|
12
|
+
output = nil
|
13
|
+
assert_nothing_raised do
|
14
|
+
output = tp_review_with_fixture(dir, fixture_path, %w(a r a r s q ))
|
15
|
+
end
|
16
|
+
assert_equal(0, output[:status].to_i, "Bad exit code: #{output[:status]} err: #{output[:err]}")
|
17
|
+
assert(File.exists?("#{fixture_path}.yml"))
|
18
|
+
|
19
|
+
fixture_path = File.join(fixtures_dir, 'vcr', 'tp-review-2')
|
20
|
+
assert_nothing_raised do
|
21
|
+
output = tp_review_with_fixture(dir, fixture_path, %w(a r))
|
22
|
+
end
|
23
|
+
assert_equal(0, output[:status].to_i, "Bad exit code: #{output[:status]} err: #{output[:err]}")
|
24
|
+
assert(File.exists?("#{fixture_path}.yml"))
|
25
|
+
|
26
|
+
tp_finish(dir)
|
27
|
+
remove_fixture_project_dir('tp_review_project_temp')
|
28
|
+
add_goodbye_message("All done!")
|
29
|
+
end
|
30
|
+
end
|