typingpool 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +23 -0
- data/bin/tp-assign +240 -0
- data/bin/tp-collect +50 -0
- data/bin/tp-config +114 -0
- data/bin/tp-finish +101 -0
- data/bin/tp-make +169 -0
- data/bin/tp-review +175 -0
- data/lib/typingpool/amazon.rb +732 -0
- data/lib/typingpool/app.rb +634 -0
- data/lib/typingpool/config.rb +344 -0
- data/lib/typingpool/error.rb +22 -0
- data/lib/typingpool/filer.rb +396 -0
- data/lib/typingpool/project.rb +593 -0
- data/lib/typingpool/template.rb +175 -0
- data/lib/typingpool/templates/assignment/amazon-init.js +38 -0
- data/lib/typingpool/templates/assignment/interview/nameless.html.erb +13 -0
- data/lib/typingpool/templates/assignment/interview/noisy.html.erb +12 -0
- data/lib/typingpool/templates/assignment/interview/partials/voices.html.erb +10 -0
- data/lib/typingpool/templates/assignment/interview/phone.html.erb +12 -0
- data/lib/typingpool/templates/assignment/interview.html.erb +11 -0
- data/lib/typingpool/templates/assignment/main.css +20 -0
- data/lib/typingpool/templates/assignment/partials/entry.html.erb +19 -0
- data/lib/typingpool/templates/assignment/partials/footer.html.erb +3 -0
- data/lib/typingpool/templates/assignment/partials/header.html.erb +11 -0
- data/lib/typingpool/templates/assignment/partials/labeling-example.html.erb +4 -0
- data/lib/typingpool/templates/assignment/partials/labeling.html.erb +5 -0
- data/lib/typingpool/templates/assignment/partials/length-description.html.erb +6 -0
- data/lib/typingpool/templates/assignment/partials/voices.html.erb +10 -0
- data/lib/typingpool/templates/assignment/speech.html.erb +11 -0
- data/lib/typingpool/templates/config.yml +21 -0
- data/lib/typingpool/templates/project/audio/chunks/.empty_directory +0 -0
- data/lib/typingpool/templates/project/audio/originals/.empty_directory +0 -0
- data/lib/typingpool/templates/project/data/.empty_directory +0 -0
- data/lib/typingpool/templates/project/etc/ About these files - read me.txt +8 -0
- data/lib/typingpool/templates/project/etc/audio-compat.js +25 -0
- data/lib/typingpool/templates/project/etc/player/audio-player.js +4 -0
- data/lib/typingpool/templates/project/etc/player/license.txt +19 -0
- data/lib/typingpool/templates/project/etc/player/player.swf +0 -0
- data/lib/typingpool/templates/project/etc/transcript.css +49 -0
- data/lib/typingpool/templates/transcript.html.erb +23 -0
- data/lib/typingpool/test/fixtures/amazon-question-html.html +95 -0
- data/lib/typingpool/test/fixtures/amazon-question-url.txt +1 -0
- data/lib/typingpool/test/fixtures/audio/mp3/interview.1.mp3 +0 -0
- data/lib/typingpool/test/fixtures/audio/mp3/interview.2.mp3 +0 -0
- data/lib/typingpool/test/fixtures/audio/wma/VN620007.WMA +0 -0
- data/lib/typingpool/test/fixtures/audio/wma/VN620052.WMA +0 -0
- data/lib/typingpool/test/fixtures/config-1 +20 -0
- data/lib/typingpool/test/fixtures/config-2 +25 -0
- data/lib/typingpool/test/fixtures/not_yaml.txt +4 -0
- data/lib/typingpool/test/fixtures/template-2.html.erb +10 -0
- data/lib/typingpool/test/fixtures/template-3.html.erb +22 -0
- data/lib/typingpool/test/fixtures/template.html.erb +10 -0
- data/lib/typingpool/test/fixtures/tp_collect_id.txt +1 -0
- data/lib/typingpool/test/fixtures/tp_collect_sandbox-assignment.csv +8 -0
- data/lib/typingpool/test/fixtures/tp_review_id.txt +1 -0
- data/lib/typingpool/test/fixtures/tp_review_sandbox-assignment.csv +8 -0
- data/lib/typingpool/test/fixtures/transcript-chunks.csv +226 -0
- data/lib/typingpool/test/fixtures/utf8_transcript.txt +7 -0
- data/lib/typingpool/test/fixtures/vcr/tp-collect-1.yml +2712 -0
- data/lib/typingpool/test/fixtures/vcr/tp-collect-2.yml +2718 -0
- data/lib/typingpool/test/fixtures/vcr/tp-collect-3.yml +2768 -0
- data/lib/typingpool/test/fixtures/vcr/tp-review-1.yml +570 -0
- data/lib/typingpool/test/fixtures/vcr/tp-review-2.yml +351 -0
- data/lib/typingpool/test.rb +418 -0
- data/lib/typingpool/transcript.rb +181 -0
- data/lib/typingpool/utility.rb +272 -0
- data/lib/typingpool.rb +500 -0
- data/test/make_amazon_question_fixture.rb +24 -0
- data/test/make_tp_collect_fixture_1.rb +26 -0
- data/test/make_tp_collect_fixture_2.rb +16 -0
- data/test/make_tp_collect_fixture_3.rb +15 -0
- data/test/make_tp_collect_fixture_4.rb +17 -0
- data/test/make_tp_review_fixture_1.rb +26 -0
- data/test/make_tp_review_fixture_2.rb +30 -0
- data/test/make_transcript_chunks_fixture.rb +53 -0
- data/test/test_integration_script_1_tp_config.rb +108 -0
- data/test/test_integration_script_2_tp_make.rb +119 -0
- data/test/test_integration_script_3_tp_assign.rb +152 -0
- data/test/test_integration_script_4_tp_review.rb +72 -0
- data/test/test_integration_script_5_tp_collect.rb +44 -0
- data/test/test_integration_script_6_tp_finish.rb +123 -0
- data/test/test_unit_amazon.rb +153 -0
- data/test/test_unit_config.rb +94 -0
- data/test/test_unit_filer.rb +202 -0
- data/test/test_unit_project.rb +168 -0
- data/test/test_unit_project_local.rb +68 -0
- data/test/test_unit_project_remote.rb +157 -0
- data/test/test_unit_template.rb +111 -0
- data/test/test_unit_transcript.rb +77 -0
- metadata +234 -0
data/lib/typingpool.rb
ADDED
|
@@ -0,0 +1,500 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#= Typingpool
|
|
3
|
+
#
|
|
4
|
+
#Typingpool is an app for easily making transcripts of audio using
|
|
5
|
+
#Amazon's labor marketplace, Mechanical Turk.
|
|
6
|
+
#
|
|
7
|
+
#Typingpool is distributed as a Ruby gem. It is a made up of a handful
|
|
8
|
+
#of scripts for users and a collection of library files for
|
|
9
|
+
#developers.
|
|
10
|
+
#
|
|
11
|
+
#Typingpool also includes a collection of ERB templates for
|
|
12
|
+
#generating Mechanical Turk assignments and the final transcript HTML
|
|
13
|
+
#file.
|
|
14
|
+
#
|
|
15
|
+
#== Dependencies
|
|
16
|
+
#
|
|
17
|
+
#Typingpool depends on these command-line tools, which are not
|
|
18
|
+
#included in the gem since they are external to Ruby:
|
|
19
|
+
#
|
|
20
|
+
# [ffmpeg] A powerhouse audio/video converter.
|
|
21
|
+
# [libmp3lame] An mp3 encoder/decoder, used by ffmpeg.
|
|
22
|
+
# [mp3splt] An audio file-splitting utility.
|
|
23
|
+
# [mp3wrap] An audio file-merging utility.
|
|
24
|
+
#
|
|
25
|
+
#== User overview
|
|
26
|
+
#
|
|
27
|
+
#=== Setup
|
|
28
|
+
#
|
|
29
|
+
#After installing the gem and its dependencies, run tp-config from the
|
|
30
|
+
#command line to create your config file (~/.typingpool). At the
|
|
31
|
+
#prompts, you will need to supply your Amazon Web Services Access Key
|
|
32
|
+
#ID and your Amazon Web Services Secret Access key.
|
|
33
|
+
#
|
|
34
|
+
#The config file is in YAML format and may be customized using any
|
|
35
|
+
#text editor. For more details on configuration options, see the
|
|
36
|
+
#documentation for Typingpool::Config.
|
|
37
|
+
#
|
|
38
|
+
#=== Workflow
|
|
39
|
+
#
|
|
40
|
+
#A typical workflow will use the bundled scripts in this order:
|
|
41
|
+
#
|
|
42
|
+
# tp-make -> tp-assign -> [wait] -> tp-review -> tp-finish
|
|
43
|
+
#
|
|
44
|
+
#tp-review may be called repeatedly, until transcripts for all audio
|
|
45
|
+
#chunks have been processed. Similarly, tp-assign may be called
|
|
46
|
+
#repeatedly, for example to re-assign chunks rejected using tp-review,
|
|
47
|
+
#or to re-assign chunks that have expired.
|
|
48
|
+
#
|
|
49
|
+
#An alternate workflow would go like this:
|
|
50
|
+
#
|
|
51
|
+
# tp-make -> [manually upload assignments.csv to Amazon RUI] ->
|
|
52
|
+
# [wait] -> [approve/reject assignments via RUI] -> tp-collect ->
|
|
53
|
+
# tp-finish
|
|
54
|
+
#
|
|
55
|
+
#=== Examples
|
|
56
|
+
#
|
|
57
|
+
#Typical usage scenario:
|
|
58
|
+
#
|
|
59
|
+
# tp-make 'Chad Interview' chad1.WMA chad2.WMA --unusual 'Hack Day,
|
|
60
|
+
# Yahoo' --subtitle 'Phone interview re Yahoo Hack Day'
|
|
61
|
+
#
|
|
62
|
+
# # => Converting chad1.WMA to mp3
|
|
63
|
+
# # => Converting chad2.WMA to mp3
|
|
64
|
+
# # => Merging audio
|
|
65
|
+
# # => Splitting audio into uniform bits
|
|
66
|
+
# # => Uploading Chad Interview.00.00.mp3 to
|
|
67
|
+
# ryantate42.s3.amazonaws.com as Chad
|
|
68
|
+
# Interview.00.00.33ca7f2cceba9f8031bf4fb7c3f819f4.LHFJEM.mp3
|
|
69
|
+
# # => Uploading Chad Interview.01.00.mp3 to
|
|
70
|
+
# ryantate42.s3.amazonaws.com as Chad #
|
|
71
|
+
# Interview.01.00.33ca7f2cceba9f8031bf4fb7c3f819f4.XMWNYW.mp3
|
|
72
|
+
# # => Uploading Chad Interview.02.00.mp3 to
|
|
73
|
+
# ryantate42.s3.amazonaws.com as Chad #
|
|
74
|
+
# Interview.02.00.33ca7f2cceba9f8031bf4fb7c3f819f4.FNEIWN.mp3
|
|
75
|
+
# # => ... [snip]
|
|
76
|
+
# # => Done. Project at:
|
|
77
|
+
# # => /Users/ryantate/Desktop/Transcripts/Chad Interview
|
|
78
|
+
#
|
|
79
|
+
#
|
|
80
|
+
# tp-assign 'Chad Interview' interview/nameless --reward 1.00
|
|
81
|
+
# --deadline 90m --approval 6h --lifetime 2d
|
|
82
|
+
#
|
|
83
|
+
# # => Figuring out what needs to be assigned
|
|
84
|
+
# # => 85 assignments total
|
|
85
|
+
# # => 85 assignments to assign
|
|
86
|
+
# # => Deleting old assignment HTML from ryantate42.s3.amazonaws.com
|
|
87
|
+
# # => Uploading assignment HTML to ryantate42.s3.amazonaws.com
|
|
88
|
+
# # => Assigning
|
|
89
|
+
# # => Assigned 85 transcription jobs for $85
|
|
90
|
+
# # => Remaining balance: $115.00
|
|
91
|
+
#
|
|
92
|
+
# [Wait...]
|
|
93
|
+
#
|
|
94
|
+
#
|
|
95
|
+
# tp-review 'Chad Interview'
|
|
96
|
+
#
|
|
97
|
+
# # => Gathering submissions from Amazon
|
|
98
|
+
# # => Matching submissions with local projects
|
|
99
|
+
# # =>
|
|
100
|
+
# # => Transcript for: https://ryantate42.s3.amazonaws.com/
|
|
101
|
+
# Chad%20Interview.29.00.263d492275a81afb005c8231d8d8afdb.
|
|
102
|
+
# UEMOCN.mp3
|
|
103
|
+
# # => Project: Chad Interview: Phone interview re Yahoo Hack Day
|
|
104
|
+
# # => Submitted at: 2012-08-11 17:00:36 -0700 by A9S0AOAI8HO9P
|
|
105
|
+
# # =>
|
|
106
|
+
# # => Chad: ... so it had sort of some geek history. And the
|
|
107
|
+
# # => weather was really bad. But it was an indoor event,
|
|
108
|
+
# # => right? So people were staying indoors. And like very
|
|
109
|
+
# # => early... And there was all this really expensive gear
|
|
110
|
+
# # => that the BBC had. Like these cameras that guys were like
|
|
111
|
+
# # => riding around on and stuff, huge sound stage, bigger than
|
|
112
|
+
# # => the one we had in Sunnyvale.
|
|
113
|
+
# # =>
|
|
114
|
+
# # => Two hours into the event, we heard this big lightning
|
|
115
|
+
# # => strike, because we were up on a hill in London. And all
|
|
116
|
+
# # => the lights went out and the roof opened up in the
|
|
117
|
+
# # => building. What we didn't know is the fire supression
|
|
118
|
+
# # => system in that building which got blown up by the
|
|
119
|
+
# # => lightning during a fire would cause the roof to open
|
|
120
|
+
# # => up. So we had all these geeks with equipment and all this
|
|
121
|
+
# # => BBC equipment and it was literally raining on them.
|
|
122
|
+
# # =>
|
|
123
|
+
# # => (A)pprove, (R)eject, (Q)uit, [(S)kip]? (1/20)
|
|
124
|
+
#
|
|
125
|
+
# a
|
|
126
|
+
#
|
|
127
|
+
# # => Approved. Chad Interview transcript updated.
|
|
128
|
+
# # =>
|
|
129
|
+
# # => Transcript for: https://ryantate42.s3.amazonaws.com/
|
|
130
|
+
# Chad%20Interview.30.00.263d492275a81afb005c8231d8d8afdb.
|
|
131
|
+
# RXNKRN.mp3
|
|
132
|
+
# # => Project: Chad Interview: Phone interview re Yahoo Hack Day
|
|
133
|
+
# # => Submitted at: 2012-08-11 17:00:58 -0700 by A9S0AOAI8HO9P
|
|
134
|
+
# # =>
|
|
135
|
+
# # => Blah blah blah blah okay I am done typing byeeeeeeee
|
|
136
|
+
# # =>
|
|
137
|
+
# # => (A)pprove, (R)eject, (Q)uit, [(S)kip]? (2/20)
|
|
138
|
+
#
|
|
139
|
+
# r
|
|
140
|
+
#
|
|
141
|
+
# # => Rejection reason, for worker:
|
|
142
|
+
#
|
|
143
|
+
# There's no transcription at all, just nonsense
|
|
144
|
+
#
|
|
145
|
+
# # => Rejected
|
|
146
|
+
# # =>
|
|
147
|
+
# # => Transcript for...
|
|
148
|
+
# # => ... [snip]
|
|
149
|
+
#
|
|
150
|
+
#
|
|
151
|
+
# tp-finish 'Chad Interview'
|
|
152
|
+
#
|
|
153
|
+
# # => Removing from Amazon
|
|
154
|
+
# # => Collecting all results
|
|
155
|
+
# # => Removing HIT 2GKMIKMN9U8PNHKK58NXL3SU4TCBSN (Reviewable)
|
|
156
|
+
# # => Removing from data/assignment.csv
|
|
157
|
+
# # => Removing from local cache
|
|
158
|
+
# # => Removing HIT 2CFX2Q45UUKQ2HXZU8SNV8OG6CQBTC (Assignable)
|
|
159
|
+
# # => Removing from data/assignment.csv
|
|
160
|
+
# # => Removing from local cache
|
|
161
|
+
# # => Removing HIT 294EZZ2MIKMNNDP1LAU8WWWXOEI7O0...
|
|
162
|
+
# # => ... [snip]
|
|
163
|
+
# # => Removing Chad Interview.00.00.
|
|
164
|
+
# 263d492275a81afb005c8231d8d8afdb.ORSENE.html from
|
|
165
|
+
# ryantate42.s3.amazonaws.com
|
|
166
|
+
# # => Removing Chad Interview.01.00...
|
|
167
|
+
# # => ... [snip]
|
|
168
|
+
# # => Removing Chad Interview.00.00.
|
|
169
|
+
# 263d492275a81afb005c8231d8d8afdb.RNTVLN.mp3 from
|
|
170
|
+
# ryantate42.s3.amazonaws.com
|
|
171
|
+
# # => Removing Chad Interview.01.00....
|
|
172
|
+
# # => ... [snip]
|
|
173
|
+
#
|
|
174
|
+
#=== Output
|
|
175
|
+
#
|
|
176
|
+
#The final output of Typingpool is a project directory containing a
|
|
177
|
+
#transcript file.
|
|
178
|
+
#
|
|
179
|
+
#The transcript file is HTML with audio chunks embedded alongside each
|
|
180
|
+
#associated transcript chunk.
|
|
181
|
+
#
|
|
182
|
+
#The transcript file is called transcript.html when complete. A
|
|
183
|
+
#partial transcript file is called transcript_in_progress.html.
|
|
184
|
+
#
|
|
185
|
+
#The project directory also includes supporting files, including a CSV
|
|
186
|
+
#data file used to store raw transcript chunks, Amazon Mechanical Turk
|
|
187
|
+
#HIT information, and other metdata; Javscript code that swaps in
|
|
188
|
+
#Flash players on browsers that don't support mp3 files in audio tags;
|
|
189
|
+
#the original audio files and the audio chunks generated from them;
|
|
190
|
+
#and a CSS file.
|
|
191
|
+
#
|
|
192
|
+
#The directory is laid out like so:
|
|
193
|
+
#
|
|
194
|
+
# Chad Interview/
|
|
195
|
+
# -> transcript.html | transcript_in_progress.html
|
|
196
|
+
# -> audio/
|
|
197
|
+
# -> chunks/
|
|
198
|
+
# -> Chad Interview.00.00.mp3
|
|
199
|
+
# -> Chad Interview.01.00.mp3
|
|
200
|
+
# -> ... [snip]
|
|
201
|
+
# -> originals/
|
|
202
|
+
# -> chad1.WMA
|
|
203
|
+
# -> chad2.WMA
|
|
204
|
+
# -> data/
|
|
205
|
+
# -> assignment.csv
|
|
206
|
+
# -> id.txt
|
|
207
|
+
# -> subtitle.txt
|
|
208
|
+
# -> etc/
|
|
209
|
+
# -> audio-compat.js
|
|
210
|
+
# -> transcript.css
|
|
211
|
+
# -> About these files - readme.txt
|
|
212
|
+
# -> player/
|
|
213
|
+
# -> audio-player.js
|
|
214
|
+
# -> license.txt
|
|
215
|
+
# -> player.swf
|
|
216
|
+
#
|
|
217
|
+
#You may safely edit the files transcript.html, etc/transcript.css,
|
|
218
|
+
#and data/subtitle.txt, and you may safely delete the files in
|
|
219
|
+
#audio/originals and any .txt files in etc/. Editing or deleting other
|
|
220
|
+
#files may interfere with the operation of Typingpool or render the
|
|
221
|
+
#transcript inoperative. Do not edit transcript_in_progress.html as
|
|
222
|
+
#your changes will be overwritten if/when the transcript is next
|
|
223
|
+
#updated.
|
|
224
|
+
#
|
|
225
|
+
#
|
|
226
|
+
#=== Workflow (additional)
|
|
227
|
+
# * When you want to preview your assignments, run tp-assign with the
|
|
228
|
+
# option --sandbox and with --qualify 'rejection_rate < 100' (to
|
|
229
|
+
# make sure you qualify to view your own HITs). Then visit
|
|
230
|
+
# http://workersandbox.mturk.com and find your assignments (a seach
|
|
231
|
+
# for "mp3" works if you left mp3 set as a keyword in your config
|
|
232
|
+
# file). When you are done previewing, run tp-finish with the
|
|
233
|
+
# name/path of your project and the --sandbox option.
|
|
234
|
+
#
|
|
235
|
+
# * When you assign your transcription jobs via tp-assign, you must
|
|
236
|
+
# supply a template name or relative path as the second
|
|
237
|
+
# argument. In the example above, the named template is
|
|
238
|
+
# “interview/nameless.”
|
|
239
|
+
#
|
|
240
|
+
# The template “interview/nameless” is a great general purpose
|
|
241
|
+
# template. It instructs the transcriber not to worry about the
|
|
242
|
+
# names of the speakers, and instead to use labels like “male 1,”
|
|
243
|
+
# “male 2,” etc. This allows the transcriber to work quickly and
|
|
244
|
+
# usually results in a viable transcript, since you can consult
|
|
245
|
+
# your memory or the original audio to figure out who is who.
|
|
246
|
+
#
|
|
247
|
+
# To find what other templates are available, navigate to the
|
|
248
|
+
# directory where typingpool is installed (`gem which typingpool`)
|
|
249
|
+
# and then go into typingpool/templates/assignment and its
|
|
250
|
+
# subdirectories. Anything that ends in ‘.html.erb’ is an available
|
|
251
|
+
# template. You may also create your own templates in the directory
|
|
252
|
+
# listed in the “templates” param of your config file.
|
|
253
|
+
#
|
|
254
|
+
# The templates interview, interview/phone, and interview/noisy
|
|
255
|
+
# require you to have passed the names of two voices to tp-make
|
|
256
|
+
# when you created your project. The first voice should be the name
|
|
257
|
+
# (and optional title) of the interviewer, and the second the name
|
|
258
|
+
# (and title) of the interviewee, like so:
|
|
259
|
+
#
|
|
260
|
+
# tp-make 'Chad Interview' chad1.WMA chad2.WMA –voice ‘Ryan,
|
|
261
|
+
# hack reporter’ --voice ‘Chad, a software engineer’ --unusual
|
|
262
|
+
# 'Hack Day, Yahoo' --subtitle 'Phone interview re Yahoo Hack
|
|
263
|
+
# Day'
|
|
264
|
+
#
|
|
265
|
+
#
|
|
266
|
+
# * When you've rejected some submissions in tp-review and need to
|
|
267
|
+
# re-assign these chunks to be transcribed, simply re-run tp-assign
|
|
268
|
+
# with the name (or path) of your project. You may select the same
|
|
269
|
+
# template, reward, deadlines, etc., or pick new ones. tp-assign
|
|
270
|
+
# will be careful not to re-assign chunks for which you have
|
|
271
|
+
# approved a transcript, or which are pending on Mechanical Turk.
|
|
272
|
+
#
|
|
273
|
+
# * When some chunks previously assigned via tp-assign have expired
|
|
274
|
+
# without attracting submissions, simply re-run tp-assign as
|
|
275
|
+
# described above to re-assign these chunks. Consider increasing
|
|
276
|
+
# the dollar amount specified in your --reward argument.
|
|
277
|
+
#
|
|
278
|
+
# * When some chunks previously assigned via tp-assign have been
|
|
279
|
+
# submitted by workers but not approved or rejected in time for the
|
|
280
|
+
# approval deadline (assign/approval in your config file or
|
|
281
|
+
# --approval as passed to tp-assign), Mechanical Turk has
|
|
282
|
+
# automatically approved these submissions for you and you'll need
|
|
283
|
+
# to run tp-collect to collect them. (Yes, it’s silly you need run
|
|
284
|
+
# a whole different script instead of just calling tp-review as
|
|
285
|
+
# usual. I’ll fix this in a future version.)
|
|
286
|
+
#
|
|
287
|
+
# * When you want to cancel outstanding assignments, simply run
|
|
288
|
+
# tp-finish with the name of your project. If your assignments have
|
|
289
|
+
# already attracted submissions, you may be prompted to run
|
|
290
|
+
# tp-review first.
|
|
291
|
+
#
|
|
292
|
+
# * When tp-make, tp-assign, or tp-finish tells you it failed an
|
|
293
|
+
# upload, deletion, or Amazon command, simply re-run the script
|
|
294
|
+
# with the same arguments to re-attempt the upload, deletion or
|
|
295
|
+
# Amazon command. Typingpool carefully records which network
|
|
296
|
+
# operations it is attempting and which network operations have
|
|
297
|
+
# completed. It can robustly handle network errors, including
|
|
298
|
+
# uncaught exceptions.
|
|
299
|
+
##
|
|
300
|
+
#
|
|
301
|
+
#=== Maintenance
|
|
302
|
+
#
|
|
303
|
+
# [cache] If the cache file grows too large, you'll need to delete it
|
|
304
|
+
# manually. It may be safely deleted as long as no
|
|
305
|
+
# Typingpool scripts are running. Its location is
|
|
306
|
+
# specified in the 'cache' param in the config file. (The
|
|
307
|
+
# config file is at ~/.typingpool and the cache, by
|
|
308
|
+
# default, is at ~/.typingpool.cache.)
|
|
309
|
+
#
|
|
310
|
+
# Typingpool takes no steps to limit the size of the
|
|
311
|
+
# cache file. It prunes the cache of project-specific
|
|
312
|
+
# entries when you run tp-finish on a project, but the
|
|
313
|
+
# cache may grow large if you work on many active
|
|
314
|
+
# projects in parallel, or if you fail to run tp-finish
|
|
315
|
+
# on projects when you are done with them.
|
|
316
|
+
#
|
|
317
|
+
# [tp-finish] You should run tp-finish PROJECT each time you finish a
|
|
318
|
+
# project, where PROJECT may be either the project name
|
|
319
|
+
# or path. Assuming you have no submissions pending or
|
|
320
|
+
# awaiting approval, this clears all traces of the
|
|
321
|
+
# project from Amazon Mechanical Turk, from Amazon S3 or
|
|
322
|
+
# your SFTP server, and from the local cache. This will
|
|
323
|
+
# keep your local cache from balooning in size and will
|
|
324
|
+
# minimize your S3 charges or SFTP disk usage. It will
|
|
325
|
+
# also help Typingpool scripts run faster by reducing the
|
|
326
|
+
# number of HITs you have on Amazon Mechanical Turk; many
|
|
327
|
+
# Typingpool operations involve iterating through all of
|
|
328
|
+
# your HITs.
|
|
329
|
+
#
|
|
330
|
+
#
|
|
331
|
+
#=== See also
|
|
332
|
+
#
|
|
333
|
+
# * Run any script with the --help options for further details on how
|
|
334
|
+
# to run the script.
|
|
335
|
+
#
|
|
336
|
+
# * See the docs for Typingpool::Config for details of the config
|
|
337
|
+
# file format.
|
|
338
|
+
#
|
|
339
|
+
# * See Amazon's Mechanical Turk documentation for guides and
|
|
340
|
+
# overviews on how Mechanical Turk
|
|
341
|
+
# works. https://requester.mturk.com/help
|
|
342
|
+
#
|
|
343
|
+
# * See the documentation on ffmpeg and related libraries for clues
|
|
344
|
+
# as to how to make Typingpool support additional file
|
|
345
|
+
# formats. Typingpool can work with any file format that ffmpeg can
|
|
346
|
+
# convert to mp3 (libmp3lame).
|
|
347
|
+
#
|
|
348
|
+
# * For an overview of the concepts on which Typingpool is built, see
|
|
349
|
+
# Andy Baio’s guide to using Mechanical Turk for transcription:
|
|
350
|
+
# http://waxy.org/2008/09/audio_transcription_with_mechanical_turk/
|
|
351
|
+
#
|
|
352
|
+
#== Developer overview
|
|
353
|
+
#
|
|
354
|
+
#Views, used for the final transcript and for rendering HTML
|
|
355
|
+
#assignments for Amazon Mechanical Turk workers, are contained in a
|
|
356
|
+
#series of templates in lib/typingpool/templates, particularly
|
|
357
|
+
#transcript.html.erb and assignment/*. The control layer lives in the
|
|
358
|
+
#App class (lib/typingpool/app.rb) and within the individual
|
|
359
|
+
#scripts. The models constitute the other Typingpool classes,
|
|
360
|
+
#including most importantly and in rough order of importance the
|
|
361
|
+
#Project, Transcript, Amazon, Config and Filer classes (the latter of
|
|
362
|
+
#interest mainly because of Filer::Audio, which handles splitting,
|
|
363
|
+
#merging, and conversion).
|
|
364
|
+
#
|
|
365
|
+
#The models in particular, along with the App class, are
|
|
366
|
+
#underdeveloped and not particularly clear or fully thought
|
|
367
|
+
#through. The Transcript model, for example, should almost certainly
|
|
368
|
+
#be folded into the Project model. Dividing Project into
|
|
369
|
+
#Project::Local and Project::Remote only makes sense on a superficial
|
|
370
|
+
#level; Project::Remote could probably be its own class or even part
|
|
371
|
+
#of Utility. Amazon will probably be simpler if I can get some patches
|
|
372
|
+
#into RTurk, and Amazon::HITshould probably be integrated more closely
|
|
373
|
+
#with Project.
|
|
374
|
+
#
|
|
375
|
+
#One of the most frustrating things about the code is that there are
|
|
376
|
+
#so many subtly different ways a "chunk" of a transcript/project is
|
|
377
|
+
#represented: As a simple hash derived from a row in
|
|
378
|
+
#data/assignment.csv within a project folder, as an Amazon::HIT, as a
|
|
379
|
+
#Transcription::Chunk, as an audio file on a remote server, and as a
|
|
380
|
+
#local audio file (which has a different name from the remote
|
|
381
|
+
#file). So in future versions I'll probably reduce the number of
|
|
382
|
+
#different ways to represent a chunk.
|
|
383
|
+
#
|
|
384
|
+
#Also in the future, it's very likely that App will evolve from a
|
|
385
|
+
#simple collection of class methods into a real class with a simple
|
|
386
|
+
#set of instance methods called in a particular order by a "run"
|
|
387
|
+
#method or similar. Subclasses for particular scripts/commands will
|
|
388
|
+
#then override these methods.
|
|
389
|
+
#
|
|
390
|
+
#
|
|
391
|
+
#===Examples
|
|
392
|
+
#
|
|
393
|
+
#The most comprehensive examples of how the Typingpool classes
|
|
394
|
+
#actually work and interact are the tp-* scripts themselves, in
|
|
395
|
+
#particular tp-make, tp-assign, tp-review, and tp-finish.
|
|
396
|
+
#
|
|
397
|
+
#More concise examples follow below, to give you a sense of what the
|
|
398
|
+
#various classes actually do:
|
|
399
|
+
#
|
|
400
|
+
# require 'typingpool'
|
|
401
|
+
#
|
|
402
|
+
# #new Project instance
|
|
403
|
+
# project = Typingpool::Project.new('Chad Interview')
|
|
404
|
+
#
|
|
405
|
+
# #check if project exists on disk
|
|
406
|
+
# unless project.local
|
|
407
|
+
# #make a skeleton project folder in Config#transcripts dir
|
|
408
|
+
# project.create_local
|
|
409
|
+
# #make subtitle record in project folder
|
|
410
|
+
# project.local.subtitle = 'Interview about Hack Day Jan 21'
|
|
411
|
+
# end
|
|
412
|
+
#
|
|
413
|
+
# id = project.local.id
|
|
414
|
+
#
|
|
415
|
+
# #Wrap file in Typingpool::Filer
|
|
416
|
+
# wma = Typingpool::Filer::Audio.new('/foo/bar.wma')
|
|
417
|
+
#
|
|
418
|
+
# #convert file to mp3
|
|
419
|
+
# mp3 = wma.to_mp3
|
|
420
|
+
# other_mp3 = Typingpool::Filer::Audio.new('/foo/bar2.wma').to_mp3
|
|
421
|
+
#
|
|
422
|
+
# #merge audio
|
|
423
|
+
# combined_mp3 = Typingpool::Filer::Files::Audio.new([mp3,
|
|
424
|
+
# other_mp3]).merge(Typingpool::Filer.new('/foo/combined.mp3')
|
|
425
|
+
#
|
|
426
|
+
# #split audio every 1 minute
|
|
427
|
+
# chunks = combined_mp3.split('1.00')
|
|
428
|
+
#
|
|
429
|
+
# #upload mp3s
|
|
430
|
+
# urls = project.remote.put(chunks.to_streams,
|
|
431
|
+
# project.create_remote_names(chunks))
|
|
432
|
+
#
|
|
433
|
+
# #remove mp3s
|
|
434
|
+
# project.remote.remove_urls(urls)
|
|
435
|
+
#
|
|
436
|
+
# #new Template instance
|
|
437
|
+
# template = Typingpool::Template::Assignment.from_config('interview/nameless')
|
|
438
|
+
# html = template.render({
|
|
439
|
+
# 'audio_url' => urls[0],
|
|
440
|
+
# 'unusual' => ['Hack Day', 'Yahoo', 'Atlassian'],
|
|
441
|
+
# 'chunk_minutes' => 1,
|
|
442
|
+
# 'project_id' => project.local.id
|
|
443
|
+
# })
|
|
444
|
+
#
|
|
445
|
+
# question = Typingpool::Amazon::Question.new(urls[0], html)
|
|
446
|
+
#
|
|
447
|
+
# Typingpool::Amazon.setup
|
|
448
|
+
#
|
|
449
|
+
# #Assign a transcription job (1 chunk)
|
|
450
|
+
# hit = Typingpool::Amazon::HIT.create(question, Typingpool::Config.file.assign)
|
|
451
|
+
#
|
|
452
|
+
# #Find all Typingpool HITs on Amazon Mechanical Turk
|
|
453
|
+
# all = Typingpool::Amazon::HIT.all
|
|
454
|
+
# #Find all reviewable Typingpool HITs
|
|
455
|
+
# reviewable = Typingpool::Amazon::HIT.all_reviewable
|
|
456
|
+
# #Find all approved Typingpool HITs
|
|
457
|
+
# approved = Typingpool::Amazon::HIT.all_approved
|
|
458
|
+
# #Find all HITs for our project
|
|
459
|
+
# project_hits = Typingpool::Amazon::HIT.all_for_project(project.local.id)
|
|
460
|
+
# #Filter all HITs (not just Typingpool HITs) arbitrarily
|
|
461
|
+
# safe_to_delete = Typingpool::Amazon::HIT.all{|hit| hit.ours? && hit.full.expired_and_overdue? }
|
|
462
|
+
# #Filter all approved HITs arbitrarily
|
|
463
|
+
# ready_for_judgment = Typingpool::Amazon::HIT.all_approved{|hit| hit.submitted? && hit.ours? }
|
|
464
|
+
#
|
|
465
|
+
# #Approve a HIT
|
|
466
|
+
# ready_for_judgment[0].at_amazon.approve! #at_amazon is an rturk instance
|
|
467
|
+
# #Reject a HIT
|
|
468
|
+
# ready_for_judgment[1].at_amazon.reject!('Your transcription is just random gibberish')
|
|
469
|
+
# #Delete a HIT from Amazon
|
|
470
|
+
# safe_to_delete[0].remove_from_amazon
|
|
471
|
+
#
|
|
472
|
+
# #Get text of transcript chunk (Typingpool::Transcript::Chunk)
|
|
473
|
+
# transcript_chunk = approved[0].transcript
|
|
474
|
+
# puts transcript_chunk.body
|
|
475
|
+
# #Get formmated text of transcript chunk
|
|
476
|
+
# puts transcript_chunk.body_as_text
|
|
477
|
+
# #Get transcript chunk as HTML
|
|
478
|
+
# puts transcript_chunk.body_as_html
|
|
479
|
+
# #Get transcript chunk metadata
|
|
480
|
+
# puts "--#{transcript_chunk.url} (audio at #{transcript_chunk.offset})"
|
|
481
|
+
#
|
|
482
|
+
#==Author
|
|
483
|
+
# Ryan Tate - ryantate@ryantate.com
|
|
484
|
+
#
|
|
485
|
+
#==License
|
|
486
|
+
# Copyright (c) 2011-2012 Ryan Tate. Released under the terms of the MIT
|
|
487
|
+
# license. See LICENSE for details.
|
|
488
|
+
|
|
489
|
+
module Typingpool
|
|
490
|
+
VERSION = '0.7.0'
|
|
491
|
+
require 'typingpool/error'
|
|
492
|
+
require 'typingpool/utility'
|
|
493
|
+
require 'typingpool/config'
|
|
494
|
+
require 'typingpool/filer'
|
|
495
|
+
require 'typingpool/amazon'
|
|
496
|
+
require 'typingpool/project'
|
|
497
|
+
require 'typingpool/transcript'
|
|
498
|
+
require 'typingpool/template'
|
|
499
|
+
require 'typingpool/app'
|
|
500
|
+
end #Typingpool
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
|
4
|
+
|
|
5
|
+
require 'typingpool'
|
|
6
|
+
require 'typingpool/test'
|
|
7
|
+
require 'fileutils'
|
|
8
|
+
|
|
9
|
+
class MakeAmazonQuestion < Typingpool::Test::Script
|
|
10
|
+
def test_make_amazon_question_fixture
|
|
11
|
+
in_temp_tp_dir do |dir|
|
|
12
|
+
tp_make(dir)
|
|
13
|
+
template = Typingpool::Template::Assignment.from_config(assign_default[:template], config_from_dir(dir))
|
|
14
|
+
assignment = temp_tp_dir_project(dir).local.file('data', 'assignment.csv').as(:csv).read.first
|
|
15
|
+
question_html = template.render(assignment)
|
|
16
|
+
question_url = 'http://example.com/assignments/101.html'
|
|
17
|
+
assert_match(question_html, /\S/)
|
|
18
|
+
assert_match(question_url, /http/i)
|
|
19
|
+
File.open(File.join(fixtures_dir, 'amazon-question-html.html'), 'w'){|f| f << question_html}
|
|
20
|
+
File.open(File.join(fixtures_dir, 'amazon-question-url.txt'), 'w'){|f| f << question_url}
|
|
21
|
+
end #in_temp_tp_dir
|
|
22
|
+
add_goodbye_message("Amazon question fixtures created.")
|
|
23
|
+
end
|
|
24
|
+
end #MakeAmazonQuestion
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
|
4
|
+
|
|
5
|
+
require 'typingpool'
|
|
6
|
+
require 'typingpool/test'
|
|
7
|
+
require 'fileutils'
|
|
8
|
+
|
|
9
|
+
class MakeAndAssignProject < Typingpool::Test::Script
|
|
10
|
+
def test_prep_for_fixture
|
|
11
|
+
dir = make_fixture_project_dir('tp_collect_project_temp')
|
|
12
|
+
setup_temp_tp_dir(dir)
|
|
13
|
+
begin
|
|
14
|
+
tp_make(dir)
|
|
15
|
+
tp_assign(dir)
|
|
16
|
+
rescue
|
|
17
|
+
FileUtils.remove_entry_secure(dir)
|
|
18
|
+
raise
|
|
19
|
+
end
|
|
20
|
+
#copy key files over to permanent locations within fixture dir
|
|
21
|
+
with_fixtures_in_temp_tp_dir(dir, 'tp_collect_') do |fixture_path, project_path|
|
|
22
|
+
FileUtils.cp(project_path, fixture_path)
|
|
23
|
+
end
|
|
24
|
+
add_goodbye_message("Temp project assigned in Mechanical Turk sandbox. Complete and approve TWO assignments and run make_tp_collect_fixture_2.rb. Check for assignments at\nhttps://workersandbox.mturk.com/mturk/searchbar?minReward=0.00&searchWords=typingpooltest&selectedSearchType=hitgroups\n...and then approve them at\nhttps://requestersandbox.mturk.com/mturk/manageHITs?hitSortType=CREATION_DESCENDING&%2Fsort.x=11&%2Fsort.y=7")
|
|
25
|
+
end
|
|
26
|
+
end #MakeAndAssignProject
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
require 'typingpool/test'
|
|
7
|
+
require 'fileutils'
|
|
8
|
+
|
|
9
|
+
class CollectProjectFixtureGen2 < Typingpool::Test::Script
|
|
10
|
+
def test_populate_fixture
|
|
11
|
+
fixture_path = File.join(fixtures_dir, 'vcr', 'tp-collect-1')
|
|
12
|
+
tp_collect_with_fixture(fixture_project_dir('tp_collect_project_temp'), fixture_path)
|
|
13
|
+
assert(File.exists?("#{fixture_path}.yml"))
|
|
14
|
+
add_goodbye_message("Initial tp-collect recorded. Please complete and approve TWO more assignments and run make_tp_collect_fixture_3.rb. Check for assignments at\nhttps://workersandbox.mturk.com/mturk/searchbar?minReward=0.00&searchWords=typingpooltest&selectedSearchType=hitgroups\n...and then approve them at\nhttps://requestersandbox.mturk.com/mturk/manageHITs?hitSortType=CREATION_DESCENDING&%2Fsort.x=11&%2Fsort.y=7")
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
|
4
|
+
|
|
5
|
+
require 'typingpool/test'
|
|
6
|
+
require 'fileutils'
|
|
7
|
+
|
|
8
|
+
class CollectProjectFixtureGen3 < Typingpool::Test::Script
|
|
9
|
+
def test_populate_fixture2
|
|
10
|
+
fixture_path = File.join(fixtures_dir, 'vcr', 'tp-collect-2')
|
|
11
|
+
tp_collect_with_fixture(fixture_project_dir('tp_collect_project_temp'), fixture_path)
|
|
12
|
+
assert(File.exists?("#{fixture_path}.yml"))
|
|
13
|
+
add_goodbye_message("Second tp-collect recorded. Please complete and approve THREE more assignments and run make_tp_collect_fixture_4.rb. Check for assignments at\nhttps://workersandbox.mturk.com/mturk/searchbar?minReward=0.00&searchWords=typingpooltest&selectedSearchType=hitgroups\n...and then approve them at\nhttps://requestersandbox.mturk.com/mturk/manageHITs?hitSortType=CREATION_DESCENDING&%2Fsort.x=11&%2Fsort.y=7")
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
|
4
|
+
|
|
5
|
+
require 'typingpool/test'
|
|
6
|
+
require 'fileutils'
|
|
7
|
+
|
|
8
|
+
class CollectProjectFixtureGen4 < Typingpool::Test::Script
|
|
9
|
+
def test_populate_fixture3
|
|
10
|
+
fixture_path = File.join(fixtures_dir, 'vcr', 'tp-collect-3')
|
|
11
|
+
tp_collect_with_fixture(fixture_project_dir('tp_collect_project_temp'), fixture_path)
|
|
12
|
+
assert(File.exists?("#{fixture_path}.yml"))
|
|
13
|
+
tp_finish(fixture_project_dir('tp_collect_project_temp'))
|
|
14
|
+
remove_fixture_project_dir('tp_collect_project_temp')
|
|
15
|
+
add_goodbye_message("Third and final tp-collect recorded. Fixtures for tp-collect testing successfully generated in #{File.dirname(fixture_path)}!")
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
|
4
|
+
|
|
5
|
+
require 'typingpool'
|
|
6
|
+
require 'typingpool/test'
|
|
7
|
+
require 'fileutils'
|
|
8
|
+
|
|
9
|
+
class ReviewProjectFixtureGen1 < Typingpool::Test::Script
|
|
10
|
+
def test_prep_for_fixture
|
|
11
|
+
dir = make_fixture_project_dir('tp_review_project_temp')
|
|
12
|
+
setup_temp_tp_dir(dir)
|
|
13
|
+
begin
|
|
14
|
+
tp_make(dir)
|
|
15
|
+
tp_assign(dir)
|
|
16
|
+
rescue
|
|
17
|
+
FileUtils.remove_entry_secure(dir)
|
|
18
|
+
raise
|
|
19
|
+
end
|
|
20
|
+
#copy key files over to permanent locations within fixture dir
|
|
21
|
+
with_fixtures_in_temp_tp_dir(dir, 'tp_review_') do |fixture_path, project_path|
|
|
22
|
+
FileUtils.cp(project_path, fixture_path)
|
|
23
|
+
end
|
|
24
|
+
add_goodbye_message("Temp project assigned in Mechanical Turk sandbox. Complete SIX assignments and run make_tp_review_fixture_2.rb. Check for assignments at\nhttps://workersandbox.mturk.com/mturk/searchbar?minReward=0.00&searchWords=typingpooltest&selectedSearchType=hitgroups\n")
|
|
25
|
+
end
|
|
26
|
+
end #MakeAndAssignProjectForReview
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$:.unshift File.join(File.dirname(File.dirname($0)), 'lib')
|
|
4
|
+
|
|
5
|
+
require 'typingpool/test'
|
|
6
|
+
require 'fileutils'
|
|
7
|
+
|
|
8
|
+
class ReviewProjectFixtureGen2 < Typingpool::Test::Script
|
|
9
|
+
def test_populate_fixture
|
|
10
|
+
fixture_path = File.join(fixtures_dir, 'vcr', 'tp-review-1')
|
|
11
|
+
dir = fixture_project_dir('tp_review_project_temp')
|
|
12
|
+
output = nil
|
|
13
|
+
assert_nothing_raised do
|
|
14
|
+
output = tp_review_with_fixture(dir, fixture_path, %w(a r a r s q ))
|
|
15
|
+
end
|
|
16
|
+
assert_equal(0, output[:status].to_i, "Bad exit code: #{output[:status]} err: #{output[:err]}")
|
|
17
|
+
assert(File.exists?("#{fixture_path}.yml"))
|
|
18
|
+
|
|
19
|
+
fixture_path = File.join(fixtures_dir, 'vcr', 'tp-review-2')
|
|
20
|
+
assert_nothing_raised do
|
|
21
|
+
output = tp_review_with_fixture(dir, fixture_path, %w(a r))
|
|
22
|
+
end
|
|
23
|
+
assert_equal(0, output[:status].to_i, "Bad exit code: #{output[:status]} err: #{output[:err]}")
|
|
24
|
+
assert(File.exists?("#{fixture_path}.yml"))
|
|
25
|
+
|
|
26
|
+
tp_finish(dir)
|
|
27
|
+
remove_fixture_project_dir('tp_review_project_temp')
|
|
28
|
+
add_goodbye_message("All done!")
|
|
29
|
+
end
|
|
30
|
+
end
|