typingpool 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.markdown +452 -0
- data/lib/typingpool/amazon/hit/assignment/empty.rb +19 -0
- data/lib/typingpool/amazon/hit/assignment.rb +43 -0
- data/lib/typingpool/amazon/hit/full/fromsearchhits.rb +44 -0
- data/lib/typingpool/amazon/hit/full.rb +105 -0
- data/lib/typingpool/amazon/hit.rb +458 -0
- data/lib/typingpool/amazon/question.rb +45 -0
- data/lib/typingpool/amazon.rb +3 -677
- data/lib/typingpool/app/cli/formatter.rb +16 -0
- data/lib/typingpool/app/cli.rb +64 -0
- data/lib/typingpool/app/friendlyexceptions.rb +34 -0
- data/lib/typingpool/app.rb +2 -97
- data/lib/typingpool/config/root.rb +114 -0
- data/lib/typingpool/config.rb +13 -119
- data/lib/typingpool/filer/audio.rb +84 -0
- data/lib/typingpool/filer/csv.rb +57 -0
- data/lib/typingpool/filer/dir.rb +76 -0
- data/lib/typingpool/filer/files/audio.rb +63 -0
- data/lib/typingpool/filer/files.rb +55 -0
- data/lib/typingpool/filer.rb +4 -313
- data/lib/typingpool/project/local.rb +117 -0
- data/lib/typingpool/project/remote/s3.rb +135 -0
- data/lib/typingpool/project/remote/sftp.rb +100 -0
- data/lib/typingpool/project/remote.rb +65 -0
- data/lib/typingpool/project.rb +2 -396
- data/lib/typingpool/template/assignment.rb +17 -0
- data/lib/typingpool/template/env.rb +77 -0
- data/lib/typingpool/template.rb +2 -87
- data/lib/typingpool/test/script.rb +310 -0
- data/lib/typingpool/test.rb +1 -306
- data/lib/typingpool/transcript/chunk.rb +129 -0
- data/lib/typingpool/transcript.rb +1 -125
- data/lib/typingpool/utility/castable.rb +65 -0
- data/lib/typingpool/utility.rb +1 -61
- data/test/test_integration_script_6_tp_finish.rb +1 -0
- metadata +135 -81
@@ -0,0 +1,129 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Transcript
|
3
|
+
|
4
|
+
#Transcript::Chunk is the model class for one transcription by one
|
5
|
+
#Mechanical Turk worker of one "chunk" (a file) of audio, which in
|
6
|
+
#turn is a portion of a larger recording (for example, one minute
|
7
|
+
#of a 60 minute interview). It is basically parallel and similar
|
8
|
+
#to an Amazon::HIT instance. Transcript is a container for these
|
9
|
+
#chunks, which know how to render themselves as text and HTML.
|
10
|
+
class Chunk
|
11
|
+
require 'cgi'
|
12
|
+
require 'rubygems/text'
|
13
|
+
include Gem::Text
|
14
|
+
|
15
|
+
#Get/set the raw text of the transcript
|
16
|
+
attr_accessor :body
|
17
|
+
|
18
|
+
#Get/set the Amazon ID of the Mechanical Turk worker who
|
19
|
+
#transcribed the audio into text
|
20
|
+
attr_accessor :worker
|
21
|
+
|
22
|
+
#Get/set the id of the Amazon::HIT associated with this chunk
|
23
|
+
attr_accessor :hit
|
24
|
+
|
25
|
+
#Get/set the id of the Project#local associated with this chunk
|
26
|
+
attr_accessor :project
|
27
|
+
|
28
|
+
#Return the offset associated with the chunk, in MM:SS
|
29
|
+
#format. This corresponds to the associated audio file, which is
|
30
|
+
#a chunk of a larger recording and which starts at a particular
|
31
|
+
#time offset, for example from 1:00 (the offset) to 2:00 (the
|
32
|
+
#next offset).
|
33
|
+
#
|
34
|
+
#
|
35
|
+
#This should be updated to return HH:MM:SS and MM:SS.sss when
|
36
|
+
#appropriate, since in Project#interval we use that format and
|
37
|
+
#allow audio to be divided into such units. (TODO)
|
38
|
+
attr_reader :offset
|
39
|
+
|
40
|
+
#Returns the offset in seconds. So for an offset of 1:00 would return 60.
|
41
|
+
attr_reader :offset_seconds
|
42
|
+
|
43
|
+
#Returns the name of the remote audio file corresponding to this
|
44
|
+
#chunk. The remote file has the project ID and pseudo random
|
45
|
+
#characters added to it.
|
46
|
+
attr_reader :filename
|
47
|
+
|
48
|
+
#Returns the name of the local audio file corresponding to this
|
49
|
+
#chunk.
|
50
|
+
attr_reader :filename_local
|
51
|
+
|
52
|
+
#Returns the URL of the remote audio transcribed in the body of
|
53
|
+
#this chunk.
|
54
|
+
attr_reader :url
|
55
|
+
|
56
|
+
#Constructor. Takes the raw text of the transcription.
|
57
|
+
def initialize(body)
|
58
|
+
@body = body
|
59
|
+
end
|
60
|
+
|
61
|
+
#Sorts by offset seconds.
|
62
|
+
def <=>(other)
|
63
|
+
self.offset_seconds <=> other.offset_seconds
|
64
|
+
end
|
65
|
+
|
66
|
+
#Takes an URL. As an important side effect, sets various
|
67
|
+
#attributes, including url, filename, filename_local, offset and
|
68
|
+
#offset_seconds. So setting Chunk#url= http://whateverwhatever
|
69
|
+
#is an important step in populating the instance.
|
70
|
+
def url=(url)
|
71
|
+
#http://ryantate.com/transfer/Speech.01.00.ede9b0f2aed0d35a26cef7160bc9e35e.ISEAOM.mp3
|
72
|
+
matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
|
73
|
+
@url = matches[0]
|
74
|
+
@filename = matches[1]
|
75
|
+
@filename_local = Project.local_basename_from_url(@url)
|
76
|
+
@offset = "#{matches[3]}:#{matches[4]}"
|
77
|
+
@offset_seconds = (matches[3].to_i * 60) + matches[4].to_i
|
78
|
+
end
|
79
|
+
|
80
|
+
#Takes an optional specification of how many spaces to indent
|
81
|
+
#the text by (default 0) and an optional specification of how
|
82
|
+
#many characters to wrap at (default no wrapping).
|
83
|
+
#
|
84
|
+
#Returns the text with newlines normalized to Unix format, runs
|
85
|
+
#of newlines shortened to a maximum of two newlines, leading and
|
86
|
+
#trailing whitespace removed from each line, and the text
|
87
|
+
#wrapped/indented as specified.
|
88
|
+
def body_as_text(indent=nil, wrap=nil)
|
89
|
+
text = self.body
|
90
|
+
text = Utility.normalize_newlines(text)
|
91
|
+
text.gsub!(/\n\n+/, "\n\n")
|
92
|
+
text = text.split("\n").map{|line| line.strip }.join("\n")
|
93
|
+
text = wrap_text(text, wrap) if wrap
|
94
|
+
text = indent_text(text, indent) if indent
|
95
|
+
text
|
96
|
+
end
|
97
|
+
alias :to_s :body_as_text
|
98
|
+
alias :to_str :body_as_text
|
99
|
+
|
100
|
+
#Takes an optional count of how many characters to wrap at
|
101
|
+
#(default 72). Returns the body, presumed to be raw text, as
|
102
|
+
#HTML. Any HTML tags in the body are escaped. Text blocks
|
103
|
+
#separated by double newlines are converted to HTML paragraphs,
|
104
|
+
#while single newlines are converted to HTML BR tags. Newlines
|
105
|
+
#are normalized as in body_as_text, and lines in the HTML source
|
106
|
+
#are automatically wrapped as specified.
|
107
|
+
def body_as_html(wrap=72)
|
108
|
+
text = body_as_text
|
109
|
+
text = CGI::escapeHTML(text)
|
110
|
+
text = Utility.newlines_to_html(text)
|
111
|
+
text = text.split("\n").map do |line|
|
112
|
+
wrap_text(line, 72).chomp
|
113
|
+
end.join("\n")
|
114
|
+
text
|
115
|
+
end
|
116
|
+
|
117
|
+
protected
|
118
|
+
|
119
|
+
def indent_text(text, indent)
|
120
|
+
text.gsub!(/^/, " " * indent)
|
121
|
+
text
|
122
|
+
end
|
123
|
+
|
124
|
+
def wrap_text(text, wrap=72)
|
125
|
+
format_text(text, wrap)
|
126
|
+
end
|
127
|
+
end #Chunk
|
128
|
+
end #Transcript
|
129
|
+
end #Typingpool
|
@@ -52,130 +52,6 @@ module Typingpool
|
|
52
52
|
@chunks.push(chunk)
|
53
53
|
end
|
54
54
|
|
55
|
-
|
56
|
-
#Mechanical Turk worker of one "chunk" (a file) of audio, which in
|
57
|
-
#turn is a portion of a larger recording (for example, one minute
|
58
|
-
#of a 60 minute interview). It is basically parallel and similar
|
59
|
-
#to an Amazon::HIT instance. Transcript is a container for these
|
60
|
-
#chunks, which know how to render themselves as text and HTML.
|
61
|
-
class Chunk
|
62
|
-
require 'cgi'
|
63
|
-
require 'rubygems/text'
|
64
|
-
include Gem::Text
|
65
|
-
|
66
|
-
#Get/set the raw text of the transcript
|
67
|
-
attr_accessor :body
|
68
|
-
|
69
|
-
#Get/set the Amazon ID of the Mechanical Turk worker who
|
70
|
-
#transcribed the audio into text
|
71
|
-
attr_accessor :worker
|
72
|
-
|
73
|
-
#Get/set the id of the Amazon::HIT associated with this chunk
|
74
|
-
attr_accessor :hit
|
75
|
-
|
76
|
-
#Get/set the id of the Project#local associated with this chunk
|
77
|
-
attr_accessor :project
|
78
|
-
|
79
|
-
#Return the offset associated with the chunk, in MM:SS
|
80
|
-
#format. This corresponds to the associated audio file, which is
|
81
|
-
#a chunk of a larger recording and which starts at a particular
|
82
|
-
#time offset, for example from 1:00 (the offset) to 2:00 (the
|
83
|
-
#next offset).
|
84
|
-
#
|
85
|
-
#
|
86
|
-
#This should be updated to return HH:MM:SS and MM:SS.sss when
|
87
|
-
#appropriate, since in Project#interval we use that format and
|
88
|
-
#allow audio to be divided into such units. (TODO)
|
89
|
-
attr_reader :offset
|
90
|
-
|
91
|
-
#Returns the offset in seconds. So for an offset of 1:00 would return 60.
|
92
|
-
attr_reader :offset_seconds
|
93
|
-
|
94
|
-
#Returns the name of the remote audio file corresponding to this
|
95
|
-
#chunk. The remote file has the project ID and pseudo random
|
96
|
-
#characters added to it.
|
97
|
-
attr_reader :filename
|
98
|
-
|
99
|
-
#Returns the name of the local audio file corresponding to this
|
100
|
-
#chunk.
|
101
|
-
attr_reader :filename_local
|
102
|
-
|
103
|
-
#Returns the URL of the remote audio transcribed in the body of
|
104
|
-
#this chunk.
|
105
|
-
attr_reader :url
|
106
|
-
|
107
|
-
#Constructor. Takes the raw text of the transcription.
|
108
|
-
def initialize(body)
|
109
|
-
@body = body
|
110
|
-
end
|
111
|
-
|
112
|
-
#Sorts by offset seconds.
|
113
|
-
def <=>(other)
|
114
|
-
self.offset_seconds <=> other.offset_seconds
|
115
|
-
end
|
116
|
-
|
117
|
-
#Takes an URL. As an important side effect, sets various
|
118
|
-
#attributes, including url, filename, filename_local, offset and
|
119
|
-
#offset_seconds. So setting Chunk#url= http://whateverwhatever
|
120
|
-
#is an important step in populating the instance.
|
121
|
-
def url=(url)
|
122
|
-
#http://ryantate.com/transfer/Speech.01.00.ede9b0f2aed0d35a26cef7160bc9e35e.ISEAOM.mp3
|
123
|
-
matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
|
124
|
-
@url = matches[0]
|
125
|
-
@filename = matches[1]
|
126
|
-
@filename_local = Project.local_basename_from_url(@url)
|
127
|
-
@offset = "#{matches[3]}:#{matches[4]}"
|
128
|
-
@offset_seconds = (matches[3].to_i * 60) + matches[4].to_i
|
129
|
-
end
|
130
|
-
|
131
|
-
#Takes an optional specification of how many spaces to indent
|
132
|
-
#the text by (default 0) and an optional specification of how
|
133
|
-
#many characters to wrap at (default no wrapping).
|
134
|
-
#
|
135
|
-
#Returns the text with newlines normalized to Unix format, runs
|
136
|
-
#of newlines shortened to a maximum of two newlines, leading and
|
137
|
-
#trailing whitespace removed from each line, and the text
|
138
|
-
#wrapped/indented as specified.
|
139
|
-
def body_as_text(indent=nil, wrap=nil)
|
140
|
-
text = self.body
|
141
|
-
text = Utility.normalize_newlines(text)
|
142
|
-
text.gsub!(/\n\n+/, "\n\n")
|
143
|
-
text = text.split("\n").map{|line| line.strip }.join("\n")
|
144
|
-
text = wrap_text(text, wrap) if wrap
|
145
|
-
text = indent_text(text, indent) if indent
|
146
|
-
text
|
147
|
-
end
|
148
|
-
alias :to_s :body_as_text
|
149
|
-
alias :to_str :body_as_text
|
150
|
-
|
151
|
-
#Takes an optional count of how many characters to wrap at
|
152
|
-
#(default 72). Returns the body, presumed to be raw text, as
|
153
|
-
#HTML. Any HTML tags in the body are escaped. Text blocks
|
154
|
-
#separated by double newlines are converted to HTML paragraphs,
|
155
|
-
#while single newlines are converted to HTML BR tags. Newlines
|
156
|
-
#are normalized as in body_as_text, and lines in the HTML source
|
157
|
-
#are automatically wrapped as specified.
|
158
|
-
def body_as_html(wrap=72)
|
159
|
-
text = body_as_text
|
160
|
-
text = CGI::escapeHTML(text)
|
161
|
-
text = Utility.newlines_to_html(text)
|
162
|
-
text = text.split("\n").map do |line|
|
163
|
-
wrap_text(line, 72).chomp
|
164
|
-
end.join("\n")
|
165
|
-
text
|
166
|
-
end
|
167
|
-
|
168
|
-
protected
|
169
|
-
|
170
|
-
def indent_text(text, indent)
|
171
|
-
text.gsub!(/^/, " " * indent)
|
172
|
-
text
|
173
|
-
end
|
174
|
-
|
175
|
-
def wrap_text(text, wrap=72)
|
176
|
-
format_text(text, wrap)
|
177
|
-
end
|
178
|
-
|
179
|
-
end #Chunk
|
55
|
+
require 'typingpool/transcript/chunk'
|
180
56
|
end #Transcript
|
181
57
|
end #Typingpool
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Typingpool
|
2
|
+
module Utility
|
3
|
+
module Castable
|
4
|
+
|
5
|
+
#Cast this object instance to a relative class. Call this from
|
6
|
+
#super in your own class if you want to pass args to the
|
7
|
+
#relative class constructor. All args after the first will be
|
8
|
+
#passed to new.
|
9
|
+
#
|
10
|
+
#A relative class can be a subclass and in some cases a sibling
|
11
|
+
#class, parent class, parent sibling class, grandparent class,
|
12
|
+
#grandparent sibling class, and so on. A relative class will
|
13
|
+
#never be higher up the inheritance tree than the subclasses of
|
14
|
+
#the class where Castable was included.
|
15
|
+
# ==== Params
|
16
|
+
# [sym] Symbol corresponding to relative class to cast into. For
|
17
|
+
# example, Class#as(:audio) will cast into a Class::Audio
|
18
|
+
# and Class#as(:csv) will cast into Class::CSV. Casting
|
19
|
+
# is class insensitive, which means you can't have class
|
20
|
+
# CSV and class Csv. To cast into a related class whose
|
21
|
+
# name is not not directly under that of its parent, you
|
22
|
+
# must either specify the full name,
|
23
|
+
# e.g. Class#as(:foo_bar_baz) to cast to Foo::Bar::Baz,
|
24
|
+
# or a name relative to the parent,
|
25
|
+
# e.g. Class#as(:remote_html), where Class::Remote does
|
26
|
+
# not inherit from Class but Class::Remote::HTML does.
|
27
|
+
# ==== Returns
|
28
|
+
# New instance of subclass
|
29
|
+
def as(sym, *args)
|
30
|
+
if klass = self.class.relative_klass(sym.to_s.downcase)
|
31
|
+
klass.new(*args)
|
32
|
+
else
|
33
|
+
raise Error, "Can't find class '#{sym.to_s}' to cast to"
|
34
|
+
end #if subklass =...
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.included(receiver)
|
38
|
+
receiver.extend(ClassMethods)
|
39
|
+
end
|
40
|
+
|
41
|
+
module ClassMethods
|
42
|
+
def inherited(subklass)
|
43
|
+
subklasses[subklass.to_s.split("#{self.name}::").last.downcase.gsub(/::/, '_')] = subklass
|
44
|
+
end
|
45
|
+
|
46
|
+
def subklasses
|
47
|
+
@subklasses ||= {}
|
48
|
+
end
|
49
|
+
|
50
|
+
def subklass(subklass_key)
|
51
|
+
subklasses[subklass_key]
|
52
|
+
end
|
53
|
+
|
54
|
+
def relative_klass(key)
|
55
|
+
if subklasses[key]
|
56
|
+
subklasses[key]
|
57
|
+
elsif self.superclass.respond_to? :relative_klass
|
58
|
+
self.superclass.relative_klass(key)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end #module ClassMethods
|
63
|
+
end #Castable
|
64
|
+
end #Utility
|
65
|
+
end #Typingpool
|
data/lib/typingpool/utility.rb
CHANGED
@@ -207,66 +207,6 @@ module Typingpool
|
|
207
207
|
end
|
208
208
|
|
209
209
|
end #class << self
|
210
|
-
|
211
|
-
module Castable
|
212
|
-
#Cast this object instance to a relative class. Call this from
|
213
|
-
#super in your own class if you want to pass args to the
|
214
|
-
#relative class constructor. All args after the first will be
|
215
|
-
#passed to new.
|
216
|
-
#
|
217
|
-
#A relative class can be a subclass and in some cases a sibling
|
218
|
-
#class, parent class, parent sibling class, grandparent class,
|
219
|
-
#grandparent sibling class, and so on. A relative class will
|
220
|
-
#never be higher up the inheritance tree than the subclasses of
|
221
|
-
#the class where Castable was included.
|
222
|
-
# ==== Params
|
223
|
-
# [sym] Symbol corresponding to relative class to cast into. For
|
224
|
-
# example, Class#as(:audio) will cast into a Class::Audio
|
225
|
-
# and Class#as(:csv) will cast into Class::CSV. Casting
|
226
|
-
# is class insensitive, which means you can't have class
|
227
|
-
# CSV and class Csv. To cast into a related class whose
|
228
|
-
# name is not not directly under that of its parent, you
|
229
|
-
# must either specify the full name,
|
230
|
-
# e.g. Class#as(:foo_bar_baz) to cast to Foo::Bar::Baz,
|
231
|
-
# or a name relative to the parent,
|
232
|
-
# e.g. Class#as(:remote_html), where Class::Remote does
|
233
|
-
# not inherit from Class but Class::Remote::HTML does.
|
234
|
-
# ==== Returns
|
235
|
-
# New instance of subclass
|
236
|
-
def as(sym, *args)
|
237
|
-
if klass = self.class.relative_klass(sym.to_s.downcase)
|
238
|
-
klass.new(*args)
|
239
|
-
else
|
240
|
-
raise Error, "Can't find class '#{sym.to_s}' to cast to"
|
241
|
-
end #if subklass =...
|
242
|
-
end
|
243
|
-
|
244
|
-
def self.included(receiver)
|
245
|
-
receiver.extend(ClassMethods)
|
246
|
-
end
|
247
|
-
|
248
|
-
module ClassMethods
|
249
|
-
def inherited(subklass)
|
250
|
-
subklasses[subklass.to_s.split("#{self.name}::").last.downcase.gsub(/::/, '_')] = subklass
|
251
|
-
end
|
252
|
-
|
253
|
-
def subklasses
|
254
|
-
@subklasses ||= {}
|
255
|
-
end
|
256
|
-
|
257
|
-
def subklass(subklass_key)
|
258
|
-
subklasses[subklass_key]
|
259
|
-
end
|
260
|
-
|
261
|
-
def relative_klass(key)
|
262
|
-
if subklasses[key]
|
263
|
-
subklasses[key]
|
264
|
-
elsif self.superclass.respond_to? :relative_klass
|
265
|
-
self.superclass.relative_klass(key)
|
266
|
-
end
|
267
|
-
end
|
268
|
-
|
269
|
-
end #module ClassMethods
|
270
|
-
end #Castable
|
210
|
+
require 'typingpool/utility/castable'
|
271
211
|
end #Utility
|
272
212
|
end #Typingpool
|
@@ -19,6 +19,7 @@ class TestTpFinish < Typingpool::Test::Script
|
|
19
19
|
assert_nothing_raised do
|
20
20
|
tp_finish_outside_sandbox(dir, config_path)
|
21
21
|
end
|
22
|
+
sleep 1 #pause before checking URLs so remote server has time to fully delete
|
22
23
|
assert_empty(urls.select{|url| working_url? url })
|
23
24
|
assert_all_assets_have_upload_status(csv, ['audio'], 'no')
|
24
25
|
end
|