typingpool 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.markdown +452 -0
- data/lib/typingpool/amazon/hit/assignment/empty.rb +19 -0
- data/lib/typingpool/amazon/hit/assignment.rb +43 -0
- data/lib/typingpool/amazon/hit/full/fromsearchhits.rb +44 -0
- data/lib/typingpool/amazon/hit/full.rb +105 -0
- data/lib/typingpool/amazon/hit.rb +458 -0
- data/lib/typingpool/amazon/question.rb +45 -0
- data/lib/typingpool/amazon.rb +3 -677
- data/lib/typingpool/app/cli/formatter.rb +16 -0
- data/lib/typingpool/app/cli.rb +64 -0
- data/lib/typingpool/app/friendlyexceptions.rb +34 -0
- data/lib/typingpool/app.rb +2 -97
- data/lib/typingpool/config/root.rb +114 -0
- data/lib/typingpool/config.rb +13 -119
- data/lib/typingpool/filer/audio.rb +84 -0
- data/lib/typingpool/filer/csv.rb +57 -0
- data/lib/typingpool/filer/dir.rb +76 -0
- data/lib/typingpool/filer/files/audio.rb +63 -0
- data/lib/typingpool/filer/files.rb +55 -0
- data/lib/typingpool/filer.rb +4 -313
- data/lib/typingpool/project/local.rb +117 -0
- data/lib/typingpool/project/remote/s3.rb +135 -0
- data/lib/typingpool/project/remote/sftp.rb +100 -0
- data/lib/typingpool/project/remote.rb +65 -0
- data/lib/typingpool/project.rb +2 -396
- data/lib/typingpool/template/assignment.rb +17 -0
- data/lib/typingpool/template/env.rb +77 -0
- data/lib/typingpool/template.rb +2 -87
- data/lib/typingpool/test/script.rb +310 -0
- data/lib/typingpool/test.rb +1 -306
- data/lib/typingpool/transcript/chunk.rb +129 -0
- data/lib/typingpool/transcript.rb +1 -125
- data/lib/typingpool/utility/castable.rb +65 -0
- data/lib/typingpool/utility.rb +1 -61
- data/test/test_integration_script_6_tp_finish.rb +1 -0
- metadata +135 -81
@@ -0,0 +1,129 @@
|
|
1
|
+
module Typingpool
|
2
|
+
class Transcript
|
3
|
+
|
4
|
+
#Transcript::Chunk is the model class for one transcription by one
|
5
|
+
#Mechanical Turk worker of one "chunk" (a file) of audio, which in
|
6
|
+
#turn is a portion of a larger recording (for example, one minute
|
7
|
+
#of a 60 minute interview). It is basically parallel and similar
|
8
|
+
#to an Amazon::HIT instance. Transcript is a container for these
|
9
|
+
#chunks, which know how to render themselves as text and HTML.
|
10
|
+
class Chunk
|
11
|
+
require 'cgi'
|
12
|
+
require 'rubygems/text'
|
13
|
+
include Gem::Text
|
14
|
+
|
15
|
+
#Get/set the raw text of the transcript
|
16
|
+
attr_accessor :body
|
17
|
+
|
18
|
+
#Get/set the Amazon ID of the Mechanical Turk worker who
|
19
|
+
#transcribed the audio into text
|
20
|
+
attr_accessor :worker
|
21
|
+
|
22
|
+
#Get/set the id of the Amazon::HIT associated with this chunk
|
23
|
+
attr_accessor :hit
|
24
|
+
|
25
|
+
#Get/set the id of the Project#local associated with this chunk
|
26
|
+
attr_accessor :project
|
27
|
+
|
28
|
+
#Return the offset associated with the chunk, in MM:SS
|
29
|
+
#format. This corresponds to the associated audio file, which is
|
30
|
+
#a chunk of a larger recording and which starts at a particular
|
31
|
+
#time offset, for example from 1:00 (the offset) to 2:00 (the
|
32
|
+
#next offset).
|
33
|
+
#
|
34
|
+
#
|
35
|
+
#This should be updated to return HH:MM:SS and MM:SS.sss when
|
36
|
+
#appropriate, since in Project#interval we use that format and
|
37
|
+
#allow audio to be divided into such units. (TODO)
|
38
|
+
attr_reader :offset
|
39
|
+
|
40
|
+
#Returns the offset in seconds. So for an offset of 1:00 would return 60.
|
41
|
+
attr_reader :offset_seconds
|
42
|
+
|
43
|
+
#Returns the name of the remote audio file corresponding to this
|
44
|
+
#chunk. The remote file has the project ID and pseudo random
|
45
|
+
#characters added to it.
|
46
|
+
attr_reader :filename
|
47
|
+
|
48
|
+
#Returns the name of the local audio file corresponding to this
|
49
|
+
#chunk.
|
50
|
+
attr_reader :filename_local
|
51
|
+
|
52
|
+
#Returns the URL of the remote audio transcribed in the body of
|
53
|
+
#this chunk.
|
54
|
+
attr_reader :url
|
55
|
+
|
56
|
+
#Constructor. Takes the raw text of the transcription.
|
57
|
+
def initialize(body)
|
58
|
+
@body = body
|
59
|
+
end
|
60
|
+
|
61
|
+
#Sorts by offset seconds.
|
62
|
+
def <=>(other)
|
63
|
+
self.offset_seconds <=> other.offset_seconds
|
64
|
+
end
|
65
|
+
|
66
|
+
#Takes an URL. As an important side effect, sets various
|
67
|
+
#attributes, including url, filename, filename_local, offset and
|
68
|
+
#offset_seconds. So setting Chunk#url= http://whateverwhatever
|
69
|
+
#is an important step in populating the instance.
|
70
|
+
def url=(url)
|
71
|
+
#http://ryantate.com/transfer/Speech.01.00.ede9b0f2aed0d35a26cef7160bc9e35e.ISEAOM.mp3
|
72
|
+
matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
|
73
|
+
@url = matches[0]
|
74
|
+
@filename = matches[1]
|
75
|
+
@filename_local = Project.local_basename_from_url(@url)
|
76
|
+
@offset = "#{matches[3]}:#{matches[4]}"
|
77
|
+
@offset_seconds = (matches[3].to_i * 60) + matches[4].to_i
|
78
|
+
end
|
79
|
+
|
80
|
+
#Takes an optional specification of how many spaces to indent
|
81
|
+
#the text by (default 0) and an optional specification of how
|
82
|
+
#many characters to wrap at (default no wrapping).
|
83
|
+
#
|
84
|
+
#Returns the text with newlines normalized to Unix format, runs
|
85
|
+
#of newlines shortened to a maximum of two newlines, leading and
|
86
|
+
#trailing whitespace removed from each line, and the text
|
87
|
+
#wrapped/indented as specified.
|
88
|
+
def body_as_text(indent=nil, wrap=nil)
|
89
|
+
text = self.body
|
90
|
+
text = Utility.normalize_newlines(text)
|
91
|
+
text.gsub!(/\n\n+/, "\n\n")
|
92
|
+
text = text.split("\n").map{|line| line.strip }.join("\n")
|
93
|
+
text = wrap_text(text, wrap) if wrap
|
94
|
+
text = indent_text(text, indent) if indent
|
95
|
+
text
|
96
|
+
end
|
97
|
+
alias :to_s :body_as_text
|
98
|
+
alias :to_str :body_as_text
|
99
|
+
|
100
|
+
#Takes an optional count of how many characters to wrap at
|
101
|
+
#(default 72). Returns the body, presumed to be raw text, as
|
102
|
+
#HTML. Any HTML tags in the body are escaped. Text blocks
|
103
|
+
#separated by double newlines are converted to HTML paragraphs,
|
104
|
+
#while single newlines are converted to HTML BR tags. Newlines
|
105
|
+
#are normalized as in body_as_text, and lines in the HTML source
|
106
|
+
#are automatically wrapped as specified.
|
107
|
+
def body_as_html(wrap=72)
|
108
|
+
text = body_as_text
|
109
|
+
text = CGI::escapeHTML(text)
|
110
|
+
text = Utility.newlines_to_html(text)
|
111
|
+
text = text.split("\n").map do |line|
|
112
|
+
wrap_text(line, 72).chomp
|
113
|
+
end.join("\n")
|
114
|
+
text
|
115
|
+
end
|
116
|
+
|
117
|
+
protected
|
118
|
+
|
119
|
+
def indent_text(text, indent)
|
120
|
+
text.gsub!(/^/, " " * indent)
|
121
|
+
text
|
122
|
+
end
|
123
|
+
|
124
|
+
def wrap_text(text, wrap=72)
|
125
|
+
format_text(text, wrap)
|
126
|
+
end
|
127
|
+
end #Chunk
|
128
|
+
end #Transcript
|
129
|
+
end #Typingpool
|
@@ -52,130 +52,6 @@ module Typingpool
|
|
52
52
|
@chunks.push(chunk)
|
53
53
|
end
|
54
54
|
|
55
|
-
|
56
|
-
#Mechanical Turk worker of one "chunk" (a file) of audio, which in
|
57
|
-
#turn is a portion of a larger recording (for example, one minute
|
58
|
-
#of a 60 minute interview). It is basically parallel and similar
|
59
|
-
#to an Amazon::HIT instance. Transcript is a container for these
|
60
|
-
#chunks, which know how to render themselves as text and HTML.
|
61
|
-
class Chunk
|
62
|
-
require 'cgi'
|
63
|
-
require 'rubygems/text'
|
64
|
-
include Gem::Text
|
65
|
-
|
66
|
-
#Get/set the raw text of the transcript
|
67
|
-
attr_accessor :body
|
68
|
-
|
69
|
-
#Get/set the Amazon ID of the Mechanical Turk worker who
|
70
|
-
#transcribed the audio into text
|
71
|
-
attr_accessor :worker
|
72
|
-
|
73
|
-
#Get/set the id of the Amazon::HIT associated with this chunk
|
74
|
-
attr_accessor :hit
|
75
|
-
|
76
|
-
#Get/set the id of the Project#local associated with this chunk
|
77
|
-
attr_accessor :project
|
78
|
-
|
79
|
-
#Return the offset associated with the chunk, in MM:SS
|
80
|
-
#format. This corresponds to the associated audio file, which is
|
81
|
-
#a chunk of a larger recording and which starts at a particular
|
82
|
-
#time offset, for example from 1:00 (the offset) to 2:00 (the
|
83
|
-
#next offset).
|
84
|
-
#
|
85
|
-
#
|
86
|
-
#This should be updated to return HH:MM:SS and MM:SS.sss when
|
87
|
-
#appropriate, since in Project#interval we use that format and
|
88
|
-
#allow audio to be divided into such units. (TODO)
|
89
|
-
attr_reader :offset
|
90
|
-
|
91
|
-
#Returns the offset in seconds. So for an offset of 1:00 would return 60.
|
92
|
-
attr_reader :offset_seconds
|
93
|
-
|
94
|
-
#Returns the name of the remote audio file corresponding to this
|
95
|
-
#chunk. The remote file has the project ID and pseudo random
|
96
|
-
#characters added to it.
|
97
|
-
attr_reader :filename
|
98
|
-
|
99
|
-
#Returns the name of the local audio file corresponding to this
|
100
|
-
#chunk.
|
101
|
-
attr_reader :filename_local
|
102
|
-
|
103
|
-
#Returns the URL of the remote audio transcribed in the body of
|
104
|
-
#this chunk.
|
105
|
-
attr_reader :url
|
106
|
-
|
107
|
-
#Constructor. Takes the raw text of the transcription.
|
108
|
-
def initialize(body)
|
109
|
-
@body = body
|
110
|
-
end
|
111
|
-
|
112
|
-
#Sorts by offset seconds.
|
113
|
-
def <=>(other)
|
114
|
-
self.offset_seconds <=> other.offset_seconds
|
115
|
-
end
|
116
|
-
|
117
|
-
#Takes an URL. As an important side effect, sets various
|
118
|
-
#attributes, including url, filename, filename_local, offset and
|
119
|
-
#offset_seconds. So setting Chunk#url= http://whateverwhatever
|
120
|
-
#is an important step in populating the instance.
|
121
|
-
def url=(url)
|
122
|
-
#http://ryantate.com/transfer/Speech.01.00.ede9b0f2aed0d35a26cef7160bc9e35e.ISEAOM.mp3
|
123
|
-
matches = Project.url_regex.match(url) or raise Error::Argument::Format, "Unexpected format to url '#{url}'"
|
124
|
-
@url = matches[0]
|
125
|
-
@filename = matches[1]
|
126
|
-
@filename_local = Project.local_basename_from_url(@url)
|
127
|
-
@offset = "#{matches[3]}:#{matches[4]}"
|
128
|
-
@offset_seconds = (matches[3].to_i * 60) + matches[4].to_i
|
129
|
-
end
|
130
|
-
|
131
|
-
#Takes an optional specification of how many spaces to indent
|
132
|
-
#the text by (default 0) and an optional specification of how
|
133
|
-
#many characters to wrap at (default no wrapping).
|
134
|
-
#
|
135
|
-
#Returns the text with newlines normalized to Unix format, runs
|
136
|
-
#of newlines shortened to a maximum of two newlines, leading and
|
137
|
-
#trailing whitespace removed from each line, and the text
|
138
|
-
#wrapped/indented as specified.
|
139
|
-
def body_as_text(indent=nil, wrap=nil)
|
140
|
-
text = self.body
|
141
|
-
text = Utility.normalize_newlines(text)
|
142
|
-
text.gsub!(/\n\n+/, "\n\n")
|
143
|
-
text = text.split("\n").map{|line| line.strip }.join("\n")
|
144
|
-
text = wrap_text(text, wrap) if wrap
|
145
|
-
text = indent_text(text, indent) if indent
|
146
|
-
text
|
147
|
-
end
|
148
|
-
alias :to_s :body_as_text
|
149
|
-
alias :to_str :body_as_text
|
150
|
-
|
151
|
-
#Takes an optional count of how many characters to wrap at
|
152
|
-
#(default 72). Returns the body, presumed to be raw text, as
|
153
|
-
#HTML. Any HTML tags in the body are escaped. Text blocks
|
154
|
-
#separated by double newlines are converted to HTML paragraphs,
|
155
|
-
#while single newlines are converted to HTML BR tags. Newlines
|
156
|
-
#are normalized as in body_as_text, and lines in the HTML source
|
157
|
-
#are automatically wrapped as specified.
|
158
|
-
def body_as_html(wrap=72)
|
159
|
-
text = body_as_text
|
160
|
-
text = CGI::escapeHTML(text)
|
161
|
-
text = Utility.newlines_to_html(text)
|
162
|
-
text = text.split("\n").map do |line|
|
163
|
-
wrap_text(line, 72).chomp
|
164
|
-
end.join("\n")
|
165
|
-
text
|
166
|
-
end
|
167
|
-
|
168
|
-
protected
|
169
|
-
|
170
|
-
def indent_text(text, indent)
|
171
|
-
text.gsub!(/^/, " " * indent)
|
172
|
-
text
|
173
|
-
end
|
174
|
-
|
175
|
-
def wrap_text(text, wrap=72)
|
176
|
-
format_text(text, wrap)
|
177
|
-
end
|
178
|
-
|
179
|
-
end #Chunk
|
55
|
+
require 'typingpool/transcript/chunk'
|
180
56
|
end #Transcript
|
181
57
|
end #Typingpool
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Typingpool
|
2
|
+
module Utility
|
3
|
+
module Castable
|
4
|
+
|
5
|
+
#Cast this object instance to a relative class. Call this from
|
6
|
+
#super in your own class if you want to pass args to the
|
7
|
+
#relative class constructor. All args after the first will be
|
8
|
+
#passed to new.
|
9
|
+
#
|
10
|
+
#A relative class can be a subclass and in some cases a sibling
|
11
|
+
#class, parent class, parent sibling class, grandparent class,
|
12
|
+
#grandparent sibling class, and so on. A relative class will
|
13
|
+
#never be higher up the inheritance tree than the subclasses of
|
14
|
+
#the class where Castable was included.
|
15
|
+
# ==== Params
|
16
|
+
# [sym] Symbol corresponding to relative class to cast into. For
|
17
|
+
# example, Class#as(:audio) will cast into a Class::Audio
|
18
|
+
# and Class#as(:csv) will cast into Class::CSV. Casting
|
19
|
+
# is class insensitive, which means you can't have class
|
20
|
+
# CSV and class Csv. To cast into a related class whose
|
21
|
+
# name is not not directly under that of its parent, you
|
22
|
+
# must either specify the full name,
|
23
|
+
# e.g. Class#as(:foo_bar_baz) to cast to Foo::Bar::Baz,
|
24
|
+
# or a name relative to the parent,
|
25
|
+
# e.g. Class#as(:remote_html), where Class::Remote does
|
26
|
+
# not inherit from Class but Class::Remote::HTML does.
|
27
|
+
# ==== Returns
|
28
|
+
# New instance of subclass
|
29
|
+
def as(sym, *args)
|
30
|
+
if klass = self.class.relative_klass(sym.to_s.downcase)
|
31
|
+
klass.new(*args)
|
32
|
+
else
|
33
|
+
raise Error, "Can't find class '#{sym.to_s}' to cast to"
|
34
|
+
end #if subklass =...
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.included(receiver)
|
38
|
+
receiver.extend(ClassMethods)
|
39
|
+
end
|
40
|
+
|
41
|
+
module ClassMethods
|
42
|
+
def inherited(subklass)
|
43
|
+
subklasses[subklass.to_s.split("#{self.name}::").last.downcase.gsub(/::/, '_')] = subklass
|
44
|
+
end
|
45
|
+
|
46
|
+
def subklasses
|
47
|
+
@subklasses ||= {}
|
48
|
+
end
|
49
|
+
|
50
|
+
def subklass(subklass_key)
|
51
|
+
subklasses[subklass_key]
|
52
|
+
end
|
53
|
+
|
54
|
+
def relative_klass(key)
|
55
|
+
if subklasses[key]
|
56
|
+
subklasses[key]
|
57
|
+
elsif self.superclass.respond_to? :relative_klass
|
58
|
+
self.superclass.relative_klass(key)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end #module ClassMethods
|
63
|
+
end #Castable
|
64
|
+
end #Utility
|
65
|
+
end #Typingpool
|
data/lib/typingpool/utility.rb
CHANGED
@@ -207,66 +207,6 @@ module Typingpool
|
|
207
207
|
end
|
208
208
|
|
209
209
|
end #class << self
|
210
|
-
|
211
|
-
module Castable
|
212
|
-
#Cast this object instance to a relative class. Call this from
|
213
|
-
#super in your own class if you want to pass args to the
|
214
|
-
#relative class constructor. All args after the first will be
|
215
|
-
#passed to new.
|
216
|
-
#
|
217
|
-
#A relative class can be a subclass and in some cases a sibling
|
218
|
-
#class, parent class, parent sibling class, grandparent class,
|
219
|
-
#grandparent sibling class, and so on. A relative class will
|
220
|
-
#never be higher up the inheritance tree than the subclasses of
|
221
|
-
#the class where Castable was included.
|
222
|
-
# ==== Params
|
223
|
-
# [sym] Symbol corresponding to relative class to cast into. For
|
224
|
-
# example, Class#as(:audio) will cast into a Class::Audio
|
225
|
-
# and Class#as(:csv) will cast into Class::CSV. Casting
|
226
|
-
# is class insensitive, which means you can't have class
|
227
|
-
# CSV and class Csv. To cast into a related class whose
|
228
|
-
# name is not not directly under that of its parent, you
|
229
|
-
# must either specify the full name,
|
230
|
-
# e.g. Class#as(:foo_bar_baz) to cast to Foo::Bar::Baz,
|
231
|
-
# or a name relative to the parent,
|
232
|
-
# e.g. Class#as(:remote_html), where Class::Remote does
|
233
|
-
# not inherit from Class but Class::Remote::HTML does.
|
234
|
-
# ==== Returns
|
235
|
-
# New instance of subclass
|
236
|
-
def as(sym, *args)
|
237
|
-
if klass = self.class.relative_klass(sym.to_s.downcase)
|
238
|
-
klass.new(*args)
|
239
|
-
else
|
240
|
-
raise Error, "Can't find class '#{sym.to_s}' to cast to"
|
241
|
-
end #if subklass =...
|
242
|
-
end
|
243
|
-
|
244
|
-
def self.included(receiver)
|
245
|
-
receiver.extend(ClassMethods)
|
246
|
-
end
|
247
|
-
|
248
|
-
module ClassMethods
|
249
|
-
def inherited(subklass)
|
250
|
-
subklasses[subklass.to_s.split("#{self.name}::").last.downcase.gsub(/::/, '_')] = subklass
|
251
|
-
end
|
252
|
-
|
253
|
-
def subklasses
|
254
|
-
@subklasses ||= {}
|
255
|
-
end
|
256
|
-
|
257
|
-
def subklass(subklass_key)
|
258
|
-
subklasses[subklass_key]
|
259
|
-
end
|
260
|
-
|
261
|
-
def relative_klass(key)
|
262
|
-
if subklasses[key]
|
263
|
-
subklasses[key]
|
264
|
-
elsif self.superclass.respond_to? :relative_klass
|
265
|
-
self.superclass.relative_klass(key)
|
266
|
-
end
|
267
|
-
end
|
268
|
-
|
269
|
-
end #module ClassMethods
|
270
|
-
end #Castable
|
210
|
+
require 'typingpool/utility/castable'
|
271
211
|
end #Utility
|
272
212
|
end #Typingpool
|
@@ -19,6 +19,7 @@ class TestTpFinish < Typingpool::Test::Script
|
|
19
19
|
assert_nothing_raised do
|
20
20
|
tp_finish_outside_sandbox(dir, config_path)
|
21
21
|
end
|
22
|
+
sleep 1 #pause before checking URLs so remote server has time to fully delete
|
22
23
|
assert_empty(urls.select{|url| working_url? url })
|
23
24
|
assert_all_assets_have_upload_status(csv, ['audio'], 'no')
|
24
25
|
end
|