sm-transcript 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +23 -0
- data/README.txt +140 -0
- data/Rakefile +31 -0
- data/bin/results/PLACEHOLDER.txt +8 -0
- data/bin/sm-transcript +12 -0
- data/bin/transcripts/PLACEHOLDER.txt +8 -0
- data/lib/sm_transcript/LICENSE.txt +23 -0
- data/lib/sm_transcript/metadata.rb +69 -0
- data/lib/sm_transcript/metadata_reader.rb +56 -0
- data/lib/sm_transcript/options.rb +89 -0
- data/lib/sm_transcript/optparseExample.rb +113 -0
- data/lib/sm_transcript/process_csv_files_to_html.rb +58 -0
- data/lib/sm_transcript/process_seg_files.rb +21 -0
- data/lib/sm_transcript/process_seg_files_to_csv.rb +24 -0
- data/lib/sm_transcript/process_seg_files_to_html.rb +31 -0
- data/lib/sm_transcript/require_relative.rb +14 -0
- data/lib/sm_transcript/runner.rb +70 -0
- data/lib/sm_transcript/seg_reader.rb +42 -0
- data/lib/sm_transcript/transcript.rb +130 -0
- data/lib/sm_transcript/word.rb +31 -0
- data/lib/sm_transcript/wrd_reader.rb +42 -0
- data/test/Rakefile +14 -0
- data/test/results/IIHS_Diane_Davis_Nov2009.seg +425 -0
- data/test/results/NERCOMP-SpokenMedia4.wrd +6791 -0
- data/test/results/PLACEHOLDER.txt +8 -0
- data/test/results/PLACEHOLDER.txt.ignore +8 -0
- data/test/results/vijay_kumar.wrd +1675 -0
- data/test/results/wirehair-beetle.txt +6 -0
- data/test/test_metadata.rb +39 -0
- data/test/test_metadatareader.rb +30 -0
- data/test/test_options.rb +47 -0
- data/test/test_runner.rb +52 -0
- data/test/test_segreader.rb +39 -0
- data/test/test_transcript.rb +62 -0
- data/test/test_wrdreader.rb +43 -0
- data/test/transcripts/IIHS_Diane_Davis_Nov2009-t1.html +148 -0
- data/test/transcripts/PLACEHOLDER.txt +8 -0
- data/test/transcripts/data.js +24 -0
- data/test/transcripts/vijay_kumar-1.-t1.html +557 -0
- data/test/transcripts/vijay_kumar-1.t1.html +557 -0
- data/test/transcripts/vijay_kumar-t1.html +557 -0
- data/test/transcripts/vijay_kumar-t1.ttml +569 -0
- data/test/transcripts/vijay_kumar.data.js +2 -0
- data/test/transcripts/wirehair-beetle.data.js +3 -0
- metadata +234 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
# $Id: word.rb 182 2010-03-12 22:07:34Z pwilkins $
|
2
|
+
# Copyright (c) 2010 Massachusetts Institute of Technology
|
3
|
+
# see LICENSE.txt for license text
|
4
|
+
|
5
|
+
module SmTranscript
|
6
|
+
class Word
|
7
|
+
attr_reader :start_time
|
8
|
+
attr_reader :end_time
|
9
|
+
attr_reader :word
|
10
|
+
|
11
|
+
def initialize( start_time, end_time, word )
|
12
|
+
@start_time = start_time
|
13
|
+
@end_time = end_time
|
14
|
+
@word = apply_word_rules(word)
|
15
|
+
end
|
16
|
+
|
17
|
+
def apply_word_rules(word)
|
18
|
+
case word
|
19
|
+
when "i" # (English) "i" appearing as a word is capitalized
|
20
|
+
word.upcase
|
21
|
+
when "i'm", "i've" # (English) "i" as personal pronoun is capitalized
|
22
|
+
word.capitalize
|
23
|
+
when "iranian" # (English) proper nouns are capitalized
|
24
|
+
word.capitalize
|
25
|
+
else
|
26
|
+
word
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# $Id: wrd_reader.rb 182 2010-03-12 22:07:34Z pwilkins $
|
2
|
+
# Copyright (c) 2010 Massachusetts Institute of Technology
|
3
|
+
# see LICENSE.txt for license text
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require 'extensions/kernel'
|
7
|
+
require_relative 'word'
|
8
|
+
|
9
|
+
module SmTranscript
|
10
|
+
class WrdReader
|
11
|
+
attr_reader :metadata
|
12
|
+
attr_reader :words
|
13
|
+
|
14
|
+
def self.from_file(file_name)
|
15
|
+
# p File.expand_path(file_name)
|
16
|
+
new(File.open(file_name))
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(src_file)
|
20
|
+
@metadata = {}
|
21
|
+
@words = []
|
22
|
+
parse_metadata()
|
23
|
+
parse_words(src_file)
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_metadata()
|
27
|
+
# there is currently no metadata in .wrd files
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse_words(src_file)
|
31
|
+
src_file.each do |ln|
|
32
|
+
# line is expected to contain two integers separated by a space,
|
33
|
+
# followed by a space and one or more words. The words may contain
|
34
|
+
# characters or an apostrophe
|
35
|
+
ln.scan(/^\d* \d* [\w']* *[\w']*$/) do |t|
|
36
|
+
arr = t.split
|
37
|
+
@words << SmTranscript::Word.new(arr[0], arr[1], arr[2])
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/test/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# $Id: Rakefile 182 2010-03-12 22:07:34Z pwilkins $
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rake'
|
5
|
+
require 'rake/testtask'
|
6
|
+
|
7
|
+
task :default => [:test_units]
|
8
|
+
|
9
|
+
desc "Run basic tests"
|
10
|
+
Rake::TestTask.new("test_units") { |t|
|
11
|
+
t.pattern = 'test_*.rb'
|
12
|
+
t.verbose = false
|
13
|
+
t.warning = false
|
14
|
+
}
|
@@ -0,0 +1,425 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<document fileName="/Users/mckinney/Developer/Projects/SpokenMedia/workspace/standalone/results/IIHS_Diane_Davis_Nov2009.seg">
|
3
|
+
<lecture title="NA" keywords="global">
|
4
|
+
<segment id="1" title="global">
|
5
|
+
11406 11500 oh
|
6
|
+
11500 11585 i
|
7
|
+
11585 11790 think
|
8
|
+
11790 12188 there're
|
9
|
+
12188 12584 several
|
10
|
+
12584 12951 critical
|
11
|
+
12951 13438 questions
|
12
|
+
13438 13560 of
|
13
|
+
13560 13850 ones
|
14
|
+
13850 13995 that
|
15
|
+
13995 14405 i
|
16
|
+
14405 14822 found
|
17
|
+
14822 14943 in
|
18
|
+
14943 15300 steady
|
19
|
+
15300 15745 most
|
20
|
+
15745 16559 myself
|
21
|
+
17214 17415 are
|
22
|
+
17415 17813 questions
|
23
|
+
17813 17937 of
|
24
|
+
17937 18630 insecurity
|
25
|
+
18630 18775 into
|
26
|
+
18775 19510 violence
|
27
|
+
19510 20055 i
|
28
|
+
20055 20445 see
|
29
|
+
20445 20853 those
|
30
|
+
20853 21200 and
|
31
|
+
21200 22070 some
|
32
|
+
22070 22655 pervasive
|
33
|
+
22655 22770 the
|
34
|
+
22770 23260 class
|
35
|
+
23260 23553 much
|
36
|
+
23553 23680 of
|
37
|
+
23680 23756 a
|
38
|
+
23756 24108 global
|
39
|
+
24108 24516 found
|
40
|
+
25147 25277 and
|
41
|
+
25277 25427 i
|
42
|
+
25427 25947 think
|
43
|
+
27342 27461 have
|
44
|
+
27461 27522 a
|
45
|
+
27522 27677 lot
|
46
|
+
27677 27781 to
|
47
|
+
27781 28032 do
|
48
|
+
28032 28357 with
|
49
|
+
28357 28422 her
|
50
|
+
28422 28722 and
|
51
|
+
28722 28967 devil
|
52
|
+
28967 29672 transition
|
53
|
+
29672 29963 that
|
54
|
+
29963 30217 in
|
55
|
+
30217 30286 the
|
56
|
+
30286 30762 developing
|
57
|
+
30762 30936 world
|
58
|
+
30936 31172 is
|
59
|
+
31172 31777 facebook
|
60
|
+
31777 31953 pull
|
61
|
+
31953 32034 the
|
62
|
+
32034 32287 climb
|
63
|
+
32287 32827 economic
|
64
|
+
32827 34027 globalization
|
65
|
+
34027 34312 never
|
66
|
+
34312 34862 flat
|
67
|
+
35122 35285 it's
|
68
|
+
35285 35350 a
|
69
|
+
35350 35685 fairly
|
70
|
+
35685 36235 standard
|
71
|
+
36235 36460 way
|
72
|
+
36460 36548 of
|
73
|
+
36548 37129 understanding
|
74
|
+
37129 37665 the
|
75
|
+
37665 37943 crime
|
76
|
+
37943 38040 and
|
77
|
+
38040 38570 violence
|
78
|
+
38570 38972 so
|
79
|
+
40637 40696 i
|
80
|
+
40696 40905 think
|
81
|
+
40905 41057 it's
|
82
|
+
41057 41432 import
|
83
|
+
41432 41472 and
|
84
|
+
41472 41705 because
|
85
|
+
41705 41787 it
|
86
|
+
41787 42172 means
|
87
|
+
42172 42302 is
|
88
|
+
42302 42687 that
|
89
|
+
42687 43083 problem
|
90
|
+
43083 43191 of
|
91
|
+
43191 43306 r
|
92
|
+
43306 43907 times
|
93
|
+
44811 44972 and
|
94
|
+
44972 45157 then
|
95
|
+
45157 45619 many
|
96
|
+
45619 46167 developing
|
97
|
+
46167 46903 countries
|
98
|
+
46903 47042 in
|
99
|
+
47042 47402 different
|
100
|
+
47402 47782 regions
|
101
|
+
47782 47892 of
|
102
|
+
47892 47982 the
|
103
|
+
47982 48317 world
|
104
|
+
48317 48421 are
|
105
|
+
48421 49423 facing
|
106
|
+
49423 50356 insecurity
|
107
|
+
50356 50582 and
|
108
|
+
50582 51492 violence
|
109
|
+
51766 52265 that
|
110
|
+
53319 53525 and
|
111
|
+
53525 53662 a
|
112
|
+
53662 53821 lot
|
113
|
+
53821 53961 to
|
114
|
+
53961 54096 do
|
115
|
+
54096 54267 with
|
116
|
+
54267 54436 the
|
117
|
+
54436 54916 changing
|
118
|
+
54916 55391 nature
|
119
|
+
55391 55656 of
|
120
|
+
55656 55938 an
|
121
|
+
55938 56182 urban
|
122
|
+
56182 56898 economies
|
123
|
+
56898 57071 and
|
124
|
+
57071 57214 how
|
125
|
+
57214 57635 they
|
126
|
+
57874 58346 linked
|
127
|
+
58346 58459 to
|
128
|
+
58459 58566 the
|
129
|
+
58566 59276 global
|
130
|
+
59276 59936 economy
|
131
|
+
59936 60151 so
|
132
|
+
60151 60263 and
|
133
|
+
60263 60685 that's
|
134
|
+
61248 61343 i
|
135
|
+
61343 61898 think
|
136
|
+
61898 63003 come
|
137
|
+
63003 63518 pervasive
|
138
|
+
63518 64288 challenge
|
139
|
+
67678 67836 is
|
140
|
+
67836 68090 that
|
141
|
+
68090 68239 is
|
142
|
+
68239 68457 one
|
143
|
+
68457 68557 and
|
144
|
+
68557 69049 two
|
145
|
+
69049 69146 are
|
146
|
+
69146 69451 three
|
147
|
+
69451 70386 specializations
|
148
|
+
70386 70526 that
|
149
|
+
70526 70776 when
|
150
|
+
70776 71121 means
|
151
|
+
71121 71427 in
|
152
|
+
71769 71906 and
|
153
|
+
71906 72691 interconnected
|
154
|
+
72691 73066 the
|
155
|
+
73066 73371 global
|
156
|
+
73371 73866 world
|
157
|
+
73866 74411 aware
|
158
|
+
74765 75695 technology
|
159
|
+
75695 75895 and
|
160
|
+
75895 76625 treat
|
161
|
+
76625 77568 endowment
|
162
|
+
78727 78947 of
|
163
|
+
78947 79777 communication
|
164
|
+
79777 80012 and
|
165
|
+
80012 80287 this
|
166
|
+
81408 81577 i
|
167
|
+
81577 81792 d
|
168
|
+
81792 82247 l
|
169
|
+
82247 83052 ideology
|
170
|
+
83052 83167 of
|
171
|
+
83167 83368 those
|
172
|
+
83368 83437 are
|
173
|
+
83437 83572 all
|
174
|
+
83572 84132 important
|
175
|
+
84132 84202 to
|
176
|
+
84875 85342 aspects
|
177
|
+
85342 85470 of
|
178
|
+
85470 85791 that
|
179
|
+
85791 85915 i'm
|
180
|
+
85915 86000 the
|
181
|
+
86000 86660 planning
|
182
|
+
86660 86930 to
|
183
|
+
86930 87295 kept
|
184
|
+
88377 88488 but
|
185
|
+
88488 88636 i
|
186
|
+
88636 88894 would
|
187
|
+
88894 89157 say
|
188
|
+
89157 89903 definitely
|
189
|
+
89903 90337 need
|
190
|
+
90337 90426 to
|
191
|
+
90426 91267 people
|
192
|
+
91267 91641 who
|
193
|
+
91641 92057 are
|
194
|
+
92739 92857 i
|
195
|
+
92857 93002 had
|
196
|
+
93002 93312 the
|
197
|
+
93312 93544 think
|
198
|
+
93544 93942 about
|
199
|
+
93942 94487 them
|
200
|
+
94487 94777 meaning
|
201
|
+
94777 95260 regulator
|
202
|
+
95260 95597 people
|
203
|
+
95597 95652 and
|
204
|
+
95652 95791 now
|
205
|
+
95791 95947 but
|
206
|
+
95947 96862 organizations
|
207
|
+
96862 97052 howell
|
208
|
+
97052 97782 organizations
|
209
|
+
97782 98102 function
|
210
|
+
98102 98397 because
|
211
|
+
98397 99367 organizations
|
212
|
+
100540 100751 there
|
213
|
+
100751 100916 it's
|
214
|
+
100916 100971 a
|
215
|
+
100971 101366 whole
|
216
|
+
101366 101991 literature
|
217
|
+
101991 102072 in
|
218
|
+
102072 102102 a
|
219
|
+
102102 102567 field
|
220
|
+
102567 102726 of
|
221
|
+
102726 103460 organizational
|
222
|
+
103460 103916 fury
|
223
|
+
103916 104143 there
|
224
|
+
104143 104306 would
|
225
|
+
104306 104456 be
|
226
|
+
104456 104869 important
|
227
|
+
104869 105451 understand
|
228
|
+
105451 105775 amiga
|
229
|
+
105775 106191 small
|
230
|
+
106191 106256 is
|
231
|
+
106256 106518 that
|
232
|
+
106518 106675 is
|
233
|
+
106675 106896 our
|
234
|
+
106896 107196 local
|
235
|
+
107196 107776 government
|
236
|
+
107776 107906 our
|
237
|
+
107906 108172 rap
|
238
|
+
108172 109141 community
|
239
|
+
109141 109511 iranian
|
240
|
+
109511 109691 g
|
241
|
+
109691 109964 l
|
242
|
+
110415 110638 and
|
243
|
+
110638 111084 something
|
244
|
+
111084 111284 as
|
245
|
+
111284 111677 large
|
246
|
+
111677 111939 as
|
247
|
+
112581 112952 steak
|
248
|
+
112952 113167 ever
|
249
|
+
113167 113497 meant
|
250
|
+
113497 113662 and
|
251
|
+
113662 114072 national
|
252
|
+
114072 114836 government
|
253
|
+
114836 115485 maybe
|
254
|
+
115485 116278 at
|
255
|
+
116278 116382 the
|
256
|
+
116382 116602 u
|
257
|
+
116602 116847 n
|
258
|
+
116847 116952 r
|
259
|
+
116952 117042 a
|
260
|
+
117042 117492 global
|
261
|
+
117492 118022 agency
|
262
|
+
118022 118377 news
|
263
|
+
118377 119171 activities
|
264
|
+
119584 120310 influence
|
265
|
+
121832 121927 and
|
266
|
+
121927 122050 the
|
267
|
+
122050 122358 small
|
268
|
+
122358 122659 scale
|
269
|
+
122659 122781 so
|
270
|
+
122781 122876 i
|
271
|
+
122876 123140 think
|
272
|
+
123140 124361 organization
|
273
|
+
124361 125033 analysis
|
274
|
+
125033 125141 is
|
275
|
+
125141 125221 a
|
276
|
+
125221 125461 very
|
277
|
+
125461 126021 important
|
278
|
+
126021 126274 okay
|
279
|
+
129718 130024 line
|
280
|
+
130024 130126 is
|
281
|
+
130126 130540 lambda
|
282
|
+
130540 130728 t
|
283
|
+
130728 131096 v
|
284
|
+
131096 131291 for
|
285
|
+
131291 131866 television
|
286
|
+
131866 132116 the
|
287
|
+
132116 132381 only
|
288
|
+
132381 132631 about
|
289
|
+
132631 132964 it
|
290
|
+
132964 133061 i
|
291
|
+
133061 133423 think
|
292
|
+
133423 133791 and
|
293
|
+
133791 134086 there's
|
294
|
+
134086 134285 a
|
295
|
+
134285 134496 lot
|
296
|
+
134496 134596 of
|
297
|
+
134596 134764 x
|
298
|
+
134764 135074 segment
|
299
|
+
135074 135361 about
|
300
|
+
135361 135671 that
|
301
|
+
135748 135839 and
|
302
|
+
135839 135967 i
|
303
|
+
135967 136207 didn't
|
304
|
+
136207 136422 think
|
305
|
+
136422 136547 that
|
306
|
+
136547 136787 that
|
307
|
+
136787 137235 eminently
|
308
|
+
137235 137311 to
|
309
|
+
137311 137413 the
|
310
|
+
137413 137907 challenges
|
311
|
+
137907 138027 the
|
312
|
+
138027 138277 town
|
313
|
+
138277 138432 and
|
314
|
+
138432 138543 it's
|
315
|
+
138543 139365 hard
|
316
|
+
139365 139918 are
|
317
|
+
139918 140118 how
|
318
|
+
140118 140182 to
|
319
|
+
140182 140442 get
|
320
|
+
140442 140717 there
|
321
|
+
141033 141442 yeah
|
322
|
+
141442 141657 so
|
323
|
+
141657 141795 in
|
324
|
+
141795 142041 what
|
325
|
+
142041 142173 are
|
326
|
+
142173 142338 the
|
327
|
+
142338 143080 strategic
|
328
|
+
143080 143260 aims
|
329
|
+
143260 143662 least
|
330
|
+
143662 143920 okay
|
331
|
+
143920 144155 many
|
332
|
+
144155 144265 here
|
333
|
+
144265 144395 and
|
334
|
+
144395 144605 this
|
335
|
+
144605 144840 meeting
|
336
|
+
144840 145385 whatever
|
337
|
+
145385 145556 to
|
338
|
+
145556 145740 dick
|
339
|
+
145740 146073 steps
|
340
|
+
146073 146390 any
|
341
|
+
146390 146485 to
|
342
|
+
146485 146645 be
|
343
|
+
146645 147000 take
|
344
|
+
147000 147620 ten
|
345
|
+
147879 148133 where
|
346
|
+
148133 148284 you
|
347
|
+
148284 148642 can
|
348
|
+
148642 149163 accomplish
|
349
|
+
149163 149507 something
|
350
|
+
149507 149588 in
|
351
|
+
149588 149689 the
|
352
|
+
149689 149988 short
|
353
|
+
149988 150238 term
|
354
|
+
150238 150520 right
|
355
|
+
150520 150943 away
|
356
|
+
151340 151574 but
|
357
|
+
151574 152461 also
|
358
|
+
152461 152600 you
|
359
|
+
152600 152825 know
|
360
|
+
152825 152892 the
|
361
|
+
152892 153045 the
|
362
|
+
153045 153127 you're
|
363
|
+
153127 153295 not
|
364
|
+
153295 153422 going
|
365
|
+
153422 153550 to
|
366
|
+
153550 154016 accomplish
|
367
|
+
154016 154395 everything
|
368
|
+
154395 154543 that
|
369
|
+
154543 154725 you're
|
370
|
+
154725 155005 leading
|
371
|
+
155005 155193 out
|
372
|
+
155193 155360 in
|
373
|
+
155360 155564 year
|
374
|
+
155564 155815 vision
|
375
|
+
155815 156135 and
|
376
|
+
156323 156847 so
|
377
|
+
157318 157455 you
|
378
|
+
157455 157735 want
|
379
|
+
157735 157856 to
|
380
|
+
157856 158175 make
|
381
|
+
158175 159070 progress
|
382
|
+
159725 160753 continually
|
383
|
+
160753 161005 every
|
384
|
+
161005 161499 states
|
385
|
+
161499 161776 has
|
386
|
+
161776 161800 a
|
387
|
+
161800 162055 has
|
388
|
+
162055 162315 some
|
389
|
+
162315 162402 of
|
390
|
+
162402 162475 the
|
391
|
+
162475 162817 small
|
392
|
+
162817 163585 success
|
393
|
+
163795 164185 but
|
394
|
+
164185 164390 for
|
395
|
+
164390 164432 a
|
396
|
+
164432 164650 need
|
397
|
+
164650 164725 for
|
398
|
+
164725 164865 its
|
399
|
+
164865 165250 allies
|
400
|
+
165250 165680 success
|
401
|
+
165680 166000 because
|
402
|
+
166000 166135 you
|
403
|
+
166135 166935 can't
|
404
|
+
166935 167295 put
|
405
|
+
167295 167413 an
|
406
|
+
167413 168470 institution
|
407
|
+
168906 169036 it's
|
408
|
+
169036 169135 and
|
409
|
+
169135 169483 develop
|
410
|
+
169483 169540 an
|
411
|
+
169540 170090 institution
|
412
|
+
170090 170270 like
|
413
|
+
170270 170539 to
|
414
|
+
170539 171150 emulate
|
415
|
+
171769 171942 your
|
416
|
+
171942 172137 have
|
417
|
+
172137 172265 in
|
418
|
+
172265 173159 mind
|
419
|
+
173159 173270 of
|
420
|
+
173270 173339 the
|
421
|
+
173339 173770 night
|
422
|
+
</segment>
|
423
|
+
</lecture>
|
424
|
+
</document>
|
425
|
+
|