sm-transcript 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +23 -0
- data/README.txt +140 -0
- data/Rakefile +31 -0
- data/bin/results/PLACEHOLDER.txt +8 -0
- data/bin/sm-transcript +12 -0
- data/bin/transcripts/PLACEHOLDER.txt +8 -0
- data/lib/sm_transcript/LICENSE.txt +23 -0
- data/lib/sm_transcript/metadata.rb +69 -0
- data/lib/sm_transcript/metadata_reader.rb +56 -0
- data/lib/sm_transcript/options.rb +89 -0
- data/lib/sm_transcript/optparseExample.rb +113 -0
- data/lib/sm_transcript/process_csv_files_to_html.rb +58 -0
- data/lib/sm_transcript/process_seg_files.rb +21 -0
- data/lib/sm_transcript/process_seg_files_to_csv.rb +24 -0
- data/lib/sm_transcript/process_seg_files_to_html.rb +31 -0
- data/lib/sm_transcript/require_relative.rb +14 -0
- data/lib/sm_transcript/runner.rb +70 -0
- data/lib/sm_transcript/seg_reader.rb +42 -0
- data/lib/sm_transcript/transcript.rb +130 -0
- data/lib/sm_transcript/word.rb +31 -0
- data/lib/sm_transcript/wrd_reader.rb +42 -0
- data/test/Rakefile +14 -0
- data/test/results/IIHS_Diane_Davis_Nov2009.seg +425 -0
- data/test/results/NERCOMP-SpokenMedia4.wrd +6791 -0
- data/test/results/PLACEHOLDER.txt +8 -0
- data/test/results/PLACEHOLDER.txt.ignore +8 -0
- data/test/results/vijay_kumar.wrd +1675 -0
- data/test/results/wirehair-beetle.txt +6 -0
- data/test/test_metadata.rb +39 -0
- data/test/test_metadatareader.rb +30 -0
- data/test/test_options.rb +47 -0
- data/test/test_runner.rb +52 -0
- data/test/test_segreader.rb +39 -0
- data/test/test_transcript.rb +62 -0
- data/test/test_wrdreader.rb +43 -0
- data/test/transcripts/IIHS_Diane_Davis_Nov2009-t1.html +148 -0
- data/test/transcripts/PLACEHOLDER.txt +8 -0
- data/test/transcripts/data.js +24 -0
- data/test/transcripts/vijay_kumar-1.-t1.html +557 -0
- data/test/transcripts/vijay_kumar-1.t1.html +557 -0
- data/test/transcripts/vijay_kumar-t1.html +557 -0
- data/test/transcripts/vijay_kumar-t1.ttml +569 -0
- data/test/transcripts/vijay_kumar.data.js +2 -0
- data/test/transcripts/wirehair-beetle.data.js +3 -0
- metadata +234 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
# $Id: word.rb 182 2010-03-12 22:07:34Z pwilkins $
|
2
|
+
# Copyright (c) 2010 Massachusetts Institute of Technology
|
3
|
+
# see LICENSE.txt for license text
|
4
|
+
|
5
|
+
module SmTranscript
|
6
|
+
class Word
|
7
|
+
attr_reader :start_time
|
8
|
+
attr_reader :end_time
|
9
|
+
attr_reader :word
|
10
|
+
|
11
|
+
def initialize( start_time, end_time, word )
|
12
|
+
@start_time = start_time
|
13
|
+
@end_time = end_time
|
14
|
+
@word = apply_word_rules(word)
|
15
|
+
end
|
16
|
+
|
17
|
+
def apply_word_rules(word)
|
18
|
+
case word
|
19
|
+
when "i" # (English) "i" appearing as a word is capitalized
|
20
|
+
word.upcase
|
21
|
+
when "i'm", "i've" # (English) "i" as personal pronoun is capitalized
|
22
|
+
word.capitalize
|
23
|
+
when "iranian" # (English) proper nouns are capitalized
|
24
|
+
word.capitalize
|
25
|
+
else
|
26
|
+
word
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# $Id: wrd_reader.rb 182 2010-03-12 22:07:34Z pwilkins $
|
2
|
+
# Copyright (c) 2010 Massachusetts Institute of Technology
|
3
|
+
# see LICENSE.txt for license text
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require 'extensions/kernel'
|
7
|
+
require_relative 'word'
|
8
|
+
|
9
|
+
module SmTranscript
|
10
|
+
class WrdReader
|
11
|
+
attr_reader :metadata
|
12
|
+
attr_reader :words
|
13
|
+
|
14
|
+
def self.from_file(file_name)
|
15
|
+
# p File.expand_path(file_name)
|
16
|
+
new(File.open(file_name))
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(src_file)
|
20
|
+
@metadata = {}
|
21
|
+
@words = []
|
22
|
+
parse_metadata()
|
23
|
+
parse_words(src_file)
|
24
|
+
end
|
25
|
+
|
26
|
+
def parse_metadata()
|
27
|
+
# there is currently no metadata in .wrd files
|
28
|
+
end
|
29
|
+
|
30
|
+
def parse_words(src_file)
|
31
|
+
src_file.each do |ln|
|
32
|
+
# line is expected to contain two integers separated by a space,
|
33
|
+
# followed by a space and one or more words. The words may contain
|
34
|
+
# characters or an apostrophe
|
35
|
+
ln.scan(/^\d* \d* [\w']* *[\w']*$/) do |t|
|
36
|
+
arr = t.split
|
37
|
+
@words << SmTranscript::Word.new(arr[0], arr[1], arr[2])
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/test/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# $Id: Rakefile 182 2010-03-12 22:07:34Z pwilkins $
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rake'
|
5
|
+
require 'rake/testtask'
|
6
|
+
|
7
|
+
task :default => [:test_units]
|
8
|
+
|
9
|
+
desc "Run basic tests"
|
10
|
+
Rake::TestTask.new("test_units") { |t|
|
11
|
+
t.pattern = 'test_*.rb'
|
12
|
+
t.verbose = false
|
13
|
+
t.warning = false
|
14
|
+
}
|
@@ -0,0 +1,425 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<document fileName="/Users/mckinney/Developer/Projects/SpokenMedia/workspace/standalone/results/IIHS_Diane_Davis_Nov2009.seg">
|
3
|
+
<lecture title="NA" keywords="global">
|
4
|
+
<segment id="1" title="global">
|
5
|
+
11406 11500 oh
|
6
|
+
11500 11585 i
|
7
|
+
11585 11790 think
|
8
|
+
11790 12188 there're
|
9
|
+
12188 12584 several
|
10
|
+
12584 12951 critical
|
11
|
+
12951 13438 questions
|
12
|
+
13438 13560 of
|
13
|
+
13560 13850 ones
|
14
|
+
13850 13995 that
|
15
|
+
13995 14405 i
|
16
|
+
14405 14822 found
|
17
|
+
14822 14943 in
|
18
|
+
14943 15300 steady
|
19
|
+
15300 15745 most
|
20
|
+
15745 16559 myself
|
21
|
+
17214 17415 are
|
22
|
+
17415 17813 questions
|
23
|
+
17813 17937 of
|
24
|
+
17937 18630 insecurity
|
25
|
+
18630 18775 into
|
26
|
+
18775 19510 violence
|
27
|
+
19510 20055 i
|
28
|
+
20055 20445 see
|
29
|
+
20445 20853 those
|
30
|
+
20853 21200 and
|
31
|
+
21200 22070 some
|
32
|
+
22070 22655 pervasive
|
33
|
+
22655 22770 the
|
34
|
+
22770 23260 class
|
35
|
+
23260 23553 much
|
36
|
+
23553 23680 of
|
37
|
+
23680 23756 a
|
38
|
+
23756 24108 global
|
39
|
+
24108 24516 found
|
40
|
+
25147 25277 and
|
41
|
+
25277 25427 i
|
42
|
+
25427 25947 think
|
43
|
+
27342 27461 have
|
44
|
+
27461 27522 a
|
45
|
+
27522 27677 lot
|
46
|
+
27677 27781 to
|
47
|
+
27781 28032 do
|
48
|
+
28032 28357 with
|
49
|
+
28357 28422 her
|
50
|
+
28422 28722 and
|
51
|
+
28722 28967 devil
|
52
|
+
28967 29672 transition
|
53
|
+
29672 29963 that
|
54
|
+
29963 30217 in
|
55
|
+
30217 30286 the
|
56
|
+
30286 30762 developing
|
57
|
+
30762 30936 world
|
58
|
+
30936 31172 is
|
59
|
+
31172 31777 facebook
|
60
|
+
31777 31953 pull
|
61
|
+
31953 32034 the
|
62
|
+
32034 32287 climb
|
63
|
+
32287 32827 economic
|
64
|
+
32827 34027 globalization
|
65
|
+
34027 34312 never
|
66
|
+
34312 34862 flat
|
67
|
+
35122 35285 it's
|
68
|
+
35285 35350 a
|
69
|
+
35350 35685 fairly
|
70
|
+
35685 36235 standard
|
71
|
+
36235 36460 way
|
72
|
+
36460 36548 of
|
73
|
+
36548 37129 understanding
|
74
|
+
37129 37665 the
|
75
|
+
37665 37943 crime
|
76
|
+
37943 38040 and
|
77
|
+
38040 38570 violence
|
78
|
+
38570 38972 so
|
79
|
+
40637 40696 i
|
80
|
+
40696 40905 think
|
81
|
+
40905 41057 it's
|
82
|
+
41057 41432 import
|
83
|
+
41432 41472 and
|
84
|
+
41472 41705 because
|
85
|
+
41705 41787 it
|
86
|
+
41787 42172 means
|
87
|
+
42172 42302 is
|
88
|
+
42302 42687 that
|
89
|
+
42687 43083 problem
|
90
|
+
43083 43191 of
|
91
|
+
43191 43306 r
|
92
|
+
43306 43907 times
|
93
|
+
44811 44972 and
|
94
|
+
44972 45157 then
|
95
|
+
45157 45619 many
|
96
|
+
45619 46167 developing
|
97
|
+
46167 46903 countries
|
98
|
+
46903 47042 in
|
99
|
+
47042 47402 different
|
100
|
+
47402 47782 regions
|
101
|
+
47782 47892 of
|
102
|
+
47892 47982 the
|
103
|
+
47982 48317 world
|
104
|
+
48317 48421 are
|
105
|
+
48421 49423 facing
|
106
|
+
49423 50356 insecurity
|
107
|
+
50356 50582 and
|
108
|
+
50582 51492 violence
|
109
|
+
51766 52265 that
|
110
|
+
53319 53525 and
|
111
|
+
53525 53662 a
|
112
|
+
53662 53821 lot
|
113
|
+
53821 53961 to
|
114
|
+
53961 54096 do
|
115
|
+
54096 54267 with
|
116
|
+
54267 54436 the
|
117
|
+
54436 54916 changing
|
118
|
+
54916 55391 nature
|
119
|
+
55391 55656 of
|
120
|
+
55656 55938 an
|
121
|
+
55938 56182 urban
|
122
|
+
56182 56898 economies
|
123
|
+
56898 57071 and
|
124
|
+
57071 57214 how
|
125
|
+
57214 57635 they
|
126
|
+
57874 58346 linked
|
127
|
+
58346 58459 to
|
128
|
+
58459 58566 the
|
129
|
+
58566 59276 global
|
130
|
+
59276 59936 economy
|
131
|
+
59936 60151 so
|
132
|
+
60151 60263 and
|
133
|
+
60263 60685 that's
|
134
|
+
61248 61343 i
|
135
|
+
61343 61898 think
|
136
|
+
61898 63003 come
|
137
|
+
63003 63518 pervasive
|
138
|
+
63518 64288 challenge
|
139
|
+
67678 67836 is
|
140
|
+
67836 68090 that
|
141
|
+
68090 68239 is
|
142
|
+
68239 68457 one
|
143
|
+
68457 68557 and
|
144
|
+
68557 69049 two
|
145
|
+
69049 69146 are
|
146
|
+
69146 69451 three
|
147
|
+
69451 70386 specializations
|
148
|
+
70386 70526 that
|
149
|
+
70526 70776 when
|
150
|
+
70776 71121 means
|
151
|
+
71121 71427 in
|
152
|
+
71769 71906 and
|
153
|
+
71906 72691 interconnected
|
154
|
+
72691 73066 the
|
155
|
+
73066 73371 global
|
156
|
+
73371 73866 world
|
157
|
+
73866 74411 aware
|
158
|
+
74765 75695 technology
|
159
|
+
75695 75895 and
|
160
|
+
75895 76625 treat
|
161
|
+
76625 77568 endowment
|
162
|
+
78727 78947 of
|
163
|
+
78947 79777 communication
|
164
|
+
79777 80012 and
|
165
|
+
80012 80287 this
|
166
|
+
81408 81577 i
|
167
|
+
81577 81792 d
|
168
|
+
81792 82247 l
|
169
|
+
82247 83052 ideology
|
170
|
+
83052 83167 of
|
171
|
+
83167 83368 those
|
172
|
+
83368 83437 are
|
173
|
+
83437 83572 all
|
174
|
+
83572 84132 important
|
175
|
+
84132 84202 to
|
176
|
+
84875 85342 aspects
|
177
|
+
85342 85470 of
|
178
|
+
85470 85791 that
|
179
|
+
85791 85915 i'm
|
180
|
+
85915 86000 the
|
181
|
+
86000 86660 planning
|
182
|
+
86660 86930 to
|
183
|
+
86930 87295 kept
|
184
|
+
88377 88488 but
|
185
|
+
88488 88636 i
|
186
|
+
88636 88894 would
|
187
|
+
88894 89157 say
|
188
|
+
89157 89903 definitely
|
189
|
+
89903 90337 need
|
190
|
+
90337 90426 to
|
191
|
+
90426 91267 people
|
192
|
+
91267 91641 who
|
193
|
+
91641 92057 are
|
194
|
+
92739 92857 i
|
195
|
+
92857 93002 had
|
196
|
+
93002 93312 the
|
197
|
+
93312 93544 think
|
198
|
+
93544 93942 about
|
199
|
+
93942 94487 them
|
200
|
+
94487 94777 meaning
|
201
|
+
94777 95260 regulator
|
202
|
+
95260 95597 people
|
203
|
+
95597 95652 and
|
204
|
+
95652 95791 now
|
205
|
+
95791 95947 but
|
206
|
+
95947 96862 organizations
|
207
|
+
96862 97052 howell
|
208
|
+
97052 97782 organizations
|
209
|
+
97782 98102 function
|
210
|
+
98102 98397 because
|
211
|
+
98397 99367 organizations
|
212
|
+
100540 100751 there
|
213
|
+
100751 100916 it's
|
214
|
+
100916 100971 a
|
215
|
+
100971 101366 whole
|
216
|
+
101366 101991 literature
|
217
|
+
101991 102072 in
|
218
|
+
102072 102102 a
|
219
|
+
102102 102567 field
|
220
|
+
102567 102726 of
|
221
|
+
102726 103460 organizational
|
222
|
+
103460 103916 fury
|
223
|
+
103916 104143 there
|
224
|
+
104143 104306 would
|
225
|
+
104306 104456 be
|
226
|
+
104456 104869 important
|
227
|
+
104869 105451 understand
|
228
|
+
105451 105775 amiga
|
229
|
+
105775 106191 small
|
230
|
+
106191 106256 is
|
231
|
+
106256 106518 that
|
232
|
+
106518 106675 is
|
233
|
+
106675 106896 our
|
234
|
+
106896 107196 local
|
235
|
+
107196 107776 government
|
236
|
+
107776 107906 our
|
237
|
+
107906 108172 rap
|
238
|
+
108172 109141 community
|
239
|
+
109141 109511 iranian
|
240
|
+
109511 109691 g
|
241
|
+
109691 109964 l
|
242
|
+
110415 110638 and
|
243
|
+
110638 111084 something
|
244
|
+
111084 111284 as
|
245
|
+
111284 111677 large
|
246
|
+
111677 111939 as
|
247
|
+
112581 112952 steak
|
248
|
+
112952 113167 ever
|
249
|
+
113167 113497 meant
|
250
|
+
113497 113662 and
|
251
|
+
113662 114072 national
|
252
|
+
114072 114836 government
|
253
|
+
114836 115485 maybe
|
254
|
+
115485 116278 at
|
255
|
+
116278 116382 the
|
256
|
+
116382 116602 u
|
257
|
+
116602 116847 n
|
258
|
+
116847 116952 r
|
259
|
+
116952 117042 a
|
260
|
+
117042 117492 global
|
261
|
+
117492 118022 agency
|
262
|
+
118022 118377 news
|
263
|
+
118377 119171 activities
|
264
|
+
119584 120310 influence
|
265
|
+
121832 121927 and
|
266
|
+
121927 122050 the
|
267
|
+
122050 122358 small
|
268
|
+
122358 122659 scale
|
269
|
+
122659 122781 so
|
270
|
+
122781 122876 i
|
271
|
+
122876 123140 think
|
272
|
+
123140 124361 organization
|
273
|
+
124361 125033 analysis
|
274
|
+
125033 125141 is
|
275
|
+
125141 125221 a
|
276
|
+
125221 125461 very
|
277
|
+
125461 126021 important
|
278
|
+
126021 126274 okay
|
279
|
+
129718 130024 line
|
280
|
+
130024 130126 is
|
281
|
+
130126 130540 lambda
|
282
|
+
130540 130728 t
|
283
|
+
130728 131096 v
|
284
|
+
131096 131291 for
|
285
|
+
131291 131866 television
|
286
|
+
131866 132116 the
|
287
|
+
132116 132381 only
|
288
|
+
132381 132631 about
|
289
|
+
132631 132964 it
|
290
|
+
132964 133061 i
|
291
|
+
133061 133423 think
|
292
|
+
133423 133791 and
|
293
|
+
133791 134086 there's
|
294
|
+
134086 134285 a
|
295
|
+
134285 134496 lot
|
296
|
+
134496 134596 of
|
297
|
+
134596 134764 x
|
298
|
+
134764 135074 segment
|
299
|
+
135074 135361 about
|
300
|
+
135361 135671 that
|
301
|
+
135748 135839 and
|
302
|
+
135839 135967 i
|
303
|
+
135967 136207 didn't
|
304
|
+
136207 136422 think
|
305
|
+
136422 136547 that
|
306
|
+
136547 136787 that
|
307
|
+
136787 137235 eminently
|
308
|
+
137235 137311 to
|
309
|
+
137311 137413 the
|
310
|
+
137413 137907 challenges
|
311
|
+
137907 138027 the
|
312
|
+
138027 138277 town
|
313
|
+
138277 138432 and
|
314
|
+
138432 138543 it's
|
315
|
+
138543 139365 hard
|
316
|
+
139365 139918 are
|
317
|
+
139918 140118 how
|
318
|
+
140118 140182 to
|
319
|
+
140182 140442 get
|
320
|
+
140442 140717 there
|
321
|
+
141033 141442 yeah
|
322
|
+
141442 141657 so
|
323
|
+
141657 141795 in
|
324
|
+
141795 142041 what
|
325
|
+
142041 142173 are
|
326
|
+
142173 142338 the
|
327
|
+
142338 143080 strategic
|
328
|
+
143080 143260 aims
|
329
|
+
143260 143662 least
|
330
|
+
143662 143920 okay
|
331
|
+
143920 144155 many
|
332
|
+
144155 144265 here
|
333
|
+
144265 144395 and
|
334
|
+
144395 144605 this
|
335
|
+
144605 144840 meeting
|
336
|
+
144840 145385 whatever
|
337
|
+
145385 145556 to
|
338
|
+
145556 145740 dick
|
339
|
+
145740 146073 steps
|
340
|
+
146073 146390 any
|
341
|
+
146390 146485 to
|
342
|
+
146485 146645 be
|
343
|
+
146645 147000 take
|
344
|
+
147000 147620 ten
|
345
|
+
147879 148133 where
|
346
|
+
148133 148284 you
|
347
|
+
148284 148642 can
|
348
|
+
148642 149163 accomplish
|
349
|
+
149163 149507 something
|
350
|
+
149507 149588 in
|
351
|
+
149588 149689 the
|
352
|
+
149689 149988 short
|
353
|
+
149988 150238 term
|
354
|
+
150238 150520 right
|
355
|
+
150520 150943 away
|
356
|
+
151340 151574 but
|
357
|
+
151574 152461 also
|
358
|
+
152461 152600 you
|
359
|
+
152600 152825 know
|
360
|
+
152825 152892 the
|
361
|
+
152892 153045 the
|
362
|
+
153045 153127 you're
|
363
|
+
153127 153295 not
|
364
|
+
153295 153422 going
|
365
|
+
153422 153550 to
|
366
|
+
153550 154016 accomplish
|
367
|
+
154016 154395 everything
|
368
|
+
154395 154543 that
|
369
|
+
154543 154725 you're
|
370
|
+
154725 155005 leading
|
371
|
+
155005 155193 out
|
372
|
+
155193 155360 in
|
373
|
+
155360 155564 year
|
374
|
+
155564 155815 vision
|
375
|
+
155815 156135 and
|
376
|
+
156323 156847 so
|
377
|
+
157318 157455 you
|
378
|
+
157455 157735 want
|
379
|
+
157735 157856 to
|
380
|
+
157856 158175 make
|
381
|
+
158175 159070 progress
|
382
|
+
159725 160753 continually
|
383
|
+
160753 161005 every
|
384
|
+
161005 161499 states
|
385
|
+
161499 161776 has
|
386
|
+
161776 161800 a
|
387
|
+
161800 162055 has
|
388
|
+
162055 162315 some
|
389
|
+
162315 162402 of
|
390
|
+
162402 162475 the
|
391
|
+
162475 162817 small
|
392
|
+
162817 163585 success
|
393
|
+
163795 164185 but
|
394
|
+
164185 164390 for
|
395
|
+
164390 164432 a
|
396
|
+
164432 164650 need
|
397
|
+
164650 164725 for
|
398
|
+
164725 164865 its
|
399
|
+
164865 165250 allies
|
400
|
+
165250 165680 success
|
401
|
+
165680 166000 because
|
402
|
+
166000 166135 you
|
403
|
+
166135 166935 can't
|
404
|
+
166935 167295 put
|
405
|
+
167295 167413 an
|
406
|
+
167413 168470 institution
|
407
|
+
168906 169036 it's
|
408
|
+
169036 169135 and
|
409
|
+
169135 169483 develop
|
410
|
+
169483 169540 an
|
411
|
+
169540 170090 institution
|
412
|
+
170090 170270 like
|
413
|
+
170270 170539 to
|
414
|
+
170539 171150 emulate
|
415
|
+
171769 171942 your
|
416
|
+
171942 172137 have
|
417
|
+
172137 172265 in
|
418
|
+
172265 173159 mind
|
419
|
+
173159 173270 of
|
420
|
+
173270 173339 the
|
421
|
+
173339 173770 night
|
422
|
+
</segment>
|
423
|
+
</lecture>
|
424
|
+
</document>
|
425
|
+
|