sm-transcript 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/LICENSE.txt +23 -0
  2. data/README.txt +140 -0
  3. data/Rakefile +31 -0
  4. data/bin/results/PLACEHOLDER.txt +8 -0
  5. data/bin/sm-transcript +12 -0
  6. data/bin/transcripts/PLACEHOLDER.txt +8 -0
  7. data/lib/sm_transcript/LICENSE.txt +23 -0
  8. data/lib/sm_transcript/metadata.rb +69 -0
  9. data/lib/sm_transcript/metadata_reader.rb +56 -0
  10. data/lib/sm_transcript/options.rb +89 -0
  11. data/lib/sm_transcript/optparseExample.rb +113 -0
  12. data/lib/sm_transcript/process_csv_files_to_html.rb +58 -0
  13. data/lib/sm_transcript/process_seg_files.rb +21 -0
  14. data/lib/sm_transcript/process_seg_files_to_csv.rb +24 -0
  15. data/lib/sm_transcript/process_seg_files_to_html.rb +31 -0
  16. data/lib/sm_transcript/require_relative.rb +14 -0
  17. data/lib/sm_transcript/runner.rb +70 -0
  18. data/lib/sm_transcript/seg_reader.rb +42 -0
  19. data/lib/sm_transcript/transcript.rb +130 -0
  20. data/lib/sm_transcript/word.rb +31 -0
  21. data/lib/sm_transcript/wrd_reader.rb +42 -0
  22. data/test/Rakefile +14 -0
  23. data/test/results/IIHS_Diane_Davis_Nov2009.seg +425 -0
  24. data/test/results/NERCOMP-SpokenMedia4.wrd +6791 -0
  25. data/test/results/PLACEHOLDER.txt +8 -0
  26. data/test/results/PLACEHOLDER.txt.ignore +8 -0
  27. data/test/results/vijay_kumar.wrd +1675 -0
  28. data/test/results/wirehair-beetle.txt +6 -0
  29. data/test/test_metadata.rb +39 -0
  30. data/test/test_metadatareader.rb +30 -0
  31. data/test/test_options.rb +47 -0
  32. data/test/test_runner.rb +52 -0
  33. data/test/test_segreader.rb +39 -0
  34. data/test/test_transcript.rb +62 -0
  35. data/test/test_wrdreader.rb +43 -0
  36. data/test/transcripts/IIHS_Diane_Davis_Nov2009-t1.html +148 -0
  37. data/test/transcripts/PLACEHOLDER.txt +8 -0
  38. data/test/transcripts/data.js +24 -0
  39. data/test/transcripts/vijay_kumar-1.-t1.html +557 -0
  40. data/test/transcripts/vijay_kumar-1.t1.html +557 -0
  41. data/test/transcripts/vijay_kumar-t1.html +557 -0
  42. data/test/transcripts/vijay_kumar-t1.ttml +569 -0
  43. data/test/transcripts/vijay_kumar.data.js +2 -0
  44. data/test/transcripts/wirehair-beetle.data.js +3 -0
  45. metadata +234 -0
@@ -0,0 +1,31 @@
1
+ # $Id: word.rb 182 2010-03-12 22:07:34Z pwilkins $
2
+ # Copyright (c) 2010 Massachusetts Institute of Technology
3
+ # see LICENSE.txt for license text
4
+
5
+ module SmTranscript
6
+ class Word
7
+ attr_reader :start_time
8
+ attr_reader :end_time
9
+ attr_reader :word
10
+
11
+ def initialize( start_time, end_time, word )
12
+ @start_time = start_time
13
+ @end_time = end_time
14
+ @word = apply_word_rules(word)
15
+ end
16
+
17
+ def apply_word_rules(word)
18
+ case word
19
+ when "i" # (English) "i" appearing as a word is capitalized
20
+ word.upcase
21
+ when "i'm", "i've" # (English) "i" as personal pronoun is capitalized
22
+ word.capitalize
23
+ when "iranian" # (English) proper nouns are capitalized
24
+ word.capitalize
25
+ else
26
+ word
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,42 @@
1
+ # $Id: wrd_reader.rb 182 2010-03-12 22:07:34Z pwilkins $
2
+ # Copyright (c) 2010 Massachusetts Institute of Technology
3
+ # see LICENSE.txt for license text
4
+
5
+ require 'rubygems'
6
+ require 'extensions/kernel'
7
+ require_relative 'word'
8
+
9
+ module SmTranscript
10
+ class WrdReader
11
+ attr_reader :metadata
12
+ attr_reader :words
13
+
14
+ def self.from_file(file_name)
15
+ # p File.expand_path(file_name)
16
+ new(File.open(file_name))
17
+ end
18
+
19
+ def initialize(src_file)
20
+ @metadata = {}
21
+ @words = []
22
+ parse_metadata()
23
+ parse_words(src_file)
24
+ end
25
+
26
+ def parse_metadata()
27
+ # there is currently no metadata in .wrd files
28
+ end
29
+
30
+ def parse_words(src_file)
31
+ src_file.each do |ln|
32
+ # line is expected to contain two integers separated by a space,
33
+ # followed by a space and one or more words. The words may contain
34
+ # characters or an apostrophe
35
+ ln.scan(/^\d* \d* [\w']* *[\w']*$/) do |t|
36
+ arr = t.split
37
+ @words << SmTranscript::Word.new(arr[0], arr[1], arr[2])
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,14 @@
1
+ # $Id: Rakefile 182 2010-03-12 22:07:34Z pwilkins $
2
+
3
+ require 'rubygems'
4
+ require 'rake'
5
+ require 'rake/testtask'
6
+
7
+ task :default => [:test_units]
8
+
9
+ desc "Run basic tests"
10
+ Rake::TestTask.new("test_units") { |t|
11
+ t.pattern = 'test_*.rb'
12
+ t.verbose = false
13
+ t.warning = false
14
+ }
@@ -0,0 +1,425 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <document fileName="/Users/mckinney/Developer/Projects/SpokenMedia/workspace/standalone/results/IIHS_Diane_Davis_Nov2009.seg">
3
+ <lecture title="NA" keywords="global">
4
+ <segment id="1" title="global">
5
+ 11406 11500 oh
6
+ 11500 11585 i
7
+ 11585 11790 think
8
+ 11790 12188 there're
9
+ 12188 12584 several
10
+ 12584 12951 critical
11
+ 12951 13438 questions
12
+ 13438 13560 of
13
+ 13560 13850 ones
14
+ 13850 13995 that
15
+ 13995 14405 i
16
+ 14405 14822 found
17
+ 14822 14943 in
18
+ 14943 15300 steady
19
+ 15300 15745 most
20
+ 15745 16559 myself
21
+ 17214 17415 are
22
+ 17415 17813 questions
23
+ 17813 17937 of
24
+ 17937 18630 insecurity
25
+ 18630 18775 into
26
+ 18775 19510 violence
27
+ 19510 20055 i
28
+ 20055 20445 see
29
+ 20445 20853 those
30
+ 20853 21200 and
31
+ 21200 22070 some
32
+ 22070 22655 pervasive
33
+ 22655 22770 the
34
+ 22770 23260 class
35
+ 23260 23553 much
36
+ 23553 23680 of
37
+ 23680 23756 a
38
+ 23756 24108 global
39
+ 24108 24516 found
40
+ 25147 25277 and
41
+ 25277 25427 i
42
+ 25427 25947 think
43
+ 27342 27461 have
44
+ 27461 27522 a
45
+ 27522 27677 lot
46
+ 27677 27781 to
47
+ 27781 28032 do
48
+ 28032 28357 with
49
+ 28357 28422 her
50
+ 28422 28722 and
51
+ 28722 28967 devil
52
+ 28967 29672 transition
53
+ 29672 29963 that
54
+ 29963 30217 in
55
+ 30217 30286 the
56
+ 30286 30762 developing
57
+ 30762 30936 world
58
+ 30936 31172 is
59
+ 31172 31777 facebook
60
+ 31777 31953 pull
61
+ 31953 32034 the
62
+ 32034 32287 climb
63
+ 32287 32827 economic
64
+ 32827 34027 globalization
65
+ 34027 34312 never
66
+ 34312 34862 flat
67
+ 35122 35285 it's
68
+ 35285 35350 a
69
+ 35350 35685 fairly
70
+ 35685 36235 standard
71
+ 36235 36460 way
72
+ 36460 36548 of
73
+ 36548 37129 understanding
74
+ 37129 37665 the
75
+ 37665 37943 crime
76
+ 37943 38040 and
77
+ 38040 38570 violence
78
+ 38570 38972 so
79
+ 40637 40696 i
80
+ 40696 40905 think
81
+ 40905 41057 it's
82
+ 41057 41432 import
83
+ 41432 41472 and
84
+ 41472 41705 because
85
+ 41705 41787 it
86
+ 41787 42172 means
87
+ 42172 42302 is
88
+ 42302 42687 that
89
+ 42687 43083 problem
90
+ 43083 43191 of
91
+ 43191 43306 r
92
+ 43306 43907 times
93
+ 44811 44972 and
94
+ 44972 45157 then
95
+ 45157 45619 many
96
+ 45619 46167 developing
97
+ 46167 46903 countries
98
+ 46903 47042 in
99
+ 47042 47402 different
100
+ 47402 47782 regions
101
+ 47782 47892 of
102
+ 47892 47982 the
103
+ 47982 48317 world
104
+ 48317 48421 are
105
+ 48421 49423 facing
106
+ 49423 50356 insecurity
107
+ 50356 50582 and
108
+ 50582 51492 violence
109
+ 51766 52265 that
110
+ 53319 53525 and
111
+ 53525 53662 a
112
+ 53662 53821 lot
113
+ 53821 53961 to
114
+ 53961 54096 do
115
+ 54096 54267 with
116
+ 54267 54436 the
117
+ 54436 54916 changing
118
+ 54916 55391 nature
119
+ 55391 55656 of
120
+ 55656 55938 an
121
+ 55938 56182 urban
122
+ 56182 56898 economies
123
+ 56898 57071 and
124
+ 57071 57214 how
125
+ 57214 57635 they
126
+ 57874 58346 linked
127
+ 58346 58459 to
128
+ 58459 58566 the
129
+ 58566 59276 global
130
+ 59276 59936 economy
131
+ 59936 60151 so
132
+ 60151 60263 and
133
+ 60263 60685 that's
134
+ 61248 61343 i
135
+ 61343 61898 think
136
+ 61898 63003 come
137
+ 63003 63518 pervasive
138
+ 63518 64288 challenge
139
+ 67678 67836 is
140
+ 67836 68090 that
141
+ 68090 68239 is
142
+ 68239 68457 one
143
+ 68457 68557 and
144
+ 68557 69049 two
145
+ 69049 69146 are
146
+ 69146 69451 three
147
+ 69451 70386 specializations
148
+ 70386 70526 that
149
+ 70526 70776 when
150
+ 70776 71121 means
151
+ 71121 71427 in
152
+ 71769 71906 and
153
+ 71906 72691 interconnected
154
+ 72691 73066 the
155
+ 73066 73371 global
156
+ 73371 73866 world
157
+ 73866 74411 aware
158
+ 74765 75695 technology
159
+ 75695 75895 and
160
+ 75895 76625 treat
161
+ 76625 77568 endowment
162
+ 78727 78947 of
163
+ 78947 79777 communication
164
+ 79777 80012 and
165
+ 80012 80287 this
166
+ 81408 81577 i
167
+ 81577 81792 d
168
+ 81792 82247 l
169
+ 82247 83052 ideology
170
+ 83052 83167 of
171
+ 83167 83368 those
172
+ 83368 83437 are
173
+ 83437 83572 all
174
+ 83572 84132 important
175
+ 84132 84202 to
176
+ 84875 85342 aspects
177
+ 85342 85470 of
178
+ 85470 85791 that
179
+ 85791 85915 i'm
180
+ 85915 86000 the
181
+ 86000 86660 planning
182
+ 86660 86930 to
183
+ 86930 87295 kept
184
+ 88377 88488 but
185
+ 88488 88636 i
186
+ 88636 88894 would
187
+ 88894 89157 say
188
+ 89157 89903 definitely
189
+ 89903 90337 need
190
+ 90337 90426 to
191
+ 90426 91267 people
192
+ 91267 91641 who
193
+ 91641 92057 are
194
+ 92739 92857 i
195
+ 92857 93002 had
196
+ 93002 93312 the
197
+ 93312 93544 think
198
+ 93544 93942 about
199
+ 93942 94487 them
200
+ 94487 94777 meaning
201
+ 94777 95260 regulator
202
+ 95260 95597 people
203
+ 95597 95652 and
204
+ 95652 95791 now
205
+ 95791 95947 but
206
+ 95947 96862 organizations
207
+ 96862 97052 howell
208
+ 97052 97782 organizations
209
+ 97782 98102 function
210
+ 98102 98397 because
211
+ 98397 99367 organizations
212
+ 100540 100751 there
213
+ 100751 100916 it's
214
+ 100916 100971 a
215
+ 100971 101366 whole
216
+ 101366 101991 literature
217
+ 101991 102072 in
218
+ 102072 102102 a
219
+ 102102 102567 field
220
+ 102567 102726 of
221
+ 102726 103460 organizational
222
+ 103460 103916 fury
223
+ 103916 104143 there
224
+ 104143 104306 would
225
+ 104306 104456 be
226
+ 104456 104869 important
227
+ 104869 105451 understand
228
+ 105451 105775 amiga
229
+ 105775 106191 small
230
+ 106191 106256 is
231
+ 106256 106518 that
232
+ 106518 106675 is
233
+ 106675 106896 our
234
+ 106896 107196 local
235
+ 107196 107776 government
236
+ 107776 107906 our
237
+ 107906 108172 rap
238
+ 108172 109141 community
239
+ 109141 109511 iranian
240
+ 109511 109691 g
241
+ 109691 109964 l
242
+ 110415 110638 and
243
+ 110638 111084 something
244
+ 111084 111284 as
245
+ 111284 111677 large
246
+ 111677 111939 as
247
+ 112581 112952 steak
248
+ 112952 113167 ever
249
+ 113167 113497 meant
250
+ 113497 113662 and
251
+ 113662 114072 national
252
+ 114072 114836 government
253
+ 114836 115485 maybe
254
+ 115485 116278 at
255
+ 116278 116382 the
256
+ 116382 116602 u
257
+ 116602 116847 n
258
+ 116847 116952 r
259
+ 116952 117042 a
260
+ 117042 117492 global
261
+ 117492 118022 agency
262
+ 118022 118377 news
263
+ 118377 119171 activities
264
+ 119584 120310 influence
265
+ 121832 121927 and
266
+ 121927 122050 the
267
+ 122050 122358 small
268
+ 122358 122659 scale
269
+ 122659 122781 so
270
+ 122781 122876 i
271
+ 122876 123140 think
272
+ 123140 124361 organization
273
+ 124361 125033 analysis
274
+ 125033 125141 is
275
+ 125141 125221 a
276
+ 125221 125461 very
277
+ 125461 126021 important
278
+ 126021 126274 okay
279
+ 129718 130024 line
280
+ 130024 130126 is
281
+ 130126 130540 lambda
282
+ 130540 130728 t
283
+ 130728 131096 v
284
+ 131096 131291 for
285
+ 131291 131866 television
286
+ 131866 132116 the
287
+ 132116 132381 only
288
+ 132381 132631 about
289
+ 132631 132964 it
290
+ 132964 133061 i
291
+ 133061 133423 think
292
+ 133423 133791 and
293
+ 133791 134086 there's
294
+ 134086 134285 a
295
+ 134285 134496 lot
296
+ 134496 134596 of
297
+ 134596 134764 x
298
+ 134764 135074 segment
299
+ 135074 135361 about
300
+ 135361 135671 that
301
+ 135748 135839 and
302
+ 135839 135967 i
303
+ 135967 136207 didn't
304
+ 136207 136422 think
305
+ 136422 136547 that
306
+ 136547 136787 that
307
+ 136787 137235 eminently
308
+ 137235 137311 to
309
+ 137311 137413 the
310
+ 137413 137907 challenges
311
+ 137907 138027 the
312
+ 138027 138277 town
313
+ 138277 138432 and
314
+ 138432 138543 it's
315
+ 138543 139365 hard
316
+ 139365 139918 are
317
+ 139918 140118 how
318
+ 140118 140182 to
319
+ 140182 140442 get
320
+ 140442 140717 there
321
+ 141033 141442 yeah
322
+ 141442 141657 so
323
+ 141657 141795 in
324
+ 141795 142041 what
325
+ 142041 142173 are
326
+ 142173 142338 the
327
+ 142338 143080 strategic
328
+ 143080 143260 aims
329
+ 143260 143662 least
330
+ 143662 143920 okay
331
+ 143920 144155 many
332
+ 144155 144265 here
333
+ 144265 144395 and
334
+ 144395 144605 this
335
+ 144605 144840 meeting
336
+ 144840 145385 whatever
337
+ 145385 145556 to
338
+ 145556 145740 dick
339
+ 145740 146073 steps
340
+ 146073 146390 any
341
+ 146390 146485 to
342
+ 146485 146645 be
343
+ 146645 147000 take
344
+ 147000 147620 ten
345
+ 147879 148133 where
346
+ 148133 148284 you
347
+ 148284 148642 can
348
+ 148642 149163 accomplish
349
+ 149163 149507 something
350
+ 149507 149588 in
351
+ 149588 149689 the
352
+ 149689 149988 short
353
+ 149988 150238 term
354
+ 150238 150520 right
355
+ 150520 150943 away
356
+ 151340 151574 but
357
+ 151574 152461 also
358
+ 152461 152600 you
359
+ 152600 152825 know
360
+ 152825 152892 the
361
+ 152892 153045 the
362
+ 153045 153127 you're
363
+ 153127 153295 not
364
+ 153295 153422 going
365
+ 153422 153550 to
366
+ 153550 154016 accomplish
367
+ 154016 154395 everything
368
+ 154395 154543 that
369
+ 154543 154725 you're
370
+ 154725 155005 leading
371
+ 155005 155193 out
372
+ 155193 155360 in
373
+ 155360 155564 year
374
+ 155564 155815 vision
375
+ 155815 156135 and
376
+ 156323 156847 so
377
+ 157318 157455 you
378
+ 157455 157735 want
379
+ 157735 157856 to
380
+ 157856 158175 make
381
+ 158175 159070 progress
382
+ 159725 160753 continually
383
+ 160753 161005 every
384
+ 161005 161499 states
385
+ 161499 161776 has
386
+ 161776 161800 a
387
+ 161800 162055 has
388
+ 162055 162315 some
389
+ 162315 162402 of
390
+ 162402 162475 the
391
+ 162475 162817 small
392
+ 162817 163585 success
393
+ 163795 164185 but
394
+ 164185 164390 for
395
+ 164390 164432 a
396
+ 164432 164650 need
397
+ 164650 164725 for
398
+ 164725 164865 its
399
+ 164865 165250 allies
400
+ 165250 165680 success
401
+ 165680 166000 because
402
+ 166000 166135 you
403
+ 166135 166935 can't
404
+ 166935 167295 put
405
+ 167295 167413 an
406
+ 167413 168470 institution
407
+ 168906 169036 it's
408
+ 169036 169135 and
409
+ 169135 169483 develop
410
+ 169483 169540 an
411
+ 169540 170090 institution
412
+ 170090 170270 like
413
+ 170270 170539 to
414
+ 170539 171150 emulate
415
+ 171769 171942 your
416
+ 171942 172137 have
417
+ 172137 172265 in
418
+ 172265 173159 mind
419
+ 173159 173270 of
420
+ 173270 173339 the
421
+ 173339 173770 night
422
+ </segment>
423
+ </lecture>
424
+ </document>
425
+