sm-transcript 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/LICENSE.txt +23 -0
  2. data/README.txt +140 -0
  3. data/Rakefile +31 -0
  4. data/bin/results/PLACEHOLDER.txt +8 -0
  5. data/bin/sm-transcript +12 -0
  6. data/bin/transcripts/PLACEHOLDER.txt +8 -0
  7. data/lib/sm_transcript/LICENSE.txt +23 -0
  8. data/lib/sm_transcript/metadata.rb +69 -0
  9. data/lib/sm_transcript/metadata_reader.rb +56 -0
  10. data/lib/sm_transcript/options.rb +89 -0
  11. data/lib/sm_transcript/optparseExample.rb +113 -0
  12. data/lib/sm_transcript/process_csv_files_to_html.rb +58 -0
  13. data/lib/sm_transcript/process_seg_files.rb +21 -0
  14. data/lib/sm_transcript/process_seg_files_to_csv.rb +24 -0
  15. data/lib/sm_transcript/process_seg_files_to_html.rb +31 -0
  16. data/lib/sm_transcript/require_relative.rb +14 -0
  17. data/lib/sm_transcript/runner.rb +70 -0
  18. data/lib/sm_transcript/seg_reader.rb +42 -0
  19. data/lib/sm_transcript/transcript.rb +130 -0
  20. data/lib/sm_transcript/word.rb +31 -0
  21. data/lib/sm_transcript/wrd_reader.rb +42 -0
  22. data/test/Rakefile +14 -0
  23. data/test/results/IIHS_Diane_Davis_Nov2009.seg +425 -0
  24. data/test/results/NERCOMP-SpokenMedia4.wrd +6791 -0
  25. data/test/results/PLACEHOLDER.txt +8 -0
  26. data/test/results/PLACEHOLDER.txt.ignore +8 -0
  27. data/test/results/vijay_kumar.wrd +1675 -0
  28. data/test/results/wirehair-beetle.txt +6 -0
  29. data/test/test_metadata.rb +39 -0
  30. data/test/test_metadatareader.rb +30 -0
  31. data/test/test_options.rb +47 -0
  32. data/test/test_runner.rb +52 -0
  33. data/test/test_segreader.rb +39 -0
  34. data/test/test_transcript.rb +62 -0
  35. data/test/test_wrdreader.rb +43 -0
  36. data/test/transcripts/IIHS_Diane_Davis_Nov2009-t1.html +148 -0
  37. data/test/transcripts/PLACEHOLDER.txt +8 -0
  38. data/test/transcripts/data.js +24 -0
  39. data/test/transcripts/vijay_kumar-1.-t1.html +557 -0
  40. data/test/transcripts/vijay_kumar-1.t1.html +557 -0
  41. data/test/transcripts/vijay_kumar-t1.html +557 -0
  42. data/test/transcripts/vijay_kumar-t1.ttml +569 -0
  43. data/test/transcripts/vijay_kumar.data.js +2 -0
  44. data/test/transcripts/wirehair-beetle.data.js +3 -0
  45. metadata +234 -0
@@ -0,0 +1,31 @@
1
+ # $Id: word.rb 182 2010-03-12 22:07:34Z pwilkins $
2
+ # Copyright (c) 2010 Massachusetts Institute of Technology
3
+ # see LICENSE.txt for license text
4
+
5
+ module SmTranscript
6
+ class Word
7
+ attr_reader :start_time
8
+ attr_reader :end_time
9
+ attr_reader :word
10
+
11
+ def initialize( start_time, end_time, word )
12
+ @start_time = start_time
13
+ @end_time = end_time
14
+ @word = apply_word_rules(word)
15
+ end
16
+
17
+ def apply_word_rules(word)
18
+ case word
19
+ when "i" # (English) "i" appearing as a word is capitalized
20
+ word.upcase
21
+ when "i'm", "i've" # (English) "i" as personal pronoun is capitalized
22
+ word.capitalize
23
+ when "iranian" # (English) proper nouns are capitalized
24
+ word.capitalize
25
+ else
26
+ word
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,42 @@
1
+ # $Id: wrd_reader.rb 182 2010-03-12 22:07:34Z pwilkins $
2
+ # Copyright (c) 2010 Massachusetts Institute of Technology
3
+ # see LICENSE.txt for license text
4
+
5
+ require 'rubygems'
6
+ require 'extensions/kernel'
7
+ require_relative 'word'
8
+
9
+ module SmTranscript
10
+ class WrdReader
11
+ attr_reader :metadata
12
+ attr_reader :words
13
+
14
+ def self.from_file(file_name)
15
+ # p File.expand_path(file_name)
16
+ new(File.open(file_name))
17
+ end
18
+
19
+ def initialize(src_file)
20
+ @metadata = {}
21
+ @words = []
22
+ parse_metadata()
23
+ parse_words(src_file)
24
+ end
25
+
26
+ def parse_metadata()
27
+ # there is currently no metadata in .wrd files
28
+ end
29
+
30
+ def parse_words(src_file)
31
+ src_file.each do |ln|
32
+ # line is expected to contain two integers separated by a space,
33
+ # followed by a space and one or more words. The words may contain
34
+ # characters or an apostrophe
35
+ ln.scan(/^\d* \d* [\w']* *[\w']*$/) do |t|
36
+ arr = t.split
37
+ @words << SmTranscript::Word.new(arr[0], arr[1], arr[2])
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,14 @@
1
+ # $Id: Rakefile 182 2010-03-12 22:07:34Z pwilkins $
2
+
3
+ require 'rubygems'
4
+ require 'rake'
5
+ require 'rake/testtask'
6
+
7
+ task :default => [:test_units]
8
+
9
+ desc "Run basic tests"
10
+ Rake::TestTask.new("test_units") { |t|
11
+ t.pattern = 'test_*.rb'
12
+ t.verbose = false
13
+ t.warning = false
14
+ }
@@ -0,0 +1,425 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <document fileName="/Users/mckinney/Developer/Projects/SpokenMedia/workspace/standalone/results/IIHS_Diane_Davis_Nov2009.seg">
3
+ <lecture title="NA" keywords="global">
4
+ <segment id="1" title="global">
5
+ 11406 11500 oh
6
+ 11500 11585 i
7
+ 11585 11790 think
8
+ 11790 12188 there're
9
+ 12188 12584 several
10
+ 12584 12951 critical
11
+ 12951 13438 questions
12
+ 13438 13560 of
13
+ 13560 13850 ones
14
+ 13850 13995 that
15
+ 13995 14405 i
16
+ 14405 14822 found
17
+ 14822 14943 in
18
+ 14943 15300 steady
19
+ 15300 15745 most
20
+ 15745 16559 myself
21
+ 17214 17415 are
22
+ 17415 17813 questions
23
+ 17813 17937 of
24
+ 17937 18630 insecurity
25
+ 18630 18775 into
26
+ 18775 19510 violence
27
+ 19510 20055 i
28
+ 20055 20445 see
29
+ 20445 20853 those
30
+ 20853 21200 and
31
+ 21200 22070 some
32
+ 22070 22655 pervasive
33
+ 22655 22770 the
34
+ 22770 23260 class
35
+ 23260 23553 much
36
+ 23553 23680 of
37
+ 23680 23756 a
38
+ 23756 24108 global
39
+ 24108 24516 found
40
+ 25147 25277 and
41
+ 25277 25427 i
42
+ 25427 25947 think
43
+ 27342 27461 have
44
+ 27461 27522 a
45
+ 27522 27677 lot
46
+ 27677 27781 to
47
+ 27781 28032 do
48
+ 28032 28357 with
49
+ 28357 28422 her
50
+ 28422 28722 and
51
+ 28722 28967 devil
52
+ 28967 29672 transition
53
+ 29672 29963 that
54
+ 29963 30217 in
55
+ 30217 30286 the
56
+ 30286 30762 developing
57
+ 30762 30936 world
58
+ 30936 31172 is
59
+ 31172 31777 facebook
60
+ 31777 31953 pull
61
+ 31953 32034 the
62
+ 32034 32287 climb
63
+ 32287 32827 economic
64
+ 32827 34027 globalization
65
+ 34027 34312 never
66
+ 34312 34862 flat
67
+ 35122 35285 it's
68
+ 35285 35350 a
69
+ 35350 35685 fairly
70
+ 35685 36235 standard
71
+ 36235 36460 way
72
+ 36460 36548 of
73
+ 36548 37129 understanding
74
+ 37129 37665 the
75
+ 37665 37943 crime
76
+ 37943 38040 and
77
+ 38040 38570 violence
78
+ 38570 38972 so
79
+ 40637 40696 i
80
+ 40696 40905 think
81
+ 40905 41057 it's
82
+ 41057 41432 import
83
+ 41432 41472 and
84
+ 41472 41705 because
85
+ 41705 41787 it
86
+ 41787 42172 means
87
+ 42172 42302 is
88
+ 42302 42687 that
89
+ 42687 43083 problem
90
+ 43083 43191 of
91
+ 43191 43306 r
92
+ 43306 43907 times
93
+ 44811 44972 and
94
+ 44972 45157 then
95
+ 45157 45619 many
96
+ 45619 46167 developing
97
+ 46167 46903 countries
98
+ 46903 47042 in
99
+ 47042 47402 different
100
+ 47402 47782 regions
101
+ 47782 47892 of
102
+ 47892 47982 the
103
+ 47982 48317 world
104
+ 48317 48421 are
105
+ 48421 49423 facing
106
+ 49423 50356 insecurity
107
+ 50356 50582 and
108
+ 50582 51492 violence
109
+ 51766 52265 that
110
+ 53319 53525 and
111
+ 53525 53662 a
112
+ 53662 53821 lot
113
+ 53821 53961 to
114
+ 53961 54096 do
115
+ 54096 54267 with
116
+ 54267 54436 the
117
+ 54436 54916 changing
118
+ 54916 55391 nature
119
+ 55391 55656 of
120
+ 55656 55938 an
121
+ 55938 56182 urban
122
+ 56182 56898 economies
123
+ 56898 57071 and
124
+ 57071 57214 how
125
+ 57214 57635 they
126
+ 57874 58346 linked
127
+ 58346 58459 to
128
+ 58459 58566 the
129
+ 58566 59276 global
130
+ 59276 59936 economy
131
+ 59936 60151 so
132
+ 60151 60263 and
133
+ 60263 60685 that's
134
+ 61248 61343 i
135
+ 61343 61898 think
136
+ 61898 63003 come
137
+ 63003 63518 pervasive
138
+ 63518 64288 challenge
139
+ 67678 67836 is
140
+ 67836 68090 that
141
+ 68090 68239 is
142
+ 68239 68457 one
143
+ 68457 68557 and
144
+ 68557 69049 two
145
+ 69049 69146 are
146
+ 69146 69451 three
147
+ 69451 70386 specializations
148
+ 70386 70526 that
149
+ 70526 70776 when
150
+ 70776 71121 means
151
+ 71121 71427 in
152
+ 71769 71906 and
153
+ 71906 72691 interconnected
154
+ 72691 73066 the
155
+ 73066 73371 global
156
+ 73371 73866 world
157
+ 73866 74411 aware
158
+ 74765 75695 technology
159
+ 75695 75895 and
160
+ 75895 76625 treat
161
+ 76625 77568 endowment
162
+ 78727 78947 of
163
+ 78947 79777 communication
164
+ 79777 80012 and
165
+ 80012 80287 this
166
+ 81408 81577 i
167
+ 81577 81792 d
168
+ 81792 82247 l
169
+ 82247 83052 ideology
170
+ 83052 83167 of
171
+ 83167 83368 those
172
+ 83368 83437 are
173
+ 83437 83572 all
174
+ 83572 84132 important
175
+ 84132 84202 to
176
+ 84875 85342 aspects
177
+ 85342 85470 of
178
+ 85470 85791 that
179
+ 85791 85915 i'm
180
+ 85915 86000 the
181
+ 86000 86660 planning
182
+ 86660 86930 to
183
+ 86930 87295 kept
184
+ 88377 88488 but
185
+ 88488 88636 i
186
+ 88636 88894 would
187
+ 88894 89157 say
188
+ 89157 89903 definitely
189
+ 89903 90337 need
190
+ 90337 90426 to
191
+ 90426 91267 people
192
+ 91267 91641 who
193
+ 91641 92057 are
194
+ 92739 92857 i
195
+ 92857 93002 had
196
+ 93002 93312 the
197
+ 93312 93544 think
198
+ 93544 93942 about
199
+ 93942 94487 them
200
+ 94487 94777 meaning
201
+ 94777 95260 regulator
202
+ 95260 95597 people
203
+ 95597 95652 and
204
+ 95652 95791 now
205
+ 95791 95947 but
206
+ 95947 96862 organizations
207
+ 96862 97052 howell
208
+ 97052 97782 organizations
209
+ 97782 98102 function
210
+ 98102 98397 because
211
+ 98397 99367 organizations
212
+ 100540 100751 there
213
+ 100751 100916 it's
214
+ 100916 100971 a
215
+ 100971 101366 whole
216
+ 101366 101991 literature
217
+ 101991 102072 in
218
+ 102072 102102 a
219
+ 102102 102567 field
220
+ 102567 102726 of
221
+ 102726 103460 organizational
222
+ 103460 103916 fury
223
+ 103916 104143 there
224
+ 104143 104306 would
225
+ 104306 104456 be
226
+ 104456 104869 important
227
+ 104869 105451 understand
228
+ 105451 105775 amiga
229
+ 105775 106191 small
230
+ 106191 106256 is
231
+ 106256 106518 that
232
+ 106518 106675 is
233
+ 106675 106896 our
234
+ 106896 107196 local
235
+ 107196 107776 government
236
+ 107776 107906 our
237
+ 107906 108172 rap
238
+ 108172 109141 community
239
+ 109141 109511 iranian
240
+ 109511 109691 g
241
+ 109691 109964 l
242
+ 110415 110638 and
243
+ 110638 111084 something
244
+ 111084 111284 as
245
+ 111284 111677 large
246
+ 111677 111939 as
247
+ 112581 112952 steak
248
+ 112952 113167 ever
249
+ 113167 113497 meant
250
+ 113497 113662 and
251
+ 113662 114072 national
252
+ 114072 114836 government
253
+ 114836 115485 maybe
254
+ 115485 116278 at
255
+ 116278 116382 the
256
+ 116382 116602 u
257
+ 116602 116847 n
258
+ 116847 116952 r
259
+ 116952 117042 a
260
+ 117042 117492 global
261
+ 117492 118022 agency
262
+ 118022 118377 news
263
+ 118377 119171 activities
264
+ 119584 120310 influence
265
+ 121832 121927 and
266
+ 121927 122050 the
267
+ 122050 122358 small
268
+ 122358 122659 scale
269
+ 122659 122781 so
270
+ 122781 122876 i
271
+ 122876 123140 think
272
+ 123140 124361 organization
273
+ 124361 125033 analysis
274
+ 125033 125141 is
275
+ 125141 125221 a
276
+ 125221 125461 very
277
+ 125461 126021 important
278
+ 126021 126274 okay
279
+ 129718 130024 line
280
+ 130024 130126 is
281
+ 130126 130540 lambda
282
+ 130540 130728 t
283
+ 130728 131096 v
284
+ 131096 131291 for
285
+ 131291 131866 television
286
+ 131866 132116 the
287
+ 132116 132381 only
288
+ 132381 132631 about
289
+ 132631 132964 it
290
+ 132964 133061 i
291
+ 133061 133423 think
292
+ 133423 133791 and
293
+ 133791 134086 there's
294
+ 134086 134285 a
295
+ 134285 134496 lot
296
+ 134496 134596 of
297
+ 134596 134764 x
298
+ 134764 135074 segment
299
+ 135074 135361 about
300
+ 135361 135671 that
301
+ 135748 135839 and
302
+ 135839 135967 i
303
+ 135967 136207 didn't
304
+ 136207 136422 think
305
+ 136422 136547 that
306
+ 136547 136787 that
307
+ 136787 137235 eminently
308
+ 137235 137311 to
309
+ 137311 137413 the
310
+ 137413 137907 challenges
311
+ 137907 138027 the
312
+ 138027 138277 town
313
+ 138277 138432 and
314
+ 138432 138543 it's
315
+ 138543 139365 hard
316
+ 139365 139918 are
317
+ 139918 140118 how
318
+ 140118 140182 to
319
+ 140182 140442 get
320
+ 140442 140717 there
321
+ 141033 141442 yeah
322
+ 141442 141657 so
323
+ 141657 141795 in
324
+ 141795 142041 what
325
+ 142041 142173 are
326
+ 142173 142338 the
327
+ 142338 143080 strategic
328
+ 143080 143260 aims
329
+ 143260 143662 least
330
+ 143662 143920 okay
331
+ 143920 144155 many
332
+ 144155 144265 here
333
+ 144265 144395 and
334
+ 144395 144605 this
335
+ 144605 144840 meeting
336
+ 144840 145385 whatever
337
+ 145385 145556 to
338
+ 145556 145740 dick
339
+ 145740 146073 steps
340
+ 146073 146390 any
341
+ 146390 146485 to
342
+ 146485 146645 be
343
+ 146645 147000 take
344
+ 147000 147620 ten
345
+ 147879 148133 where
346
+ 148133 148284 you
347
+ 148284 148642 can
348
+ 148642 149163 accomplish
349
+ 149163 149507 something
350
+ 149507 149588 in
351
+ 149588 149689 the
352
+ 149689 149988 short
353
+ 149988 150238 term
354
+ 150238 150520 right
355
+ 150520 150943 away
356
+ 151340 151574 but
357
+ 151574 152461 also
358
+ 152461 152600 you
359
+ 152600 152825 know
360
+ 152825 152892 the
361
+ 152892 153045 the
362
+ 153045 153127 you're
363
+ 153127 153295 not
364
+ 153295 153422 going
365
+ 153422 153550 to
366
+ 153550 154016 accomplish
367
+ 154016 154395 everything
368
+ 154395 154543 that
369
+ 154543 154725 you're
370
+ 154725 155005 leading
371
+ 155005 155193 out
372
+ 155193 155360 in
373
+ 155360 155564 year
374
+ 155564 155815 vision
375
+ 155815 156135 and
376
+ 156323 156847 so
377
+ 157318 157455 you
378
+ 157455 157735 want
379
+ 157735 157856 to
380
+ 157856 158175 make
381
+ 158175 159070 progress
382
+ 159725 160753 continually
383
+ 160753 161005 every
384
+ 161005 161499 states
385
+ 161499 161776 has
386
+ 161776 161800 a
387
+ 161800 162055 has
388
+ 162055 162315 some
389
+ 162315 162402 of
390
+ 162402 162475 the
391
+ 162475 162817 small
392
+ 162817 163585 success
393
+ 163795 164185 but
394
+ 164185 164390 for
395
+ 164390 164432 a
396
+ 164432 164650 need
397
+ 164650 164725 for
398
+ 164725 164865 its
399
+ 164865 165250 allies
400
+ 165250 165680 success
401
+ 165680 166000 because
402
+ 166000 166135 you
403
+ 166135 166935 can't
404
+ 166935 167295 put
405
+ 167295 167413 an
406
+ 167413 168470 institution
407
+ 168906 169036 it's
408
+ 169036 169135 and
409
+ 169135 169483 develop
410
+ 169483 169540 an
411
+ 169540 170090 institution
412
+ 170090 170270 like
413
+ 170270 170539 to
414
+ 170539 171150 emulate
415
+ 171769 171942 your
416
+ 171942 172137 have
417
+ 172137 172265 in
418
+ 172265 173159 mind
419
+ 173159 173270 of
420
+ 173270 173339 the
421
+ 173339 173770 night
422
+ </segment>
423
+ </lecture>
424
+ </document>
425
+