sm-transcript 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/LICENSE.txt +23 -0
  2. data/README.txt +140 -0
  3. data/Rakefile +31 -0
  4. data/bin/results/PLACEHOLDER.txt +8 -0
  5. data/bin/sm-transcript +12 -0
  6. data/bin/transcripts/PLACEHOLDER.txt +8 -0
  7. data/lib/sm_transcript/LICENSE.txt +23 -0
  8. data/lib/sm_transcript/metadata.rb +69 -0
  9. data/lib/sm_transcript/metadata_reader.rb +56 -0
  10. data/lib/sm_transcript/options.rb +89 -0
  11. data/lib/sm_transcript/optparseExample.rb +113 -0
  12. data/lib/sm_transcript/process_csv_files_to_html.rb +58 -0
  13. data/lib/sm_transcript/process_seg_files.rb +21 -0
  14. data/lib/sm_transcript/process_seg_files_to_csv.rb +24 -0
  15. data/lib/sm_transcript/process_seg_files_to_html.rb +31 -0
  16. data/lib/sm_transcript/require_relative.rb +14 -0
  17. data/lib/sm_transcript/runner.rb +70 -0
  18. data/lib/sm_transcript/seg_reader.rb +42 -0
  19. data/lib/sm_transcript/transcript.rb +130 -0
  20. data/lib/sm_transcript/word.rb +31 -0
  21. data/lib/sm_transcript/wrd_reader.rb +42 -0
  22. data/test/Rakefile +14 -0
  23. data/test/results/IIHS_Diane_Davis_Nov2009.seg +425 -0
  24. data/test/results/NERCOMP-SpokenMedia4.wrd +6791 -0
  25. data/test/results/PLACEHOLDER.txt +8 -0
  26. data/test/results/PLACEHOLDER.txt.ignore +8 -0
  27. data/test/results/vijay_kumar.wrd +1675 -0
  28. data/test/results/wirehair-beetle.txt +6 -0
  29. data/test/test_metadata.rb +39 -0
  30. data/test/test_metadatareader.rb +30 -0
  31. data/test/test_options.rb +47 -0
  32. data/test/test_runner.rb +52 -0
  33. data/test/test_segreader.rb +39 -0
  34. data/test/test_transcript.rb +62 -0
  35. data/test/test_wrdreader.rb +43 -0
  36. data/test/transcripts/IIHS_Diane_Davis_Nov2009-t1.html +148 -0
  37. data/test/transcripts/PLACEHOLDER.txt +8 -0
  38. data/test/transcripts/data.js +24 -0
  39. data/test/transcripts/vijay_kumar-1.-t1.html +557 -0
  40. data/test/transcripts/vijay_kumar-1.t1.html +557 -0
  41. data/test/transcripts/vijay_kumar-t1.html +557 -0
  42. data/test/transcripts/vijay_kumar-t1.ttml +569 -0
  43. data/test/transcripts/vijay_kumar.data.js +2 -0
  44. data/test/transcripts/wirehair-beetle.data.js +3 -0
  45. metadata +234 -0
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'optparse'
5
+ require 'optparse/time'
6
+ require 'ostruct'
7
+ require 'pp'
8
+
9
+ class OptparseExample
10
+
11
+ CODES = %w[iso-2022-jp shift_jis euc-jp utf8 binary]
12
+ CODE_ALIASES = { "jis" => "iso-2022-jp", "sjis" => "shift_jis" }
13
+
14
+ #
15
+ # Return a structure describing the options.
16
+ #
17
+ def self.parse(args)
18
+ # The options specified on the command line will be collected in *options*.
19
+ # We set default values here.
20
+ options = OpenStruct.new
21
+ options.library = []
22
+ options.inplace = false
23
+ options.encoding = "utf8"
24
+ options.transfer_type = :auto
25
+ options.verbose = false
26
+
27
+ opts = OptionParser.new do |opts|
28
+ opts.banner = "Usage: example.rb [options]"
29
+
30
+ opts.separator ""
31
+ opts.separator "Specific options:"
32
+
33
+ # Mandatory argument.
34
+ opts.on("-r", "--require LIBRARY",
35
+ "Require the LIBRARY before executing your script") do |lib|
36
+ options.library << lib
37
+ end
38
+
39
+ # Optional argument; multi-line description.
40
+ opts.on("-i", "--inplace [EXTENSION]",
41
+ "Edit ARGV files in place",
42
+ " (make backup if EXTENSION supplied)") do |ext|
43
+ options.inplace = true
44
+ options.extension = ext || ''
45
+ options.extension.sub!(/\A\.?(?=.)/, ".") # Ensure extension begins with dot.
46
+ end
47
+
48
+ # Cast 'delay' argument to a Float.
49
+ opts.on("--delay N", Float, "Delay N seconds before executing") do |n|
50
+ options.delay = n
51
+ end
52
+
53
+ # Cast 'time' argument to a Time object.
54
+ opts.on("-t", "--time [TIME]", Time, "Begin execution at given time") do |time|
55
+ options.time = time
56
+ end
57
+
58
+ # Cast to octal integer.
59
+ opts.on("-F", "--irs [OCTAL]", OptionParser::OctalInteger,
60
+ "Specify record separator (default \\0)") do |rs|
61
+ options.record_separator = rs
62
+ end
63
+
64
+ # List of arguments.
65
+ opts.on("--list x,y,z", Array, "Example 'list' of arguments") do |list|
66
+ options.list = list
67
+ end
68
+
69
+ # Keyword completion. We are specifying a specific set of arguments (CODES
70
+ # and CODE_ALIASES - notice the latter is a Hash), and the user may provide
71
+ # the shortest unambiguous text.
72
+ code_list = (CODE_ALIASES.keys + CODES).join(',')
73
+ opts.on("--code CODE", CODES, CODE_ALIASES, "Select encoding",
74
+ " (#{code_list})") do |encoding|
75
+ options.encoding = encoding
76
+ end
77
+
78
+ # Optional argument with keyword completion.
79
+ opts.on("--type [TYPE]", [:text, :binary, :auto],
80
+ "Select transfer type (text, binary, auto)") do |t|
81
+ options.transfer_type = t
82
+ end
83
+
84
+ # Boolean switch.
85
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
86
+ options.verbose = v
87
+ end
88
+
89
+ opts.separator ""
90
+ opts.separator "Common options:"
91
+
92
+ # No argument, shows at tail. This will print an options summary.
93
+ # Try it and see!
94
+ opts.on_tail("-h", "--help", "Show this message") do
95
+ puts opts
96
+ exit
97
+ end
98
+
99
+ # Another typical switch to print the version.
100
+ opts.on_tail("--version", "Show version") do
101
+ puts OptionParser::Version.join('.')
102
+ exit
103
+ end
104
+ end
105
+
106
+ opts.parse!(args)
107
+ options
108
+ end # parse()
109
+
110
+ end # class OptparseExample
111
+
112
+ options = OptparseExample.parse(ARGV)
113
+ pp options
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env ruby -wKU
2
+ $KCODE = "UTF8"
3
+ # p "Résumé"
4
+
5
+ # require "rexml/document"
6
+ require "rubygems"
7
+ require "htmlentities"
8
+
9
+ if __FILE__ == $0
10
+
11
+ # Dir.glob("/Users/pwilkins/work/Spoken\ Lecture/IIHS/*transcript\ ????09.csv") { |x|
12
+ Dir.glob("/Users/pwilkins/Downloads/*transcript\ ????09.csv") { |x|
13
+ # reg = Regexp.new('[\w\-_]*\.seg$')
14
+ coder = HTMLEntities.new
15
+ span_element = ""
16
+ prev_start_time = 0
17
+ f = File.new(x)
18
+ puts f.path
19
+ f.each("\n") { |line|
20
+ arr = line.split(",")
21
+ # p arr[8]
22
+ start_time = arr[2].to_i/1000
23
+ if start_time == prev_start_time # append word
24
+ # span_element << " #{coder.encode(arr[8].strip, :basic)}"
25
+ span_element << " #{arr[8].strip}"
26
+ else # create a new span_element
27
+ puts span_element << "</span> "
28
+ # span_element = "<span id='T#{start_time}'>#{coder.encode(arr[8].strip, :basic)}"
29
+ span_element = "<span id='T#{start_time}'>#{arr[8].strip}"
30
+ prev_start_time = start_time
31
+ end
32
+ # puts line.dump
33
+ # puts "<span id='T#{start_time}'>#{arr[6]}</span>"
34
+
35
+ }
36
+
37
+ # puts f
38
+ # root.elements.each("/document/lecture/segment") do |s|
39
+ # span_element = ""
40
+ # prev_start_time = 0
41
+ # s.text.scan(/^\d* \d* [\w']*$/) { |t|
42
+ ## get the start time and reduce its granularity so that multiple words fall
43
+ ## within a <span> element.
44
+ # start_time = t.scan(/^\d*/)[0].to_i/1000
45
+ # if start_time == prev_start_time # append word
46
+ # span_element << " #{t.scan(/[\w']*$/)}"
47
+ # else # create a new span_element
48
+ # puts span_element << "</span> "
49
+ # span_element = "<span id='T#{start_time}'>#{t.scan(/[\w']*$/)}"
50
+ # prev_start_time = start_time
51
+ # end
52
+ ## puts "<span id='T#{t.scan(/^\d*/)[0].to_i/1000}'>#{t.scan(/[\w']*$/)}</span> "
53
+ # }
54
+ # end
55
+ puts
56
+ }
57
+ end
58
+
@@ -0,0 +1,21 @@
1
+ require "rexml/document"
2
+ require "optparse"
3
+
4
+ if __FILE__ == $0
5
+
6
+ Dir.glob("/Users/pwilkins/work/Spoken\ Lecture/IIHS/*.seg") { |x|
7
+ reg = Regexp.new('[\w\-_]*\.seg$')
8
+ doc = REXML::Document.new File.open(x)
9
+ root = doc.root
10
+ f = reg.match( root.attributes.get_attribute("fileName").value )
11
+ root.elements.each("/document/lecture/segment") do |s|
12
+ s.text.scan(/^\d* \d* [\w']*$/) { |t|
13
+ puts f[0] + "," + s.attributes.get_attribute("id").value + "," + t.gsub(/ /, ',')
14
+ }
15
+ end
16
+ puts
17
+ }
18
+ end
19
+
20
+
21
+
@@ -0,0 +1,24 @@
1
+ require "rexml/document"
2
+
3
+ if __FILE__ == $0
4
+
5
+ Dir.glob("/Users/pwilkins/work/Spoken\ Lecture/IIHS/*.seg") { |x|
6
+ reg = Regexp.new('[\w\-_]*\.seg$')
7
+ doc = REXML::Document.new File.open(x)
8
+ root = doc.root
9
+ f = reg.match( root.attributes.get_attribute("fileName").value )
10
+ root.elements.each("/document/lecture/segment") do |s|
11
+ print "Filename (do not edit),Segment (do not edit),Start Time (do not edit),End Time (do not edit),Unedited Word (do not edit),Edited English Word,Hindi Word(s)"
12
+ puts
13
+ s.text.scan(/^\d* \d* [\w']*$/) { |t|
14
+ puts f[0] + "," + s.attributes.get_attribute("id").value + "," + t.gsub(/ /, ',')
15
+ }
16
+ end
17
+ puts
18
+ }
19
+ end
20
+
21
+ # Header for csv file:
22
+ # Filename (do not edit),Segment (do not edit),Start Time (do not edit),End Time (do not edit),Unedited Word (do not edit),Edited English Word,Hindi Word(s)
23
+
24
+
@@ -0,0 +1,31 @@
1
+ require "rexml/document"
2
+
3
+ if __FILE__ == $0
4
+
5
+ Dir.glob("/Users/pwilkins/work/Spoken\ Lecture/IIHS/*.seg") { |x|
6
+ reg = Regexp.new('[\w\-_]*\.seg$')
7
+ doc = REXML::Document.new File.open(x)
8
+ root = doc.root
9
+ f = reg.match( root.attributes.get_attribute("fileName").value )
10
+ puts f
11
+ root.elements.each("/document/lecture/segment") do |s|
12
+ span_element = ""
13
+ prev_start_time = 0
14
+ s.text.scan(/^\d* \d* [\w']*$/) { |t|
15
+ # get the start time and reduce its granularity so that multiple words fall
16
+ # within a <span> element.
17
+ start_time = t.scan(/^\d*/)[0].to_i/1000
18
+ if start_time == prev_start_time # append word
19
+ span_element << " #{t.scan(/[\w']*$/)}"
20
+ else # create a new span_element
21
+ puts span_element << "</span> "
22
+ span_element = "<span id='T#{start_time}'>#{t.scan(/[\w']*$/)}"
23
+ prev_start_time = start_time
24
+ end
25
+ # puts "<span id='T#{t.scan(/^\d*/)[0].to_i/1000}'>#{t.scan(/[\w']*$/)}</span> "
26
+ }
27
+ end
28
+ puts
29
+ }
30
+ end
31
+
@@ -0,0 +1,14 @@
1
+ # This file implements the Ruby 1.9 feature 'require_relative'. It is needed
2
+ # only when running Ruby 1.8, so that code doesn't break. It is taken from
3
+ # the Pickaxe book for 1.9.
4
+
5
+ def require_relative(relative_feature)
6
+ c = caller.first
7
+ fail "Can't parse #{c}" unless c.rindex(/:\d+(:in '.*')?$/)
8
+ file = $`
9
+ if /\A\((.*)\)/ =~ file # eval, etc.
10
+ raise LoadError, "require_relative is called in #{$1}"
11
+ end
12
+ absolute = File.expand_path(relative_feature, File.dirname(file))
13
+ require absolute
14
+ end
@@ -0,0 +1,70 @@
1
+ # $Id: runner.rb 183 2010-03-15 19:07:50Z pwilkins $
2
+ # Copyright (c) 2010 Massachusetts Institute of Technology
3
+ # see LICENSE.txt for license text
4
+
5
+ require 'rubygems'
6
+ require 'extensions/kernel'
7
+ require_relative 'options'
8
+ require_relative 'seg_reader'
9
+ require_relative 'wrd_reader'
10
+ require_relative 'transcript'
11
+ require_relative 'metadata'
12
+ require_relative 'metadata_reader'
13
+
14
+ module SmTranscript
15
+ class Runner
16
+
17
+ attr_reader :options
18
+
19
+ def initialize(argv)
20
+ @options = Options.new(argv)
21
+ end
22
+
23
+ def run
24
+ # collect files to process
25
+ begin
26
+ raise "source directory doesn't exist" unless FileTest.exists?(@options.srcdir)
27
+ raise "destination directory doesn't exist" unless FileTest.exists?(@options.destdir)
28
+
29
+ # process each file in srcdir whose extension is the same as srctype
30
+ Dir.glob("#{@options.srcdir}/*.#{@options.srctype}") do |x|
31
+
32
+ case @options.srctype
33
+ when SmTranscript::Options::SEG_SRC_TYPE
34
+ words = SmTranscript::SegReader.from_file(x).words
35
+ when SmTranscript::Options::TXT_SRC_TYPE
36
+ md = SmTranscript::MetadataReader.from_file(x).metadata
37
+ else SmTranscript::Options::WRD_SRC_TYPE
38
+ words = SmTranscript::WrdReader.from_file(x).words
39
+ end
40
+
41
+ trans = SmTranscript::Transcript.new(words)
42
+ meta = SmTranscript::Metadata.new(md)
43
+ destfile = File.basename( x, @options.srctype)
44
+
45
+ case @options.desttype
46
+ when SmTranscript::Options::HTML_DEST_TYPE
47
+ raise "txt invalid srctype for html desttype" if @options.srctype ==
48
+ SmTranscript::Options::TXT_SRC_TYPE
49
+ destfile = "#{destfile}t1.html"
50
+ # p "destfile is #{destfile}"
51
+ trans.write_html("#{@options.destdir}/#{destfile}")
52
+ when SmTranscript::Options::DATAJS_DEST_TYPE
53
+ raise "txt is only valid srctype for datajs desttype" unless @options.srctype ==
54
+ SmTranscript::Options::TXT_SRC_TYPE
55
+ destfile = "#{destfile}data.js"
56
+ # p "destfile is #{destfile}"
57
+ meta.write_datajs("#{@options.destdir}/#{destfile}")
58
+ else
59
+ destfile = "#{destfile}t1.ttml"
60
+ # p "destfile is #{destfile}"
61
+ trans.write_ttml("#{@options.destdir}/#{destfile}")
62
+ end
63
+ end # Dir.glob()
64
+
65
+ rescue SystemCallError => e
66
+ STDERR.puts $!
67
+ end
68
+ end # run
69
+ end
70
+ end
@@ -0,0 +1,42 @@
1
+ # $Id: seg_reader.rb 182 2010-03-12 22:07:34Z pwilkins $
2
+ # Copyright (c) 2010 Massachusetts Institute of Technology
3
+ # see LICENSE.txt for license text
4
+
5
+ require "rexml/document"
6
+ require 'extensions/kernel'
7
+ require_relative 'word'
8
+
9
+ module SmTranscript
10
+ class SegReader
11
+ attr_reader :metadata
12
+ attr_reader :words
13
+
14
+ def self.from_file(file_name)
15
+ # p File.expand_path(file_name)
16
+ new(REXML::Document.new File.open(file_name))
17
+ end
18
+
19
+ def initialize(xml_doc)
20
+ @metadata = {}
21
+ @words = []
22
+ @root = xml_doc.root
23
+ parse_metadata()
24
+ parse_words()
25
+ end
26
+
27
+ def parse_metadata()
28
+ reg = Regexp.new('[\w\-_]*\.seg$')
29
+ @metadata["orig_seg_path"] = # absolute path to segfile on processor
30
+ reg.match(@root.attributes.get_attribute("fileName").value)
31
+ end
32
+
33
+ def parse_words()
34
+ @root.elements.each("/document/lecture/segment") do |s|
35
+ s.text.scan(/^\d* \d* [\w']*$/) do |t|
36
+ arr = t.split
37
+ @words << SmTranscript::Word.new(arr[0], arr[1], arr[2])
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,130 @@
1
+ # $Id: transcript.rb 182 2010-03-12 22:07:34Z pwilkins $
2
+ # Copyright (c) 2010 Massachusetts Institute of Technology
3
+ # see LICENSE.txt for license text
4
+
5
+ require "rexml/document"
6
+ require 'extensions/kernel'
7
+ require 'builder'
8
+ require_relative 'word'
9
+
10
+ module SmTranscript
11
+ class Transcript
12
+
13
+ @words = Array.new()
14
+
15
+ def initialize(word_arr)
16
+ @metadata = {}
17
+ @words = word_arr
18
+ end
19
+
20
+ def write_html(dest_file)
21
+ # TODO: Do we want to notify user when overwriting existing file?
22
+ # if File.exists?(dest_file)
23
+ # p "overwriting existing destination file"
24
+ # end
25
+ File.open(dest_file, "w") do |f|
26
+ span_element = ""
27
+ prev_start_time = 0
28
+ start_time = 0
29
+ @words.each do |w|
30
+ # get the start time and reduce its granularity so that multiple
31
+ # words fall within a <span> element.
32
+ start_time = w.start_time.to_i/1000
33
+ if start_time.to_i == prev_start_time.to_i # append word
34
+ span_element << " #{w.word}"
35
+ else # create a new span_element
36
+ # since prev_start_time is zero on first line, this avoids
37
+ # writing a closing </span> with no opening <span>
38
+ f.puts span_element << "</span> " unless prev_start_time == 0
39
+
40
+ span_element = "<span id='T#{start_time}'>#{w.word}"
41
+ prev_start_time = start_time
42
+ end
43
+ end
44
+ # In the block above, the last word isn't written if
45
+ # the start_time and prev_start_time are the same.
46
+ f.puts span_element << "</span> " unless start_time != prev_start_time
47
+
48
+ end
49
+ end # write_html()
50
+
51
+
52
+ def write_ttml(dest_file)
53
+ # TODO: Do we want to notify user when overwriting existing file?
54
+ # if File.exists?(dest_file)
55
+ # p "overwriting existing destination file"
56
+ # end
57
+ buf = ""
58
+ bldr = Builder::XmlMarkup.new( :target => buf, :indent => 2 )
59
+ bldr.instruct!
60
+ bldr.tt("xmlns" => "http://www.w3.org/2006/04/ttaf1",
61
+ "xmlns:tts" => "http://www.w3.org/ns/ttml#styling",
62
+ "xmlns:ttm" => "http://www.w3.org/ns/ttml#metadata",
63
+ "xml:lang" => "en" ) {
64
+ bldr.head { |b|
65
+ b.ttm :title, 'Document Metadata Example'
66
+ b.ttm :desc, 'This document employs document metadata.'
67
+ }
68
+ bldr.body {
69
+ bldr.div {
70
+ span_element = ""
71
+ prev_start_secs = 0
72
+ start_ms = end_ms = 0
73
+ start_secs = 0
74
+ @words.each do |w|
75
+ # get the start time and reduce its granularity so that multiple
76
+ # words fall within a span element.
77
+ start_secs = w.start_time.to_i/1000
78
+ if start_secs == prev_start_secs # append word
79
+ end_ms = w.end_time.to_i
80
+ span_element << " #{w.word}"
81
+ else # create a new span_element
82
+ bldr.p( span_element,
83
+ "xml:id" => "T#{start_secs.to_s}", "begin" => "#{start_ms.to_s}ms", "end" => "#{end_ms.to_s}ms" )
84
+
85
+ start_ms = w.start_time.to_i
86
+ end_ms = w.end_time.to_i
87
+ span_element = " #{w.word}"
88
+ prev_start_secs = start_secs
89
+ end
90
+ end
91
+ # In the block above, the last word isn't written if
92
+ # the start_time and prev_start_time are the same.
93
+ bldr.p( span_element,
94
+ "xml:id" => "T#{start_secs.to_s}",
95
+ "begin" => "#{start_ms.to_s}ms",
96
+ "end" => "#{end_ms.to_s}ms" ) unless start_secs != prev_start_secs
97
+ }
98
+ }
99
+ }
100
+ # p buf
101
+ File.open(dest_file, "w") do |f|
102
+ f.puts buf
103
+ f.flush
104
+ end
105
+ end
106
+
107
+ # Times are expressed in milliseconds, far more granularity than is
108
+ # useful for most user-facing apps, especially since the player reports
109
+ # elapsed time only ten times a second.
110
+ # By reducing the time by orders of magnitude provides these benefits:
111
+ # 1) Multiple words fall within a <span> element.
112
+ # 2) Better mapping between start times and player time tracking
113
+ def words_to_phrase(start_time)
114
+ start_time.to_i/1000
115
+ end # words_to_phrase
116
+
117
+ def get_time_expression(milliseconds)
118
+ milliseconds
119
+ end
120
+
121
+ # There are some word combinations that occur with such regularity that
122
+ # they call out to be fixed. For example, "m I t" is unambiguously MIT.
123
+ # These edits can only be done when the phrase has been assembled.
124
+ def cleanup_phrase(phrase)
125
+ phrase
126
+ end
127
+
128
+
129
+ end # class
130
+ end