RubyGems - sm-transcript - Versions diffs - 0.0.3 - Mend

sm-transcript 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

data/LICENSE.txt +23 -0
data/README.txt +140 -0
data/Rakefile +31 -0
data/bin/results/PLACEHOLDER.txt +8 -0
data/bin/sm-transcript +12 -0
data/bin/transcripts/PLACEHOLDER.txt +8 -0
data/lib/sm_transcript/LICENSE.txt +23 -0
data/lib/sm_transcript/metadata.rb +69 -0
data/lib/sm_transcript/metadata_reader.rb +56 -0
data/lib/sm_transcript/options.rb +89 -0
data/lib/sm_transcript/optparseExample.rb +113 -0
data/lib/sm_transcript/process_csv_files_to_html.rb +58 -0
data/lib/sm_transcript/process_seg_files.rb +21 -0
data/lib/sm_transcript/process_seg_files_to_csv.rb +24 -0
data/lib/sm_transcript/process_seg_files_to_html.rb +31 -0
data/lib/sm_transcript/require_relative.rb +14 -0
data/lib/sm_transcript/runner.rb +70 -0
data/lib/sm_transcript/seg_reader.rb +42 -0
data/lib/sm_transcript/transcript.rb +130 -0
data/lib/sm_transcript/word.rb +31 -0
data/lib/sm_transcript/wrd_reader.rb +42 -0
data/test/Rakefile +14 -0
data/test/results/IIHS_Diane_Davis_Nov2009.seg +425 -0
data/test/results/NERCOMP-SpokenMedia4.wrd +6791 -0
data/test/results/PLACEHOLDER.txt +8 -0
data/test/results/PLACEHOLDER.txt.ignore +8 -0
data/test/results/vijay_kumar.wrd +1675 -0
data/test/results/wirehair-beetle.txt +6 -0
data/test/test_metadata.rb +39 -0
data/test/test_metadatareader.rb +30 -0
data/test/test_options.rb +47 -0
data/test/test_runner.rb +52 -0
data/test/test_segreader.rb +39 -0
data/test/test_transcript.rb +62 -0
data/test/test_wrdreader.rb +43 -0
data/test/transcripts/IIHS_Diane_Davis_Nov2009-t1.html +148 -0
data/test/transcripts/PLACEHOLDER.txt +8 -0
data/test/transcripts/data.js +24 -0
data/test/transcripts/vijay_kumar-1.-t1.html +557 -0
data/test/transcripts/vijay_kumar-1.t1.html +557 -0
data/test/transcripts/vijay_kumar-t1.html +557 -0
data/test/transcripts/vijay_kumar-t1.ttml +569 -0
data/test/transcripts/vijay_kumar.data.js +2 -0
data/test/transcripts/wirehair-beetle.data.js +3 -0
metadata +234 -0

data/lib/sm_transcript/optparseExample.rb ADDED

@@ -0,0 +1,113 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'optparse'
+require 'optparse/time'
+require 'ostruct'
+require 'pp'
+ class OptparseExample
+   CODES = %w[iso-2022-jp shift_jis euc-jp utf8 binary]
+   CODE_ALIASES = { "jis" => "iso-2022-jp", "sjis" => "shift_jis" }
+   #
+   # Return a structure describing the options.
+   #
+   def self.parse(args)
+     # The options specified on the command line will be collected in *options*.
+     # We set default values here.
+     options = OpenStruct.new
+     options.library = []
+     options.inplace = false
+     options.encoding = "utf8"
+     options.transfer_type = :auto
+     options.verbose = false
+     opts = OptionParser.new do |opts|
+       opts.banner = "Usage: example.rb [options]"
+       opts.separator ""
+       opts.separator "Specific options:"
+       # Mandatory argument.
+       opts.on("-r", "--require LIBRARY",
+               "Require the LIBRARY before executing your script") do |lib|
+         options.library << lib
+       end
+       # Optional argument; multi-line description.
+       opts.on("-i", "--inplace [EXTENSION]",
+               "Edit ARGV files in place",
+               "  (make backup if EXTENSION supplied)") do |ext|
+         options.inplace = true
+         options.extension = ext || ''
+         options.extension.sub!(/\A\.?(?=.)/, ".")  # Ensure extension begins with dot.
+       end
+       # Cast 'delay' argument to a Float.
+       opts.on("--delay N", Float, "Delay N seconds before executing") do |n|
+         options.delay = n
+       end
+       # Cast 'time' argument to a Time object.
+       opts.on("-t", "--time [TIME]", Time, "Begin execution at given time") do |time|
+         options.time = time
+       end
+       # Cast to octal integer.
+       opts.on("-F", "--irs [OCTAL]", OptionParser::OctalInteger,
+               "Specify record separator (default \\0)") do |rs|
+         options.record_separator = rs
+       end
+       # List of arguments.
+       opts.on("--list x,y,z", Array, "Example 'list' of arguments") do |list|
+         options.list = list
+       end
+       # Keyword completion.  We are specifying a specific set of arguments (CODES
+       # and CODE_ALIASES - notice the latter is a Hash), and the user may provide
+       # the shortest unambiguous text.
+       code_list = (CODE_ALIASES.keys + CODES).join(',')
+       opts.on("--code CODE", CODES, CODE_ALIASES, "Select encoding",
+               "  (#{code_list})") do |encoding|
+         options.encoding = encoding
+       end
+       # Optional argument with keyword completion.
+       opts.on("--type [TYPE]", [:text, :binary, :auto],
+               "Select transfer type (text, binary, auto)") do |t|
+         options.transfer_type = t
+       end
+       # Boolean switch.
+       opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
+         options.verbose = v
+       end
+       opts.separator ""
+       opts.separator "Common options:"
+       # No argument, shows at tail.  This will print an options summary.
+       # Try it and see!
+       opts.on_tail("-h", "--help", "Show this message") do
+         puts opts
+         exit
+       end
+       # Another typical switch to print the version.
+       opts.on_tail("--version", "Show version") do
+         puts OptionParser::Version.join('.')
+         exit
+       end
+     end
+     opts.parse!(args)
+     options
+   end  # parse()
+ end  # class OptparseExample
+ options = OptparseExample.parse(ARGV)
+ pp options

data/lib/sm_transcript/process_csv_files_to_html.rb ADDED

@@ -0,0 +1,58 @@
+#!/usr/bin/env ruby -wKU
+$KCODE = "UTF8"
+# p "Résumé"
+# require "rexml/document"
+require "rubygems"
+require "htmlentities"
+if __FILE__ == $0
+#  Dir.glob("/Users/pwilkins/work/Spoken\ Lecture/IIHS/*transcript\ ????09.csv") { |x|
+  Dir.glob("/Users/pwilkins/Downloads/*transcript\ ????09.csv") { |x|
+#      reg = Regexp.new('[\w\-_]*\.seg$')
+      coder = HTMLEntities.new
+      span_element = ""
+      prev_start_time = 0
+      f = File.new(x)
+      puts f.path
+      f.each("\n") { |line|
+        arr = line.split(",")
+#        p arr[8]
+        start_time = arr[2].to_i/1000
+      if start_time == prev_start_time # append word
+#        span_element << " #{coder.encode(arr[8].strip, :basic)}"
+        span_element << " #{arr[8].strip}"
+      else # create a new span_element
+        puts span_element << "</span> "
+#        span_element = "<span id='T#{start_time}'>#{coder.encode(arr[8].strip, :basic)}"
+        span_element = "<span id='T#{start_time}'>#{arr[8].strip}"
+        prev_start_time = start_time
+      end
+#       puts line.dump
+#        puts "<span id='T#{start_time}'>#{arr[6]}</span>"
+      }
+#      puts f
+#      root.elements.each("/document/lecture/segment") do |s|
+#        span_element = ""
+#        prev_start_time = 0
+#        s.text.scan(/^\d* \d* [\w']*$/) { |t|
+## get the start time and reduce its granularity so that multiple words fall
+## within a <span> element.
+#          start_time = t.scan(/^\d*/)[0].to_i/1000
+#          if start_time == prev_start_time # append word
+#            span_element << " #{t.scan(/[\w']*$/)}"
+#          else # create a new span_element
+#            puts span_element << "</span> "
+#            span_element = "<span id='T#{start_time}'>#{t.scan(/[\w']*$/)}"
+#            prev_start_time = start_time
+#          end
+##          puts "<span id='T#{t.scan(/^\d*/)[0].to_i/1000}'>#{t.scan(/[\w']*$/)}</span> "
+#        }
+#      end
+      puts
+  }
+end

data/lib/sm_transcript/process_seg_files.rb ADDED

@@ -0,0 +1,21 @@
+require "rexml/document"
+require "optparse"
+if __FILE__ == $0
+  Dir.glob("/Users/pwilkins/work/Spoken\ Lecture/IIHS/*.seg") { |x|
+    reg = Regexp.new('[\w\-_]*\.seg$')
+    doc = REXML::Document.new File.open(x)
+    root = doc.root
+    f = reg.match( root.attributes.get_attribute("fileName").value )
+    root.elements.each("/document/lecture/segment") do |s|
+      s.text.scan(/^\d* \d* [\w']*$/) { |t|
+      puts f[0] + "," + s.attributes.get_attribute("id").value + "," + t.gsub(/ /, ',')
+    }
+  end
+  puts
+}
+end

data/lib/sm_transcript/process_seg_files_to_csv.rb ADDED

@@ -0,0 +1,24 @@
+require "rexml/document"
+if __FILE__ == $0
+  Dir.glob("/Users/pwilkins/work/Spoken\ Lecture/IIHS/*.seg") { |x|
+      reg = Regexp.new('[\w\-_]*\.seg$')
+      doc = REXML::Document.new File.open(x)
+      root = doc.root
+      f = reg.match( root.attributes.get_attribute("fileName").value )
+      root.elements.each("/document/lecture/segment") do |s|
+      print "Filename (do not edit),Segment (do not edit),Start Time (do not edit),End Time (do not edit),Unedited Word (do not edit),Edited English Word,Hindi Word(s)"
+      puts
+        s.text.scan(/^\d* \d* [\w']*$/) { |t|
+          puts f[0] + "," + s.attributes.get_attribute("id").value + "," + t.gsub(/ /, ',')
+        }
+      end
+      puts
+  }
+end
+# Header for csv file:
+# Filename (do not edit),Segment (do not edit),Start Time (do not edit),End Time (do not edit),Unedited Word (do not edit),Edited English Word,Hindi Word(s)

data/lib/sm_transcript/process_seg_files_to_html.rb ADDED

@@ -0,0 +1,31 @@
+require "rexml/document"
+if __FILE__ == $0
+  Dir.glob("/Users/pwilkins/work/Spoken\ Lecture/IIHS/*.seg") { |x|
+      reg = Regexp.new('[\w\-_]*\.seg$')
+      doc = REXML::Document.new File.open(x)
+      root = doc.root
+      f = reg.match( root.attributes.get_attribute("fileName").value )
+      puts f
+      root.elements.each("/document/lecture/segment") do |s|
+        span_element = ""
+        prev_start_time = 0
+        s.text.scan(/^\d* \d* [\w']*$/) { |t|
+# get the start time and reduce its granularity so that multiple words fall
+# within a <span> element.
+          start_time = t.scan(/^\d*/)[0].to_i/1000
+          if start_time == prev_start_time # append word
+            span_element << " #{t.scan(/[\w']*$/)}"
+          else # create a new span_element
+            puts span_element << "</span> "
+            span_element = "<span id='T#{start_time}'>#{t.scan(/[\w']*$/)}"
+            prev_start_time = start_time
+          end
+#          puts "<span id='T#{t.scan(/^\d*/)[0].to_i/1000}'>#{t.scan(/[\w']*$/)}</span> "
+        }
+      end
+      puts
+  }
+end

data/lib/sm_transcript/require_relative.rb ADDED

@@ -0,0 +1,14 @@
+# This file implements the Ruby 1.9 feature 'require_relative'.  It is needed
+# only when running Ruby 1.8, so that code doesn't break.  It is taken from
+# the Pickaxe book for 1.9.
+def require_relative(relative_feature)
+  c = caller.first
+  fail "Can't parse #{c}" unless c.rindex(/:\d+(:in '.*')?$/)
+  file = $`
+  if /\A\((.*)\)/ =~ file # eval, etc.
+    raise LoadError, "require_relative is called in #{$1}"
+  end
+  absolute = File.expand_path(relative_feature, File.dirname(file))
+  require absolute
+end

data/lib/sm_transcript/runner.rb ADDED

@@ -0,0 +1,70 @@
+# $Id: runner.rb 183 2010-03-15 19:07:50Z pwilkins $
+# Copyright (c) 2010 Massachusetts Institute of Technology
+# see LICENSE.txt for license text
+require 'rubygems'
+require 'extensions/kernel'
+require_relative 'options'
+require_relative 'seg_reader'
+require_relative 'wrd_reader'
+require_relative 'transcript'
+require_relative 'metadata'
+require_relative 'metadata_reader'
+module SmTranscript
+  class Runner
+    attr_reader :options
+    def initialize(argv)
+      @options = Options.new(argv)
+    end
+    def run
+      # collect files to process
+      begin
+        raise "source directory doesn't exist" unless FileTest.exists?(@options.srcdir)
+        raise "destination directory doesn't exist" unless FileTest.exists?(@options.destdir)
+        # process each file in srcdir whose extension is the same as srctype
+        Dir.glob("#{@options.srcdir}/*.#{@options.srctype}") do |x|
+          case @options.srctype
+          when SmTranscript::Options::SEG_SRC_TYPE
+            words = SmTranscript::SegReader.from_file(x).words
+          when SmTranscript::Options::TXT_SRC_TYPE
+            md = SmTranscript::MetadataReader.from_file(x).metadata
+          else SmTranscript::Options::WRD_SRC_TYPE
+            words = SmTranscript::WrdReader.from_file(x).words
+          end
+          trans = SmTranscript::Transcript.new(words)
+          meta  = SmTranscript::Metadata.new(md)
+          destfile = File.basename( x, @options.srctype)
+          case @options.desttype
+          when SmTranscript::Options::HTML_DEST_TYPE
+            raise "txt invalid srctype for html desttype" if @options.srctype ==
+              SmTranscript::Options::TXT_SRC_TYPE
+            destfile = "#{destfile}t1.html"
+            # p "destfile is #{destfile}"
+            trans.write_html("#{@options.destdir}/#{destfile}")
+          when SmTranscript::Options::DATAJS_DEST_TYPE
+            raise "txt is only valid srctype for datajs desttype" unless @options.srctype ==
+              SmTranscript::Options::TXT_SRC_TYPE
+            destfile = "#{destfile}data.js"
+            # p "destfile is #{destfile}"
+            meta.write_datajs("#{@options.destdir}/#{destfile}")
+          else
+            destfile = "#{destfile}t1.ttml"
+            # p "destfile is #{destfile}"
+            trans.write_ttml("#{@options.destdir}/#{destfile}")
+          end
+        end  # Dir.glob()
+        rescue SystemCallError => e
+          STDERR.puts $!
+        end
+    end  # run
+  end
+end

data/lib/sm_transcript/seg_reader.rb ADDED

@@ -0,0 +1,42 @@
+# $Id: seg_reader.rb 182 2010-03-12 22:07:34Z pwilkins $
+# Copyright (c) 2010 Massachusetts Institute of Technology
+# see LICENSE.txt for license text
+require "rexml/document"
+require 'extensions/kernel'
+require_relative 'word'
+module SmTranscript
+  class SegReader
+    attr_reader :metadata
+    attr_reader :words
+    def self.from_file(file_name)
+      # p File.expand_path(file_name)
+      new(REXML::Document.new File.open(file_name))
+    end
+    def initialize(xml_doc)
+      @metadata = {}
+      @words = []
+      @root = xml_doc.root
+      parse_metadata()
+      parse_words()
+    end
+    def parse_metadata()
+      reg = Regexp.new('[\w\-_]*\.seg$')
+      @metadata["orig_seg_path"] =  # absolute path to segfile on processor
+      reg.match(@root.attributes.get_attribute("fileName").value)
+    end
+    def parse_words()
+      @root.elements.each("/document/lecture/segment") do |s|
+        s.text.scan(/^\d* \d* [\w']*$/) do |t|
+          arr = t.split
+          @words << SmTranscript::Word.new(arr[0], arr[1], arr[2])
+        end
+      end
+    end
+  end
+end

data/lib/sm_transcript/transcript.rb ADDED

@@ -0,0 +1,130 @@
+# $Id: transcript.rb 182 2010-03-12 22:07:34Z pwilkins $
+# Copyright (c) 2010 Massachusetts Institute of Technology
+# see LICENSE.txt for license text
+require "rexml/document"
+require 'extensions/kernel'
+require 'builder'
+require_relative 'word'
+module SmTranscript
+  class Transcript
+    @words = Array.new()
+    def initialize(word_arr)
+      @metadata = {}
+      @words = word_arr
+    end
+    def write_html(dest_file)
+      # TODO: Do we want to notify user when overwriting existing file?
+      # if File.exists?(dest_file)
+      #   p "overwriting existing destination file"
+      # end
+      File.open(dest_file, "w") do |f|
+        span_element = ""
+        prev_start_time = 0
+        start_time = 0
+        @words.each do |w|
+          # get the start time and reduce its granularity so that multiple
+          # words fall within a <span> element.
+          start_time = w.start_time.to_i/1000
+          if start_time.to_i == prev_start_time.to_i # append word
+            span_element << " #{w.word}"
+          else # create a new span_element
+            # since prev_start_time is zero on first line, this avoids
+            # writing a closing </span> with no opening <span>
+            f.puts span_element << "</span> " unless prev_start_time == 0
+            span_element = "<span id='T#{start_time}'>#{w.word}"
+            prev_start_time = start_time
+          end
+        end
+        # In the block above, the last word isn't written if
+        # the start_time and prev_start_time are the same.
+        f.puts span_element << "</span> " unless start_time != prev_start_time
+      end
+    end  # write_html()
+    def write_ttml(dest_file)
+      # TODO: Do we want to notify user when overwriting existing file?
+      # if File.exists?(dest_file)
+      #   p "overwriting existing destination file"
+      # end
+      buf = ""
+      bldr = Builder::XmlMarkup.new( :target => buf, :indent => 2 )
+      bldr.instruct!
+      bldr.tt("xmlns" => "http://www.w3.org/2006/04/ttaf1",
+      "xmlns:tts" => "http://www.w3.org/ns/ttml#styling",
+      "xmlns:ttm" => "http://www.w3.org/ns/ttml#metadata",
+      "xml:lang" => "en" ) {
+        bldr.head { |b|
+          b.ttm :title, 'Document Metadata Example'
+          b.ttm :desc,  'This document employs document metadata.'
+        }
+        bldr.body {
+          bldr.div {
+            span_element = ""
+            prev_start_secs = 0
+            start_ms = end_ms = 0
+            start_secs = 0
+            @words.each do |w|
+              # get the start time and reduce its granularity so that multiple
+              # words fall within a span element.
+              start_secs = w.start_time.to_i/1000
+              if start_secs == prev_start_secs # append word
+                end_ms   = w.end_time.to_i
+                span_element << " #{w.word}"
+              else # create a new span_element
+                bldr.p( span_element,
+                "xml:id" => "T#{start_secs.to_s}", "begin" => "#{start_ms.to_s}ms", "end" => "#{end_ms.to_s}ms" )
+                start_ms = w.start_time.to_i
+                end_ms   = w.end_time.to_i
+                span_element = " #{w.word}"
+                prev_start_secs = start_secs
+              end
+            end
+            # In the block above, the last word isn't written if
+            # the start_time and prev_start_time are the same.
+            bldr.p( span_element,
+              "xml:id" => "T#{start_secs.to_s}",
+              "begin" => "#{start_ms.to_s}ms",
+              "end" => "#{end_ms.to_s}ms" ) unless start_secs != prev_start_secs
+          }
+        }
+      }
+      # p buf
+      File.open(dest_file, "w") do |f|
+        f.puts buf
+        f.flush
+      end
+    end
+    # Times are expressed in milliseconds, far more granularity than is
+    # useful for most user-facing apps, especially since the player reports
+    # elapsed time only ten times a second.
+    # By reducing the time by orders of magnitude provides these benefits:
+    # 1) Multiple words fall within a <span> element.
+    # 2) Better mapping between start times and player time tracking
+    def words_to_phrase(start_time)
+      start_time.to_i/1000
+    end  # words_to_phrase
+    def get_time_expression(milliseconds)
+      milliseconds
+    end
+    # There are some word combinations that occur with such regularity that
+    # they call out to be fixed.  For example, "m I t" is unambiguously MIT.
+    # These edits can only be done when the phrase has been assembled.
+    def cleanup_phrase(phrase)
+      phrase
+    end
+  end  # class
+end