sm-transcript 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/LICENSE.txt +23 -0
  2. data/README.txt +140 -0
  3. data/Rakefile +31 -0
  4. data/bin/results/PLACEHOLDER.txt +8 -0
  5. data/bin/sm-transcript +12 -0
  6. data/bin/transcripts/PLACEHOLDER.txt +8 -0
  7. data/lib/sm_transcript/LICENSE.txt +23 -0
  8. data/lib/sm_transcript/metadata.rb +69 -0
  9. data/lib/sm_transcript/metadata_reader.rb +56 -0
  10. data/lib/sm_transcript/options.rb +89 -0
  11. data/lib/sm_transcript/optparseExample.rb +113 -0
  12. data/lib/sm_transcript/process_csv_files_to_html.rb +58 -0
  13. data/lib/sm_transcript/process_seg_files.rb +21 -0
  14. data/lib/sm_transcript/process_seg_files_to_csv.rb +24 -0
  15. data/lib/sm_transcript/process_seg_files_to_html.rb +31 -0
  16. data/lib/sm_transcript/require_relative.rb +14 -0
  17. data/lib/sm_transcript/runner.rb +70 -0
  18. data/lib/sm_transcript/seg_reader.rb +42 -0
  19. data/lib/sm_transcript/transcript.rb +130 -0
  20. data/lib/sm_transcript/word.rb +31 -0
  21. data/lib/sm_transcript/wrd_reader.rb +42 -0
  22. data/test/Rakefile +14 -0
  23. data/test/results/IIHS_Diane_Davis_Nov2009.seg +425 -0
  24. data/test/results/NERCOMP-SpokenMedia4.wrd +6791 -0
  25. data/test/results/PLACEHOLDER.txt +8 -0
  26. data/test/results/PLACEHOLDER.txt.ignore +8 -0
  27. data/test/results/vijay_kumar.wrd +1675 -0
  28. data/test/results/wirehair-beetle.txt +6 -0
  29. data/test/test_metadata.rb +39 -0
  30. data/test/test_metadatareader.rb +30 -0
  31. data/test/test_options.rb +47 -0
  32. data/test/test_runner.rb +52 -0
  33. data/test/test_segreader.rb +39 -0
  34. data/test/test_transcript.rb +62 -0
  35. data/test/test_wrdreader.rb +43 -0
  36. data/test/transcripts/IIHS_Diane_Davis_Nov2009-t1.html +148 -0
  37. data/test/transcripts/PLACEHOLDER.txt +8 -0
  38. data/test/transcripts/data.js +24 -0
  39. data/test/transcripts/vijay_kumar-1.-t1.html +557 -0
  40. data/test/transcripts/vijay_kumar-1.t1.html +557 -0
  41. data/test/transcripts/vijay_kumar-t1.html +557 -0
  42. data/test/transcripts/vijay_kumar-t1.ttml +569 -0
  43. data/test/transcripts/vijay_kumar.data.js +2 -0
  44. data/test/transcripts/wirehair-beetle.data.js +3 -0
  45. metadata +234 -0
@@ -0,0 +1,23 @@
1
+ # $Id: LICENSE.txt 192 2010-03-27 01:24:26Z pwilkins $
2
+
3
+ Copyright (c) 2010 Massachusetts Institute of Technology
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
23
+
@@ -0,0 +1,140 @@
1
+ $Id: README.txt 187 2010-03-22 19:32:07Z pwilkins $
2
+
3
+ sm-transcript reads results of SLS processing and produces transcripts for
4
+ the SpokenMedia browser. For each file in the source folder whose extension
5
+ matches the source type, a file of destination type is created in the
6
+ destination folder. All of these parameters have default values.
7
+
8
+ Requirements:
9
+ sm-transcript is written in Ruby and packaged as a RubyGem. Since Ruby is
10
+ not a compiled language, you will need to have Ruby installed on your
11
+ machine to run sm-transcript. You can determine if Ruby is installed by
12
+ typing "ruby -v" at a terminal prompt. It should return the version of
13
+ Ruby that is installed. If Ruby is not installed on your machine, contact
14
+ me (or your local Ruby wizard) for assistance.
15
+
16
+ Installation:
17
+ You can get sm-transcript as either a RubyGem or as source from svn.
18
+
19
+ The preferred way to install this package is as a Rubygem. You can
20
+ download the gem from this page:
21
+
22
+ http://TBD
23
+
24
+ and install it with this command:
25
+
26
+ sudo gem install [--verbose] <path to gem file>sm-transcript-n.n.n.gem
27
+
28
+ You must use "sudo" to properly install the gem. If you execute "gem
29
+ install" (omitting the "sudo") the gem is installed in your home gem
30
+ repository and it isn't in your path without additional configuration.
31
+
32
+ Note: You need sudo privileges to run the command as written. If you
33
+ can't sudo, then you can install it locally and will need some additional
34
+ configuration. Contact me (or your local Ruby wizard) for assistance.
35
+
36
+ The executable is now in your path.
37
+
38
+ You can cleanly uninstall the gem with this command:
39
+
40
+ sudo gem uninstall sm-transcript
41
+
42
+ If you have access to our svn repository, you are welcome to check out the
43
+ code. Be warned that the trunk tip is not necessarily stable. It changes
44
+ frequently as enhancements (and bug fixes) are added. (note that the
45
+ 'smb_transcript' in the command line below is not a typo. )
46
+
47
+ svn co svn+ssh://svn.mit.edu/oeit-tsa/SMB/smb_transcript/trunk sm_transcript
48
+
49
+ build the gem by running this command from the directory you installed the
50
+ source.
51
+
52
+ rake gem
53
+
54
+ The gem will be built and put in ./pkg You can now use the gem
55
+ installation instructions above.
56
+
57
+
58
+ Using the App:
59
+ Run with no command line parameters, the app reads *.wrd files out of
60
+ ./results and writes *t1.html files to ./transcripts. These directories
61
+ are relative to where sm_transcript is called.
62
+
63
+ Note: destination files are overwritten without a warning prompt. If you
64
+ want to preserve an existing output file, rename it before running the app
65
+ again.
66
+
67
+ For example, run the app by navigating to the bin folder and running
68
+
69
+ projects/sm_transcript/bin felix$ sm_transcript
70
+
71
+ This command run from this folder will read *.wrd files from bin/results
72
+ and write *-t1.html to bin/transcripts.
73
+
74
+ Usage: sm_transcript [options]
75
+ --srcdir PATH Read files from this folder (Default: ./results)
76
+ --destdir PATH Write files to this folder (Default: ./transcripts)
77
+ --srctype wrd | seg Kind of file to process (Default: wrd)
78
+ --desttype html | ttml Kind of file to output (Default: html)
79
+ -h, --help Show this message
80
+
81
+
82
+ Troubleshooting:
83
+ sm-transcript requires additional gems to operate. The RubyGem
84
+ installation instructions should install dependencies automatically, but
85
+ they may not. If you get an error that includes
86
+
87
+ ... no such file to load -- builder (LoadError)
88
+
89
+ in the first few lines when you run sm-transcript, the problem is a
90
+ missing dependent gem. (the error above indicates that the Builder
91
+ gem is missing.) Try installing the missing gem. For the error above,
92
+ command looks like this:
93
+
94
+ sudo gem install builder
95
+
96
+ See "Required Gems" below for more information.
97
+
98
+
99
+ Upgrading:
100
+ You can easily upgrade by simply executing the same command you used to
101
+ install the gem. Running install again will add the newer version and make
102
+ it active. By default the most recent version is used, but older versions
103
+ are still available, simply inactive.
104
+
105
+ If are using svn, you should already know what to do.
106
+
107
+
108
+ Required Gems:
109
+ builder - create structured data, such as XML
110
+ extensions - added for the 'require_relative' command. (To get this
111
+ command in Ruby 1.8 you need to install this gem, for Ruby 1.9
112
+ the command is already part of the core.)
113
+ htmlentities - html parsing
114
+ json - create JSON structured data
115
+ optparse - option parsing
116
+ ostruct - open data structures
117
+ ppcommand - pp is a pretty printer. It is used only for debugging
118
+ rake - make for Ruby
119
+ rubygems - support for gems
120
+ shoulda - enhancement for Test::Unit
121
+
122
+ This command installs gems on OSX and Linux:
123
+ felix$ sudo gem install <gem name>
124
+
125
+ Unit Tests:
126
+ You may run all unit tests by navigating to the test folder and running
127
+ rake with no parameters (the default rake task runs all tests):
128
+
129
+ projects/sm_transcript/test felix$ rake
130
+
131
+
132
+ Release Notes:
133
+ Initial Version - runs under Ruby 1.8.
134
+
135
+ To Do:
136
+ update code to run under Ruby 1.9
137
+
138
+ Make this a rubygem, making it available from an OEIT server, rather than
139
+ from a public gem repository like RubyForge.
140
+
@@ -0,0 +1,31 @@
1
+ # $Id: Rakefile 190 2010-03-26 22:00:32Z pwilkins $
2
+
3
+ require 'rake/gempackagetask'
4
+ require 'rake'
5
+
6
+ spec = Gem::Specification.new do |s|
7
+ s.name = "sm-transcript"
8
+ s.summary = "Convert word lists to transcripts"
9
+ s.description= File.read(File.join(File.dirname(__FILE__), 'README.txt'))
10
+ s.requirements = [ 'TBD' ]
11
+ s.version = "0.0.3"
12
+ s.author = "Peter Wilkins"
13
+ s.email = "pwilkins@mit.edu"
14
+ s.homepage = "http://spokenmedia.mit.edu"
15
+ s.platform = Gem::Platform::RUBY
16
+ s.required_ruby_version = '>=1.8'
17
+ s.files = Dir['lib/**/**'] +
18
+ Dir['bin/sm-transcript'] +
19
+ Dir['bin/results/PLACEHOLDER.txt'] +
20
+ Dir['bin/transcripts/PLACEHOLDER.txt'] +
21
+ Dir['test/**/**'] +
22
+ Dir['README.txt'] +
23
+ Dir['LICENSE.txt'] +
24
+ Dir['Rakefile']
25
+ s.files.reject! { |fn| fn.include? "process_" }
26
+ s.executables = [ 'sm-transcript' ]
27
+ s.test_files = Dir["test/test*.rb"]
28
+ s.has_rdoc = false
29
+ end
30
+
31
+ Rake::GemPackageTask.new(spec).define
@@ -0,0 +1,8 @@
1
+ # $Id: PLACEHOLDER.txt 186 2010-03-20 14:21:04Z pwilkins $
2
+
3
+ The bin/results directory provides a default location for input files. The
4
+ directory would be empty, but Subversion and Rubygems won't version an empty
5
+ directory.
6
+
7
+ This file serves no purpose other than tricking Subversion and Rubygems into
8
+ creating what might otherwise be an empty directory.
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby -KU
2
+ # $Id: sm-transcript 192 2010-03-27 01:24:26Z pwilkins $
3
+ # Copyright (c) 2010 Massachusetts Institute of Technology
4
+ # see LICENSE.txt for license text
5
+
6
+
7
+ require 'rubygems'
8
+ require 'extensions/kernel'
9
+ require_relative '../lib/sm_transcript/runner'
10
+
11
+ runner = SmTranscript::Runner.new(ARGV)
12
+ runner.run
@@ -0,0 +1,8 @@
1
+ # $Id: PLACEHOLDER.txt 186 2010-03-20 14:21:04Z pwilkins $
2
+
3
+ The bin/transcripts directory exists to receive transcript files created as
4
+ a result of processing. The directory would be empty, but Subversion and
5
+ Rubygems won't version an empty directory.
6
+
7
+ This file serves no purpose other than tricking Subversion and Rubygems into
8
+ creating what might otherwise be an empty directory.
@@ -0,0 +1,23 @@
1
+ # $Id$
2
+
3
+ Copyright (c) 2010 Massachusetts Institute of Technology
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
23
+
@@ -0,0 +1,69 @@
1
+ # $Id: metadata.rb 183 2010-03-15 19:07:50Z pwilkins $
2
+ # Copyright (c) 2010 Massachusetts Institute of Technology
3
+ # see LICENSE.txt for license text
4
+
5
+ require "rexml/document"
6
+ require 'extensions/kernel'
7
+ require 'json/ext'
8
+ require_relative 'word'
9
+
10
+ module SmTranscript
11
+ class Metadata
12
+
13
+ def initialize(metadata)
14
+ @metadata = metadata
15
+ end
16
+
17
+ # While the format this method writes is JSON, it writes a
18
+ # data.js-like file for compatibility with the IIHS demo player.
19
+ # There is also a write_json() method which will write a richer
20
+ # content set.
21
+ def write_datajs(dest_file)
22
+ # TODO: Do we want to notify user when overwriting existing file?
23
+ # if File.exists?(dest_file)
24
+ # p "overwriting existing destination file"
25
+ # end
26
+ return if @metadata.empty?
27
+
28
+ id_val = @metadata['name'].chomp + "-#{id_from_time()}" unless @metadata['name'].nil?
29
+ title_val = @metadata['title'].chomp unless @metadata['title'].nil?
30
+ speaker_val = @metadata['speaker'].chomp unless @metadata['speaker'].nil?
31
+ video_val = @metadata['video'].chomp unless @metadata['video'].nil?
32
+
33
+ File.open(dest_file, "w") do |f|
34
+ # f.puts 'var SMData = {'
35
+ tmp = JSON.pretty_generate(
36
+ { 'videos' => [
37
+ 'id' => id_val,
38
+ 'title' => title_val,
39
+ 'speaker' => speaker_val,
40
+ 'video' => video_val,
41
+ 'width' => 320,
42
+ 'height' => 240,
43
+ 'duration' => 523,
44
+ 'preview' => 'preview.png',
45
+ 'transcripts' => {'en-US' => 'transcript.trn'},
46
+ 'audio' => {'en-US' => '../audio/'},
47
+ 'defaultLocale' => 'en-US'
48
+ ],
49
+ 'locales' => {'en-US' => 'English (US)'}
50
+ }
51
+ )
52
+ # p tmp
53
+ f.puts tmp
54
+ f.flush
55
+ end
56
+
57
+ end # write_datajs()
58
+
59
+ def boilerplate()
60
+ JSON
61
+ end
62
+
63
+ def id_from_time()
64
+ t = Time.now
65
+ t.strftime("%y%m%d%H%M")
66
+ end
67
+
68
+ end
69
+ end
@@ -0,0 +1,56 @@
1
+ # $Id: metadata_reader.rb 182 2010-03-12 22:07:34Z pwilkins $
2
+ # Copyright (c) 2010 Massachusetts Institute of Technology
3
+ # see LICENSE.txt for license text
4
+
5
+ require 'rubygems'
6
+ require 'extensions/kernel'
7
+ require_relative 'word'
8
+
9
+ module SmTranscript
10
+ class MetadataReader
11
+ attr_reader :metadata
12
+ attr_reader :words
13
+
14
+ def self.from_file(file_name)
15
+ # p File.expand_path(file_name)
16
+ new(File.open(file_name))
17
+ end
18
+
19
+ def initialize(src_file)
20
+ @metadata = {}
21
+ @words = []
22
+ parse_metadata(src_file)
23
+ # parse_words(src_file)
24
+ end
25
+
26
+ def parse_metadata(src_file)
27
+ # don't process PLACEHOLDER.txt
28
+ reg = Regexp.new('.*PLACEHOLDER.*')
29
+
30
+ src_file.each do |ln|
31
+ # break if src_file.lineno > 6 # we're only interested in the 1st 6 lines
32
+ case src_file.lineno
33
+ when 1
34
+ return if reg.match(ln)
35
+ @metadata['name'] = ln
36
+ when 2
37
+ @metadata['email'] = ln
38
+ when 3
39
+ @metadata['org'] = ln
40
+ when 4
41
+ @metadata['title'] = ln
42
+ when 5
43
+ @metadata['speaker'] = ln
44
+ when 6
45
+ @metadata['video'] = ln
46
+ else
47
+ break
48
+ end
49
+ end
50
+ end
51
+
52
+ def parse_words(src_file)
53
+ # there are currently no timed text in metadata files
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,89 @@
1
+ # $Id: options.rb 183 2010-03-15 19:07:50Z pwilkins $
2
+ # Copyright (c) 2010 Massachusetts Institute of Technology
3
+ # see LICENSE.txt for license text
4
+
5
+ require "optparse"
6
+ require 'ostruct'
7
+
8
+ module SmTranscript
9
+ class Options
10
+
11
+ SEG_SRC_TYPE = 'seg'
12
+ WRD_SRC_TYPE = 'wrd'
13
+ TXT_SRC_TYPE = 'txt'
14
+ TTML_DEST_TYPE = 'ttml'
15
+ HTML_DEST_TYPE = 'html'
16
+ DATAJS_DEST_TYPE = 'datajs'
17
+ DEFAULT_SRC_DIR = "./results"
18
+ DEFAULT_DEST_DIR = "./transcripts"
19
+ DEFAULT_SRC_TYPE = WRD_SRC_TYPE
20
+ DEFAULT_DEST_TYPE = HTML_DEST_TYPE
21
+
22
+ attr_reader :srcdir
23
+ attr_reader :destdir
24
+ attr_reader :srctype
25
+ attr_reader :desttype
26
+ attr_reader :options
27
+
28
+ def initialize(argv)
29
+ @options = OpenStruct.new
30
+ @srcdir = DEFAULT_SRC_DIR
31
+ @destdir = DEFAULT_DEST_DIR
32
+ @srctype = DEFAULT_SRC_TYPE
33
+ @desttype = DEFAULT_DEST_TYPE
34
+ parse(argv)
35
+ end
36
+
37
+ def parse(argv)
38
+ @options.srcdir = @srcdir
39
+ @options.destdir = @destdir
40
+ @options.srctype = @srctype
41
+ @options.desttype = @desttype
42
+
43
+ opts = OptionParser.new do |opts|
44
+ opts.banner = "Usage: sm_transcript [options]"
45
+ opts.separator " "
46
+ opts.separator "Specific options:"
47
+
48
+ opts.on(
49
+ "--srcdir PATH",
50
+ "Read files from this folder (Default: ./results)") do |sdir|
51
+ @options.srcdir = @srcdir = sdir
52
+ end
53
+
54
+ opts.on(
55
+ "--destdir PATH",
56
+ String,
57
+ "Write files to this folder (Default: ./transcripts)") do |ddir|
58
+ @options.destdir = @destdir = ddir
59
+ end
60
+
61
+ opts.on("--srctype seg | wrd | txt",
62
+ "Kind of file to process (Default: seg)") do |stype|
63
+ @options.srctype = @srctype = stype
64
+ end
65
+
66
+ opts.on("--desttype html | ttml | datajs",
67
+ "Kind of format to output (Default: html)") do |dtype|
68
+ @options.desttype = @desttype = dtype
69
+ end
70
+
71
+ opts.on("-h", "--help", "Show this message") do
72
+ puts "\n#{opts}"
73
+ return
74
+ end
75
+
76
+ begin
77
+ argv = ["-h"] if argv.empty?
78
+ opts.parse!(argv)
79
+ rescue OptionParser::ParseError => e
80
+ STDERR.puts e.message, "\n", opts
81
+ exit(-1)
82
+ end
83
+ end
84
+ end
85
+ end
86
+ end
87
+
88
+
89
+