subtitle 0.2.6 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 99d9960e9d5fb5daa7ddc87e76db91c654ce68453a13fd9124611e7589cc7fe5
4
- data.tar.gz: '090e594747871afd7e3e423dab00154313d5b77423664a300f499df07dd895c0'
3
+ metadata.gz: 49e45fb2713aedd5d6d7d6d290fe4874a292df3249cfad7259913e90b0cb7fd8
4
+ data.tar.gz: 7f4535875a19028db4ec08de90903daba7b906b659571c5a921850071bf3154c
5
5
  SHA512:
6
- metadata.gz: 1545c3120f496fe10228473f45f5bb3a78135fd603d15eed6eee7a2aa85fc281f53f4fc5f612c3ab4d710baa9388abd6528c8f5aff52f3fb3bfe424e61c05e2c
7
- data.tar.gz: 38fc75d66ea829a7596d5a7235fd6d7f43d256a3dbb1b8ae136926b92ca83b7cbf041c290cf4e506e032d987e73b9a56761cfb89596d6131aafc5d5f87453732
6
+ metadata.gz: 0b53144f0a627a545c0a989d664f3611078a7afc11e9f00b479065a0d3a1b2bc9bf68e10706bd89b85c0e73ff53d7c4627a2c5e29d38867ec2882c99ea56eda0
7
+ data.tar.gz: 6999ae152b2f5904a2061944522b11387280df6e02cbf4d36d7d5ae27ba12eb3b6c36357bba791c5a7b4f06575531ce76df904dcbb76b0e5f79bd62b05988704
data/lib/allfather.rb CHANGED
@@ -1,3 +1,6 @@
1
+ require 'fileutils'
2
+ require_relative "engines/translator"
3
+
1
4
  #
2
5
  # A Module that kind of acts as an interface where the generic methods
3
6
  # that applies to each caption type can be defined
@@ -12,6 +15,15 @@ module AllFather
12
15
  #
13
16
  VALID_FILES = [".scc", ".srt", ".vtt", ".ttml", ".dfxp"]
14
17
 
18
+ #
19
+ # Caption type constants
20
+ #
21
+ TYPE_SCC = 1
22
+ TYPE_SRT = 2
23
+ TYPE_VTT = 3
24
+ TYPE_TTML = 4
25
+ TYPE_DFXP = 5
26
+
15
27
  #
16
28
  # Generic exception class that is raised for validation errors
17
29
  #
@@ -45,12 +57,23 @@ module AllFather
45
57
  raise "Not Implemented. Class #{self.class.name} doesn't implement infer_languages"
46
58
  end
47
59
 
60
+
61
+ #
62
+ # Method to set a translation engine
63
+ #
64
+ # * +translator+ - Instance of translation engine. Refer to `engines/aws` for example
65
+ #
66
+ def set_translator(translator)
67
+ if translator && !(translator.is_a? Translator)
68
+ raise "Argument is not an instance of Translator"
69
+ end
70
+ end
71
+
48
72
  #
49
73
  # Method to translate the caption from one language to another
50
74
  #
51
75
  # :args: src_lang, target_lang, output_file
52
76
  #
53
- # * +input_caption+ - A Valid input caption file. Refer to #is_valid?
54
77
  # * +src_lang+ - can be inferred using #infer_language method
55
78
  # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
56
79
  # * +output_file+ - Output file. Can be a fully qualified path or just file name
@@ -80,4 +103,55 @@ module AllFather
80
103
  # Further checks can be done only in caption specific implementations
81
104
  # or translation engine specific implementation
82
105
  end
106
+
107
+ #
108
+ # Method to convert from one caption type to other types. If the src_lang is not provided
109
+ # then all source languages will be converted to target types. For example, if a ttml file
110
+ # has "en" and "es" and target_type is vtt and no src_lang is provided 2 vtt files would be
111
+ # created one per language in the source. if a target_lang is provided then one of the lang
112
+ # from source would be picked for creating the output file with target_lang
113
+ #
114
+ # If no target_lang is provided, no translations are applied. output_file is created using
115
+ # without any need for any language translation services. Hence doesn't incur any cost !!
116
+ #
117
+ # * +types+ - An array of Valid input caption type(s). Refer to `#CaptionType`
118
+ # * +src_lang+ - can be inferred using #infer_language method
119
+ # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
120
+ # * +output_dir+ - Output Directory. Generated files would be dumped here
121
+ #
122
+ # ==== Raises
123
+ #
124
+ # InvalidInputException shall be raised if
125
+ # 1. The input file doesn't exist or is unreadable or is invalid caption
126
+ # 2. The output dir doesn't exist
127
+ # 3. Invalid lang codes for a given caption type
128
+ # 4. Unsupported type to which conversion is requested for
129
+ #
130
+ def transform_to(types, src_lang, target_lang, output_dir)
131
+ if (types - supported_transformations).size != 0
132
+ raise InvalidInputException.new("Unknown types provided for conversion in input #{types}")
133
+ end
134
+ unless File.directory?(output_dir)
135
+ FileUtils.mkdir_p(output_dir)
136
+ end
137
+ # Basic validations
138
+ if types.include?(TYPE_SCC)
139
+ if target_lang && !target_lang.eql?("en")
140
+ raise InvalidInputException.new("SCC can be generated only in en. #{target_lang} is unsupported")
141
+ end
142
+ end
143
+ if target_lang && !target_lang.empty?
144
+ raise InvalidInputException.new("Translation to other language as part of transform is yet to be implemented")
145
+ end
146
+ end
147
+
148
+ #
149
+ # Method to report on the supported transformations. Each implementor is free to return
150
+ # the types to which it can convert itself to
151
+ #
152
+ # Returns an array of one or more types defined as +TYPE_+ constants here
153
+ #
154
+ def supported_transformations
155
+ raise "Not Implemented. Class #{self.class.name} doesn't implement supported_transformations"
156
+ end
83
157
  end
data/lib/dfxp.rb CHANGED
@@ -10,10 +10,10 @@ require_relative "ttml"
10
10
  #
11
11
  class DFXP < TTML
12
12
 
13
- def initialize(cc_file, translator, opts={})
13
+ def initialize(cc_file)
14
14
  @cc_file = cc_file
15
- @translator = translator
16
- @force_detect = opts[:force_detect] || false
15
+ #@translator = translator
16
+ #@force_detect = opts[:force_detect] || false
17
17
  raise "Invalid TTML file provided" unless is_valid?
18
18
  end
19
19
 
data/lib/engines/aws.rb CHANGED
@@ -10,10 +10,9 @@ require_relative 'translator'
10
10
  # == Credential Referencing Order
11
11
  #
12
12
  # * [Arguments] - Pass the credentials access_key_id and secret_access_key as arguments
13
- # * [Environment route] - AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY can be exposed as
14
- # environment variables
13
+ # * [Environment route] - AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY can be exposed as environment variables
15
14
  # * [Profile Name] - The application uses the credentials of the system and picks the
16
- # credentials referred to by the profile
15
+ # credentials referred to by the profile
17
16
  #
18
17
  class AwsEngine
19
18
  include Translator
@@ -71,9 +70,6 @@ class AwsEngine
71
70
  # Invokes the language detection API of AWS and returns only the language
72
71
  # of the highest score and returns the ISO 639-1 code
73
72
  #
74
- # :args: text
75
- #
76
- # ===== Arguments
77
73
  # * +text+ - The text for which the language is to be inferred
78
74
  #
79
75
  def infer_language(text)
@@ -83,12 +79,10 @@ class AwsEngine
83
79
 
84
80
  #
85
81
  # Invokes the translation API of AWS and returns the translated text
86
- # as per the arguments provided
82
+ # as per the arguments provided.
87
83
  # Will Raise exception if a translation cannot be made between the source
88
84
  # and target language codes or if the lang code is invalid
89
85
  #
90
- # :args: input_text, src_lang, target_lang
91
- #
92
86
  # * +input_text+ - The text that needs to be translated
93
87
  # * +src_lang+ - The source language of the text
94
88
  # * +target_lang+ - The target language to which the input_text needs to be translated to
data/lib/scc.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require_relative "engines/translator"
2
+ require_relative "utils/common_utils"
3
+ require_relative "utils/cue_info"
2
4
  require_relative "allfather"
3
5
 
4
6
  #
@@ -10,10 +12,12 @@ require_relative "allfather"
10
12
  class SCC
11
13
 
12
14
  include AllFather
15
+ include CommonUtils
13
16
 
14
- def initialize(cc_file, translator)
17
+ SUPPORTED_TRANSFORMATIONS = [TYPE_SRT, TYPE_VTT, TYPE_TTML, TYPE_DFXP]
18
+
19
+ def initialize(cc_file)
15
20
  @cc_file = cc_file
16
- @translator = translator
17
21
  raise "Invalid SCC file provided" unless is_valid?
18
22
  end
19
23
 
@@ -25,6 +29,11 @@ class SCC
25
29
  return false
26
30
  end
27
31
 
32
+ def set_translator(translator)
33
+ super(translator)
34
+ @translator = translator
35
+ end
36
+
28
37
  def infer_languages
29
38
  lang = nil
30
39
  begin
@@ -40,6 +49,71 @@ class SCC
40
49
  raise "Not Implemented. Class #{self.class.name} doesn't implement translate yet !!"
41
50
  end
42
51
 
52
+ def supported_transformations
53
+ return SUPPORTED_TRANSFORMATIONS
54
+ end
55
+
56
+ def transform_to(types, src_lang, target_lang, output_dir)
57
+ # Let's start off with some validations
58
+ super(types, src_lang, target_lang, output_dir)
59
+
60
+ # Suffix output dir with File seperator
61
+ output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
62
+
63
+ # Prepare the output files for each type
64
+ file_map = {}
65
+ types.each do |type|
66
+ output_file = File.basename(@cc_file, File.extname(@cc_file)) + extension_from_type(type)
67
+ out_file = "#{output_dir}#{output_file}"
68
+ if create_file(TYPE_SCC, type, out_file, target_lang)
69
+ file_map[type] = out_file
70
+ else
71
+ raise StandardError.new("Failed to create output file for type #{type}")
72
+ end
73
+ end
74
+
75
+ # Read the file and prepare the cue model
76
+ prev_cue_info = cur_cue_info = nil
77
+ ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
78
+ cue_index = 1
79
+ ccfile.each_line do | line |
80
+ time_point = line.scan(/(^\d\d:\d\d:\d\d:\d\d\s)(.*)/)
81
+ unless time_point.empty?
82
+ scc_text_code = time_point[0][1].strip
83
+ message = decode(scc_text_code)
84
+ # Replace \u0000 with empty as this causes the ttml / dfxp outputs
85
+ # to treat them as end and terminates the xml the moment this is encountered
86
+ # https://github.com/sparklemotion/nokogiri/issues/1535
87
+ message = message.gsub(/\u0000/, '')
88
+ if prev_cue_info.nil?
89
+ prev_cue_info = CueInfo.new(TYPE_SCC)
90
+ prev_cue_info.index = cue_index
91
+ prev_cue_info.message = message
92
+ prev_cue_info.start = time_point[0][0].strip
93
+ else
94
+ cur_cue_info = CueInfo.new(TYPE_SCC)
95
+ cur_cue_info.index = cue_index
96
+ cur_cue_info.message = message
97
+ cur_cue_info.start = time_point[0][0].strip
98
+ # Set the previous cue info's end time to current cue's start time
99
+ # TODO: Need to see if we need to reduce alteast 1 fps or 1s
100
+ prev_cue_info.end = cur_cue_info.start
101
+ prev_cue_info.start_time_units = time_details(prev_cue_info.start, TYPE_SCC)
102
+ prev_cue_info.end_time_units = time_details(prev_cue_info.end, TYPE_SCC)
103
+ write_cue(prev_cue_info, file_map)
104
+ prev_cue_info = cur_cue_info
105
+ end
106
+ cue_index += 1
107
+ end
108
+ end
109
+ # we need to set some end time, but don't know the same !!
110
+ # for now setting the start time itself
111
+ cur_cue_info.end = cur_cue_info.start
112
+ cur_cue_info.start_time_units = time_details(cur_cue_info.start, TYPE_SCC)
113
+ cur_cue_info.end_time_units = time_details(cur_cue_info.end, TYPE_SCC)
114
+ write_cue(cur_cue_info, file_map, true)
115
+ end
116
+
43
117
  private
44
118
 
45
119
  def get_text(srt_file, num_chars)
@@ -78,20 +152,4 @@ class SCC
78
152
  end
79
153
  decoded_text
80
154
  end
81
-
82
- def encode(free_text)
83
- encoded_str = ""
84
- count = 0
85
- free_text.each_byte do |char|
86
- count += 1
87
- binval = char.to_s(2).count("1") % 2 == 0 ? (char.to_i | 128 ).to_s(2) : char.to_s(2)
88
- encode_char = binval.to_i(2).to_s(16)
89
- if ((count > 0) && (count % 2 == 0))
90
- encoded_str << encode_char << " "
91
- else
92
- encoded_str << encode_char
93
- end
94
- end
95
- encoded_str
96
- end
97
155
  end
data/lib/srt.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require_relative "engines/translator"
2
+ require_relative "utils/common_utils"
3
+ require_relative "utils/cue_info"
2
4
  require_relative "allfather"
3
5
 
4
6
  #
@@ -10,10 +12,12 @@ require_relative "allfather"
10
12
  class SRT
11
13
 
12
14
  include AllFather
15
+ include CommonUtils
13
16
 
14
- def initialize(cc_file, translator)
17
+ SUPPORTED_TRANSFORMATIONS = [TYPE_SCC, TYPE_VTT, TYPE_TTML, TYPE_DFXP]
18
+
19
+ def initialize(cc_file)
15
20
  @cc_file = cc_file
16
- @translator = translator
17
21
  raise "Invalid SRT file provided" unless is_valid?
18
22
  end
19
23
 
@@ -25,6 +29,11 @@ class SRT
25
29
  return false
26
30
  end
27
31
 
32
+ def set_translator(translator)
33
+ super(translator)
34
+ @translator = translator
35
+ end
36
+
28
37
  def translate(src_lang, dest_lang, out_file)
29
38
  super(src_lang, dest_lang, out_file)
30
39
  begin
@@ -60,7 +69,6 @@ class SRT
60
69
  outfile.puts
61
70
  end
62
71
  ensure
63
- ccfile.close rescue nil
64
72
  outfile.close
65
73
  end
66
74
  end
@@ -76,6 +84,66 @@ class SRT
76
84
  [lang]
77
85
  end
78
86
 
87
+ def supported_transformations
88
+ return SUPPORTED_TRANSFORMATIONS
89
+ end
90
+
91
+ def transform_to(types, src_lang, target_lang, output_dir)
92
+ # Let's start off with some validations
93
+ super(types, src_lang, target_lang, output_dir)
94
+
95
+ # Suffix output dir with File seperator
96
+ output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
97
+
98
+ # Prepare the output files for each type
99
+ file_map = {}
100
+ types.each do |type|
101
+ output_file = File.basename(@cc_file, File.extname(@cc_file)) + extension_from_type(type)
102
+ out_file = "#{output_dir}#{output_file}"
103
+ if create_file(TYPE_SRT, type, out_file, target_lang)
104
+ file_map[type] = out_file
105
+ else
106
+ raise StandardError.new("Failed to create output file for type #{type}")
107
+ end
108
+ end
109
+
110
+ # Read the file and prepare the cue model
111
+ cue_info = nil
112
+ ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
113
+ message = ""
114
+ ccfile.each_line do | line |
115
+ # p line
116
+ next if line.strip.empty?
117
+ time_points = line.scan(/^((\d\d:)\d\d:\d\d[,.]\d\d\d.*)-->.*((\d\d:)\d\d:\d\d[,.]\d\d\d)/)
118
+ if time_points.empty?
119
+ # This is not a time point
120
+ seq = line.strip
121
+ if seq.to_i > 0
122
+ cue_info.message = message unless message.empty?
123
+ write_cue(cue_info, file_map) if cue_info
124
+ cue_info = CueInfo.new(TYPE_SRT)
125
+ cue_info.sequence = seq
126
+ # Reset the message
127
+ message = ""
128
+ else
129
+ # This is not a sequence number nor it's timepoints
130
+ # Grab the details until we find next cue point
131
+ message << line
132
+ end
133
+ else
134
+ # This is a cue point. Fetch timestamps
135
+ cue_info.start = time_points[0][0]
136
+ cue_info.end = time_points[0][2]
137
+ start_units = time_details(cue_info.start, TYPE_SRT)
138
+ end_units = time_details(cue_info.end, TYPE_SRT)
139
+ cue_info.start_time_units = start_units
140
+ cue_info.end_time_units = end_units
141
+ end
142
+ end
143
+ cue_info.message = message unless message.empty?
144
+ write_cue(cue_info, file_map, true)
145
+ end
146
+
79
147
  private
80
148
 
81
149
  #
@@ -103,4 +171,4 @@ class SRT
103
171
  end
104
172
  return text_sample[0, num_chars]
105
173
  end
106
- end
174
+ end
data/lib/subtitle.rb CHANGED
@@ -7,21 +7,29 @@ require_relative "allfather"
7
7
  require_relative "engines/translator"
8
8
  require_relative "engines/aws"
9
9
 
10
-
10
+ #
11
+ # Facade that wraps all the complexities surrounding which translation
12
+ # engine to use or which caption instances to be instantiated.
13
+ #
11
14
  class Subtitle
12
- def initialize(options={})
15
+
16
+ TYPE_MAP = {"scc" => AllFather::TYPE_SCC, "srt" => AllFather::TYPE_SRT, "vtt" => AllFather::TYPE_VTT,
17
+ "ttml" => AllFather::TYPE_TTML, "dfxp" => AllFather::TYPE_DFXP}
18
+
19
+ def initialize(file, options = nil)
13
20
  # Infer the caption handler from the extension
14
- @cc_file = options[:cc_file]
21
+ @cc_file = file
15
22
  raise "Input caption not provided. Please provide the same in :cc_file option" if @cc_file.nil?
16
- translator = get_translator(options)
17
- @handler = get_caption_handler(options, translator)
23
+ initialize_handler(options) unless options.nil?
18
24
  end
19
25
 
20
- def detect_language
26
+ def detect_language(options = nil)
27
+ initialize_handler(options) if @handler.nil?
21
28
  @handler.infer_languages
22
29
  end
23
30
 
24
- def translate(dest_lang, src_lang = nil, outfile = nil)
31
+ def translate(dest_lang, src_lang = nil, outfile = nil, options = nil)
32
+ initialize_handler(options) if @handler.nil?
25
33
  if outfile.nil?
26
34
  outfile = "#{@cc_file}_#{dest_lang}"
27
35
  end
@@ -33,13 +41,40 @@ class Subtitle
33
41
  outfile
34
42
  end
35
43
 
44
+ def transform(types, src_lang = nil, target_lang = nil, options = nil)
45
+ # A quick validation & translation to expected arguments
46
+ vals = []
47
+ invalid_vals = []
48
+ types.each do |type|
49
+ type_val = TYPE_MAP[type]
50
+ if type_val.nil?
51
+ invalid_vals << type
52
+ next
53
+ end
54
+ vals << type_val
55
+ end
56
+ unless invalid_vals.empty?
57
+ raise "Invalid types #{invalid_vals} provided"
58
+ end
59
+ # Translator not required if target_lang is nil
60
+ if @handler.nil?
61
+ if target_lang.nil?
62
+ @handler = get_caption_handler(options, nil)
63
+ else
64
+ initialize_handler(options)
65
+ end
66
+ end
67
+ output_dir = options[:outfile]
68
+ @handler.transform_to(vals, src_lang, target_lang, output_dir)
69
+ end
70
+
36
71
  def type
37
72
  type = nil
38
73
  ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
39
74
  ccfile.each_line do | line |
40
75
  if line =~ /^(\d\d:)\d\d:\d\d[,]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,]\d\d\d/
41
76
  type = "srt"
42
- elsif line =~ /(^(\d\d:)\d\d:\d\d[.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[.]\d\d\d)|(^WEBVTT$)/
77
+ elsif line =~ /^((\d\d:)+\d\d[.,]\d\d\d)\s-->\s((\d\d:)+\d\d[.,]\d\d\d)|(^WEBVTT$)/
43
78
  type = "vtt"
44
79
  elsif line =~ /(^\d\d:\d\d:\d\d:\d\d\t(([0-9a-fA-F]{4})\s)*)+|(^Scenarist_SCC V(\d.\d)$)/
45
80
  type = "scc"
@@ -63,6 +98,11 @@ class Subtitle
63
98
 
64
99
  private
65
100
 
101
+ def initialize_handler(options)
102
+ translator = get_translator(options)
103
+ @handler = get_caption_handler(options, translator)
104
+ end
105
+
66
106
  def get_translator(options)
67
107
  translator = nil
68
108
  # Try to infer the engine based on the passed options
@@ -93,24 +133,26 @@ class Subtitle
93
133
  def get_caption_handler(options, translator)
94
134
  caption_file = options[:cc_file]
95
135
  extension = File.extname(caption_file)
136
+ extension = ".#{type}" if extension.nil?
96
137
  unless AllFather::VALID_FILES.include?(extension)
97
138
  raise "Caption support for #{caption_file} of type #{extension} is not supported yet"
98
139
  end
99
140
  handler = nil
100
141
  case extension.downcase
101
142
  when ".scc"
102
- handler = SCC.new(caption_file, translator)
143
+ handler = SCC.new(caption_file)
103
144
  when ".srt"
104
- handler = SRT.new(caption_file, translator)
145
+ handler = SRT.new(caption_file)
105
146
  when ".vtt"
106
- handler = VTT.new(caption_file, translator)
147
+ handler = VTT.new(caption_file)
107
148
  when ".ttml"
108
- handler = TTML.new(caption_file, translator, {:force_detect => options[:force_detect]})
149
+ handler = TTML.new(caption_file)
109
150
  when ".dfxp"
110
- handler = DFXP.new(caption_file, translator, {:force_detect => options[:force_detect]})
151
+ handler = DFXP.new(caption_file)
111
152
  else
112
153
  raise "Cannot handle file type .#{extension}"
113
154
  end
155
+ handler.set_translator(translator)
114
156
  handler
115
157
  end
116
158
  end
data/lib/ttml.rb CHANGED
@@ -13,10 +13,8 @@ class TTML
13
13
 
14
14
  include AllFather
15
15
 
16
- def initialize(cc_file, translator, opts={})
16
+ def initialize(cc_file)
17
17
  @cc_file = cc_file
18
- @translator = translator
19
- @force_detect = opts[:force_detect] || false
20
18
  raise "Invalid TTML file provided" unless is_valid?
21
19
  end
22
20
 
@@ -30,7 +28,12 @@ class TTML
30
28
  return false
31
29
  end
32
30
 
31
+ def set_translator(translator)
32
+ @translator = translator
33
+ end
34
+
33
35
  def infer_languages
36
+ force_detect = false
34
37
  lang = []
35
38
  begin
36
39
  xml_file = File.open(@cc_file)
@@ -43,9 +46,9 @@ class TTML
43
46
  if inferred_lang.nil?
44
47
  # If lang is not provided in the caption, then override
45
48
  # force detect for inferrence
46
- @force_detect = true
49
+ force_detect = true
47
50
  end
48
- if @force_detect
51
+ if force_detect
49
52
  sample_text = get_text(div, 100)
50
53
  inferred_lang = @translator.infer_language(sample_text) rescue nil
51
54
  if inferred_lang.nil?
@@ -0,0 +1,329 @@
1
+ require_relative "../allfather"
2
+ require "nokogiri"
3
+
4
+ module CommonUtils
5
+
6
+ CREDITS = "Credits: Autogenerated by subtitle Rubygem".freeze
7
+
8
+ SCC_DEFAULT_FRAME_RATE = ENV["SCC_DEFAULT_FRAME_RATE"] || 23.976
9
+
10
+ #
11
+ # Method to create the file with basic header informations which can be
12
+ # further updated with the transformed caption details by respective
13
+ # implementations
14
+ #
15
+ # * +src_type+ - Source caption type. Refer to AllFather::TYPE_SCC type constants
16
+ # * +dest_type+ - Target caption type. Refer to AllFather::TYPE_SCC type constants
17
+ # * +output_file+ - Creates this output_file to which type specific
18
+ # information would be dumped into
19
+ # * +target_lang+ - Target lang of the output_file
20
+ #
21
+ # ==== Returns
22
+ # true if the file is created with right headers and false otherwise
23
+ #
24
+ def create_file(src_type, dest_type, output_file, target_lang)
25
+ file = nil
26
+ done = false
27
+ begin
28
+ # Create the file in overwrite mode
29
+ file = File.open(output_file, "w")
30
+
31
+ # Dump the initial info into the file to start off with
32
+ case dest_type
33
+ when AllFather::TYPE_SCC
34
+ file.write("Scenarist_SCC V1.0\n\n")
35
+
36
+ when AllFather::TYPE_SRT
37
+ file.write("NOTE #{CREDITS}\n\n")
38
+
39
+ when AllFather::TYPE_VTT
40
+ file.write("WEBVTT\n\n")
41
+ file.write("NOTE #{CREDITS}\n\n")
42
+
43
+ when AllFather::TYPE_TTML
44
+ target_lang ||= ""
45
+ # TODO: Move this to a template file and load from there !!
46
+ data = <<-EOF
47
+ <tt xml:lang="" xmlns="http://www.w3.org/ns/ttml">
48
+ <head>
49
+ <metadata xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
50
+ <ttm:desc>#{CREDITS}</ttm:desc>
51
+ </metadata>
52
+ </head>
53
+ <body>
54
+ <div xml:lang=\"#{target_lang}\">
55
+ EOF
56
+ file.write(data)
57
+
58
+ when AllFather::TYPE_DFXP
59
+ target_lang ||= ""
60
+ data = <<-EOF
61
+ <tt xml:lang="" xmlns="http://www.w3.org/2004/11/ttaf1">
62
+ <head>
63
+ <meta xmlns:ttm="http://www.w3.org/2004/11/ttaf1#metadata">
64
+ <ttm:desc>#{CREDITS}</ttm:desc>
65
+ </meta>
66
+ </head>
67
+ <body>
68
+ <div xml:lang=\"#{target_lang}\">
69
+ EOF
70
+ file.write(data)
71
+ else
72
+ raise AllFather::InvalidInputException.new("Not a valid type; Failed to create output file for type #{type}")
73
+ end
74
+ done = true
75
+ ensure
76
+ file.close if file rescue nil
77
+ end
78
+ done
79
+ end
80
+
81
+ #
82
+ # Method to return a valid extension for a given caption type
83
+ # Refer to `AllFather#VALID_FILES`
84
+ #
85
+ # * +type+ - Must be one of the valid type defined in `AllFather`
86
+ #
87
+ # ====Raises
88
+ # InvalidInputException if a valid type is not provided
89
+ #
90
+ def extension_from_type(type)
91
+ case type
92
+ when AllFather::TYPE_SCC
93
+ return AllFather::VALID_FILES[0]
94
+ when AllFather::TYPE_SRT
95
+ return AllFather::VALID_FILES[1]
96
+ when AllFather::TYPE_VTT
97
+ return AllFather::VALID_FILES[2]
98
+ when AllFather::TYPE_TTML
99
+ return AllFather::VALID_FILES[3]
100
+ when AllFather::TYPE_DFXP
101
+ return AllFather::VALID_FILES[4]
102
+ else
103
+ raise AllFather::InvalidInputException.new("Not a valid type; Failed to create output file for type #{type}")
104
+ end
105
+ end
106
+
107
+ #
108
+ # Method to encode a text to SCC format
109
+ #
110
+ # * +free_text+ - Text that needs to be encoded
111
+ #
112
+ # ===== Returns
113
+ # The encoded string that can be added to SCC file
114
+ #
115
+ def scc_encode(free_text)
116
+ encoded_str = ""
117
+ count = 0
118
+ free_text.each_byte do |char|
119
+ count += 1
120
+ binval = char.to_s(2).count("1") % 2 == 0 ? (char.to_i | 128 ).to_s(2) : char.to_s(2)
121
+ encode_char = binval.to_i(2).to_s(16)
122
+ if ((count > 0) && (count % 2 == 0))
123
+ encoded_str << encode_char << " "
124
+ else
125
+ encoded_str << encode_char
126
+ end
127
+ end
128
+ encoded_str
129
+ end
130
+
131
+ #
132
+ # Method to return the cue info of the caption based on the model
133
+ # and target caption type which can be used by the caller's transformation routine
134
+ #
135
+ # * +model+ - `CueInfo` instance which is caption agnostic details of a cue
136
+ # * +target_type+ - The target type to which the new cue is to be generated
137
+ # * +last_cue+ - true for last cue and false otherwise.
138
+ #
139
+ def new_cue(model, target_type, last_cue = false)
140
+ message = nil
141
+ case target_type
142
+ when AllFather::TYPE_SCC
143
+ start_unit = model.start_time_units
144
+ h = start_unit[0].to_s.rjust(2, "0")
145
+ m = start_unit[1].to_s.rjust(2, "0")
146
+ s = start_unit[2].to_s.rjust(2, "0")
147
+ ms = start_unit[3]
148
+ # Convert to Frames assuming a framerate of 23.976
149
+ # Pad 0 if frames is <= 9
150
+ frames = ((ms.to_f * SCC_DEFAULT_FRAME_RATE) / 1000.0).to_i.to_s.rjust(2, "0").to_i
151
+ # TODO: Might have to strip off non-english characters here
152
+ message = "#{h}:#{m}:#{s}:#{frames} " + scc_encode(model.message)
153
+ when AllFather::TYPE_VTT, AllFather::TYPE_SRT
154
+ start_unit = model.start_time_units
155
+ end_unit = model.end_time_units
156
+ message = ""
157
+ if model.sequence
158
+ message = model.sequence + "\n"
159
+ else
160
+ message = model.index.to_s + "\n"
161
+ end
162
+ delimiter_added = false
163
+ [start_unit, end_unit].each do |unit|
164
+ h = unit[0].to_s.rjust(2, "0")
165
+ m = unit[1].to_s.rjust(2, "0")
166
+ s = unit[2].to_s.rjust(2, "0")
167
+ ms = unit[3]
168
+ if ms < 100
169
+ ms = ms.to_s.rjust(3, "0")
170
+ end
171
+ if target_type == AllFather::TYPE_VTT
172
+ message << "#{h}:#{m}:#{s}:#{ms}"
173
+ else
174
+ message << "#{h}:#{m}:#{s},#{ms}"
175
+ end
176
+ unless delimiter_added
177
+ message << " --> "
178
+ delimiter_added = true
179
+ end
180
+ end
181
+ message << "\n"
182
+ message << model.message
183
+ message << "\n"
184
+ message << "\n" unless model.message.end_with?("\n")
185
+ when AllFather::TYPE_TTML, AllFather::TYPE_DFXP
186
+ start_unit = model.start_time_units
187
+ end_unit = model.end_time_units
188
+ h = start_unit[0].to_s.rjust(2, "0")
189
+ m = start_unit[1].to_s.rjust(2, "0")
190
+ s = start_unit[2].to_s.rjust(2, "0")
191
+ ms = start_unit[3]
192
+ begin_time = "#{h}:#{m}:#{s}"
193
+ begin_time << ".#{ms.to_s.rjust(3, "0")}" if ms > 0
194
+ h = end_unit[0].to_s.rjust(2, "0")
195
+ m = end_unit[1].to_s.rjust(2, "0")
196
+ s = end_unit[2].to_s.rjust(2, "0")
197
+ ms = end_unit[3]
198
+ end_time = "#{h}:#{m}:#{s}"
199
+ end_time << ".#{ms.to_s.rjust(3, "0")}" if ms > 0
200
+ message = "<p begin=\"#{begin_time}\" end=\"#{end_time}\">#{model.message.encode(:xml => :text)}</p>"
201
+ message << "</div>\n</body>\n</tt>" if last_cue
202
+ end
203
+ message
204
+ end
205
+
206
+ #
207
+ # Method that normalizes the timestamps from various different caption formats into
208
+ # a caption agnostic format
209
+ #
210
+ # * +time_stamp+ - The timestamp parsed from the caption file for a given caption type
211
+ # * +type+ - A valid caption type. Refer to `AllFather` for valid types
212
+ #
213
+ def time_details(time_stamp, type)
214
+ h = m = s = ms = nil
215
+ elapsed_seconds = nil
216
+ case type
217
+ when AllFather::TYPE_SCC
218
+ tokens = time_stamp.split(":")
219
+ h = tokens[0].to_i
220
+ m = tokens[1].to_i
221
+ s = tokens[2].to_i
222
+ frames = tokens[3].to_i
223
+ ms = (frames * 1000 / SCC_DEFAULT_FRAME_RATE).round(0).to_s.rjust(3, "0").to_i
224
+ if ms >= 1000
225
+ ms = 999
226
+ end
227
+ when AllFather::TYPE_SRT
228
+ tokens = time_stamp.split(",")
229
+ ms = tokens[1].to_i
230
+ tokens = tokens[0].split(":")
231
+ h = tokens[0].to_i
232
+ m = tokens[1].to_i
233
+ s = tokens[2].to_i
234
+ when AllFather::TYPE_VTT
235
+ tokens = time_stamp.split(".")
236
+ ms = tokens[1].to_i
237
+ tokens = tokens[0].split(":")
238
+ if tokens.size == 2
239
+ h = 0
240
+ m = tokens[0].to_i
241
+ s = tokens[1].to_i
242
+ else
243
+ h = tokens[0].to_i
244
+ m = tokens[1].to_i
245
+ s = tokens[2].to_i
246
+ end
247
+ when AllFather::TYPE_TTML, AllFather::TYPE_DFXP
248
+ # We support only clock-time without framerate / tickrate and only media timebase
249
+ # For offset hence we don't support frames / ticks
250
+ tokens = time_stamp.split(":")
251
+ if tokens.size > 1
252
+ if tokens.size > 3
253
+ # This is specified with frames and/or subframes. Unsupported
254
+ raise AllFather::InvalidInputException.new("TTML file with clock-time referencing frames / ticks is unsupported")
255
+ end
256
+ h = tokens[0].to_i
257
+ m = tokens[1].to_i
258
+ ms_tokens = tokens[2].split(".")
259
+ if ms_tokens.size == 1
260
+ ms = 0
261
+ else
262
+ ms = ms_tokens[1].to_i
263
+ end
264
+ s = ms_tokens[0].to_i
265
+ else
266
+ # Parsing in offset mode
267
+ if time_stamp.end_with?("ms")
268
+ unit = "ms"
269
+ time_with_no_unit = time_stamp[0, time_stamp.size - 2]
270
+ else
271
+ unit = time_stamp[time_stamp.size - 1]
272
+ time_with_no_unit = time_stamp[0, time_stamp.size - 1]
273
+ end
274
+ case unit
275
+ when "m"
276
+ time_with_no_unit = time_with_no_unit.to_f * 60
277
+ when "h"
278
+ time_with_no_unit = time_with_no_unit.to_f * (60 * 60)
279
+ when "s"
280
+ # do nothing
281
+ when "ms"
282
+ time_with_no_unit = time_with_no_unit.to_f / 1000.0
283
+ else
284
+ # Fail out f / t
285
+ raise AllFather::InvalidInputException.new("TTML file with offset-time referencing frames / ticks is unsupported")
286
+ end
287
+ tokens = time_with_no_unit.to_s.split(".")
288
+ h = m = 0
289
+ if tokens.size == 1
290
+ s = time_with_no_unit
291
+ ms = 0
292
+ else
293
+ s = tokens[0].to_i
294
+ ms = tokens[1].to_i
295
+ end
296
+ h = s / 3600
297
+ m = (s / 60) % 60
298
+ s = s % 60
299
+ end
300
+ end
301
+ elapsed_seconds = (h * 60 * 60) + (m * 60) + s
302
+ return [h, m, s, ms, elapsed_seconds]
303
+ end
304
+
305
+
306
+ #
307
+ # Method to write the cue details to the output files
308
+ #
309
+ # * +model+ - Cue instance
310
+ # * +file_map+ - Hash of files for each caption type
311
+ # * +last_cue+ - true for last cue and false otherwise
312
+ #
313
+ def write_cue(model, file_map, last_cue = false)
314
+ file_map.each do |type, file_path|
315
+ File.open(file_path, "a") do |f|
316
+ f.puts new_cue(model, type, last_cue)
317
+ end
318
+ end
319
+ if last_cue
320
+ # Pretty print the output for ttml & dfxp
321
+ file_map.each do |type, file_path|
322
+ next unless [AllFather::TYPE_DFXP, AllFather::TYPE_TTML].include?(type)
323
+ file = File.open(file_path, "r")
324
+ xml_doc = Nokogiri::XML(file, &:noblanks)
325
+ File.write(file_path, xml_doc.to_s)
326
+ end
327
+ end
328
+ end
329
+ end
@@ -0,0 +1,40 @@
1
+ class CueInfo
2
+ def initialize(type)
3
+ @type = type
4
+ @start = @end = @sequence = nil
5
+ @message = ""
6
+ @start_time_units = []
7
+ @end_time_units = []
8
+ @index = 1
9
+ end
10
+
11
+ attr_reader :type, :start, :end, :sequence, :message, :start_time_units, :end_time_units, :index
12
+
13
+ def start=(start)
14
+ @start = start
15
+ end
16
+
17
+ def end=(end_point)
18
+ @end = end_point
19
+ end
20
+
21
+ def message=(msg)
22
+ @message = msg
23
+ end
24
+
25
+ def sequence=(seq)
26
+ @sequence = seq
27
+ end
28
+
29
+ def index=(index)
30
+ @index = index
31
+ end
32
+
33
+ def start_time_units=(units)
34
+ @start_time_units = units
35
+ end
36
+
37
+ def end_time_units=(units)
38
+ @end_time_units = units
39
+ end
40
+ end
data/lib/vtt.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require_relative "engines/translator"
2
+ require_relative "utils/common_utils"
3
+ require_relative "utils/cue_info"
2
4
  require_relative "allfather"
3
5
 
4
6
  #
@@ -10,13 +12,20 @@ require_relative "allfather"
10
12
  class VTT
11
13
 
12
14
  include AllFather
15
+ include CommonUtils
13
16
 
14
- def initialize(cc_file, translator)
17
+ SUPPORTED_TRANSFORMATIONS = [TYPE_SCC, TYPE_SRT, TYPE_TTML, TYPE_DFXP]
18
+
19
+ def initialize(cc_file)
15
20
  @cc_file = cc_file
16
- @translator = translator
17
21
  raise "Invalid VTT file provided" unless is_valid?
18
22
  end
19
23
 
24
+ def set_translator(translator)
25
+ super(translator)
26
+ @translator = translator
27
+ end
28
+
20
29
  def translate(src_lang, dest_lang, out_file)
21
30
  super(src_lang, dest_lang, out_file)
22
31
  begin
@@ -53,7 +62,6 @@ class VTT
53
62
  outfile.puts
54
63
  end
55
64
  ensure
56
- ccfile.close rescue nil
57
65
  outfile.close
58
66
  end
59
67
  end
@@ -85,6 +93,69 @@ class VTT
85
93
  return false
86
94
  end
87
95
 
96
+ def supported_transformations
97
+ return SUPPORTED_TRANSFORMATIONS
98
+ end
99
+
100
+ def transform_to(types, src_lang, target_lang, output_dir)
101
+ # Let's start off with some validations
102
+ super(types, src_lang, target_lang, output_dir)
103
+
104
+ # Suffix output dir with File seperator
105
+ output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
106
+
107
+ # Prepare the output files for each type
108
+ file_map = {}
109
+ types.each do |type|
110
+ output_file = File.basename(@cc_file, File.extname(@cc_file)) + extension_from_type(type)
111
+ out_file = "#{output_dir}#{output_file}"
112
+ if create_file(TYPE_VTT, type, out_file, target_lang)
113
+ file_map[type] = out_file
114
+ else
115
+ raise StandardError.new("Failed to create output file for type #{type}")
116
+ end
117
+ end
118
+
119
+ # Read the file and prepare the cue model
120
+ cue_info = nil
121
+ ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
122
+ message = ""
123
+ collect_msg = false
124
+ cue_index = 1
125
+ ccfile.each_line do | line |
126
+ if line.strip.empty?
127
+ collect_msg = false
128
+ next
129
+ end
130
+ time_points = line.scan(/^((\d\d:)+\d\d[.,]\d\d\d)\s-->\s((\d\d:)+\d\d[.,]\d\d\d)/)
131
+ if time_points.empty?
132
+ if collect_msg
133
+ message << line
134
+ end
135
+ else
136
+ collect_msg = false
137
+ unless message.empty?
138
+ cue_info.message = message
139
+ write_cue(cue_info, file_map)
140
+ message = ""
141
+ cue_index += 1
142
+ end
143
+ # This is a cue point. Fetch timestamps
144
+ cue_info = CueInfo.new(AllFather::TYPE_VTT)
145
+ cue_info.index = cue_index
146
+ cue_info.start = time_points[0][0]
147
+ cue_info.end = time_points[0][2]
148
+ start_units = time_details(cue_info.start, TYPE_VTT)
149
+ end_units = time_details(cue_info.end, TYPE_VTT)
150
+ cue_info.start_time_units = start_units
151
+ cue_info.end_time_units = end_units
152
+ collect_msg = true
153
+ end
154
+ end
155
+ cue_info.message = message unless message.empty?
156
+ write_cue(cue_info, file_map, true)
157
+ end
158
+
88
159
  private
89
160
 
90
161
  #
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: subtitle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maheshwaran G
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-11-11 00:00:00.000000000 Z
12
+ date: 2019-11-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -67,6 +67,34 @@ dependencies:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
69
  version: '10.0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: minitest
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: optimist
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
70
98
  description: Subtitle gem helps you to detect language and translate closed caption
71
99
  to required language.
72
100
  email:
@@ -85,6 +113,8 @@ files:
85
113
  - lib/srt.rb
86
114
  - lib/subtitle.rb
87
115
  - lib/ttml.rb
116
+ - lib/utils/common_utils.rb
117
+ - lib/utils/cue_info.rb
88
118
  - lib/vtt.rb
89
119
  homepage: https://github.com/cloudaffair/subtitle
90
120
  licenses: