subtitle 0.2.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 99d9960e9d5fb5daa7ddc87e76db91c654ce68453a13fd9124611e7589cc7fe5
4
- data.tar.gz: '090e594747871afd7e3e423dab00154313d5b77423664a300f499df07dd895c0'
3
+ metadata.gz: 49e45fb2713aedd5d6d7d6d290fe4874a292df3249cfad7259913e90b0cb7fd8
4
+ data.tar.gz: 7f4535875a19028db4ec08de90903daba7b906b659571c5a921850071bf3154c
5
5
  SHA512:
6
- metadata.gz: 1545c3120f496fe10228473f45f5bb3a78135fd603d15eed6eee7a2aa85fc281f53f4fc5f612c3ab4d710baa9388abd6528c8f5aff52f3fb3bfe424e61c05e2c
7
- data.tar.gz: 38fc75d66ea829a7596d5a7235fd6d7f43d256a3dbb1b8ae136926b92ca83b7cbf041c290cf4e506e032d987e73b9a56761cfb89596d6131aafc5d5f87453732
6
+ metadata.gz: 0b53144f0a627a545c0a989d664f3611078a7afc11e9f00b479065a0d3a1b2bc9bf68e10706bd89b85c0e73ff53d7c4627a2c5e29d38867ec2882c99ea56eda0
7
+ data.tar.gz: 6999ae152b2f5904a2061944522b11387280df6e02cbf4d36d7d5ae27ba12eb3b6c36357bba791c5a7b4f06575531ce76df904dcbb76b0e5f79bd62b05988704
data/lib/allfather.rb CHANGED
@@ -1,3 +1,6 @@
1
+ require 'fileutils'
2
+ require_relative "engines/translator"
3
+
1
4
  #
2
5
  # A Module that kind of acts as an interface where the generic methods
3
6
  # that applies to each caption type can be defined
@@ -12,6 +15,15 @@ module AllFather
12
15
  #
13
16
  VALID_FILES = [".scc", ".srt", ".vtt", ".ttml", ".dfxp"]
14
17
 
18
+ #
19
+ # Caption type constants
20
+ #
21
+ TYPE_SCC = 1
22
+ TYPE_SRT = 2
23
+ TYPE_VTT = 3
24
+ TYPE_TTML = 4
25
+ TYPE_DFXP = 5
26
+
15
27
  #
16
28
  # Generic exception class that is raised for validation errors
17
29
  #
@@ -45,12 +57,23 @@ module AllFather
45
57
  raise "Not Implemented. Class #{self.class.name} doesn't implement infer_languages"
46
58
  end
47
59
 
60
+
61
+ #
62
+ # Method to set a translation engine
63
+ #
64
+ # * +translator+ - Instance of translation engine. Refer to `engines/aws` for example
65
+ #
66
+ def set_translator(translator)
67
+ if translator && !(translator.is_a? Translator)
68
+ raise "Argument is not an instance of Translator"
69
+ end
70
+ end
71
+
48
72
  #
49
73
  # Method to translate the caption from one language to another
50
74
  #
51
75
  # :args: src_lang, target_lang, output_file
52
76
  #
53
- # * +input_caption+ - A Valid input caption file. Refer to #is_valid?
54
77
  # * +src_lang+ - can be inferred using #infer_language method
55
78
  # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
56
79
  # * +output_file+ - Output file. Can be a fully qualified path or just file name
@@ -80,4 +103,55 @@ module AllFather
80
103
  # Further checks can be done only in caption specific implementations
81
104
  # or translation engine specific implementation
82
105
  end
106
+
107
+ #
108
+ # Method to convert from one caption type to other types. If the src_lang is not provided
109
+ # then all source languages will be converted to target types. For example, if a ttml file
110
+ # has "en" and "es" and target_type is vtt and no src_lang is provided 2 vtt files would be
111
+ # created one per language in the source. if a target_lang is provided then one of the lang
112
+ # from source would be picked for creating the output file with target_lang
113
+ #
114
+ # If no target_lang is provided, no translations are applied. output_file is created using
115
+ # without any need for any language translation services. Hence doesn't incur any cost !!
116
+ #
117
+ # * +types+ - An array of Valid input caption type(s). Refer to `#CaptionType`
118
+ # * +src_lang+ - can be inferred using #infer_language method
119
+ # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
120
+ # * +output_dir+ - Output Directory. Generated files would be dumped here
121
+ #
122
+ # ==== Raises
123
+ #
124
+ # InvalidInputException shall be raised if
125
+ # 1. The input file doesn't exist or is unreadable or is invalid caption
126
+ # 2. The output dir doesn't exist
127
+ # 3. Invalid lang codes for a given caption type
128
+ # 4. Unsupported type to which conversion is requested for
129
+ #
130
+ def transform_to(types, src_lang, target_lang, output_dir)
131
+ if (types - supported_transformations).size != 0
132
+ raise InvalidInputException.new("Unknown types provided for conversion in input #{types}")
133
+ end
134
+ unless File.directory?(output_dir)
135
+ FileUtils.mkdir_p(output_dir)
136
+ end
137
+ # Basic validations
138
+ if types.include?(TYPE_SCC)
139
+ if target_lang && !target_lang.eql?("en")
140
+ raise InvalidInputException.new("SCC can be generated only in en. #{target_lang} is unsupported")
141
+ end
142
+ end
143
+ if target_lang && !target_lang.empty?
144
+ raise InvalidInputException.new("Translation to other language as part of transform is yet to be implemented")
145
+ end
146
+ end
147
+
148
+ #
149
+ # Method to report on the supported transformations. Each implementor is free to return
150
+ # the types to which it can convert itself to
151
+ #
152
+ # Returns an array of one or more types defined as +TYPE_+ constants here
153
+ #
154
+ def supported_transformations
155
+ raise "Not Implemented. Class #{self.class.name} doesn't implement supported_transformations"
156
+ end
83
157
  end
data/lib/dfxp.rb CHANGED
@@ -10,10 +10,10 @@ require_relative "ttml"
10
10
  #
11
11
  class DFXP < TTML
12
12
 
13
- def initialize(cc_file, translator, opts={})
13
+ def initialize(cc_file)
14
14
  @cc_file = cc_file
15
- @translator = translator
16
- @force_detect = opts[:force_detect] || false
15
+ #@translator = translator
16
+ #@force_detect = opts[:force_detect] || false
17
17
  raise "Invalid TTML file provided" unless is_valid?
18
18
  end
19
19
 
data/lib/engines/aws.rb CHANGED
@@ -10,10 +10,9 @@ require_relative 'translator'
10
10
  # == Credential Referencing Order
11
11
  #
12
12
  # * [Arguments] - Pass the credentials access_key_id and secret_access_key as arguments
13
- # * [Environment route] - AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY can be exposed as
14
- # environment variables
13
+ # * [Environment route] - AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY can be exposed as environment variables
15
14
  # * [Profile Name] - The application uses the credentials of the system and picks the
16
- # credentials referred to by the profile
15
+ # credentials referred to by the profile
17
16
  #
18
17
  class AwsEngine
19
18
  include Translator
@@ -71,9 +70,6 @@ class AwsEngine
71
70
  # Invokes the language detection API of AWS and returns only the language
72
71
  # of the highest score and returns the ISO 639-1 code
73
72
  #
74
- # :args: text
75
- #
76
- # ===== Arguments
77
73
  # * +text+ - The text for which the language is to be inferred
78
74
  #
79
75
  def infer_language(text)
@@ -83,12 +79,10 @@ class AwsEngine
83
79
 
84
80
  #
85
81
  # Invokes the translation API of AWS and returns the translated text
86
- # as per the arguments provided
82
+ # as per the arguments provided.
87
83
  # Will Raise exception if a translation cannot be made between the source
88
84
  # and target language codes or if the lang code is invalid
89
85
  #
90
- # :args: input_text, src_lang, target_lang
91
- #
92
86
  # * +input_text+ - The text that needs to be translated
93
87
  # * +src_lang+ - The source language of the text
94
88
  # * +target_lang+ - The target language to which the input_text needs to be translated to
data/lib/scc.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require_relative "engines/translator"
2
+ require_relative "utils/common_utils"
3
+ require_relative "utils/cue_info"
2
4
  require_relative "allfather"
3
5
 
4
6
  #
@@ -10,10 +12,12 @@ require_relative "allfather"
10
12
  class SCC
11
13
 
12
14
  include AllFather
15
+ include CommonUtils
13
16
 
14
- def initialize(cc_file, translator)
17
+ SUPPORTED_TRANSFORMATIONS = [TYPE_SRT, TYPE_VTT, TYPE_TTML, TYPE_DFXP]
18
+
19
+ def initialize(cc_file)
15
20
  @cc_file = cc_file
16
- @translator = translator
17
21
  raise "Invalid SCC file provided" unless is_valid?
18
22
  end
19
23
 
@@ -25,6 +29,11 @@ class SCC
25
29
  return false
26
30
  end
27
31
 
32
+ def set_translator(translator)
33
+ super(translator)
34
+ @translator = translator
35
+ end
36
+
28
37
  def infer_languages
29
38
  lang = nil
30
39
  begin
@@ -40,6 +49,71 @@ class SCC
40
49
  raise "Not Implemented. Class #{self.class.name} doesn't implement translate yet !!"
41
50
  end
42
51
 
52
+ def supported_transformations
53
+ return SUPPORTED_TRANSFORMATIONS
54
+ end
55
+
56
+ def transform_to(types, src_lang, target_lang, output_dir)
57
+ # Let's start off with some validations
58
+ super(types, src_lang, target_lang, output_dir)
59
+
60
+ # Suffix output dir with File seperator
61
+ output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
62
+
63
+ # Prepare the output files for each type
64
+ file_map = {}
65
+ types.each do |type|
66
+ output_file = File.basename(@cc_file, File.extname(@cc_file)) + extension_from_type(type)
67
+ out_file = "#{output_dir}#{output_file}"
68
+ if create_file(TYPE_SCC, type, out_file, target_lang)
69
+ file_map[type] = out_file
70
+ else
71
+ raise StandardError.new("Failed to create output file for type #{type}")
72
+ end
73
+ end
74
+
75
+ # Read the file and prepare the cue model
76
+ prev_cue_info = cur_cue_info = nil
77
+ ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
78
+ cue_index = 1
79
+ ccfile.each_line do | line |
80
+ time_point = line.scan(/(^\d\d:\d\d:\d\d:\d\d\s)(.*)/)
81
+ unless time_point.empty?
82
+ scc_text_code = time_point[0][1].strip
83
+ message = decode(scc_text_code)
84
+ # Replace \u0000 with empty as this causes the ttml / dfxp outputs
85
+ # to treat them as end and terminates the xml the moment this is encountered
86
+ # https://github.com/sparklemotion/nokogiri/issues/1535
87
+ message = message.gsub(/\u0000/, '')
88
+ if prev_cue_info.nil?
89
+ prev_cue_info = CueInfo.new(TYPE_SCC)
90
+ prev_cue_info.index = cue_index
91
+ prev_cue_info.message = message
92
+ prev_cue_info.start = time_point[0][0].strip
93
+ else
94
+ cur_cue_info = CueInfo.new(TYPE_SCC)
95
+ cur_cue_info.index = cue_index
96
+ cur_cue_info.message = message
97
+ cur_cue_info.start = time_point[0][0].strip
98
+ # Set the previous cue info's end time to current cue's start time
99
+ # TODO: Need to see if we need to reduce alteast 1 fps or 1s
100
+ prev_cue_info.end = cur_cue_info.start
101
+ prev_cue_info.start_time_units = time_details(prev_cue_info.start, TYPE_SCC)
102
+ prev_cue_info.end_time_units = time_details(prev_cue_info.end, TYPE_SCC)
103
+ write_cue(prev_cue_info, file_map)
104
+ prev_cue_info = cur_cue_info
105
+ end
106
+ cue_index += 1
107
+ end
108
+ end
109
+ # we need to set some end time, but don't know the same !!
110
+ # for now setting the start time itself
111
+ cur_cue_info.end = cur_cue_info.start
112
+ cur_cue_info.start_time_units = time_details(cur_cue_info.start, TYPE_SCC)
113
+ cur_cue_info.end_time_units = time_details(cur_cue_info.end, TYPE_SCC)
114
+ write_cue(cur_cue_info, file_map, true)
115
+ end
116
+
43
117
  private
44
118
 
45
119
  def get_text(srt_file, num_chars)
@@ -78,20 +152,4 @@ class SCC
78
152
  end
79
153
  decoded_text
80
154
  end
81
-
82
- def encode(free_text)
83
- encoded_str = ""
84
- count = 0
85
- free_text.each_byte do |char|
86
- count += 1
87
- binval = char.to_s(2).count("1") % 2 == 0 ? (char.to_i | 128 ).to_s(2) : char.to_s(2)
88
- encode_char = binval.to_i(2).to_s(16)
89
- if ((count > 0) && (count % 2 == 0))
90
- encoded_str << encode_char << " "
91
- else
92
- encoded_str << encode_char
93
- end
94
- end
95
- encoded_str
96
- end
97
155
  end
data/lib/srt.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require_relative "engines/translator"
2
+ require_relative "utils/common_utils"
3
+ require_relative "utils/cue_info"
2
4
  require_relative "allfather"
3
5
 
4
6
  #
@@ -10,10 +12,12 @@ require_relative "allfather"
10
12
  class SRT
11
13
 
12
14
  include AllFather
15
+ include CommonUtils
13
16
 
14
- def initialize(cc_file, translator)
17
+ SUPPORTED_TRANSFORMATIONS = [TYPE_SCC, TYPE_VTT, TYPE_TTML, TYPE_DFXP]
18
+
19
+ def initialize(cc_file)
15
20
  @cc_file = cc_file
16
- @translator = translator
17
21
  raise "Invalid SRT file provided" unless is_valid?
18
22
  end
19
23
 
@@ -25,6 +29,11 @@ class SRT
25
29
  return false
26
30
  end
27
31
 
32
+ def set_translator(translator)
33
+ super(translator)
34
+ @translator = translator
35
+ end
36
+
28
37
  def translate(src_lang, dest_lang, out_file)
29
38
  super(src_lang, dest_lang, out_file)
30
39
  begin
@@ -60,7 +69,6 @@ class SRT
60
69
  outfile.puts
61
70
  end
62
71
  ensure
63
- ccfile.close rescue nil
64
72
  outfile.close
65
73
  end
66
74
  end
@@ -76,6 +84,66 @@ class SRT
76
84
  [lang]
77
85
  end
78
86
 
87
+ def supported_transformations
88
+ return SUPPORTED_TRANSFORMATIONS
89
+ end
90
+
91
+ def transform_to(types, src_lang, target_lang, output_dir)
92
+ # Let's start off with some validations
93
+ super(types, src_lang, target_lang, output_dir)
94
+
95
+ # Suffix output dir with File seperator
96
+ output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
97
+
98
+ # Prepare the output files for each type
99
+ file_map = {}
100
+ types.each do |type|
101
+ output_file = File.basename(@cc_file, File.extname(@cc_file)) + extension_from_type(type)
102
+ out_file = "#{output_dir}#{output_file}"
103
+ if create_file(TYPE_SRT, type, out_file, target_lang)
104
+ file_map[type] = out_file
105
+ else
106
+ raise StandardError.new("Failed to create output file for type #{type}")
107
+ end
108
+ end
109
+
110
+ # Read the file and prepare the cue model
111
+ cue_info = nil
112
+ ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
113
+ message = ""
114
+ ccfile.each_line do | line |
115
+ # p line
116
+ next if line.strip.empty?
117
+ time_points = line.scan(/^((\d\d:)\d\d:\d\d[,.]\d\d\d.*)-->.*((\d\d:)\d\d:\d\d[,.]\d\d\d)/)
118
+ if time_points.empty?
119
+ # This is not a time point
120
+ seq = line.strip
121
+ if seq.to_i > 0
122
+ cue_info.message = message unless message.empty?
123
+ write_cue(cue_info, file_map) if cue_info
124
+ cue_info = CueInfo.new(TYPE_SRT)
125
+ cue_info.sequence = seq
126
+ # Reset the message
127
+ message = ""
128
+ else
129
+ # This is not a sequence number nor it's timepoints
130
+ # Grab the details until we find next cue point
131
+ message << line
132
+ end
133
+ else
134
+ # This is a cue point. Fetch timestamps
135
+ cue_info.start = time_points[0][0]
136
+ cue_info.end = time_points[0][2]
137
+ start_units = time_details(cue_info.start, TYPE_SRT)
138
+ end_units = time_details(cue_info.end, TYPE_SRT)
139
+ cue_info.start_time_units = start_units
140
+ cue_info.end_time_units = end_units
141
+ end
142
+ end
143
+ cue_info.message = message unless message.empty?
144
+ write_cue(cue_info, file_map, true)
145
+ end
146
+
79
147
  private
80
148
 
81
149
  #
@@ -103,4 +171,4 @@ class SRT
103
171
  end
104
172
  return text_sample[0, num_chars]
105
173
  end
106
- end
174
+ end
data/lib/subtitle.rb CHANGED
@@ -7,21 +7,29 @@ require_relative "allfather"
7
7
  require_relative "engines/translator"
8
8
  require_relative "engines/aws"
9
9
 
10
-
10
+ #
11
+ # Facade that wraps all the complexities surrounding which translation
12
+ # engine to use or which caption instances to be instantiated.
13
+ #
11
14
  class Subtitle
12
- def initialize(options={})
15
+
16
+ TYPE_MAP = {"scc" => AllFather::TYPE_SCC, "srt" => AllFather::TYPE_SRT, "vtt" => AllFather::TYPE_VTT,
17
+ "ttml" => AllFather::TYPE_TTML, "dfxp" => AllFather::TYPE_DFXP}
18
+
19
+ def initialize(file, options = nil)
13
20
  # Infer the caption handler from the extension
14
- @cc_file = options[:cc_file]
21
+ @cc_file = file
15
22
  raise "Input caption not provided. Please provide the same in :cc_file option" if @cc_file.nil?
16
- translator = get_translator(options)
17
- @handler = get_caption_handler(options, translator)
23
+ initialize_handler(options) unless options.nil?
18
24
  end
19
25
 
20
- def detect_language
26
+ def detect_language(options = nil)
27
+ initialize_handler(options) if @handler.nil?
21
28
  @handler.infer_languages
22
29
  end
23
30
 
24
- def translate(dest_lang, src_lang = nil, outfile = nil)
31
+ def translate(dest_lang, src_lang = nil, outfile = nil, options = nil)
32
+ initialize_handler(options) if @handler.nil?
25
33
  if outfile.nil?
26
34
  outfile = "#{@cc_file}_#{dest_lang}"
27
35
  end
@@ -33,13 +41,40 @@ class Subtitle
33
41
  outfile
34
42
  end
35
43
 
44
+ def transform(types, src_lang = nil, target_lang = nil, options = nil)
45
+ # A quick validation & translation to expected arguments
46
+ vals = []
47
+ invalid_vals = []
48
+ types.each do |type|
49
+ type_val = TYPE_MAP[type]
50
+ if type_val.nil?
51
+ invalid_vals << type
52
+ next
53
+ end
54
+ vals << type_val
55
+ end
56
+ unless invalid_vals.empty?
57
+ raise "Invalid types #{invalid_vals} provided"
58
+ end
59
+ # Translator not required if target_lang is nil
60
+ if @handler.nil?
61
+ if target_lang.nil?
62
+ @handler = get_caption_handler(options, nil)
63
+ else
64
+ initialize_handler(options)
65
+ end
66
+ end
67
+ output_dir = options[:outfile]
68
+ @handler.transform_to(vals, src_lang, target_lang, output_dir)
69
+ end
70
+
36
71
  def type
37
72
  type = nil
38
73
  ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
39
74
  ccfile.each_line do | line |
40
75
  if line =~ /^(\d\d:)\d\d:\d\d[,]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,]\d\d\d/
41
76
  type = "srt"
42
- elsif line =~ /(^(\d\d:)\d\d:\d\d[.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[.]\d\d\d)|(^WEBVTT$)/
77
+ elsif line =~ /^((\d\d:)+\d\d[.,]\d\d\d)\s-->\s((\d\d:)+\d\d[.,]\d\d\d)|(^WEBVTT$)/
43
78
  type = "vtt"
44
79
  elsif line =~ /(^\d\d:\d\d:\d\d:\d\d\t(([0-9a-fA-F]{4})\s)*)+|(^Scenarist_SCC V(\d.\d)$)/
45
80
  type = "scc"
@@ -63,6 +98,11 @@ class Subtitle
63
98
 
64
99
  private
65
100
 
101
+ def initialize_handler(options)
102
+ translator = get_translator(options)
103
+ @handler = get_caption_handler(options, translator)
104
+ end
105
+
66
106
  def get_translator(options)
67
107
  translator = nil
68
108
  # Try to infer the engine based on the passed options
@@ -93,24 +133,26 @@ class Subtitle
93
133
  def get_caption_handler(options, translator)
94
134
  caption_file = options[:cc_file]
95
135
  extension = File.extname(caption_file)
136
+ extension = ".#{type}" if extension.nil?
96
137
  unless AllFather::VALID_FILES.include?(extension)
97
138
  raise "Caption support for #{caption_file} of type #{extension} is not supported yet"
98
139
  end
99
140
  handler = nil
100
141
  case extension.downcase
101
142
  when ".scc"
102
- handler = SCC.new(caption_file, translator)
143
+ handler = SCC.new(caption_file)
103
144
  when ".srt"
104
- handler = SRT.new(caption_file, translator)
145
+ handler = SRT.new(caption_file)
105
146
  when ".vtt"
106
- handler = VTT.new(caption_file, translator)
147
+ handler = VTT.new(caption_file)
107
148
  when ".ttml"
108
- handler = TTML.new(caption_file, translator, {:force_detect => options[:force_detect]})
149
+ handler = TTML.new(caption_file)
109
150
  when ".dfxp"
110
- handler = DFXP.new(caption_file, translator, {:force_detect => options[:force_detect]})
151
+ handler = DFXP.new(caption_file)
111
152
  else
112
153
  raise "Cannot handle file type .#{extension}"
113
154
  end
155
+ handler.set_translator(translator)
114
156
  handler
115
157
  end
116
158
  end
data/lib/ttml.rb CHANGED
@@ -13,10 +13,8 @@ class TTML
13
13
 
14
14
  include AllFather
15
15
 
16
- def initialize(cc_file, translator, opts={})
16
+ def initialize(cc_file)
17
17
  @cc_file = cc_file
18
- @translator = translator
19
- @force_detect = opts[:force_detect] || false
20
18
  raise "Invalid TTML file provided" unless is_valid?
21
19
  end
22
20
 
@@ -30,7 +28,12 @@ class TTML
30
28
  return false
31
29
  end
32
30
 
31
+ def set_translator(translator)
32
+ @translator = translator
33
+ end
34
+
33
35
  def infer_languages
36
+ force_detect = false
34
37
  lang = []
35
38
  begin
36
39
  xml_file = File.open(@cc_file)
@@ -43,9 +46,9 @@ class TTML
43
46
  if inferred_lang.nil?
44
47
  # If lang is not provided in the caption, then override
45
48
  # force detect for inferrence
46
- @force_detect = true
49
+ force_detect = true
47
50
  end
48
- if @force_detect
51
+ if force_detect
49
52
  sample_text = get_text(div, 100)
50
53
  inferred_lang = @translator.infer_language(sample_text) rescue nil
51
54
  if inferred_lang.nil?
@@ -0,0 +1,329 @@
1
+ require_relative "../allfather"
2
+ require "nokogiri"
3
+
4
+ module CommonUtils
5
+
6
+ CREDITS = "Credits: Autogenerated by subtitle Rubygem".freeze
7
+
8
+ SCC_DEFAULT_FRAME_RATE = ENV["SCC_DEFAULT_FRAME_RATE"] || 23.976
9
+
10
+ #
11
+ # Method to create the file with basic header informations which can be
12
+ # further updated with the transformed caption details by respective
13
+ # implementations
14
+ #
15
+ # * +src_type+ - Source caption type. Refer to AllFather::TYPE_SCC type constants
16
+ # * +dest_type+ - Target caption type. Refer to AllFather::TYPE_SCC type constants
17
+ # * +output_file+ - Creates this output_file to which type specific
18
+ # information would be dumped into
19
+ # * +target_lang+ - Target lang of the output_file
20
+ #
21
+ # ==== Returns
22
+ # true if the file is created with right headers and false otherwise
23
+ #
24
+ def create_file(src_type, dest_type, output_file, target_lang)
25
+ file = nil
26
+ done = false
27
+ begin
28
+ # Create the file in overwrite mode
29
+ file = File.open(output_file, "w")
30
+
31
+ # Dump the initial info into the file to start off with
32
+ case dest_type
33
+ when AllFather::TYPE_SCC
34
+ file.write("Scenarist_SCC V1.0\n\n")
35
+
36
+ when AllFather::TYPE_SRT
37
+ file.write("NOTE #{CREDITS}\n\n")
38
+
39
+ when AllFather::TYPE_VTT
40
+ file.write("WEBVTT\n\n")
41
+ file.write("NOTE #{CREDITS}\n\n")
42
+
43
+ when AllFather::TYPE_TTML
44
+ target_lang ||= ""
45
+ # TODO: Move this to a template file and load from there !!
46
+ data = <<-EOF
47
+ <tt xml:lang="" xmlns="http://www.w3.org/ns/ttml">
48
+ <head>
49
+ <metadata xmlns:ttm="http://www.w3.org/ns/ttml#metadata">
50
+ <ttm:desc>#{CREDITS}</ttm:desc>
51
+ </metadata>
52
+ </head>
53
+ <body>
54
+ <div xml:lang=\"#{target_lang}\">
55
+ EOF
56
+ file.write(data)
57
+
58
+ when AllFather::TYPE_DFXP
59
+ target_lang ||= ""
60
+ data = <<-EOF
61
+ <tt xml:lang="" xmlns="http://www.w3.org/2004/11/ttaf1">
62
+ <head>
63
+ <meta xmlns:ttm="http://www.w3.org/2004/11/ttaf1#metadata">
64
+ <ttm:desc>#{CREDITS}</ttm:desc>
65
+ </meta>
66
+ </head>
67
+ <body>
68
+ <div xml:lang=\"#{target_lang}\">
69
+ EOF
70
+ file.write(data)
71
+ else
72
+ raise AllFather::InvalidInputException.new("Not a valid type; Failed to create output file for type #{type}")
73
+ end
74
+ done = true
75
+ ensure
76
+ file.close if file rescue nil
77
+ end
78
+ done
79
+ end
80
+
81
+ #
82
+ # Method to return a valid extension for a given caption type
83
+ # Refer to `AllFather#VALID_FILES`
84
+ #
85
+ # * +type+ - Must be one of the valid type defined in `AllFather`
86
+ #
87
+ # ====Raises
88
+ # InvalidInputException if a valid type is not provided
89
+ #
90
+ def extension_from_type(type)
91
+ case type
92
+ when AllFather::TYPE_SCC
93
+ return AllFather::VALID_FILES[0]
94
+ when AllFather::TYPE_SRT
95
+ return AllFather::VALID_FILES[1]
96
+ when AllFather::TYPE_VTT
97
+ return AllFather::VALID_FILES[2]
98
+ when AllFather::TYPE_TTML
99
+ return AllFather::VALID_FILES[3]
100
+ when AllFather::TYPE_DFXP
101
+ return AllFather::VALID_FILES[4]
102
+ else
103
+ raise AllFather::InvalidInputException.new("Not a valid type; Failed to create output file for type #{type}")
104
+ end
105
+ end
106
+
107
+ #
108
+ # Method to encode a text to SCC format
109
+ #
110
+ # * +free_text+ - Text that needs to be encoded
111
+ #
112
+ # ===== Returns
113
+ # The encoded string that can be added to SCC file
114
+ #
115
+ def scc_encode(free_text)
116
+ encoded_str = ""
117
+ count = 0
118
+ free_text.each_byte do |char|
119
+ count += 1
120
+ binval = char.to_s(2).count("1") % 2 == 0 ? (char.to_i | 128 ).to_s(2) : char.to_s(2)
121
+ encode_char = binval.to_i(2).to_s(16)
122
+ if ((count > 0) && (count % 2 == 0))
123
+ encoded_str << encode_char << " "
124
+ else
125
+ encoded_str << encode_char
126
+ end
127
+ end
128
+ encoded_str
129
+ end
130
+
131
+ #
132
+ # Method to return the cue info of the caption based on the model
133
+ # and target caption type which can be used by the caller's transformation routine
134
+ #
135
+ # * +model+ - `CueInfo` instance which is caption agnostic details of a cue
136
+ # * +target_type+ - The target type to which the new cue is to be generated
137
+ # * +last_cue+ - true for last cue and false otherwise.
138
+ #
139
+ def new_cue(model, target_type, last_cue = false)
140
+ message = nil
141
+ case target_type
142
+ when AllFather::TYPE_SCC
143
+ start_unit = model.start_time_units
144
+ h = start_unit[0].to_s.rjust(2, "0")
145
+ m = start_unit[1].to_s.rjust(2, "0")
146
+ s = start_unit[2].to_s.rjust(2, "0")
147
+ ms = start_unit[3]
148
+ # Convert to Frames assuming a framerate of 23.976
149
+ # Pad 0 if frames is <= 9
150
+ frames = ((ms.to_f * SCC_DEFAULT_FRAME_RATE) / 1000.0).to_i.to_s.rjust(2, "0").to_i
151
+ # TODO: Might have to strip off non-english characters here
152
+ message = "#{h}:#{m}:#{s}:#{frames} " + scc_encode(model.message)
153
+ when AllFather::TYPE_VTT, AllFather::TYPE_SRT
154
+ start_unit = model.start_time_units
155
+ end_unit = model.end_time_units
156
+ message = ""
157
+ if model.sequence
158
+ message = model.sequence + "\n"
159
+ else
160
+ message = model.index.to_s + "\n"
161
+ end
162
+ delimiter_added = false
163
+ [start_unit, end_unit].each do |unit|
164
+ h = unit[0].to_s.rjust(2, "0")
165
+ m = unit[1].to_s.rjust(2, "0")
166
+ s = unit[2].to_s.rjust(2, "0")
167
+ ms = unit[3]
168
+ if ms < 100
169
+ ms = ms.to_s.rjust(3, "0")
170
+ end
171
+ if target_type == AllFather::TYPE_VTT
172
+ message << "#{h}:#{m}:#{s}:#{ms}"
173
+ else
174
+ message << "#{h}:#{m}:#{s},#{ms}"
175
+ end
176
+ unless delimiter_added
177
+ message << " --> "
178
+ delimiter_added = true
179
+ end
180
+ end
181
+ message << "\n"
182
+ message << model.message
183
+ message << "\n"
184
+ message << "\n" unless model.message.end_with?("\n")
185
+ when AllFather::TYPE_TTML, AllFather::TYPE_DFXP
186
+ start_unit = model.start_time_units
187
+ end_unit = model.end_time_units
188
+ h = start_unit[0].to_s.rjust(2, "0")
189
+ m = start_unit[1].to_s.rjust(2, "0")
190
+ s = start_unit[2].to_s.rjust(2, "0")
191
+ ms = start_unit[3]
192
+ begin_time = "#{h}:#{m}:#{s}"
193
+ begin_time << ".#{ms.to_s.rjust(3, "0")}" if ms > 0
194
+ h = end_unit[0].to_s.rjust(2, "0")
195
+ m = end_unit[1].to_s.rjust(2, "0")
196
+ s = end_unit[2].to_s.rjust(2, "0")
197
+ ms = end_unit[3]
198
+ end_time = "#{h}:#{m}:#{s}"
199
+ end_time << ".#{ms.to_s.rjust(3, "0")}" if ms > 0
200
+ message = "<p begin=\"#{begin_time}\" end=\"#{end_time}\">#{model.message.encode(:xml => :text)}</p>"
201
+ message << "</div>\n</body>\n</tt>" if last_cue
202
+ end
203
+ message
204
+ end
205
+
206
+ #
207
+ # Method that normalizes the timestamps from various different caption formats into
208
+ # a caption agnostic format
209
+ #
210
+ # * +time_stamp+ - The timestamp parsed from the caption file for a given caption type
211
+ # * +type+ - A valid caption type. Refer to `AllFather` for valid types
212
+ #
213
+ def time_details(time_stamp, type)
214
+ h = m = s = ms = nil
215
+ elapsed_seconds = nil
216
+ case type
217
+ when AllFather::TYPE_SCC
218
+ tokens = time_stamp.split(":")
219
+ h = tokens[0].to_i
220
+ m = tokens[1].to_i
221
+ s = tokens[2].to_i
222
+ frames = tokens[3].to_i
223
+ ms = (frames * 1000 / SCC_DEFAULT_FRAME_RATE).round(0).to_s.rjust(3, "0").to_i
224
+ if ms >= 1000
225
+ ms = 999
226
+ end
227
+ when AllFather::TYPE_SRT
228
+ tokens = time_stamp.split(",")
229
+ ms = tokens[1].to_i
230
+ tokens = tokens[0].split(":")
231
+ h = tokens[0].to_i
232
+ m = tokens[1].to_i
233
+ s = tokens[2].to_i
234
+ when AllFather::TYPE_VTT
235
+ tokens = time_stamp.split(".")
236
+ ms = tokens[1].to_i
237
+ tokens = tokens[0].split(":")
238
+ if tokens.size == 2
239
+ h = 0
240
+ m = tokens[0].to_i
241
+ s = tokens[1].to_i
242
+ else
243
+ h = tokens[0].to_i
244
+ m = tokens[1].to_i
245
+ s = tokens[2].to_i
246
+ end
247
+ when AllFather::TYPE_TTML, AllFather::TYPE_DFXP
248
+ # We support only clock-time without framerate / tickrate and only media timebase
249
+ # For offset hence we don't support frames / ticks
250
+ tokens = time_stamp.split(":")
251
+ if tokens.size > 1
252
+ if tokens.size > 3
253
+ # This is specified with frames and/or subframes. Unsupported
254
+ raise AllFather::InvalidInputException.new("TTML file with clock-time referencing frames / ticks is unsupported")
255
+ end
256
+ h = tokens[0].to_i
257
+ m = tokens[1].to_i
258
+ ms_tokens = tokens[2].split(".")
259
+ if ms_tokens.size == 1
260
+ ms = 0
261
+ else
262
+ ms = ms_tokens[1].to_i
263
+ end
264
+ s = ms_tokens[0].to_i
265
+ else
266
+ # Parsing in offset mode
267
+ if time_stamp.end_with?("ms")
268
+ unit = "ms"
269
+ time_with_no_unit = time_stamp[0, time_stamp.size - 2]
270
+ else
271
+ unit = time_stamp[time_stamp.size - 1]
272
+ time_with_no_unit = time_stamp[0, time_stamp.size - 1]
273
+ end
274
+ case unit
275
+ when "m"
276
+ time_with_no_unit = time_with_no_unit.to_f * 60
277
+ when "h"
278
+ time_with_no_unit = time_with_no_unit.to_f * (60 * 60)
279
+ when "s"
280
+ # do nothing
281
+ when "ms"
282
+ time_with_no_unit = time_with_no_unit.to_f / 1000.0
283
+ else
284
+ # Fail out f / t
285
+ raise AllFather::InvalidInputException.new("TTML file with offset-time referencing frames / ticks is unsupported")
286
+ end
287
+ tokens = time_with_no_unit.to_s.split(".")
288
+ h = m = 0
289
+ if tokens.size == 1
290
+ s = time_with_no_unit
291
+ ms = 0
292
+ else
293
+ s = tokens[0].to_i
294
+ ms = tokens[1].to_i
295
+ end
296
+ h = s / 3600
297
+ m = (s / 60) % 60
298
+ s = s % 60
299
+ end
300
+ end
301
+ elapsed_seconds = (h * 60 * 60) + (m * 60) + s
302
+ return [h, m, s, ms, elapsed_seconds]
303
+ end
304
+
305
+
306
+ #
307
+ # Method to write the cue details to the output files
308
+ #
309
+ # * +model+ - Cue instance
310
+ # * +file_map+ - Hash of files for each caption type
311
+ # * +last_cue+ - true for last cue and false otherwise
312
+ #
313
+ def write_cue(model, file_map, last_cue = false)
314
+ file_map.each do |type, file_path|
315
+ File.open(file_path, "a") do |f|
316
+ f.puts new_cue(model, type, last_cue)
317
+ end
318
+ end
319
+ if last_cue
320
+ # Pretty print the output for ttml & dfxp
321
+ file_map.each do |type, file_path|
322
+ next unless [AllFather::TYPE_DFXP, AllFather::TYPE_TTML].include?(type)
323
+ file = File.open(file_path, "r")
324
+ xml_doc = Nokogiri::XML(file, &:noblanks)
325
+ File.write(file_path, xml_doc.to_s)
326
+ end
327
+ end
328
+ end
329
+ end
@@ -0,0 +1,40 @@
1
+ class CueInfo
2
+ def initialize(type)
3
+ @type = type
4
+ @start = @end = @sequence = nil
5
+ @message = ""
6
+ @start_time_units = []
7
+ @end_time_units = []
8
+ @index = 1
9
+ end
10
+
11
+ attr_reader :type, :start, :end, :sequence, :message, :start_time_units, :end_time_units, :index
12
+
13
+ def start=(start)
14
+ @start = start
15
+ end
16
+
17
+ def end=(end_point)
18
+ @end = end_point
19
+ end
20
+
21
+ def message=(msg)
22
+ @message = msg
23
+ end
24
+
25
+ def sequence=(seq)
26
+ @sequence = seq
27
+ end
28
+
29
+ def index=(index)
30
+ @index = index
31
+ end
32
+
33
+ def start_time_units=(units)
34
+ @start_time_units = units
35
+ end
36
+
37
+ def end_time_units=(units)
38
+ @end_time_units = units
39
+ end
40
+ end
data/lib/vtt.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require_relative "engines/translator"
2
+ require_relative "utils/common_utils"
3
+ require_relative "utils/cue_info"
2
4
  require_relative "allfather"
3
5
 
4
6
  #
@@ -10,13 +12,20 @@ require_relative "allfather"
10
12
  class VTT
11
13
 
12
14
  include AllFather
15
+ include CommonUtils
13
16
 
14
- def initialize(cc_file, translator)
17
+ SUPPORTED_TRANSFORMATIONS = [TYPE_SCC, TYPE_SRT, TYPE_TTML, TYPE_DFXP]
18
+
19
+ def initialize(cc_file)
15
20
  @cc_file = cc_file
16
- @translator = translator
17
21
  raise "Invalid VTT file provided" unless is_valid?
18
22
  end
19
23
 
24
+ def set_translator(translator)
25
+ super(translator)
26
+ @translator = translator
27
+ end
28
+
20
29
  def translate(src_lang, dest_lang, out_file)
21
30
  super(src_lang, dest_lang, out_file)
22
31
  begin
@@ -53,7 +62,6 @@ class VTT
53
62
  outfile.puts
54
63
  end
55
64
  ensure
56
- ccfile.close rescue nil
57
65
  outfile.close
58
66
  end
59
67
  end
@@ -85,6 +93,69 @@ class VTT
85
93
  return false
86
94
  end
87
95
 
96
+ def supported_transformations
97
+ return SUPPORTED_TRANSFORMATIONS
98
+ end
99
+
100
+ def transform_to(types, src_lang, target_lang, output_dir)
101
+ # Let's start off with some validations
102
+ super(types, src_lang, target_lang, output_dir)
103
+
104
+ # Suffix output dir with File seperator
105
+ output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
106
+
107
+ # Prepare the output files for each type
108
+ file_map = {}
109
+ types.each do |type|
110
+ output_file = File.basename(@cc_file, File.extname(@cc_file)) + extension_from_type(type)
111
+ out_file = "#{output_dir}#{output_file}"
112
+ if create_file(TYPE_VTT, type, out_file, target_lang)
113
+ file_map[type] = out_file
114
+ else
115
+ raise StandardError.new("Failed to create output file for type #{type}")
116
+ end
117
+ end
118
+
119
+ # Read the file and prepare the cue model
120
+ cue_info = nil
121
+ ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
122
+ message = ""
123
+ collect_msg = false
124
+ cue_index = 1
125
+ ccfile.each_line do | line |
126
+ if line.strip.empty?
127
+ collect_msg = false
128
+ next
129
+ end
130
+ time_points = line.scan(/^((\d\d:)+\d\d[.,]\d\d\d)\s-->\s((\d\d:)+\d\d[.,]\d\d\d)/)
131
+ if time_points.empty?
132
+ if collect_msg
133
+ message << line
134
+ end
135
+ else
136
+ collect_msg = false
137
+ unless message.empty?
138
+ cue_info.message = message
139
+ write_cue(cue_info, file_map)
140
+ message = ""
141
+ cue_index += 1
142
+ end
143
+ # This is a cue point. Fetch timestamps
144
+ cue_info = CueInfo.new(AllFather::TYPE_VTT)
145
+ cue_info.index = cue_index
146
+ cue_info.start = time_points[0][0]
147
+ cue_info.end = time_points[0][2]
148
+ start_units = time_details(cue_info.start, TYPE_VTT)
149
+ end_units = time_details(cue_info.end, TYPE_VTT)
150
+ cue_info.start_time_units = start_units
151
+ cue_info.end_time_units = end_units
152
+ collect_msg = true
153
+ end
154
+ end
155
+ cue_info.message = message unless message.empty?
156
+ write_cue(cue_info, file_map, true)
157
+ end
158
+
88
159
  private
89
160
 
90
161
  #
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: subtitle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maheshwaran G
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-11-11 00:00:00.000000000 Z
12
+ date: 2019-11-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -67,6 +67,34 @@ dependencies:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
69
  version: '10.0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: minitest
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: optimist
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ type: :development
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
70
98
  description: Subtitle gem helps you to detect language and translate closed caption
71
99
  to required language.
72
100
  email:
@@ -85,6 +113,8 @@ files:
85
113
  - lib/srt.rb
86
114
  - lib/subtitle.rb
87
115
  - lib/ttml.rb
116
+ - lib/utils/common_utils.rb
117
+ - lib/utils/cue_info.rb
88
118
  - lib/vtt.rb
89
119
  homepage: https://github.com/cloudaffair/subtitle
90
120
  licenses: