subtitle 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 49e45fb2713aedd5d6d7d6d290fe4874a292df3249cfad7259913e90b0cb7fd8
4
- data.tar.gz: 7f4535875a19028db4ec08de90903daba7b906b659571c5a921850071bf3154c
3
+ metadata.gz: bd5a110529cb49076b699028d4aba501f92d1b9af517363ecf9d0066ca67bfcd
4
+ data.tar.gz: 6e499c8e56be6748699f1e9210eaa14c982eac3ea6c425460e75e45ee941a1dc
5
5
  SHA512:
6
- metadata.gz: 0b53144f0a627a545c0a989d664f3611078a7afc11e9f00b479065a0d3a1b2bc9bf68e10706bd89b85c0e73ff53d7c4627a2c5e29d38867ec2882c99ea56eda0
7
- data.tar.gz: 6999ae152b2f5904a2061944522b11387280df6e02cbf4d36d7d5ae27ba12eb3b6c36357bba791c5a7b4f06575531ce76df904dcbb76b0e5f79bd62b05988704
6
+ metadata.gz: 027ee941cfd582de1c98f7d7aae4a7673bcaf51b7af4cba9df0bef0f3fb16437ad90e760bd61adcea0a4487094d3f7ade675635aff69e40881b36240ec17253c
7
+ data.tar.gz: b0b3763969201e7f98883463292f59c9583b103251a14879853ae923a5a22259e2d677c175fefbfd25f45ecfe589eb157fec5d7b891364b30cb9714f8ec65e36
@@ -63,17 +63,18 @@ module AllFather
63
63
  #
64
64
  # * +translator+ - Instance of translation engine. Refer to `engines/aws` for example
65
65
  #
66
+ # ==== Raises
67
+ # * `InvalidInputException` when the argument `translator` is not an instance of Translator class
68
+ #
66
69
  def set_translator(translator)
67
70
  if translator && !(translator.is_a? Translator)
68
- raise "Argument is not an instance of Translator"
71
+ raise InvalidInputException.new("Argument is not an instance of Translator")
69
72
  end
70
73
  end
71
74
 
72
75
  #
73
76
  # Method to translate the caption from one language to another
74
77
  #
75
- # :args: src_lang, target_lang, output_file
76
- #
77
78
  # * +src_lang+ - can be inferred using #infer_language method
78
79
  # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
79
80
  # * +output_file+ - Output file. Can be a fully qualified path or just file name
@@ -114,6 +115,9 @@ module AllFather
114
115
  # If no target_lang is provided, no translations are applied. output_file is created using
115
116
  # without any need for any language translation services. Hence doesn't incur any cost !!
116
117
  #
118
+ # Note: +src_lang+ makes sense only for caption types that can hold multi lingual captions
119
+ # like dfxp and ttml. For other caption sources this field is ignored
120
+ #
117
121
  # * +types+ - An array of Valid input caption type(s). Refer to `#CaptionType`
118
122
  # * +src_lang+ - can be inferred using #infer_language method
119
123
  # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
@@ -140,18 +144,31 @@ module AllFather
140
144
  raise InvalidInputException.new("SCC can be generated only in en. #{target_lang} is unsupported")
141
145
  end
142
146
  end
143
- if target_lang && !target_lang.empty?
144
- raise InvalidInputException.new("Translation to other language as part of transform is yet to be implemented")
145
- end
146
147
  end
147
148
 
148
149
  #
149
150
  # Method to report on the supported transformations. Each implementor is free to return
150
151
  # the types to which it can convert itself to
151
152
  #
152
- # Returns an array of one or more types defined as +TYPE_+ constants here
153
+ # ==== Returns
154
+ #
155
+ # * An array of one or more types defined as +TYPE_+ constants here
153
156
  #
154
157
  def supported_transformations
155
158
  raise "Not Implemented. Class #{self.class.name} doesn't implement supported_transformations"
156
159
  end
160
+
161
+ #
162
+ # While the logic of abstracting stuff to callers has it's benefits, sometimes it's required
163
+ # to identify which instance are we specifically operate on. This method returns the instance
164
+ # currently being operated on and returns one of the +TYPE_+ constants defined here
165
+ # Implement this unless and absolutely it's necessary and there is no other easy way to do things
166
+ #
167
+ # ===== Returns
168
+ #
169
+ # * the call sign of the instance
170
+ #
171
+ def callsign
172
+ raise "Not Implemented. Class #{self.class.name} doesn't implement callsign"
173
+ end
157
174
  end
@@ -5,16 +5,17 @@ require_relative "ttml"
5
5
  #
6
6
  # Library to handle DFXP Files
7
7
  #
8
- # Uses the translator available to do the necessary language operations
9
- # as defined by the AllFather
8
+ # Extends the TTML Class as except for namespace differences there isn't
9
+ # much to call between ttml and dfxp
10
10
  #
11
11
  class DFXP < TTML
12
12
 
13
- def initialize(cc_file)
13
+ SUPPORTED_TRANSFORMATIONS = [TYPE_SCC, TYPE_SRT, TYPE_VTT, TYPE_TTML]
14
+
15
+ def initialize(cc_file, opts=nil)
14
16
  @cc_file = cc_file
15
- #@translator = translator
16
- #@force_detect = opts[:force_detect] || false
17
- raise "Invalid TTML file provided" unless is_valid?
17
+ @force_detect = opts ? (opts[:force_detect] || false) : false
18
+ raise "Invalid DFXP file provided" unless is_valid?
18
19
  end
19
20
 
20
21
  def is_valid?
@@ -26,5 +27,12 @@ class DFXP < TTML
26
27
  # a well-formed XML. Another is to see if lang is available in each div
27
28
  return false
28
29
  end
29
-
30
+
31
+ def callsign
32
+ TYPE_DFXP
33
+ end
34
+
35
+ def supported_transformations
36
+ return SUPPORTED_TRANSFORMATIONS
37
+ end
30
38
  end
@@ -11,8 +11,8 @@ require_relative 'translator'
11
11
  #
12
12
  # * [Arguments] - Pass the credentials access_key_id and secret_access_key as arguments
13
13
  # * [Environment route] - AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY can be exposed as environment variables
14
- # * [Profile Name] - The application uses the credentials of the system and picks the
15
- # credentials referred to by the profile
14
+ # * [Profile Name] - The application uses the credentials of the system and picks the credentials
15
+ # referred to by the profile
16
16
  #
17
17
  class AwsEngine
18
18
  include Translator
data/lib/srt.rb CHANGED
@@ -94,11 +94,21 @@ class SRT
94
94
 
95
95
  # Suffix output dir with File seperator
96
96
  output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
97
+
98
+ translate = false
99
+ if target_lang && !target_lang.empty?
100
+ translate = true
101
+ if @translator.nil?
102
+ raise StandardError.new("Cannot infer language as engine options are not provided")
103
+ end
104
+ end
97
105
 
98
106
  # Prepare the output files for each type
99
107
  file_map = {}
100
108
  types.each do |type|
101
- output_file = File.basename(@cc_file, File.extname(@cc_file)) + extension_from_type(type)
109
+ output_file = File.basename(@cc_file, File.extname(@cc_file))
110
+ output_file << "_#{target_lang}" if translate
111
+ output_file << extension_from_type(type)
102
112
  out_file = "#{output_dir}#{output_file}"
103
113
  if create_file(TYPE_SRT, type, out_file, target_lang)
104
114
  file_map[type] = out_file
@@ -119,7 +129,7 @@ class SRT
119
129
  # This is not a time point
120
130
  seq = line.strip
121
131
  if seq.to_i > 0
122
- cue_info.message = message unless message.empty?
132
+ cue_info.message = translated_msg(translate, message, src_lang, target_lang) unless message.empty?
123
133
  write_cue(cue_info, file_map) if cue_info
124
134
  cue_info = CueInfo.new(TYPE_SRT)
125
135
  cue_info.sequence = seq
@@ -140,12 +150,42 @@ class SRT
140
150
  cue_info.end_time_units = end_units
141
151
  end
142
152
  end
143
- cue_info.message = message unless message.empty?
153
+ cue_info.message = translated_msg(translate, message, src_lang, target_lang) unless message.empty?
144
154
  write_cue(cue_info, file_map, true)
145
155
  end
146
156
 
147
157
  private
148
158
 
159
+ #
160
+ # Method to translate a given text message based on following conditions
161
+ #
162
+ # * If translate is false, the message is returned as is
163
+ # * If +src_lang+ and +target_lang+ are same then the message is returned as is
164
+ # * If +src_lang+ is nil or empty then this caption file will be inspected to infer language
165
+ # and if it's same as target_lang, then again the message shall be returned as is
166
+ # * Otherwise, returns a translated text
167
+ #
168
+ # ==== Raise
169
+ # * LangDetectionFailureException - If failed to infer the language
170
+ #
171
+ def translated_msg(translate, message, src_lang, target_lang)
172
+ return message unless translate
173
+ use_src = nil
174
+ if (src_lang.nil? || src_lang.empty?)
175
+ # We don't need to infer again and again
176
+ begin
177
+ @inferred_src_lang ||= infer_languages.first
178
+ rescue StandardError => e
179
+ raise LangDetectionFailureException.new("Failed to infer language due to #{e.message}")
180
+ end
181
+ use_src = @inferred_src_lang
182
+ else
183
+ use_src = src_lang
184
+ end
185
+ return message if use_src.eql?(target_lang)
186
+ @translator.translate(message, use_src, target_lang)
187
+ end
188
+
149
189
  #
150
190
  # Method to get a minimal amount of key text that excludes any tags
151
191
  # or control information for the engine to meaninfully and
@@ -58,7 +58,7 @@ class Subtitle
58
58
  end
59
59
  # Translator not required if target_lang is nil
60
60
  if @handler.nil?
61
- if target_lang.nil?
61
+ if target_lang.nil? && src_lang.nil?
62
62
  @handler = get_caption_handler(options, nil)
63
63
  else
64
64
  initialize_handler(options)
@@ -146,9 +146,9 @@ class Subtitle
146
146
  when ".vtt"
147
147
  handler = VTT.new(caption_file)
148
148
  when ".ttml"
149
- handler = TTML.new(caption_file)
149
+ handler = TTML.new(caption_file, options)
150
150
  when ".dfxp"
151
- handler = DFXP.new(caption_file)
151
+ handler = DFXP.new(caption_file, options)
152
152
  else
153
153
  raise "Cannot handle file type .#{extension}"
154
154
  end
@@ -1,4 +1,6 @@
1
1
  require_relative "engines/translator"
2
+ require_relative "utils/common_utils"
3
+ require_relative "utils/cue_info"
2
4
  require_relative "allfather"
3
5
 
4
6
  require "nokogiri"
@@ -12,12 +14,20 @@ require "nokogiri"
12
14
  class TTML
13
15
 
14
16
  include AllFather
17
+ include CommonUtils
15
18
 
16
- def initialize(cc_file)
19
+ SUPPORTED_TRANSFORMATIONS = [TYPE_SCC, TYPE_SRT, TYPE_VTT, TYPE_DFXP]
20
+
21
+ def initialize(cc_file, opts=nil)
17
22
  @cc_file = cc_file
23
+ @force_detect = opts ? (opts[:force_detect] || false) : false
18
24
  raise "Invalid TTML file provided" unless is_valid?
19
25
  end
20
26
 
27
+ def callsign
28
+ TYPE_TTML
29
+ end
30
+
21
31
  def is_valid?
22
32
  # Do any VTT specific validations here
23
33
  if @cc_file =~ /^.*\.(ttml)$/
@@ -33,12 +43,12 @@ class TTML
33
43
  end
34
44
 
35
45
  def infer_languages
36
- force_detect = false
37
46
  lang = []
38
47
  begin
39
48
  xml_file = File.open(@cc_file)
40
49
  xml_doc = Nokogiri::XML(xml_file)
41
50
  div_objects = xml_doc.css("/tt/body/div")
51
+ local_force_detect = false
42
52
  div_objects.each_with_index do |div, index|
43
53
  # By default, return the lang if specified in the div and
44
54
  # force detect is false
@@ -46,9 +56,10 @@ class TTML
46
56
  if inferred_lang.nil?
47
57
  # If lang is not provided in the caption, then override
48
58
  # force detect for inferrence
49
- force_detect = true
59
+ local_force_detect = true
50
60
  end
51
- if force_detect
61
+ if @force_detect || local_force_detect
62
+ local_force_detect = false
52
63
  sample_text = get_text(div, 100)
53
64
  inferred_lang = @translator.infer_language(sample_text) rescue nil
54
65
  if inferred_lang.nil?
@@ -115,8 +126,165 @@ class TTML
115
126
  out_file
116
127
  end
117
128
 
129
+ def supported_transformations
130
+ return SUPPORTED_TRANSFORMATIONS
131
+ end
132
+
133
+ def transform_to(types, src_lang, target_lang, output_dir)
134
+ # Let's start off with some validations
135
+ super(types, src_lang, target_lang, output_dir)
136
+
137
+ # Suffix output dir with File seperator
138
+ output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
139
+
140
+ begin
141
+ xml_file = File.open(@cc_file, 'r')
142
+ xml_doc = Nokogiri::XML(xml_file)
143
+ div_objects = xml_doc.css("/tt/body/div")
144
+ langs = div_objects.map {|div| div.attributes['lang'].value rescue nil}
145
+ translate = false
146
+ matching_divs = []
147
+ inferred_src_lang = nil
148
+ if src_lang.nil? || src_lang.empty?
149
+ if target_lang && !target_lang.empty?
150
+ # Find if any of our div matches this. Else pick first and translate to target lang
151
+ div_objects.each_with_index do |div, j|
152
+ if matching_lang?(div, target_lang)
153
+ matching_divs << div
154
+ break
155
+ end
156
+ end
157
+ if matching_divs.empty?
158
+ # Let's pick the first div for target translation
159
+ selected_div = div_objects.first
160
+ inferred_src_lang = selected_div.lang
161
+ matching_divs << selected_div
162
+ translate = true
163
+ end
164
+ else
165
+ # Then we will have to create output files for each lang
166
+ matching_divs = div_objects
167
+ end
168
+ else
169
+ # Find the matching lang div and create the outputs
170
+ available_divs = langs.select { |lang| lang.eql?(src_lang) }
171
+ if available_divs.length > 1
172
+ raise InvalidInputException.new("More than one section in Caption file specifies lang as #{src_lang}. This file is unsupported")
173
+ end
174
+ div_objects.each_with_index do |div, j|
175
+ if matching_lang?(div, src_lang)
176
+ matching_divs << div
177
+ break
178
+ end
179
+ end
180
+ if matching_divs.empty?
181
+ raise InvalidInputException.new("Given Caption file #{@cc_file} doesn't contain #{src_lang} lang. Available langs are #{langs}")
182
+ end
183
+ if matching_divs.length > 1
184
+ raise InvalidInputException.new("More than one section in Caption file specifies lang as #{src_lang}. This file is unsupported")
185
+ end
186
+ if target_lang && !target_lang.empty? && !src_lang.eql?(target_lang)
187
+ translate = true
188
+ end
189
+ end
190
+
191
+ div_index = 1
192
+ multiple_outputs = matching_divs.size > 1
193
+ matching_divs.each do |div|
194
+ div_lang = div.attributes['lang'].value rescue nil
195
+ # Override div lang if translate is required
196
+ div_lang = target_lang if translate
197
+ file_map = {}
198
+ # Prepare the output files for each type and for each lang in the file
199
+ types.each do |type|
200
+ output_file = File.basename(@cc_file, File.extname(@cc_file))
201
+ # Suffix div index when multiple outputs are created
202
+ output_file << "_#{div_index}" if multiple_outputs
203
+ if target_lang.nil? && !src_lang.nil?
204
+ output_file << "_#{src_lang}"
205
+ end
206
+ # Suffix lang to filename if provideds
207
+ if target_lang && !target_lang.empty?
208
+ output_file << "_#{target_lang}"
209
+ end
210
+ output_file << extension_from_type(type)
211
+ out_file = "#{output_dir}#{output_file}"
212
+ if create_file(TYPE_TTML, type, out_file, div_lang)
213
+ file_map[type] = out_file
214
+ else
215
+ raise StandardError.new("Failed to create output file for type #{type}")
216
+ end
217
+ end
218
+ blocks = div.css("p")
219
+ cue_index = 1
220
+ total_blocks = blocks.size
221
+ blocks.each_with_index do |block, index|
222
+ start_time = block.attributes['begin'].value
223
+ end_time = block.attributes['end'].value
224
+ text = block.inner_html.strip.gsub(/(\s){2,}/, '')
225
+ message = ""
226
+ text_blocks = get_block_text(text)
227
+ text_blocks.each do |text_block|
228
+ next if text_block.start_with?('<') || text_block.empty?
229
+ message << text_block
230
+ end
231
+ cue_info = CueInfo.new(callsign)
232
+ cue_info.index = cue_index
233
+ cue_index += 1
234
+ cue_info.message = translated_msg(translate, message, src_lang, inferred_src_lang, target_lang)
235
+ cue_info.start = start_time
236
+ cue_info.end = end_time
237
+ cue_info.start_time_units = time_details(start_time, callsign)
238
+ cue_info.end_time_units = time_details(end_time, callsign)
239
+ write_cue(cue_info, file_map, index == (total_blocks - 1))
240
+ end
241
+ div_index += 1
242
+ end
243
+ ensure
244
+ xml_file.close if xml_file
245
+ end
246
+ end
247
+
118
248
  private
119
249
 
250
+ def translated_msg(translate, message, src_lang, inferred_src_lang, target_lang)
251
+ return message unless translate
252
+ use_src = nil
253
+ if (src_lang.nil? || src_lang.empty?)
254
+ if inferred_src_lang.nil?
255
+ raise LangDetectionFailureException.new("Unable to deduce source lang for translation")
256
+ end
257
+ use_src = inferred_src_lang
258
+ else
259
+ use_src = src_lang
260
+ end
261
+ return message if use_src.eql?(target_lang)
262
+ @translator.translate(message, use_src, target_lang)
263
+ end
264
+
265
+ def matching_lang?(div, target_lang)
266
+ lang = div.attributes['lang'].value rescue nil
267
+ if lang.nil?
268
+ # Let's infer the lang
269
+ if @translator.nil?
270
+ raise StandardError.new("Cannot infer language as engine options are not provided")
271
+ end
272
+ reference_text = get_text(div, 100)
273
+ inferred_lang = @translator.infer_language(reference_text) rescue nil
274
+ if inferred_lang.nil?
275
+ raise LangDetectionFailureException.new("Failed to infer language for div block #{j} of caption file")
276
+ end
277
+ # Store this lang in the div
278
+ div.lang = inferred_lang
279
+ if inferred_lang.eql?(target_lang)
280
+ return true
281
+ end
282
+ elsif lang.eql?(target_lang)
283
+ return true
284
+ end
285
+ return false
286
+ end
287
+
120
288
  #
121
289
  # Method to segregate the data from markups as markups don't need
122
290
  # translations.
data/lib/vtt.rb CHANGED
@@ -104,10 +104,19 @@ class VTT
104
104
  # Suffix output dir with File seperator
105
105
  output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
106
106
 
107
+ translate = false
108
+ if target_lang && !target_lang.empty?
109
+ translate = true
110
+ if @translator.nil?
111
+ raise StandardError.new("Cannot infer language as engine options are not provided")
112
+ end
113
+ end
107
114
  # Prepare the output files for each type
108
115
  file_map = {}
109
116
  types.each do |type|
110
- output_file = File.basename(@cc_file, File.extname(@cc_file)) + extension_from_type(type)
117
+ output_file = File.basename(@cc_file, File.extname(@cc_file))
118
+ output_file << "_#{target_lang}" if translate
119
+ output_file << extension_from_type(type)
111
120
  out_file = "#{output_dir}#{output_file}"
112
121
  if create_file(TYPE_VTT, type, out_file, target_lang)
113
122
  file_map[type] = out_file
@@ -135,7 +144,7 @@ class VTT
135
144
  else
136
145
  collect_msg = false
137
146
  unless message.empty?
138
- cue_info.message = message
147
+ cue_info.message = translated_msg(translate, message, src_lang, target_lang)
139
148
  write_cue(cue_info, file_map)
140
149
  message = ""
141
150
  cue_index += 1
@@ -152,11 +161,41 @@ class VTT
152
161
  collect_msg = true
153
162
  end
154
163
  end
155
- cue_info.message = message unless message.empty?
164
+ cue_info.message = translated_msg(translate, message, src_lang, target_lang) unless message.empty?
156
165
  write_cue(cue_info, file_map, true)
157
166
  end
158
167
 
159
- private
168
+ private
169
+
170
+ #
171
+ # Method to translate a given text message based on following conditions
172
+ #
173
+ # * If translate is false, the message is returned as is
174
+ # * If +src_lang+ and +target_lang+ are same then the message is returned as is
175
+ # * If +src_lang+ is nil or empty then this caption file will be inspected to infer language
176
+ # and if it's same as target_lang, then again the message shall be returned as is
177
+ # * Otherwise, returns a translated text
178
+ #
179
+ # ==== Raise
180
+ # * LangDetectionFailureException - If failed to infer the language
181
+ #
182
+ def translated_msg(translate, message, src_lang, target_lang)
183
+ return message unless translate
184
+ use_src = nil
185
+ if (src_lang.nil? || src_lang.empty?)
186
+ # We don't need to infer again and again
187
+ begin
188
+ @inferred_src_lang ||= infer_languages.first
189
+ rescue StandardError => e
190
+ raise LangDetectionFailureException.new("Failed to infer language due to #{e.message}")
191
+ end
192
+ use_src = @inferred_src_lang
193
+ else
194
+ use_src = src_lang
195
+ end
196
+ return message if use_src.eql?(target_lang)
197
+ @translator.translate(message, use_src, target_lang)
198
+ end
160
199
 
161
200
  #
162
201
  # Method to get a minimal amount of key text that excludes any tags
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: subtitle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maheshwaran G
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-11-13 00:00:00.000000000 Z
12
+ date: 2019-11-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -95,8 +95,10 @@ dependencies:
95
95
  - - ">="
96
96
  - !ruby/object:Gem::Version
97
97
  version: '0'
98
- description: Subtitle gem helps you to detect language and translate closed caption
99
- to required language.
98
+ description: " Subtitle gem helps you to detect the language(s)
99
+ of the caption file, translate closed caption \n to another
100
+ language and also supports transforming from one format to another. \n Say
101
+ for example from dfxp to srt or vtt or to all supported formats.\"\n"
100
102
  email:
101
103
  - pgmaheshwaran@gmail.com
102
104
  - arunjeyaprasad@gmail.com