subtitle 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 49e45fb2713aedd5d6d7d6d290fe4874a292df3249cfad7259913e90b0cb7fd8
4
- data.tar.gz: 7f4535875a19028db4ec08de90903daba7b906b659571c5a921850071bf3154c
3
+ metadata.gz: bd5a110529cb49076b699028d4aba501f92d1b9af517363ecf9d0066ca67bfcd
4
+ data.tar.gz: 6e499c8e56be6748699f1e9210eaa14c982eac3ea6c425460e75e45ee941a1dc
5
5
  SHA512:
6
- metadata.gz: 0b53144f0a627a545c0a989d664f3611078a7afc11e9f00b479065a0d3a1b2bc9bf68e10706bd89b85c0e73ff53d7c4627a2c5e29d38867ec2882c99ea56eda0
7
- data.tar.gz: 6999ae152b2f5904a2061944522b11387280df6e02cbf4d36d7d5ae27ba12eb3b6c36357bba791c5a7b4f06575531ce76df904dcbb76b0e5f79bd62b05988704
6
+ metadata.gz: 027ee941cfd582de1c98f7d7aae4a7673bcaf51b7af4cba9df0bef0f3fb16437ad90e760bd61adcea0a4487094d3f7ade675635aff69e40881b36240ec17253c
7
+ data.tar.gz: b0b3763969201e7f98883463292f59c9583b103251a14879853ae923a5a22259e2d677c175fefbfd25f45ecfe589eb157fec5d7b891364b30cb9714f8ec65e36
@@ -63,17 +63,18 @@ module AllFather
63
63
  #
64
64
  # * +translator+ - Instance of translation engine. Refer to `engines/aws` for example
65
65
  #
66
+ # ==== Raises
67
+ # * `InvalidInputException` when the argument `translator` is not an instance of Translator class
68
+ #
66
69
  def set_translator(translator)
67
70
  if translator && !(translator.is_a? Translator)
68
- raise "Argument is not an instance of Translator"
71
+ raise InvalidInputException.new("Argument is not an instance of Translator")
69
72
  end
70
73
  end
71
74
 
72
75
  #
73
76
  # Method to translate the caption from one language to another
74
77
  #
75
- # :args: src_lang, target_lang, output_file
76
- #
77
78
  # * +src_lang+ - can be inferred using #infer_language method
78
79
  # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
79
80
  # * +output_file+ - Output file. Can be a fully qualified path or just file name
@@ -114,6 +115,9 @@ module AllFather
114
115
  # If no target_lang is provided, no translations are applied. output_file is created using
115
116
  # without any need for any language translation services. Hence doesn't incur any cost !!
116
117
  #
118
+ # Note: +src_lang+ makes sense only for caption types that can hold multi lingual captions
119
+ # like dfxp and ttml. For other caption sources this field is ignored
120
+ #
117
121
  # * +types+ - An array of Valid input caption type(s). Refer to `#CaptionType`
118
122
  # * +src_lang+ - can be inferred using #infer_language method
119
123
  # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
@@ -140,18 +144,31 @@ module AllFather
140
144
  raise InvalidInputException.new("SCC can be generated only in en. #{target_lang} is unsupported")
141
145
  end
142
146
  end
143
- if target_lang && !target_lang.empty?
144
- raise InvalidInputException.new("Translation to other language as part of transform is yet to be implemented")
145
- end
146
147
  end
147
148
 
148
149
  #
149
150
  # Method to report on the supported transformations. Each implementor is free to return
150
151
  # the types to which it can convert itself to
151
152
  #
152
- # Returns an array of one or more types defined as +TYPE_+ constants here
153
+ # ==== Returns
154
+ #
155
+ # * An array of one or more types defined as +TYPE_+ constants here
153
156
  #
154
157
  def supported_transformations
155
158
  raise "Not Implemented. Class #{self.class.name} doesn't implement supported_transformations"
156
159
  end
160
+
161
+ #
162
+ # While the logic of abstracting stuff to callers has it's benefits, sometimes it's required
163
+ # to identify which instance are we specifically operate on. This method returns the instance
164
+ # currently being operated on and returns one of the +TYPE_+ constants defined here
165
+ # Implement this unless and absolutely it's necessary and there is no other easy way to do things
166
+ #
167
+ # ===== Returns
168
+ #
169
+ # * the call sign of the instance
170
+ #
171
+ def callsign
172
+ raise "Not Implemented. Class #{self.class.name} doesn't implement callsign"
173
+ end
157
174
  end
@@ -5,16 +5,17 @@ require_relative "ttml"
5
5
  #
6
6
  # Library to handle DFXP Files
7
7
  #
8
- # Uses the translator available to do the necessary language operations
9
- # as defined by the AllFather
8
+ # Extends the TTML Class as except for namespace differences there isn't
9
+ # much to call between ttml and dfxp
10
10
  #
11
11
  class DFXP < TTML
12
12
 
13
- def initialize(cc_file)
13
+ SUPPORTED_TRANSFORMATIONS = [TYPE_SCC, TYPE_SRT, TYPE_VTT, TYPE_TTML]
14
+
15
+ def initialize(cc_file, opts=nil)
14
16
  @cc_file = cc_file
15
- #@translator = translator
16
- #@force_detect = opts[:force_detect] || false
17
- raise "Invalid TTML file provided" unless is_valid?
17
+ @force_detect = opts ? (opts[:force_detect] || false) : false
18
+ raise "Invalid DFXP file provided" unless is_valid?
18
19
  end
19
20
 
20
21
  def is_valid?
@@ -26,5 +27,12 @@ class DFXP < TTML
26
27
  # a well-formed XML. Another is to see if lang is available in each div
27
28
  return false
28
29
  end
29
-
30
+
31
+ def callsign
32
+ TYPE_DFXP
33
+ end
34
+
35
+ def supported_transformations
36
+ return SUPPORTED_TRANSFORMATIONS
37
+ end
30
38
  end
@@ -11,8 +11,8 @@ require_relative 'translator'
11
11
  #
12
12
  # * [Arguments] - Pass the credentials access_key_id and secret_access_key as arguments
13
13
  # * [Environment route] - AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY can be exposed as environment variables
14
- # * [Profile Name] - The application uses the credentials of the system and picks the
15
- # credentials referred to by the profile
14
+ # * [Profile Name] - The application uses the credentials of the system and picks the credentials
15
+ # referred to by the profile
16
16
  #
17
17
  class AwsEngine
18
18
  include Translator
data/lib/srt.rb CHANGED
@@ -94,11 +94,21 @@ class SRT
94
94
 
95
95
  # Suffix output dir with File seperator
96
96
  output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
97
+
98
+ translate = false
99
+ if target_lang && !target_lang.empty?
100
+ translate = true
101
+ if @translator.nil?
102
+ raise StandardError.new("Cannot infer language as engine options are not provided")
103
+ end
104
+ end
97
105
 
98
106
  # Prepare the output files for each type
99
107
  file_map = {}
100
108
  types.each do |type|
101
- output_file = File.basename(@cc_file, File.extname(@cc_file)) + extension_from_type(type)
109
+ output_file = File.basename(@cc_file, File.extname(@cc_file))
110
+ output_file << "_#{target_lang}" if translate
111
+ output_file << extension_from_type(type)
102
112
  out_file = "#{output_dir}#{output_file}"
103
113
  if create_file(TYPE_SRT, type, out_file, target_lang)
104
114
  file_map[type] = out_file
@@ -119,7 +129,7 @@ class SRT
119
129
  # This is not a time point
120
130
  seq = line.strip
121
131
  if seq.to_i > 0
122
- cue_info.message = message unless message.empty?
132
+ cue_info.message = translated_msg(translate, message, src_lang, target_lang) unless message.empty?
123
133
  write_cue(cue_info, file_map) if cue_info
124
134
  cue_info = CueInfo.new(TYPE_SRT)
125
135
  cue_info.sequence = seq
@@ -140,12 +150,42 @@ class SRT
140
150
  cue_info.end_time_units = end_units
141
151
  end
142
152
  end
143
- cue_info.message = message unless message.empty?
153
+ cue_info.message = translated_msg(translate, message, src_lang, target_lang) unless message.empty?
144
154
  write_cue(cue_info, file_map, true)
145
155
  end
146
156
 
147
157
  private
148
158
 
159
+ #
160
+ # Method to translate a given text message based on following conditions
161
+ #
162
+ # * If translate is false, the message is returned as is
163
+ # * If +src_lang+ and +target_lang+ are same then the message is returned as is
164
+ # * If +src_lang+ is nil or empty then this caption file will be inspected to infer language
165
+ # and if it's same as target_lang, then again the message shall be returned as is
166
+ # * Otherwise, returns a translated text
167
+ #
168
+ # ==== Raise
169
+ # * LangDetectionFailureException - If failed to infer the language
170
+ #
171
+ def translated_msg(translate, message, src_lang, target_lang)
172
+ return message unless translate
173
+ use_src = nil
174
+ if (src_lang.nil? || src_lang.empty?)
175
+ # We don't need to infer again and again
176
+ begin
177
+ @inferred_src_lang ||= infer_languages.first
178
+ rescue StandardError => e
179
+ raise LangDetectionFailureException.new("Failed to infer language due to #{e.message}")
180
+ end
181
+ use_src = @inferred_src_lang
182
+ else
183
+ use_src = src_lang
184
+ end
185
+ return message if use_src.eql?(target_lang)
186
+ @translator.translate(message, use_src, target_lang)
187
+ end
188
+
149
189
  #
150
190
  # Method to get a minimal amount of key text that excludes any tags
151
191
  # or control information for the engine to meaninfully and
@@ -58,7 +58,7 @@ class Subtitle
58
58
  end
59
59
  # Translator not required if target_lang is nil
60
60
  if @handler.nil?
61
- if target_lang.nil?
61
+ if target_lang.nil? && src_lang.nil?
62
62
  @handler = get_caption_handler(options, nil)
63
63
  else
64
64
  initialize_handler(options)
@@ -146,9 +146,9 @@ class Subtitle
146
146
  when ".vtt"
147
147
  handler = VTT.new(caption_file)
148
148
  when ".ttml"
149
- handler = TTML.new(caption_file)
149
+ handler = TTML.new(caption_file, options)
150
150
  when ".dfxp"
151
- handler = DFXP.new(caption_file)
151
+ handler = DFXP.new(caption_file, options)
152
152
  else
153
153
  raise "Cannot handle file type .#{extension}"
154
154
  end
@@ -1,4 +1,6 @@
1
1
  require_relative "engines/translator"
2
+ require_relative "utils/common_utils"
3
+ require_relative "utils/cue_info"
2
4
  require_relative "allfather"
3
5
 
4
6
  require "nokogiri"
@@ -12,12 +14,20 @@ require "nokogiri"
12
14
  class TTML
13
15
 
14
16
  include AllFather
17
+ include CommonUtils
15
18
 
16
- def initialize(cc_file)
19
+ SUPPORTED_TRANSFORMATIONS = [TYPE_SCC, TYPE_SRT, TYPE_VTT, TYPE_DFXP]
20
+
21
+ def initialize(cc_file, opts=nil)
17
22
  @cc_file = cc_file
23
+ @force_detect = opts ? (opts[:force_detect] || false) : false
18
24
  raise "Invalid TTML file provided" unless is_valid?
19
25
  end
20
26
 
27
+ def callsign
28
+ TYPE_TTML
29
+ end
30
+
21
31
  def is_valid?
22
32
  # Do any VTT specific validations here
23
33
  if @cc_file =~ /^.*\.(ttml)$/
@@ -33,12 +43,12 @@ class TTML
33
43
  end
34
44
 
35
45
  def infer_languages
36
- force_detect = false
37
46
  lang = []
38
47
  begin
39
48
  xml_file = File.open(@cc_file)
40
49
  xml_doc = Nokogiri::XML(xml_file)
41
50
  div_objects = xml_doc.css("/tt/body/div")
51
+ local_force_detect = false
42
52
  div_objects.each_with_index do |div, index|
43
53
  # By default, return the lang if specified in the div and
44
54
  # force detect is false
@@ -46,9 +56,10 @@ class TTML
46
56
  if inferred_lang.nil?
47
57
  # If lang is not provided in the caption, then override
48
58
  # force detect for inferrence
49
- force_detect = true
59
+ local_force_detect = true
50
60
  end
51
- if force_detect
61
+ if @force_detect || local_force_detect
62
+ local_force_detect = false
52
63
  sample_text = get_text(div, 100)
53
64
  inferred_lang = @translator.infer_language(sample_text) rescue nil
54
65
  if inferred_lang.nil?
@@ -115,8 +126,165 @@ class TTML
115
126
  out_file
116
127
  end
117
128
 
129
+ def supported_transformations
130
+ return SUPPORTED_TRANSFORMATIONS
131
+ end
132
+
133
+ def transform_to(types, src_lang, target_lang, output_dir)
134
+ # Let's start off with some validations
135
+ super(types, src_lang, target_lang, output_dir)
136
+
137
+ # Suffix output dir with File seperator
138
+ output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
139
+
140
+ begin
141
+ xml_file = File.open(@cc_file, 'r')
142
+ xml_doc = Nokogiri::XML(xml_file)
143
+ div_objects = xml_doc.css("/tt/body/div")
144
+ langs = div_objects.map {|div| div.attributes['lang'].value rescue nil}
145
+ translate = false
146
+ matching_divs = []
147
+ inferred_src_lang = nil
148
+ if src_lang.nil? || src_lang.empty?
149
+ if target_lang && !target_lang.empty?
150
+ # Find if any of our div matches this. Else pick first and translate to target lang
151
+ div_objects.each_with_index do |div, j|
152
+ if matching_lang?(div, target_lang)
153
+ matching_divs << div
154
+ break
155
+ end
156
+ end
157
+ if matching_divs.empty?
158
+ # Let's pick the first div for target translation
159
+ selected_div = div_objects.first
160
+ inferred_src_lang = selected_div.lang
161
+ matching_divs << selected_div
162
+ translate = true
163
+ end
164
+ else
165
+ # Then we will have to create output files for each lang
166
+ matching_divs = div_objects
167
+ end
168
+ else
169
+ # Find the matching lang div and create the outputs
170
+ available_divs = langs.select { |lang| lang.eql?(src_lang) }
171
+ if available_divs.length > 1
172
+ raise InvalidInputException.new("More than one section in Caption file specifies lang as #{src_lang}. This file is unsupported")
173
+ end
174
+ div_objects.each_with_index do |div, j|
175
+ if matching_lang?(div, src_lang)
176
+ matching_divs << div
177
+ break
178
+ end
179
+ end
180
+ if matching_divs.empty?
181
+ raise InvalidInputException.new("Given Caption file #{@cc_file} doesn't contain #{src_lang} lang. Available langs are #{langs}")
182
+ end
183
+ if matching_divs.length > 1
184
+ raise InvalidInputException.new("More than one section in Caption file specifies lang as #{src_lang}. This file is unsupported")
185
+ end
186
+ if target_lang && !target_lang.empty? && !src_lang.eql?(target_lang)
187
+ translate = true
188
+ end
189
+ end
190
+
191
+ div_index = 1
192
+ multiple_outputs = matching_divs.size > 1
193
+ matching_divs.each do |div|
194
+ div_lang = div.attributes['lang'].value rescue nil
195
+ # Override div lang if translate is required
196
+ div_lang = target_lang if translate
197
+ file_map = {}
198
+ # Prepare the output files for each type and for each lang in the file
199
+ types.each do |type|
200
+ output_file = File.basename(@cc_file, File.extname(@cc_file))
201
+ # Suffix div index when multiple outputs are created
202
+ output_file << "_#{div_index}" if multiple_outputs
203
+ if target_lang.nil? && !src_lang.nil?
204
+ output_file << "_#{src_lang}"
205
+ end
206
+ # Suffix lang to filename if provideds
207
+ if target_lang && !target_lang.empty?
208
+ output_file << "_#{target_lang}"
209
+ end
210
+ output_file << extension_from_type(type)
211
+ out_file = "#{output_dir}#{output_file}"
212
+ if create_file(TYPE_TTML, type, out_file, div_lang)
213
+ file_map[type] = out_file
214
+ else
215
+ raise StandardError.new("Failed to create output file for type #{type}")
216
+ end
217
+ end
218
+ blocks = div.css("p")
219
+ cue_index = 1
220
+ total_blocks = blocks.size
221
+ blocks.each_with_index do |block, index|
222
+ start_time = block.attributes['begin'].value
223
+ end_time = block.attributes['end'].value
224
+ text = block.inner_html.strip.gsub(/(\s){2,}/, '')
225
+ message = ""
226
+ text_blocks = get_block_text(text)
227
+ text_blocks.each do |text_block|
228
+ next if text_block.start_with?('<') || text_block.empty?
229
+ message << text_block
230
+ end
231
+ cue_info = CueInfo.new(callsign)
232
+ cue_info.index = cue_index
233
+ cue_index += 1
234
+ cue_info.message = translated_msg(translate, message, src_lang, inferred_src_lang, target_lang)
235
+ cue_info.start = start_time
236
+ cue_info.end = end_time
237
+ cue_info.start_time_units = time_details(start_time, callsign)
238
+ cue_info.end_time_units = time_details(end_time, callsign)
239
+ write_cue(cue_info, file_map, index == (total_blocks - 1))
240
+ end
241
+ div_index += 1
242
+ end
243
+ ensure
244
+ xml_file.close if xml_file
245
+ end
246
+ end
247
+
118
248
  private
119
249
 
250
+ def translated_msg(translate, message, src_lang, inferred_src_lang, target_lang)
251
+ return message unless translate
252
+ use_src = nil
253
+ if (src_lang.nil? || src_lang.empty?)
254
+ if inferred_src_lang.nil?
255
+ raise LangDetectionFailureException.new("Unable to deduce source lang for translation")
256
+ end
257
+ use_src = inferred_src_lang
258
+ else
259
+ use_src = src_lang
260
+ end
261
+ return message if use_src.eql?(target_lang)
262
+ @translator.translate(message, use_src, target_lang)
263
+ end
264
+
265
+ def matching_lang?(div, target_lang)
266
+ lang = div.attributes['lang'].value rescue nil
267
+ if lang.nil?
268
+ # Let's infer the lang
269
+ if @translator.nil?
270
+ raise StandardError.new("Cannot infer language as engine options are not provided")
271
+ end
272
+ reference_text = get_text(div, 100)
273
+ inferred_lang = @translator.infer_language(reference_text) rescue nil
274
+ if inferred_lang.nil?
275
+ raise LangDetectionFailureException.new("Failed to infer language for div block #{j} of caption file")
276
+ end
277
+ # Store this lang in the div
278
+ div.lang = inferred_lang
279
+ if inferred_lang.eql?(target_lang)
280
+ return true
281
+ end
282
+ elsif lang.eql?(target_lang)
283
+ return true
284
+ end
285
+ return false
286
+ end
287
+
120
288
  #
121
289
  # Method to segregate the data from markups as markups don't need
122
290
  # translations.
data/lib/vtt.rb CHANGED
@@ -104,10 +104,19 @@ class VTT
104
104
  # Suffix output dir with File seperator
105
105
  output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
106
106
 
107
+ translate = false
108
+ if target_lang && !target_lang.empty?
109
+ translate = true
110
+ if @translator.nil?
111
+ raise StandardError.new("Cannot infer language as engine options are not provided")
112
+ end
113
+ end
107
114
  # Prepare the output files for each type
108
115
  file_map = {}
109
116
  types.each do |type|
110
- output_file = File.basename(@cc_file, File.extname(@cc_file)) + extension_from_type(type)
117
+ output_file = File.basename(@cc_file, File.extname(@cc_file))
118
+ output_file << "_#{target_lang}" if translate
119
+ output_file << extension_from_type(type)
111
120
  out_file = "#{output_dir}#{output_file}"
112
121
  if create_file(TYPE_VTT, type, out_file, target_lang)
113
122
  file_map[type] = out_file
@@ -135,7 +144,7 @@ class VTT
135
144
  else
136
145
  collect_msg = false
137
146
  unless message.empty?
138
- cue_info.message = message
147
+ cue_info.message = translated_msg(translate, message, src_lang, target_lang)
139
148
  write_cue(cue_info, file_map)
140
149
  message = ""
141
150
  cue_index += 1
@@ -152,11 +161,41 @@ class VTT
152
161
  collect_msg = true
153
162
  end
154
163
  end
155
- cue_info.message = message unless message.empty?
164
+ cue_info.message = translated_msg(translate, message, src_lang, target_lang) unless message.empty?
156
165
  write_cue(cue_info, file_map, true)
157
166
  end
158
167
 
159
- private
168
+ private
169
+
170
+ #
171
+ # Method to translate a given text message based on following conditions
172
+ #
173
+ # * If translate is false, the message is returned as is
174
+ # * If +src_lang+ and +target_lang+ are same then the message is returned as is
175
+ # * If +src_lang+ is nil or empty then this caption file will be inspected to infer language
176
+ # and if it's same as target_lang, then again the message shall be returned as is
177
+ # * Otherwise, returns a translated text
178
+ #
179
+ # ==== Raise
180
+ # * LangDetectionFailureException - If failed to infer the language
181
+ #
182
+ def translated_msg(translate, message, src_lang, target_lang)
183
+ return message unless translate
184
+ use_src = nil
185
+ if (src_lang.nil? || src_lang.empty?)
186
+ # We don't need to infer again and again
187
+ begin
188
+ @inferred_src_lang ||= infer_languages.first
189
+ rescue StandardError => e
190
+ raise LangDetectionFailureException.new("Failed to infer language due to #{e.message}")
191
+ end
192
+ use_src = @inferred_src_lang
193
+ else
194
+ use_src = src_lang
195
+ end
196
+ return message if use_src.eql?(target_lang)
197
+ @translator.translate(message, use_src, target_lang)
198
+ end
160
199
 
161
200
  #
162
201
  # Method to get a minimal amount of key text that excludes any tags
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: subtitle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maheshwaran G
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-11-13 00:00:00.000000000 Z
12
+ date: 2019-11-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -95,8 +95,10 @@ dependencies:
95
95
  - - ">="
96
96
  - !ruby/object:Gem::Version
97
97
  version: '0'
98
- description: Subtitle gem helps you to detect language and translate closed caption
99
- to required language.
98
+ description: " Subtitle gem helps you to detect the language(s)
99
+ of the caption file, translate closed caption \n to another
100
+ language and also supports transforming from one format to another. \n Say
101
+ for example from dfxp to srt or vtt or to all supported formats.\"\n"
100
102
  email:
101
103
  - pgmaheshwaran@gmail.com
102
104
  - arunjeyaprasad@gmail.com