subtitle 0.1.8 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: aba6a9f70a40bf96fd797a6a68816ca2e3070f93573d26dbbb1df4b12a47d691
4
- data.tar.gz: 1f61c3bb5a4ec42ca6fe694c9e1bac4d805d21d7f057b6f45a4c51ac0671e287
2
+ SHA1:
3
+ metadata.gz: f603ac76acbb145807944c0f948d6550eee197cc
4
+ data.tar.gz: 7d06b0e8ee047ab1790237fca1c478da03725541
5
5
  SHA512:
6
- metadata.gz: e996202bdd3ee2c8860b51aa6c5e414b9eea4735355c5b8f9d58f90ce67105fb1fafd781ae3e7375215ae8d364c9dfd0caa6dc3d73fd9f0257bb2748b945badc
7
- data.tar.gz: 5bfdf76e39dfc65a4ef5e5e288b74aa3e098fe358c402214300710b7b614d8c444053ed7dcdc3e012327d6c7b33d6d91db8701e06c39d6232dfdde42197d3e17
6
+ metadata.gz: 204c3af4231e25e6caaa198e9a8b7d46b4f917afcc8abdbce27bccbb94908d28d21c1b0318aa36a1ba63c83c064963e24c7398a99a0cd183c3ca10568fb6fe34
7
+ data.tar.gz: fb867912d76f039abf21fd1495c0f9b9139a594cde028f1d50184bd15d007a59113366507b7a85f1039ce0c638d423fbd7cff99d22792f83af59ccede48167dc
data/lib/allfather.rb ADDED
@@ -0,0 +1,83 @@
1
+ #
2
+ # A Module that kind of acts as an interface where the generic methods
3
+ # that applies to each caption type can be defined
4
+ #
5
+ # To use for a new caption type, simply include this module and provide
6
+ # caption specific implementations
7
+ #
8
+ module AllFather
9
+
10
+ #
11
+ # Valid file extensions that we support; Keep expanding as we grow
12
+ #
13
+ VALID_FILES = [".scc", ".srt", ".vtt", ".ttml", ".dfxp"]
14
+
15
+ #
16
+ # Generic exception class that is raised for validation errors
17
+ #
18
+ class InvalidInputException < StandardError; end
19
+
20
+ #
21
+ # Lang inference failure exception
22
+ #
23
+ class LangDetectionFailureException < StandardError; end
24
+
25
+ #
26
+ # Method to do basic validations like is this a valid file to even
27
+ # accept for any future transactions
28
+ #
29
+ # ==== Returns:
30
+ # true if the file is valid and false otherwise
31
+ #
32
+ def is_valid?
33
+ raise "Not Implemented. Class #{self.class.name} doesn't implement is_valid?"
34
+ end
35
+
36
+ #
37
+ # Method to infer the language(s) of the caption by inspecting the file
38
+ # depending on the type of the caption file
39
+ #
40
+ # ==== Returns
41
+ #
42
+ # * The ISO 639-1 Letter Language codes
43
+ #
44
+ def infer_languages
45
+ raise "Not Implemented. Class #{self.class.name} doesn't implement infer_languages"
46
+ end
47
+
48
+ #
49
+ # Method to translate the caption from one language to another
50
+ #
51
+ # :args: src_lang, target_lang, output_file
52
+ #
53
+ # * +input_caption+ - A Valid input caption file. Refer to #is_valid?
54
+ # * +src_lang+ - can be inferred using #infer_language method
55
+ # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
56
+ # * +output_file+ - Output file. Can be a fully qualified path or just file name
57
+ #
58
+ # ==== Raises
59
+ #
60
+ # InvalidInputException shall be raised if
61
+ # 1. The input file doesn't exist or is unreadable or is invalid caption
62
+ # 2. The output file can't be written
63
+ # 3. The target_lang is not a valid ISO 639-1 Letter Language code
64
+ #
65
+ def translate(src_lang, target_lang, output_file)
66
+ # Check if a non empty output file is present and error out to avoid
67
+ # the danger or overwriting some important file !!
68
+ if File.exists?(output_file) && File.size(output_file) > 0
69
+ raise InvalidInputException.new("Output file #{output_file} is not empty.")
70
+ else
71
+ # Just open the file in writable mode and close it just to ensure that
72
+ # we can write the output file
73
+ File.open(output_file, "w") {|f|
74
+ }
75
+ end
76
+ # Check if the file is writable ?
77
+ unless File.writable?(output_file)
78
+ raise InvalidInputException.new("Output file #{output_file} not writable.")
79
+ end
80
+ # Further checks can be done only in caption specific implementations
81
+ # or translation engine specific implementation
82
+ end
83
+ end
data/lib/dfxp.rb ADDED
@@ -0,0 +1,30 @@
1
+ require_relative "engines/translator"
2
+ require_relative "allfather"
3
+ require_relative "ttml"
4
+
5
+ #
6
+ # Library to handle DFXP Files
7
+ #
8
+ # Uses the translator available to do the necessary language operations
9
+ # as defined by the AllFather
10
+ #
11
+ class DFXP < TTML
12
+
13
+ def initialize(cc_file, translator, opts={})
14
+ @cc_file = cc_file
15
+ @translator = translator
16
+ @force_detect = opts[:force_detect] || false
17
+ raise "Invalid TTML file provided" unless is_valid?
18
+ end
19
+
20
+ def is_valid?
21
+ # Do any VTT specific validations here
22
+ if @cc_file =~ /^.*\.(dfxp)$/
23
+ return true
24
+ end
25
+ # TODO: Check if it's required to do a File read to see if this
26
+ # a well-formed XML. Another is to see if lang is available in each div
27
+ return false
28
+ end
29
+
30
+ end
@@ -0,0 +1,102 @@
1
+ require 'aws-sdk'
2
+ require 'aws-sdk'
3
+ require_relative 'translator'
4
+
5
+ #
6
+ # Provides Language services using Amazon Translate
7
+ #
8
+ # Module can be intialized using multiple options
9
+ #
10
+ # == Credential Referencing Order
11
+ #
12
+ # * [Arguments] - Pass the credentials access_key_id and secret_access_key as arguments
13
+ # * [Environment route] - AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY can be exposed as
14
+ # environment variables
15
+ # * [Profile Name] - The application uses the credentials of the system and picks the
16
+ # credentials referred to by the profile
17
+ #
18
+ class AwsEngine
19
+ include Translator
20
+
21
+ DEFAULT_REGION = ENV["AWS_DEFAULT_REGION"] || "us-east-1"
22
+
23
+ #
24
+ # :args: options
25
+ #
26
+ # ==== Arguments
27
+ # options can carry the following details
28
+ #
29
+ # * [:access_key_id] - access key id
30
+ # * [:secret_access_key] - Secret access key
31
+ # * [:env] - true for using credentials from environment variables
32
+ # * [:profile] - profile name for using shared credentials setup
33
+ # * [:region] - If not provided defaults to us-east-1
34
+ #
35
+ # ==== raises
36
+ #
37
+ # * EngineInitializationException if credentials cannot be setup due to lack of details
38
+ # * Aws Exceptions if profile name is invalid or invalid credentials are passed
39
+ #
40
+ def initialize(options)
41
+ access_key_id = nil
42
+ secret_access_key = nil
43
+ @region = options[:region] || DEFAULT_REGION
44
+ if options[:env]
45
+ access_key_id = ENV["AWS_ACCESS_KEY_ID"]
46
+ secret_access_key = ENV["AWS_SECRET_ACCESS_KEY"]
47
+ elsif options[:access_key_id] && options[:secret_access_key]
48
+ access_key_id = options[:access_key_id]
49
+ secret_access_key = options[:secret_access_key]
50
+ end
51
+ if access_key_id && secret_access_key
52
+ Aws.config.update({
53
+ region: options[:region] || DEFAULT_REGION,
54
+ credentials: Aws::Credentials.new(access_key_id, secret_access_key)
55
+ })
56
+ elsif options[:profile]
57
+ credentials = Aws::SharedCredentials.new(profile_name: options[:profile])
58
+ Aws.config.update({
59
+ region: @region,
60
+ credentials: credentials.credentials
61
+ })
62
+ else
63
+ raise Translator::EngineInitializationException.new(
64
+ "Failed to initialize Aws Engine. Credentials are missing / not provided")
65
+ end
66
+ @translate_service = Aws::Translate::Client.new(region: @region)
67
+ @comprehend_service = Aws::Comprehend::Client.new(region: @region)
68
+ end
69
+
70
+ #
71
+ # Invokes the language detection API of AWS and returns only the language
72
+ # of the highest score and returns the ISO 639-1 code
73
+ #
74
+ # :args: text
75
+ #
76
+ # ===== Arguments
77
+ # * +text+ - The text for which the language is to be inferred
78
+ #
79
+ def infer_language(text)
80
+ response = @comprehend_service.detect_dominant_language({ text: "#{text}" })
81
+ response[:languages][0][:language_code]
82
+ end
83
+
84
+ #
85
+ # Invokes the translation API of AWS and returns the translated text
86
+ # as per the arguments provided
87
+ # Will Raise exception if a translation cannot be made between the source
88
+ # and target language codes or if the lang code is invalid
89
+ #
90
+ # :args: input_text, src_lang, target_lang
91
+ #
92
+ # * +input_text+ - The text that needs to be translated
93
+ # * +src_lang+ - The source language of the text
94
+ # * +target_lang+ - The target language to which the input_text needs to be translated to
95
+ #
96
+ def translate(input_text, src_lang, target_lang)
97
+ response = @translate_service.translate_text({ :text => "#{input_text}" ,
98
+ :source_language_code => "#{src_lang}", :target_language_code => "#{target_lang}"})
99
+ response.translated_text
100
+ end
101
+ end
102
+
File without changes
@@ -0,0 +1,58 @@
1
+ #
2
+ # A Module that kind of acts as an interface where the methods
3
+ # expected out of each vendor is encapsulated into
4
+ #
5
+ # To use for a new vendor, simply include this module and provide
6
+ # caption specific implementations
7
+ #
8
+ module Translator
9
+
10
+ #
11
+ # Constants For Engines
12
+ ENGINE_AWS = 1
13
+ ENGINE_GCP = 2
14
+
15
+ #
16
+ # Keys for each Engine
17
+ AWS_KEYS = [:access_key_id, :secret_access_key, :profile]
18
+ GCP_KEYS = [:api_key, :project_id, :creds_path]
19
+
20
+ ENGINE_KEYS = {ENGINE_AWS => AWS_KEYS, ENGINE_GCP => GCP_KEYS}
21
+ #
22
+ # This exception shall be raised when we fail to initialize an
23
+ # engine for the purposes of language detection / translation
24
+ #
25
+ # ==== Example
26
+ # * When credentials are not passed
27
+ #
28
+ class EngineInitializationException < StandardError; end
29
+
30
+ #
31
+ # Method to infer the language by inspecting the text
32
+ # passed as argument
33
+ #
34
+ # :args: text
35
+ #
36
+ # * +text+ - String whose language needs to be inferred
37
+ #
38
+ # ==== Returns
39
+ #
40
+ # * The ISO 639-1 Letter Language code
41
+ #
42
+ def infer_language(text)
43
+ raise "Not Implemented. Class #{self.class.name} doesn't implement infer_language"
44
+ end
45
+
46
+ #
47
+ # Method to translate from given language to another
48
+ #
49
+ # :args: input_text, src_lang, target_lang, output_file
50
+ #
51
+ # * +input_text+ - Text which needs to be translated
52
+ # * +src_lang+ - can be inferred using #infer_language method
53
+ # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
54
+ #
55
+ def translate(input_text, src_lang, target_lang)
56
+ raise "Not Implemented. Class #{self.class.name} doesn't implement translate"
57
+ end
58
+ end
data/lib/scc.rb CHANGED
@@ -1,13 +1,47 @@
1
- require 'aws-sdk-translate'
2
- require 'aws-sdk-comprehend'
1
+ require_relative "engines/translator"
2
+ require_relative "allfather"
3
3
 
4
+ #
5
+ # Library to handle SCC Files
6
+ #
7
+ # Uses the translator available to do the necessary language operations
8
+ # as defined by the AllFather
9
+ #
4
10
  class SCC
5
11
 
6
- def initialize(awskey, awssecret)
7
- @translate = Aws::Translate::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
8
- @comp = Aws::Comprehend::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
12
+ include AllFather
13
+
14
+ def initialize(cc_file, translator)
15
+ @cc_file = cc_file
16
+ @translator = translator
17
+ raise "Invalid SCC file provided" unless is_valid?
18
+ end
19
+
20
+ def is_valid?
21
+ # Do any SCC specific validations here
22
+ if @cc_file =~ /^.*\.(scc)$/
23
+ return true
24
+ end
25
+ return false
26
+ end
27
+
28
+ def infer_languages
29
+ lang = nil
30
+ begin
31
+ sample_text = get_text(@cc_file, 100)
32
+ lang = @translator.infer_language(sample_text)
33
+ rescue StandardError => e
34
+ puts "Error while detecting the language due to #{e.message}"
35
+ end
36
+ lang
9
37
  end
10
38
 
39
+ def translate(src_lang, dest_lang, out_file)
40
+ raise "Not Implemented. Class #{self.class.name} doesn't implement translate yet !!"
41
+ end
42
+
43
+ private
44
+
11
45
  def get_text(srt_file, num_chars)
12
46
  ccfile = File.open(srt_file, 'r:UTF-8', &:read)
13
47
  text_sample = ""
@@ -15,12 +49,12 @@ class SCC
15
49
  if line =~ /^\d\d:\d\d:\d\d:\d\d\s/
16
50
  scc_text_code = line.gsub(/^\d\d:\d\d:\d\d:\d\d\s/, '')
17
51
  text_sample << decode(scc_text_code)
18
- if text_sample.length > (num_chars+1)
52
+ if text_sample.length > (num_chars + 1)
19
53
  break
20
54
  end
21
55
  end
22
56
  end
23
- return text_sample[0,num_chars]
57
+ return text_sample[0, num_chars]
24
58
  end
25
59
 
26
60
  def decode(scc_code_text)
@@ -31,7 +65,7 @@ class SCC
31
65
  hex_codes.each do | code |
32
66
  if ["94", "91", "92", "97", "15", "16", "10", "13"].include?(code)
33
67
  skip_next = true
34
- skip_count = skip_count +1
68
+ skip_count = skip_count + 1
35
69
  next
36
70
  end
37
71
  if skip_count == 1 && skip_next
@@ -60,18 +94,4 @@ class SCC
60
94
  end
61
95
  encoded_str
62
96
  end
63
-
64
- def detect_lang(scc_file)
65
- lang = nil
66
- begin
67
- sample_text = get_text(scc_file, 100)
68
- response = @comp.detect_dominant_language( {
69
- text: "#{sample_text}"
70
- })
71
- lang = response[:languages][0][:language_code] rescue nil
72
- rescue => error
73
- puts "Error while detecting the language!!"
74
- end
75
- lang
76
- end
77
- end
97
+ end
data/lib/srt.rb CHANGED
@@ -1,81 +1,106 @@
1
- require 'aws-sdk-translate'
2
- require 'aws-sdk-comprehend'
1
+ require_relative "engines/translator"
2
+ require_relative "allfather"
3
3
 
4
+ #
5
+ # Library to handle SRT Files
6
+ #
7
+ # Uses the translator available to do the necessary language operations
8
+ # as defined by the AllFather
9
+ #
4
10
  class SRT
5
- def initialize(awskey, awssecret)
6
- @translate = Aws::Translate::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
7
- @comp = Aws::Comprehend::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
11
+
12
+ include AllFather
13
+
14
+ def initialize(cc_file, translator)
15
+ @cc_file = cc_file
16
+ @translator = translator
17
+ raise "Invalid SRT file provided" unless is_valid?
18
+ end
19
+
20
+ def is_valid?
21
+ # Do any SRT specific validations here
22
+ if @cc_file =~ /^.*\.(srt)$/
23
+ return true
24
+ end
25
+ return false
8
26
  end
9
27
 
10
- def translate_text(srt_file, src_lang, dest_lang, out_file)
11
- ccfile = File.open(srt_file, 'r:UTF-8', &:read)
12
- outfile = File.open(out_file, "w")
13
- text_collection = false
14
- text_sample = ""
15
- ccfile.each_line do | line |
16
- if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
17
- text_collection = true
18
- outfile.puts line
19
- elsif line.strip.empty? && !text_sample.empty?
20
- json_text = JSON.parse(text_sample) rescue nil
21
- if json_text.nil?
22
- trans_resp = @translate.translate_text({ :text => "#{text_sample}" , :source_language_code => "#{src_lang}", :target_language_code => "#{dest_lang}"})
23
- outfile.puts trans_resp.translated_text
28
+ def translate(src_lang, dest_lang, out_file)
29
+ super(src_lang, dest_lang, out_file)
30
+ begin
31
+ ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
32
+ outfile = File.open(out_file, "w")
33
+ text_collection = false
34
+ text_sample = ""
35
+ ccfile.each_line do | line |
36
+ if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
37
+ text_collection = true
38
+ outfile.puts line
39
+ elsif line.strip.empty? && !text_sample.empty?
40
+ json_text = JSON.parse(text_sample) rescue nil
41
+ if json_text.nil?
42
+ trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
43
+ outfile.puts trans_resp
44
+ else
45
+ outfile.puts text_sample
46
+ end
24
47
  outfile.puts
48
+ text_sample = ""
49
+ text_collection = false
50
+ elsif text_collection
51
+ text_sample << line
25
52
  else
26
- outfile.puts text_sample
27
- outfile.puts
53
+ outfile.puts line
28
54
  end
29
- text_sample = ""
30
- text_collection = false
31
- elsif text_collection
32
- text_sample << line
33
- else
34
- outfile.puts line
35
55
  end
36
- next
37
- end
38
56
 
39
- if !text_sample.empty?
40
- trans_resp = @translate.translate_text({ :text => "#{text_sample}" , :source_language_code => "#{src_lang}", :target_language_code => "#{dest_lang}"})
41
- outfile.puts trans_resp.translated_text
42
- outfile.puts
57
+ if !text_sample.empty?
58
+ trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
59
+ outfile.puts trans_resp
60
+ outfile.puts
61
+ end
62
+ ensure
63
+ ccfile.close rescue nil
43
64
  outfile.close
44
65
  end
45
66
  end
46
67
 
47
-
48
- def get_text(srt_file, num_chars)
49
- ccfile = File.open(srt_file, 'r:UTF-8', &:read)
50
- text_collection = false
51
- text_sample = ""
52
- ccfile.each_line do | line |
53
- line = line
54
- if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
55
- text_collection = true
56
- elsif line.strip.empty?
57
- text_collection = false
58
- elsif text_collection && text_sample.length < (num_chars+1)
59
- text_sample << line
60
- end
61
- break if text_sample.length > (num_chars+1)
62
- next
68
+ def infer_languages
69
+ lang = nil
70
+ begin
71
+ sample_text = get_text(@cc_file, 100)
72
+ lang = @translator.infer_language(sample_text)
73
+ rescue StandardError => e
74
+ puts "Error while detecting the language due to #{e.message}"
63
75
  end
64
- return text_sample[0,num_chars]
76
+ [lang]
65
77
  end
66
78
 
67
- def detect_lang(srt_file)
68
- lang = nil
79
+ private
80
+
81
+ #
82
+ # Method to get a minimal amount of key text that excludes any tags
83
+ # or control information for the engine to meaninfully and
84
+ # correctly infer the language being referred to in ths VTT
85
+ #
86
+ def get_text(srt_file, num_chars)
69
87
  begin
70
- sample_text = get_text(srt_file, 100)
71
- response = @comp.detect_dominant_language( {
72
- text: "#{sample_text}"
73
- })
74
- lang = response[:languages][0][:language_code] rescue nil
75
- rescue => error
76
- puts "Error while detecting the language!!"
88
+ ccfile = File.open(srt_file, 'r:UTF-8', &:read)
89
+ text_collection = false
90
+ text_sample = ""
91
+ ccfile.each_line do |line|
92
+ if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
93
+ text_collection = true
94
+ elsif line.strip.empty?
95
+ text_collection = false
96
+ elsif text_collection && text_sample.length < (num_chars + 1)
97
+ text_sample << line
98
+ end
99
+ break if text_sample.length > (num_chars + 1)
100
+ end
101
+ ensure
102
+ ccfile.close rescue nil
77
103
  end
78
- lang
104
+ return text_sample[0, num_chars]
79
105
  end
80
-
81
106
  end
data/lib/subtitle.rb CHANGED
@@ -1,43 +1,88 @@
1
- require "srt"
1
+ require_relative "srt"
2
+ require_relative "vtt"
3
+ require_relative "scc"
4
+ require_relative "ttml"
5
+ require_relative "dfxp"
6
+ require_relative "allfather"
7
+ require_relative "engines/translator"
8
+ require_relative "engines/aws"
9
+
2
10
 
3
11
  class Subtitle
4
- def initialize(awskey, awssecret, ccfile)
5
- if awskey.nil? || awssecret.nil? || ccfile.nil?
6
- raise "Invalid Arguments, please check"
7
- end
8
- @ccfile = ccfile
9
- unless file_valid
10
- raise "Incorrect File extension"
11
- end
12
- begin
13
- @srt_parser = SRT.new(awskey, awssecret)
14
- rescue
15
- raise "Could not initialize Parser!!. Check the Keys supplied."
16
- end
12
+ def initialize(options={})
13
+ # Infer the caption handler from the extension
14
+ @cc_file = options[:cc_file]
15
+ raise "Input caption not provided. Please provide the same in :cc_file option" if @cc_file.nil?
16
+ translator = get_translator(options)
17
+ @handler = get_caption_handler(options, translator)
17
18
  end
18
19
 
19
20
  def detect_language
20
- detected_lang = @srt_parser.detect_lang(@ccfile)
21
- detected_lang
21
+ @handler.infer_languages
22
22
  end
23
23
 
24
- def translate_cc( dest_lang, src_lang = nil, outfile = nil)
24
+ def translate(dest_lang, src_lang = nil, outfile = nil)
25
25
  if outfile.nil?
26
- outfile = "#{@ccfile}_#{dest_lang}"
26
+ outfile = "#{@cc_file}_#{dest_lang}"
27
27
  end
28
28
  if src_lang.nil?
29
- src_lang = detect_language
30
- raise "could not detect Source Language!!" if src_lang.nil?
29
+ src_lang = detect_language[0] rescue nil
30
+ raise "Could not detect Source Language!!" if src_lang.nil?
31
31
  end
32
- @srt_parser.translate_text(@ccfile, src_lang, dest_lang, outfile)
32
+ @handler.translate(src_lang, dest_lang, outfile)
33
33
  outfile
34
34
  end
35
35
 
36
- def file_valid
37
- valid = false
38
- if @ccfile =~ /^.*\.(srt|vtt)$/
39
- valid = true
36
+ private
37
+
38
+ def get_translator(options)
39
+ translator = nil
40
+ # Try to infer the engine based on the passed options
41
+ engine = options[:engine]
42
+ unless engine
43
+ engine_props = Translator::ENGINE_KEYS
44
+ engine_props.each do |k, values|
45
+ original_size = values.size
46
+ diff = values - options.keys
47
+ if diff.size < original_size
48
+ # We have some keys for this engine in options
49
+ engine = k
50
+ break
51
+ end
52
+ end
53
+ end
54
+ case engine
55
+ when Translator::ENGINE_AWS
56
+ translator = AwsEngine.new(options)
57
+ when Translator::ENGINE_GCP
58
+ raise "GCP is yet to be implemented"
59
+ else
60
+ raise "Unable to infer the Translation Engine. Options missing key credential params"
61
+ end
62
+ translator
63
+ end
64
+
65
+ def get_caption_handler(options, translator)
66
+ caption_file = options[:cc_file]
67
+ extension = File.extname(caption_file)
68
+ unless AllFather::VALID_FILES.include?(extension)
69
+ raise "Caption support for #{caption_file} of type #{extension} is not supported yet"
70
+ end
71
+ handler = nil
72
+ case extension.downcase
73
+ when ".scc"
74
+ handler = SCC.new(caption_file, translator)
75
+ when ".srt"
76
+ handler = SRT.new(caption_file, translator)
77
+ when ".vtt"
78
+ handler = VTT.new(caption_file, translator)
79
+ when ".ttml"
80
+ handler = TTML.new(caption_file, translator, {:force_detect => options[:force_detect]})
81
+ when ".dfxp"
82
+ handler = DFXP.new(caption_file, translator, {:force_detect => options[:force_detect]})
83
+ else
84
+ raise "Cannot handle file type .#{extension}"
40
85
  end
41
- valid
86
+ handler
42
87
  end
43
- end
88
+ end
data/lib/ttml.rb ADDED
@@ -0,0 +1,180 @@
1
+ require_relative "engines/translator"
2
+ require_relative "allfather"
3
+
4
+ require "nokogiri"
5
+
6
+ #
7
+ # Library to handle TTML Files
8
+ #
9
+ # Uses the translator available to do the necessary language operations
10
+ # as defined by the AllFather
11
+ #
12
+ class TTML
13
+
14
+ include AllFather
15
+
16
+ def initialize(cc_file, translator, opts={})
17
+ @cc_file = cc_file
18
+ @translator = translator
19
+ @force_detect = opts[:force_detect] || false
20
+ raise "Invalid TTML file provided" unless is_valid?
21
+ end
22
+
23
+ def is_valid?
24
+ # Do any VTT specific validations here
25
+ if @cc_file =~ /^.*\.(ttml)$/
26
+ return true
27
+ end
28
+ # TODO: Check if it's required to do a File read to see if this
29
+ # a well-formed XML. Another is to see if lang is available in each div
30
+ return false
31
+ end
32
+
33
+ def infer_languages
34
+ lang = []
35
+ begin
36
+ xml_file = File.open(@cc_file)
37
+ xml_doc = Nokogiri::XML(xml_file)
38
+ div_objects = xml_doc.css("/tt/body/div")
39
+ div_objects.each_with_index do |div, index|
40
+ # By default, return the lang if specified in the div and
41
+ # force detect is false
42
+ inferred_lang = div.attributes['lang'].value rescue nil
43
+ if inferred_lang.nil?
44
+ # If lang is not provided in the caption, then override
45
+ # force detect for inferrence
46
+ @force_detect = true
47
+ end
48
+ if @force_detect
49
+ sample_text = get_text(div, 100)
50
+ inferred_lang = @translator.infer_language(sample_text) rescue nil
51
+ if inferred_lang.nil?
52
+ err_msg = "Failed to detect lang for div block number #{index + 1}"
53
+ unless lang.empty?
54
+ err_msg += "; Detected languages before failure are #{lang}"
55
+ end
56
+ raise AllFather::LangDetectionFailureException.new(err_msg)
57
+ end
58
+ end
59
+ lang << inferred_lang
60
+ end
61
+ rescue StandardError => e
62
+ puts "Error while detecting the language due to #{e.message}"
63
+ ensure
64
+ xml_file.close rescue nil
65
+ end
66
+ return nil if lang.empty?
67
+ lang
68
+ end
69
+
70
+ def translate(src_lang, dest_lang, out_file)
71
+ super(src_lang, dest_lang, out_file)
72
+ xml_file = File.open(@cc_file, 'r:UTF-8', &:read)
73
+ xml_doc = Nokogiri::XML(xml_file)
74
+ div_objects = xml_doc.css("/tt/body/div")
75
+ # Irrespective of what lang the div xml:lang says, infer the lang and then
76
+ # check to see if it matches src_lang
77
+ matched_div = nil
78
+ div_objects.each do |div|
79
+ sample_text = get_text(div, 100)
80
+ inferred_lang = @translator.infer_language(sample_text) rescue nil
81
+ next if inferred_lang.nil?
82
+ if inferred_lang.eql?(src_lang)
83
+ matched_div = div
84
+ break
85
+ end
86
+ end
87
+ if matched_div.nil?
88
+ FileUtils.remove_file(out_file)
89
+ raise AllFather::InvalidInputException.new("Unable to find #{src_lang} language section in TTML")
90
+ end
91
+ # Update the Lang in the Div
92
+ matched_div.lang = dest_lang
93
+
94
+ blocks = matched_div.css("p")
95
+ blocks.each do |block|
96
+ # Multiple spaces being stripped off
97
+ text = block.inner_html.strip.gsub(/(\s){2,}/, '')
98
+ text_blocks = get_block_text(text)
99
+ translated_text = ""
100
+ text_blocks.each do |text_block|
101
+ if text_block.start_with?('<') || text_block.empty?
102
+ translated_text << text_block
103
+ next
104
+ end
105
+ translated_resp = @translator.translate(text_block, src_lang, dest_lang)
106
+ translated_text << translated_resp
107
+ end
108
+ block.inner_html = translated_text
109
+ end
110
+ xml_file.close rescue nil
111
+ File.write(out_file, xml_doc)
112
+ out_file
113
+ end
114
+
115
+ private
116
+
117
+ #
118
+ # Method to segregate the data from markups as markups don't need
119
+ # translations.
120
+ # For example, if the cue block is of the form
121
+ # This is a test caption with <span id="1">a test span </span> within a block
122
+ # This method returns
123
+ # ["This is a test caption with ", "<span id=\"1\">", "a test span ", "</span>", " within a block"]
124
+ # as we can infer the markups can be retained as is to avoid translation
125
+ #
126
+ def get_block_text(text)
127
+ data = []
128
+ tag_start = tag_end = false
129
+ str_length = text.size
130
+ text_block = ""
131
+ markup_block = ""
132
+ for i in 0...text.size do
133
+ if text[i] == '<'
134
+ tag_end = false
135
+ tag_start = true
136
+ markup_block << text[i]
137
+ data << text_block
138
+ text_block = ""
139
+ next
140
+ elsif text[i] == '>'
141
+ tag_end = true
142
+ tag_start = false
143
+ markup_block << text[i]
144
+ data << markup_block
145
+ markup_block = ""
146
+ next
147
+ end
148
+ if tag_start && !tag_end
149
+ markup_block << text[i]
150
+ else
151
+ text_block << text[i]
152
+ end
153
+ end
154
+ unless text_block.empty?
155
+ data << text_block
156
+ end
157
+ data
158
+ end
159
+
160
+ #
161
+ # Method to get a minimal amount of key text that excludes any tags
162
+ # or control information for the engine to meaninfully and
163
+ # correctly infer the language being referred to in ths TTML
164
+ #
165
+ def get_text(div, num_chars)
166
+ text_sample = ""
167
+ blocks = div.css("p")
168
+ blocks.each do |block|
169
+ # Multiple spaces being stripped off
170
+ text = block.inner_html.strip.gsub(/(\s){2,}/, '')
171
+ # Strip off html tags (if any)
172
+ text = text.gsub(/(<.*?>)/, ' ')
173
+ text_sample << text
174
+ if text_sample.length > (num_chars + 1)
175
+ break
176
+ end
177
+ end
178
+ return text_sample[0, num_chars]
179
+ end
180
+ end
data/lib/vtt.rb CHANGED
@@ -1,81 +1,115 @@
1
- require 'aws-sdk-translate'
2
- require 'aws-sdk-comprehend'
1
+ require_relative "engines/translator"
2
+ require_relative "allfather"
3
3
 
4
+ #
5
+ # Library to handle VTT Files
6
+ #
7
+ # Uses the translator available to do the necessary language operations
8
+ # as defined by the AllFather
9
+ #
4
10
  class VTT
5
- def initialize(awskey, awssecret)
6
- @translate = Aws::Translate::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
7
- @comp = Aws::Comprehend::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
11
+
12
+ include AllFather
13
+
14
+ def initialize(cc_file, translator)
15
+ @cc_file = cc_file
16
+ @translator = translator
17
+ raise "Invalid VTT file provided" unless is_valid?
8
18
  end
9
19
 
10
- def translate_text(srt_file, src_lang, dest_lang, out_file)
11
- ccfile = File.open(srt_file, 'r:UTF-8', &:read)
12
- outfile = File.open(out_file, "w")
13
- text_collection = false
14
- text_sample = ""
15
- ccfile.each_line do | line |
16
- if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
17
- text_collection = true
18
- outfile.puts line
19
- elsif line.strip.empty? && !text_sample.empty?
20
- json_text = JSON.parse(text_sample) rescue nil
21
- if json_text.nil?
22
- trans_resp = @translate.translate_text({ :text => "#{text_sample}" , :source_language_code => "#{src_lang}", :target_language_code => "#{dest_lang}"})
23
- outfile.puts trans_resp.translated_text
24
- outfile.puts
20
+ def translate(src_lang, dest_lang, out_file)
21
+ super(src_lang, dest_lang, out_file)
22
+ begin
23
+ ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
24
+ outfile = File.open(out_file, "w")
25
+ text_collection = false
26
+ text_sample = ""
27
+ ccfile.each_line do | line |
28
+ if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
29
+ text_collection = true
30
+ outfile.puts line
31
+ elsif line.strip.empty? && !text_sample.empty?
32
+ json_text = JSON.parse(text_sample) rescue nil
33
+ if json_text.nil?
34
+ trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
35
+ outfile.puts trans_resp
36
+ outfile.puts
37
+ else
38
+ outfile.puts text_sample
39
+ outfile.puts
40
+ end
41
+ text_sample = ""
42
+ text_collection = false
43
+ elsif text_collection
44
+ text_sample << line
25
45
  else
26
- outfile.puts text_sample
27
- outfile.puts
46
+ outfile.puts line
28
47
  end
29
- text_sample = ""
30
- text_collection = false
31
- elsif text_collection
32
- text_sample << line
33
- else
34
- outfile.puts line
35
48
  end
36
- next
37
- end
38
49
 
39
- if !text_sample.empty?
40
- trans_resp = @translate.translate_text({ :text => "#{text_sample}" , :source_language_code => "#{src_lang}", :target_language_code => "#{dest_lang}"})
41
- outfile.puts trans_resp.translated_text
42
- outfile.puts
50
+ if !text_sample.empty?
51
+ trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
52
+ outfile.puts trans_resp
53
+ outfile.puts
54
+ end
55
+ ensure
56
+ ccfile.close rescue nil
43
57
  outfile.close
44
58
  end
45
59
  end
46
60
 
61
+ #
62
+ # Returns the inferred language in an array
63
+ #
64
+ def infer_languages
65
+ lang = nil
66
+ begin
67
+ sample_text = get_text(@cc_file, 100)
68
+ lang = @translator.infer_language(sample_text)
69
+ rescue StandardError => e
70
+ puts "Error while detecting the language due to #{e.message}"
71
+ end
72
+ [lang]
73
+ end
47
74
 
48
- def get_text(srt_file, num_chars)
49
- ccfile = File.open(srt_file, 'r:UTF-8', &:read)
50
- text_collection = false
51
- text_sample = ""
52
- ccfile.each_line do | line |
53
- line = line
54
- if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
55
- text_collection = true
56
- elsif line.strip.empty?
57
- text_collection = false
58
- elsif text_collection && text_sample.length < (num_chars+1)
59
- text_sample << line
60
- end
61
- break if text_sample.length > (num_chars+1)
62
- next
75
+ #
76
+ # Method to add required set of validations specific to caption type
77
+ #
78
+ def is_valid?
79
+ # Do any VTT specific validations here
80
+ if @cc_file =~ /^.*\.(vtt)$/
81
+ return true
63
82
  end
64
- return text_sample[0,num_chars]
83
+ # TODO: Check if it's required to do a File read to see if the 1st line is WEBVTT
84
+ # to handle cases where invalid file is named with vtt extension
85
+ return false
65
86
  end
66
87
 
67
- def detect_lang(srt_file)
68
- lang = nil
88
+ private
89
+
90
+ #
91
+ # Method to get a minimal amount of key text that excludes any tags
92
+ # or control information for the engine to meaninfully and
93
+ # correctly infer the language being referred to in ths VTT
94
+ #
95
+ def get_text(vtt_file, num_chars)
69
96
  begin
70
- sample_text = get_text(srt_file, 100)
71
- response = @comp.detect_dominant_language( {
72
- text: "#{sample_text}"
73
- })
74
- lang = response[:languages][0][:language_code] rescue nil
75
- rescue => error
76
- puts "Error while detecting the language!!"
97
+ ccfile = File.open(vtt_file, 'r:UTF-8', &:read)
98
+ text_collection = false
99
+ text_sample = ""
100
+ ccfile.each_line do |line|
101
+ if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
102
+ text_collection = true
103
+ elsif line.strip.empty?
104
+ text_collection = false
105
+ elsif text_collection && text_sample.length < (num_chars + 1)
106
+ text_sample << line
107
+ end
108
+ break if text_sample.length > (num_chars + 1)
109
+ end
110
+ ensure
111
+ ccfile.close rescue nil
77
112
  end
78
- lang
113
+ return text_sample[0, num_chars]
79
114
  end
80
-
81
115
  end
metadata CHANGED
@@ -1,14 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: subtitle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maheshwaran G
8
+ - Arunjeyaprasad A J
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2019-10-21 00:00:00.000000000 Z
12
+ date: 2019-10-31 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: bundler
@@ -25,43 +26,37 @@ dependencies:
25
26
  - !ruby/object:Gem::Version
26
27
  version: '2.0'
27
28
  - !ruby/object:Gem::Dependency
28
- name: aws-sdk-comprehend
29
+ name: aws-sdk
29
30
  requirement: !ruby/object:Gem::Requirement
30
31
  requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: aws-sdk-translate
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
32
+ - - "~>"
46
33
  - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :runtime
34
+ version: '2.11'
35
+ type: :development
49
36
  prerelease: false
50
37
  version_requirements: !ruby/object:Gem::Requirement
51
38
  requirements:
52
- - - ">="
39
+ - - "~>"
53
40
  - !ruby/object:Gem::Version
54
- version: '0'
55
- description: subtitle gem to detect and translate closed caption for SubRip and WebVTT
41
+ version: '2.11'
42
+ description: Subtitle gem helps you to detect language and translate closed caption
43
+ to required language.
56
44
  email:
57
45
  - pgmaheshwaran@gmail.com
46
+ - arunjeyaprasad@gmail.com
58
47
  executables: []
59
48
  extensions: []
60
49
  extra_rdoc_files: []
61
50
  files:
51
+ - lib/allfather.rb
52
+ - lib/dfxp.rb
53
+ - lib/engines/aws.rb
54
+ - lib/engines/gcp.rb
55
+ - lib/engines/translator.rb
62
56
  - lib/scc.rb
63
57
  - lib/srt.rb
64
58
  - lib/subtitle.rb
59
+ - lib/ttml.rb
65
60
  - lib/vtt.rb
66
61
  homepage: https://github.com/cloudaffair/subtitle
67
62
  licenses:
@@ -85,9 +80,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
85
80
  version: '0'
86
81
  requirements: []
87
82
  rubyforge_project:
88
- rubygems_version: 2.7.3
83
+ rubygems_version: 2.5.1
89
84
  signing_key:
90
85
  specification_version: 4
91
- summary: subtitle helps you to detect language and translate closed caption to required
92
- language
86
+ summary: Subtitle gem helps you to detect language and translate closed caption to
87
+ required language
93
88
  test_files: []