subtitle 0.1.8 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: aba6a9f70a40bf96fd797a6a68816ca2e3070f93573d26dbbb1df4b12a47d691
4
- data.tar.gz: 1f61c3bb5a4ec42ca6fe694c9e1bac4d805d21d7f057b6f45a4c51ac0671e287
2
+ SHA1:
3
+ metadata.gz: f603ac76acbb145807944c0f948d6550eee197cc
4
+ data.tar.gz: 7d06b0e8ee047ab1790237fca1c478da03725541
5
5
  SHA512:
6
- metadata.gz: e996202bdd3ee2c8860b51aa6c5e414b9eea4735355c5b8f9d58f90ce67105fb1fafd781ae3e7375215ae8d364c9dfd0caa6dc3d73fd9f0257bb2748b945badc
7
- data.tar.gz: 5bfdf76e39dfc65a4ef5e5e288b74aa3e098fe358c402214300710b7b614d8c444053ed7dcdc3e012327d6c7b33d6d91db8701e06c39d6232dfdde42197d3e17
6
+ metadata.gz: 204c3af4231e25e6caaa198e9a8b7d46b4f917afcc8abdbce27bccbb94908d28d21c1b0318aa36a1ba63c83c064963e24c7398a99a0cd183c3ca10568fb6fe34
7
+ data.tar.gz: fb867912d76f039abf21fd1495c0f9b9139a594cde028f1d50184bd15d007a59113366507b7a85f1039ce0c638d423fbd7cff99d22792f83af59ccede48167dc
data/lib/allfather.rb ADDED
@@ -0,0 +1,83 @@
1
+ #
2
+ # A Module that kind of acts as an interface where the generic methods
3
+ # that applies to each caption type can be defined
4
+ #
5
+ # To use for a new caption type, simply include this module and provide
6
+ # caption specific implementations
7
+ #
8
+ module AllFather
9
+
10
+ #
11
+ # Valid file extensions that we support; Keep expanding as we grow
12
+ #
13
+ VALID_FILES = [".scc", ".srt", ".vtt", ".ttml", ".dfxp"]
14
+
15
+ #
16
+ # Generic exception class that is raised for validation errors
17
+ #
18
+ class InvalidInputException < StandardError; end
19
+
20
+ #
21
+ # Lang inference failure exception
22
+ #
23
+ class LangDetectionFailureException < StandardError; end
24
+
25
+ #
26
+ # Method to do basic validations like is this a valid file to even
27
+ # accept for any future transactions
28
+ #
29
+ # ==== Returns:
30
+ # true if the file is valid and false otherwise
31
+ #
32
+ def is_valid?
33
+ raise "Not Implemented. Class #{self.class.name} doesn't implement is_valid?"
34
+ end
35
+
36
+ #
37
+ # Method to infer the language(s) of the caption by inspecting the file
38
+ # depending on the type of the caption file
39
+ #
40
+ # ==== Returns
41
+ #
42
+ # * The ISO 639-1 Letter Language codes
43
+ #
44
+ def infer_languages
45
+ raise "Not Implemented. Class #{self.class.name} doesn't implement infer_languages"
46
+ end
47
+
48
+ #
49
+ # Method to translate the caption from one language to another
50
+ #
51
+ # :args: src_lang, target_lang, output_file
52
+ #
53
+ # * +input_caption+ - A Valid input caption file. Refer to #is_valid?
54
+ # * +src_lang+ - can be inferred using #infer_language method
55
+ # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
56
+ # * +output_file+ - Output file. Can be a fully qualified path or just file name
57
+ #
58
+ # ==== Raises
59
+ #
60
+ # InvalidInputException shall be raised if
61
+ # 1. The input file doesn't exist or is unreadable or is invalid caption
62
+ # 2. The output file can't be written
63
+ # 3. The target_lang is not a valid ISO 639-1 Letter Language code
64
+ #
65
+ def translate(src_lang, target_lang, output_file)
66
+ # Check if a non empty output file is present and error out to avoid
67
+ # the danger or overwriting some important file !!
68
+ if File.exists?(output_file) && File.size(output_file) > 0
69
+ raise InvalidInputException.new("Output file #{output_file} is not empty.")
70
+ else
71
+ # Just open the file in writable mode and close it just to ensure that
72
+ # we can write the output file
73
+ File.open(output_file, "w") {|f|
74
+ }
75
+ end
76
+ # Check if the file is writable ?
77
+ unless File.writable?(output_file)
78
+ raise InvalidInputException.new("Output file #{output_file} not writable.")
79
+ end
80
+ # Further checks can be done only in caption specific implementations
81
+ # or translation engine specific implementation
82
+ end
83
+ end
data/lib/dfxp.rb ADDED
@@ -0,0 +1,30 @@
1
+ require_relative "engines/translator"
2
+ require_relative "allfather"
3
+ require_relative "ttml"
4
+
5
+ #
6
+ # Library to handle DFXP Files
7
+ #
8
+ # Uses the translator available to do the necessary language operations
9
+ # as defined by the AllFather
10
+ #
11
+ class DFXP < TTML
12
+
13
+ def initialize(cc_file, translator, opts={})
14
+ @cc_file = cc_file
15
+ @translator = translator
16
+ @force_detect = opts[:force_detect] || false
17
+ raise "Invalid TTML file provided" unless is_valid?
18
+ end
19
+
20
+ def is_valid?
21
+ # Do any VTT specific validations here
22
+ if @cc_file =~ /^.*\.(dfxp)$/
23
+ return true
24
+ end
25
+ # TODO: Check if it's required to do a File read to see if this
26
+ # a well-formed XML. Another is to see if lang is available in each div
27
+ return false
28
+ end
29
+
30
+ end
@@ -0,0 +1,102 @@
1
+ require 'aws-sdk'
2
+ require 'aws-sdk'
3
+ require_relative 'translator'
4
+
5
+ #
6
+ # Provides Language services using Amazon Translate
7
+ #
8
+ # Module can be intialized using multiple options
9
+ #
10
+ # == Credential Referencing Order
11
+ #
12
+ # * [Arguments] - Pass the credentials access_key_id and secret_access_key as arguments
13
+ # * [Environment route] - AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY can be exposed as
14
+ # environment variables
15
+ # * [Profile Name] - The application uses the credentials of the system and picks the
16
+ # credentials referred to by the profile
17
+ #
18
+ class AwsEngine
19
+ include Translator
20
+
21
+ DEFAULT_REGION = ENV["AWS_DEFAULT_REGION"] || "us-east-1"
22
+
23
+ #
24
+ # :args: options
25
+ #
26
+ # ==== Arguments
27
+ # options can carry the following details
28
+ #
29
+ # * [:access_key_id] - access key id
30
+ # * [:secret_access_key] - Secret access key
31
+ # * [:env] - true for using credentials from environment variables
32
+ # * [:profile] - profile name for using shared credentials setup
33
+ # * [:region] - If not provided defaults to us-east-1
34
+ #
35
+ # ==== raises
36
+ #
37
+ # * EngineInitializationException if credentials cannot be setup due to lack of details
38
+ # * Aws Exceptions if profile name is invalid or invalid credentials are passed
39
+ #
40
+ def initialize(options)
41
+ access_key_id = nil
42
+ secret_access_key = nil
43
+ @region = options[:region] || DEFAULT_REGION
44
+ if options[:env]
45
+ access_key_id = ENV["AWS_ACCESS_KEY_ID"]
46
+ secret_access_key = ENV["AWS_SECRET_ACCESS_KEY"]
47
+ elsif options[:access_key_id] && options[:secret_access_key]
48
+ access_key_id = options[:access_key_id]
49
+ secret_access_key = options[:secret_access_key]
50
+ end
51
+ if access_key_id && secret_access_key
52
+ Aws.config.update({
53
+ region: options[:region] || DEFAULT_REGION,
54
+ credentials: Aws::Credentials.new(access_key_id, secret_access_key)
55
+ })
56
+ elsif options[:profile]
57
+ credentials = Aws::SharedCredentials.new(profile_name: options[:profile])
58
+ Aws.config.update({
59
+ region: @region,
60
+ credentials: credentials.credentials
61
+ })
62
+ else
63
+ raise Translator::EngineInitializationException.new(
64
+ "Failed to initialize Aws Engine. Credentials are missing / not provided")
65
+ end
66
+ @translate_service = Aws::Translate::Client.new(region: @region)
67
+ @comprehend_service = Aws::Comprehend::Client.new(region: @region)
68
+ end
69
+
70
+ #
71
+ # Invokes the language detection API of AWS and returns only the language
72
+ # of the highest score and returns the ISO 639-1 code
73
+ #
74
+ # :args: text
75
+ #
76
+ # ===== Arguments
77
+ # * +text+ - The text for which the language is to be inferred
78
+ #
79
+ def infer_language(text)
80
+ response = @comprehend_service.detect_dominant_language({ text: "#{text}" })
81
+ response[:languages][0][:language_code]
82
+ end
83
+
84
+ #
85
+ # Invokes the translation API of AWS and returns the translated text
86
+ # as per the arguments provided
87
+ # Will Raise exception if a translation cannot be made between the source
88
+ # and target language codes or if the lang code is invalid
89
+ #
90
+ # :args: input_text, src_lang, target_lang
91
+ #
92
+ # * +input_text+ - The text that needs to be translated
93
+ # * +src_lang+ - The source language of the text
94
+ # * +target_lang+ - The target language to which the input_text needs to be translated to
95
+ #
96
+ def translate(input_text, src_lang, target_lang)
97
+ response = @translate_service.translate_text({ :text => "#{input_text}" ,
98
+ :source_language_code => "#{src_lang}", :target_language_code => "#{target_lang}"})
99
+ response.translated_text
100
+ end
101
+ end
102
+
File without changes
@@ -0,0 +1,58 @@
1
+ #
2
+ # A Module that kind of acts as an interface where the methods
3
+ # expected out of each vendor is encapsulated into
4
+ #
5
+ # To use for a new vendor, simply include this module and provide
6
+ # caption specific implementations
7
+ #
8
+ module Translator
9
+
10
+ #
11
+ # Constants For Engines
12
+ ENGINE_AWS = 1
13
+ ENGINE_GCP = 2
14
+
15
+ #
16
+ # Keys for each Engine
17
+ AWS_KEYS = [:access_key_id, :secret_access_key, :profile]
18
+ GCP_KEYS = [:api_key, :project_id, :creds_path]
19
+
20
+ ENGINE_KEYS = {ENGINE_AWS => AWS_KEYS, ENGINE_GCP => GCP_KEYS}
21
+ #
22
+ # This exception shall be raised when we fail to initialize an
23
+ # engine for the purposes of language detection / translation
24
+ #
25
+ # ==== Example
26
+ # * When credentials are not passed
27
+ #
28
+ class EngineInitializationException < StandardError; end
29
+
30
+ #
31
+ # Method to infer the language by inspecting the text
32
+ # passed as argument
33
+ #
34
+ # :args: text
35
+ #
36
+ # * +text+ - String whose language needs to be inferred
37
+ #
38
+ # ==== Returns
39
+ #
40
+ # * The ISO 639-1 Letter Language code
41
+ #
42
+ def infer_language(text)
43
+ raise "Not Implemented. Class #{self.class.name} doesn't implement infer_language"
44
+ end
45
+
46
+ #
47
+ # Method to translate from given language to another
48
+ #
49
+ # :args: input_text, src_lang, target_lang, output_file
50
+ #
51
+ # * +input_text+ - Text which needs to be translated
52
+ # * +src_lang+ - can be inferred using #infer_language method
53
+ # * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
54
+ #
55
+ def translate(input_text, src_lang, target_lang)
56
+ raise "Not Implemented. Class #{self.class.name} doesn't implement translate"
57
+ end
58
+ end
data/lib/scc.rb CHANGED
@@ -1,13 +1,47 @@
1
- require 'aws-sdk-translate'
2
- require 'aws-sdk-comprehend'
1
+ require_relative "engines/translator"
2
+ require_relative "allfather"
3
3
 
4
+ #
5
+ # Library to handle SCC Files
6
+ #
7
+ # Uses the translator available to do the necessary language operations
8
+ # as defined by the AllFather
9
+ #
4
10
  class SCC
5
11
 
6
- def initialize(awskey, awssecret)
7
- @translate = Aws::Translate::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
8
- @comp = Aws::Comprehend::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
12
+ include AllFather
13
+
14
+ def initialize(cc_file, translator)
15
+ @cc_file = cc_file
16
+ @translator = translator
17
+ raise "Invalid SCC file provided" unless is_valid?
18
+ end
19
+
20
+ def is_valid?
21
+ # Do any SCC specific validations here
22
+ if @cc_file =~ /^.*\.(scc)$/
23
+ return true
24
+ end
25
+ return false
26
+ end
27
+
28
+ def infer_languages
29
+ lang = nil
30
+ begin
31
+ sample_text = get_text(@cc_file, 100)
32
+ lang = @translator.infer_language(sample_text)
33
+ rescue StandardError => e
34
+ puts "Error while detecting the language due to #{e.message}"
35
+ end
36
+ lang
9
37
  end
10
38
 
39
+ def translate(src_lang, dest_lang, out_file)
40
+ raise "Not Implemented. Class #{self.class.name} doesn't implement translate yet !!"
41
+ end
42
+
43
+ private
44
+
11
45
  def get_text(srt_file, num_chars)
12
46
  ccfile = File.open(srt_file, 'r:UTF-8', &:read)
13
47
  text_sample = ""
@@ -15,12 +49,12 @@ class SCC
15
49
  if line =~ /^\d\d:\d\d:\d\d:\d\d\s/
16
50
  scc_text_code = line.gsub(/^\d\d:\d\d:\d\d:\d\d\s/, '')
17
51
  text_sample << decode(scc_text_code)
18
- if text_sample.length > (num_chars+1)
52
+ if text_sample.length > (num_chars + 1)
19
53
  break
20
54
  end
21
55
  end
22
56
  end
23
- return text_sample[0,num_chars]
57
+ return text_sample[0, num_chars]
24
58
  end
25
59
 
26
60
  def decode(scc_code_text)
@@ -31,7 +65,7 @@ class SCC
31
65
  hex_codes.each do | code |
32
66
  if ["94", "91", "92", "97", "15", "16", "10", "13"].include?(code)
33
67
  skip_next = true
34
- skip_count = skip_count +1
68
+ skip_count = skip_count + 1
35
69
  next
36
70
  end
37
71
  if skip_count == 1 && skip_next
@@ -60,18 +94,4 @@ class SCC
60
94
  end
61
95
  encoded_str
62
96
  end
63
-
64
- def detect_lang(scc_file)
65
- lang = nil
66
- begin
67
- sample_text = get_text(scc_file, 100)
68
- response = @comp.detect_dominant_language( {
69
- text: "#{sample_text}"
70
- })
71
- lang = response[:languages][0][:language_code] rescue nil
72
- rescue => error
73
- puts "Error while detecting the language!!"
74
- end
75
- lang
76
- end
77
- end
97
+ end
data/lib/srt.rb CHANGED
@@ -1,81 +1,106 @@
1
- require 'aws-sdk-translate'
2
- require 'aws-sdk-comprehend'
1
+ require_relative "engines/translator"
2
+ require_relative "allfather"
3
3
 
4
+ #
5
+ # Library to handle SRT Files
6
+ #
7
+ # Uses the translator available to do the necessary language operations
8
+ # as defined by the AllFather
9
+ #
4
10
  class SRT
5
- def initialize(awskey, awssecret)
6
- @translate = Aws::Translate::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
7
- @comp = Aws::Comprehend::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
11
+
12
+ include AllFather
13
+
14
+ def initialize(cc_file, translator)
15
+ @cc_file = cc_file
16
+ @translator = translator
17
+ raise "Invalid SRT file provided" unless is_valid?
18
+ end
19
+
20
+ def is_valid?
21
+ # Do any SRT specific validations here
22
+ if @cc_file =~ /^.*\.(srt)$/
23
+ return true
24
+ end
25
+ return false
8
26
  end
9
27
 
10
- def translate_text(srt_file, src_lang, dest_lang, out_file)
11
- ccfile = File.open(srt_file, 'r:UTF-8', &:read)
12
- outfile = File.open(out_file, "w")
13
- text_collection = false
14
- text_sample = ""
15
- ccfile.each_line do | line |
16
- if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
17
- text_collection = true
18
- outfile.puts line
19
- elsif line.strip.empty? && !text_sample.empty?
20
- json_text = JSON.parse(text_sample) rescue nil
21
- if json_text.nil?
22
- trans_resp = @translate.translate_text({ :text => "#{text_sample}" , :source_language_code => "#{src_lang}", :target_language_code => "#{dest_lang}"})
23
- outfile.puts trans_resp.translated_text
28
+ def translate(src_lang, dest_lang, out_file)
29
+ super(src_lang, dest_lang, out_file)
30
+ begin
31
+ ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
32
+ outfile = File.open(out_file, "w")
33
+ text_collection = false
34
+ text_sample = ""
35
+ ccfile.each_line do | line |
36
+ if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
37
+ text_collection = true
38
+ outfile.puts line
39
+ elsif line.strip.empty? && !text_sample.empty?
40
+ json_text = JSON.parse(text_sample) rescue nil
41
+ if json_text.nil?
42
+ trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
43
+ outfile.puts trans_resp
44
+ else
45
+ outfile.puts text_sample
46
+ end
24
47
  outfile.puts
48
+ text_sample = ""
49
+ text_collection = false
50
+ elsif text_collection
51
+ text_sample << line
25
52
  else
26
- outfile.puts text_sample
27
- outfile.puts
53
+ outfile.puts line
28
54
  end
29
- text_sample = ""
30
- text_collection = false
31
- elsif text_collection
32
- text_sample << line
33
- else
34
- outfile.puts line
35
55
  end
36
- next
37
- end
38
56
 
39
- if !text_sample.empty?
40
- trans_resp = @translate.translate_text({ :text => "#{text_sample}" , :source_language_code => "#{src_lang}", :target_language_code => "#{dest_lang}"})
41
- outfile.puts trans_resp.translated_text
42
- outfile.puts
57
+ if !text_sample.empty?
58
+ trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
59
+ outfile.puts trans_resp
60
+ outfile.puts
61
+ end
62
+ ensure
63
+ ccfile.close rescue nil
43
64
  outfile.close
44
65
  end
45
66
  end
46
67
 
47
-
48
- def get_text(srt_file, num_chars)
49
- ccfile = File.open(srt_file, 'r:UTF-8', &:read)
50
- text_collection = false
51
- text_sample = ""
52
- ccfile.each_line do | line |
53
- line = line
54
- if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
55
- text_collection = true
56
- elsif line.strip.empty?
57
- text_collection = false
58
- elsif text_collection && text_sample.length < (num_chars+1)
59
- text_sample << line
60
- end
61
- break if text_sample.length > (num_chars+1)
62
- next
68
+ def infer_languages
69
+ lang = nil
70
+ begin
71
+ sample_text = get_text(@cc_file, 100)
72
+ lang = @translator.infer_language(sample_text)
73
+ rescue StandardError => e
74
+ puts "Error while detecting the language due to #{e.message}"
63
75
  end
64
- return text_sample[0,num_chars]
76
+ [lang]
65
77
  end
66
78
 
67
- def detect_lang(srt_file)
68
- lang = nil
79
+ private
80
+
81
+ #
82
+ # Method to get a minimal amount of key text that excludes any tags
83
+ # or control information for the engine to meaninfully and
84
+ # correctly infer the language being referred to in ths VTT
85
+ #
86
+ def get_text(srt_file, num_chars)
69
87
  begin
70
- sample_text = get_text(srt_file, 100)
71
- response = @comp.detect_dominant_language( {
72
- text: "#{sample_text}"
73
- })
74
- lang = response[:languages][0][:language_code] rescue nil
75
- rescue => error
76
- puts "Error while detecting the language!!"
88
+ ccfile = File.open(srt_file, 'r:UTF-8', &:read)
89
+ text_collection = false
90
+ text_sample = ""
91
+ ccfile.each_line do |line|
92
+ if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
93
+ text_collection = true
94
+ elsif line.strip.empty?
95
+ text_collection = false
96
+ elsif text_collection && text_sample.length < (num_chars + 1)
97
+ text_sample << line
98
+ end
99
+ break if text_sample.length > (num_chars + 1)
100
+ end
101
+ ensure
102
+ ccfile.close rescue nil
77
103
  end
78
- lang
104
+ return text_sample[0, num_chars]
79
105
  end
80
-
81
106
  end
data/lib/subtitle.rb CHANGED
@@ -1,43 +1,88 @@
1
- require "srt"
1
+ require_relative "srt"
2
+ require_relative "vtt"
3
+ require_relative "scc"
4
+ require_relative "ttml"
5
+ require_relative "dfxp"
6
+ require_relative "allfather"
7
+ require_relative "engines/translator"
8
+ require_relative "engines/aws"
9
+
2
10
 
3
11
  class Subtitle
4
- def initialize(awskey, awssecret, ccfile)
5
- if awskey.nil? || awssecret.nil? || ccfile.nil?
6
- raise "Invalid Arguments, please check"
7
- end
8
- @ccfile = ccfile
9
- unless file_valid
10
- raise "Incorrect File extension"
11
- end
12
- begin
13
- @srt_parser = SRT.new(awskey, awssecret)
14
- rescue
15
- raise "Could not initialize Parser!!. Check the Keys supplied."
16
- end
12
+ def initialize(options={})
13
+ # Infer the caption handler from the extension
14
+ @cc_file = options[:cc_file]
15
+ raise "Input caption not provided. Please provide the same in :cc_file option" if @cc_file.nil?
16
+ translator = get_translator(options)
17
+ @handler = get_caption_handler(options, translator)
17
18
  end
18
19
 
19
20
  def detect_language
20
- detected_lang = @srt_parser.detect_lang(@ccfile)
21
- detected_lang
21
+ @handler.infer_languages
22
22
  end
23
23
 
24
- def translate_cc( dest_lang, src_lang = nil, outfile = nil)
24
+ def translate(dest_lang, src_lang = nil, outfile = nil)
25
25
  if outfile.nil?
26
- outfile = "#{@ccfile}_#{dest_lang}"
26
+ outfile = "#{@cc_file}_#{dest_lang}"
27
27
  end
28
28
  if src_lang.nil?
29
- src_lang = detect_language
30
- raise "could not detect Source Language!!" if src_lang.nil?
29
+ src_lang = detect_language[0] rescue nil
30
+ raise "Could not detect Source Language!!" if src_lang.nil?
31
31
  end
32
- @srt_parser.translate_text(@ccfile, src_lang, dest_lang, outfile)
32
+ @handler.translate(src_lang, dest_lang, outfile)
33
33
  outfile
34
34
  end
35
35
 
36
- def file_valid
37
- valid = false
38
- if @ccfile =~ /^.*\.(srt|vtt)$/
39
- valid = true
36
+ private
37
+
38
+ def get_translator(options)
39
+ translator = nil
40
+ # Try to infer the engine based on the passed options
41
+ engine = options[:engine]
42
+ unless engine
43
+ engine_props = Translator::ENGINE_KEYS
44
+ engine_props.each do |k, values|
45
+ original_size = values.size
46
+ diff = values - options.keys
47
+ if diff.size < original_size
48
+ # We have some keys for this engine in options
49
+ engine = k
50
+ break
51
+ end
52
+ end
53
+ end
54
+ case engine
55
+ when Translator::ENGINE_AWS
56
+ translator = AwsEngine.new(options)
57
+ when Translator::ENGINE_GCP
58
+ raise "GCP is yet to be implemented"
59
+ else
60
+ raise "Unable to infer the Translation Engine. Options missing key credential params"
61
+ end
62
+ translator
63
+ end
64
+
65
+ def get_caption_handler(options, translator)
66
+ caption_file = options[:cc_file]
67
+ extension = File.extname(caption_file)
68
+ unless AllFather::VALID_FILES.include?(extension)
69
+ raise "Caption support for #{caption_file} of type #{extension} is not supported yet"
70
+ end
71
+ handler = nil
72
+ case extension.downcase
73
+ when ".scc"
74
+ handler = SCC.new(caption_file, translator)
75
+ when ".srt"
76
+ handler = SRT.new(caption_file, translator)
77
+ when ".vtt"
78
+ handler = VTT.new(caption_file, translator)
79
+ when ".ttml"
80
+ handler = TTML.new(caption_file, translator, {:force_detect => options[:force_detect]})
81
+ when ".dfxp"
82
+ handler = DFXP.new(caption_file, translator, {:force_detect => options[:force_detect]})
83
+ else
84
+ raise "Cannot handle file type .#{extension}"
40
85
  end
41
- valid
86
+ handler
42
87
  end
43
- end
88
+ end
data/lib/ttml.rb ADDED
@@ -0,0 +1,180 @@
1
+ require_relative "engines/translator"
2
+ require_relative "allfather"
3
+
4
+ require "nokogiri"
5
+
6
+ #
7
+ # Library to handle TTML Files
8
+ #
9
+ # Uses the translator available to do the necessary language operations
10
+ # as defined by the AllFather
11
+ #
12
+ class TTML
13
+
14
+ include AllFather
15
+
16
+ def initialize(cc_file, translator, opts={})
17
+ @cc_file = cc_file
18
+ @translator = translator
19
+ @force_detect = opts[:force_detect] || false
20
+ raise "Invalid TTML file provided" unless is_valid?
21
+ end
22
+
23
+ def is_valid?
24
+ # Do any VTT specific validations here
25
+ if @cc_file =~ /^.*\.(ttml)$/
26
+ return true
27
+ end
28
+ # TODO: Check if it's required to do a File read to see if this
29
+ # a well-formed XML. Another is to see if lang is available in each div
30
+ return false
31
+ end
32
+
33
+ def infer_languages
34
+ lang = []
35
+ begin
36
+ xml_file = File.open(@cc_file)
37
+ xml_doc = Nokogiri::XML(xml_file)
38
+ div_objects = xml_doc.css("/tt/body/div")
39
+ div_objects.each_with_index do |div, index|
40
+ # By default, return the lang if specified in the div and
41
+ # force detect is false
42
+ inferred_lang = div.attributes['lang'].value rescue nil
43
+ if inferred_lang.nil?
44
+ # If lang is not provided in the caption, then override
45
+ # force detect for inferrence
46
+ @force_detect = true
47
+ end
48
+ if @force_detect
49
+ sample_text = get_text(div, 100)
50
+ inferred_lang = @translator.infer_language(sample_text) rescue nil
51
+ if inferred_lang.nil?
52
+ err_msg = "Failed to detect lang for div block number #{index + 1}"
53
+ unless lang.empty?
54
+ err_msg += "; Detected languages before failure are #{lang}"
55
+ end
56
+ raise AllFather::LangDetectionFailureException.new(err_msg)
57
+ end
58
+ end
59
+ lang << inferred_lang
60
+ end
61
+ rescue StandardError => e
62
+ puts "Error while detecting the language due to #{e.message}"
63
+ ensure
64
+ xml_file.close rescue nil
65
+ end
66
+ return nil if lang.empty?
67
+ lang
68
+ end
69
+
70
+ def translate(src_lang, dest_lang, out_file)
71
+ super(src_lang, dest_lang, out_file)
72
+ xml_file = File.open(@cc_file, 'r:UTF-8', &:read)
73
+ xml_doc = Nokogiri::XML(xml_file)
74
+ div_objects = xml_doc.css("/tt/body/div")
75
+ # Irrespective of what lang the div xml:lang says, infer the lang and then
76
+ # check to see if it matches src_lang
77
+ matched_div = nil
78
+ div_objects.each do |div|
79
+ sample_text = get_text(div, 100)
80
+ inferred_lang = @translator.infer_language(sample_text) rescue nil
81
+ next if inferred_lang.nil?
82
+ if inferred_lang.eql?(src_lang)
83
+ matched_div = div
84
+ break
85
+ end
86
+ end
87
+ if matched_div.nil?
88
+ FileUtils.remove_file(out_file)
89
+ raise AllFather::InvalidInputException.new("Unable to find #{src_lang} language section in TTML")
90
+ end
91
+ # Update the Lang in the Div
92
+ matched_div.lang = dest_lang
93
+
94
+ blocks = matched_div.css("p")
95
+ blocks.each do |block|
96
+ # Multiple spaces being stripped off
97
+ text = block.inner_html.strip.gsub(/(\s){2,}/, '')
98
+ text_blocks = get_block_text(text)
99
+ translated_text = ""
100
+ text_blocks.each do |text_block|
101
+ if text_block.start_with?('<') || text_block.empty?
102
+ translated_text << text_block
103
+ next
104
+ end
105
+ translated_resp = @translator.translate(text_block, src_lang, dest_lang)
106
+ translated_text << translated_resp
107
+ end
108
+ block.inner_html = translated_text
109
+ end
110
+ xml_file.close rescue nil
111
+ File.write(out_file, xml_doc)
112
+ out_file
113
+ end
114
+
115
+ private
116
+
117
+ #
118
+ # Method to segregate the data from markups as markups don't need
119
+ # translations.
120
+ # For example, if the cue block is of the form
121
+ # This is a test caption with <span id="1">a test span </span> within a block
122
+ # This method returns
123
+ # ["This is a test caption with ", "<span id=\"1\">", "a test span ", "</span>", " within a block"]
124
+ # as we can infer the markups can be retained as is to avoid translation
125
+ #
126
+ def get_block_text(text)
127
+ data = []
128
+ tag_start = tag_end = false
129
+ str_length = text.size
130
+ text_block = ""
131
+ markup_block = ""
132
+ for i in 0...text.size do
133
+ if text[i] == '<'
134
+ tag_end = false
135
+ tag_start = true
136
+ markup_block << text[i]
137
+ data << text_block
138
+ text_block = ""
139
+ next
140
+ elsif text[i] == '>'
141
+ tag_end = true
142
+ tag_start = false
143
+ markup_block << text[i]
144
+ data << markup_block
145
+ markup_block = ""
146
+ next
147
+ end
148
+ if tag_start && !tag_end
149
+ markup_block << text[i]
150
+ else
151
+ text_block << text[i]
152
+ end
153
+ end
154
+ unless text_block.empty?
155
+ data << text_block
156
+ end
157
+ data
158
+ end
159
+
160
+ #
161
+ # Method to get a minimal amount of key text that excludes any tags
162
+ # or control information for the engine to meaninfully and
163
+ # correctly infer the language being referred to in ths TTML
164
+ #
165
+ def get_text(div, num_chars)
166
+ text_sample = ""
167
+ blocks = div.css("p")
168
+ blocks.each do |block|
169
+ # Multiple spaces being stripped off
170
+ text = block.inner_html.strip.gsub(/(\s){2,}/, '')
171
+ # Strip off html tags (if any)
172
+ text = text.gsub(/(<.*?>)/, ' ')
173
+ text_sample << text
174
+ if text_sample.length > (num_chars + 1)
175
+ break
176
+ end
177
+ end
178
+ return text_sample[0, num_chars]
179
+ end
180
+ end
data/lib/vtt.rb CHANGED
@@ -1,81 +1,115 @@
1
- require 'aws-sdk-translate'
2
- require 'aws-sdk-comprehend'
1
+ require_relative "engines/translator"
2
+ require_relative "allfather"
3
3
 
4
+ #
5
+ # Library to handle VTT Files
6
+ #
7
+ # Uses the translator available to do the necessary language operations
8
+ # as defined by the AllFather
9
+ #
4
10
  class VTT
5
- def initialize(awskey, awssecret)
6
- @translate = Aws::Translate::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
7
- @comp = Aws::Comprehend::Client.new(:access_key_id => "#{awskey}", :secret_access_key => "#{awssecret}")
11
+
12
+ include AllFather
13
+
14
+ def initialize(cc_file, translator)
15
+ @cc_file = cc_file
16
+ @translator = translator
17
+ raise "Invalid VTT file provided" unless is_valid?
8
18
  end
9
19
 
10
- def translate_text(srt_file, src_lang, dest_lang, out_file)
11
- ccfile = File.open(srt_file, 'r:UTF-8', &:read)
12
- outfile = File.open(out_file, "w")
13
- text_collection = false
14
- text_sample = ""
15
- ccfile.each_line do | line |
16
- if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
17
- text_collection = true
18
- outfile.puts line
19
- elsif line.strip.empty? && !text_sample.empty?
20
- json_text = JSON.parse(text_sample) rescue nil
21
- if json_text.nil?
22
- trans_resp = @translate.translate_text({ :text => "#{text_sample}" , :source_language_code => "#{src_lang}", :target_language_code => "#{dest_lang}"})
23
- outfile.puts trans_resp.translated_text
24
- outfile.puts
20
+ def translate(src_lang, dest_lang, out_file)
21
+ super(src_lang, dest_lang, out_file)
22
+ begin
23
+ ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
24
+ outfile = File.open(out_file, "w")
25
+ text_collection = false
26
+ text_sample = ""
27
+ ccfile.each_line do | line |
28
+ if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
29
+ text_collection = true
30
+ outfile.puts line
31
+ elsif line.strip.empty? && !text_sample.empty?
32
+ json_text = JSON.parse(text_sample) rescue nil
33
+ if json_text.nil?
34
+ trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
35
+ outfile.puts trans_resp
36
+ outfile.puts
37
+ else
38
+ outfile.puts text_sample
39
+ outfile.puts
40
+ end
41
+ text_sample = ""
42
+ text_collection = false
43
+ elsif text_collection
44
+ text_sample << line
25
45
  else
26
- outfile.puts text_sample
27
- outfile.puts
46
+ outfile.puts line
28
47
  end
29
- text_sample = ""
30
- text_collection = false
31
- elsif text_collection
32
- text_sample << line
33
- else
34
- outfile.puts line
35
48
  end
36
- next
37
- end
38
49
 
39
- if !text_sample.empty?
40
- trans_resp = @translate.translate_text({ :text => "#{text_sample}" , :source_language_code => "#{src_lang}", :target_language_code => "#{dest_lang}"})
41
- outfile.puts trans_resp.translated_text
42
- outfile.puts
50
+ if !text_sample.empty?
51
+ trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
52
+ outfile.puts trans_resp
53
+ outfile.puts
54
+ end
55
+ ensure
56
+ ccfile.close rescue nil
43
57
  outfile.close
44
58
  end
45
59
  end
46
60
 
61
+ #
62
+ # Returns the inferred language in an array
63
+ #
64
+ def infer_languages
65
+ lang = nil
66
+ begin
67
+ sample_text = get_text(@cc_file, 100)
68
+ lang = @translator.infer_language(sample_text)
69
+ rescue StandardError => e
70
+ puts "Error while detecting the language due to #{e.message}"
71
+ end
72
+ [lang]
73
+ end
47
74
 
48
- def get_text(srt_file, num_chars)
49
- ccfile = File.open(srt_file, 'r:UTF-8', &:read)
50
- text_collection = false
51
- text_sample = ""
52
- ccfile.each_line do | line |
53
- line = line
54
- if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
55
- text_collection = true
56
- elsif line.strip.empty?
57
- text_collection = false
58
- elsif text_collection && text_sample.length < (num_chars+1)
59
- text_sample << line
60
- end
61
- break if text_sample.length > (num_chars+1)
62
- next
75
+ #
76
+ # Method to add required set of validations specific to caption type
77
+ #
78
+ def is_valid?
79
+ # Do any VTT specific validations here
80
+ if @cc_file =~ /^.*\.(vtt)$/
81
+ return true
63
82
  end
64
- return text_sample[0,num_chars]
83
+ # TODO: Check if it's required to do a File read to see if the 1st line is WEBVTT
84
+ # to handle cases where invalid file is named with vtt extension
85
+ return false
65
86
  end
66
87
 
67
- def detect_lang(srt_file)
68
- lang = nil
88
+ private
89
+
90
+ #
91
+ # Method to get a minimal amount of key text that excludes any tags
92
+ # or control information for the engine to meaninfully and
93
+ # correctly infer the language being referred to in ths VTT
94
+ #
95
+ def get_text(vtt_file, num_chars)
69
96
  begin
70
- sample_text = get_text(srt_file, 100)
71
- response = @comp.detect_dominant_language( {
72
- text: "#{sample_text}"
73
- })
74
- lang = response[:languages][0][:language_code] rescue nil
75
- rescue => error
76
- puts "Error while detecting the language!!"
97
+ ccfile = File.open(vtt_file, 'r:UTF-8', &:read)
98
+ text_collection = false
99
+ text_sample = ""
100
+ ccfile.each_line do |line|
101
+ if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
102
+ text_collection = true
103
+ elsif line.strip.empty?
104
+ text_collection = false
105
+ elsif text_collection && text_sample.length < (num_chars + 1)
106
+ text_sample << line
107
+ end
108
+ break if text_sample.length > (num_chars + 1)
109
+ end
110
+ ensure
111
+ ccfile.close rescue nil
77
112
  end
78
- lang
113
+ return text_sample[0, num_chars]
79
114
  end
80
-
81
115
  end
metadata CHANGED
@@ -1,14 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: subtitle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maheshwaran G
8
+ - Arunjeyaprasad A J
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2019-10-21 00:00:00.000000000 Z
12
+ date: 2019-10-31 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: bundler
@@ -25,43 +26,37 @@ dependencies:
25
26
  - !ruby/object:Gem::Version
26
27
  version: '2.0'
27
28
  - !ruby/object:Gem::Dependency
28
- name: aws-sdk-comprehend
29
+ name: aws-sdk
29
30
  requirement: !ruby/object:Gem::Requirement
30
31
  requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :runtime
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: aws-sdk-translate
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
32
+ - - "~>"
46
33
  - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :runtime
34
+ version: '2.11'
35
+ type: :development
49
36
  prerelease: false
50
37
  version_requirements: !ruby/object:Gem::Requirement
51
38
  requirements:
52
- - - ">="
39
+ - - "~>"
53
40
  - !ruby/object:Gem::Version
54
- version: '0'
55
- description: subtitle gem to detect and translate closed caption for SubRip and WebVTT
41
+ version: '2.11'
42
+ description: Subtitle gem helps you to detect language and translate closed caption
43
+ to required language.
56
44
  email:
57
45
  - pgmaheshwaran@gmail.com
46
+ - arunjeyaprasad@gmail.com
58
47
  executables: []
59
48
  extensions: []
60
49
  extra_rdoc_files: []
61
50
  files:
51
+ - lib/allfather.rb
52
+ - lib/dfxp.rb
53
+ - lib/engines/aws.rb
54
+ - lib/engines/gcp.rb
55
+ - lib/engines/translator.rb
62
56
  - lib/scc.rb
63
57
  - lib/srt.rb
64
58
  - lib/subtitle.rb
59
+ - lib/ttml.rb
65
60
  - lib/vtt.rb
66
61
  homepage: https://github.com/cloudaffair/subtitle
67
62
  licenses:
@@ -85,9 +80,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
85
80
  version: '0'
86
81
  requirements: []
87
82
  rubyforge_project:
88
- rubygems_version: 2.7.3
83
+ rubygems_version: 2.5.1
89
84
  signing_key:
90
85
  specification_version: 4
91
- summary: subtitle helps you to detect language and translate closed caption to required
92
- language
86
+ summary: Subtitle gem helps you to detect language and translate closed caption to
87
+ required language
93
88
  test_files: []