subtitle 0.1.8 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/allfather.rb +83 -0
- data/lib/dfxp.rb +30 -0
- data/lib/engines/aws.rb +102 -0
- data/lib/engines/gcp.rb +0 -0
- data/lib/engines/translator.rb +58 -0
- data/lib/scc.rb +43 -23
- data/lib/srt.rb +86 -61
- data/lib/subtitle.rb +72 -27
- data/lib/ttml.rb +180 -0
- data/lib/vtt.rb +95 -61
- metadata +21 -26
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f603ac76acbb145807944c0f948d6550eee197cc
|
4
|
+
data.tar.gz: 7d06b0e8ee047ab1790237fca1c478da03725541
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 204c3af4231e25e6caaa198e9a8b7d46b4f917afcc8abdbce27bccbb94908d28d21c1b0318aa36a1ba63c83c064963e24c7398a99a0cd183c3ca10568fb6fe34
|
7
|
+
data.tar.gz: fb867912d76f039abf21fd1495c0f9b9139a594cde028f1d50184bd15d007a59113366507b7a85f1039ce0c638d423fbd7cff99d22792f83af59ccede48167dc
|
data/lib/allfather.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#
|
2
|
+
# A Module that kind of acts as an interface where the generic methods
|
3
|
+
# that applies to each caption type can be defined
|
4
|
+
#
|
5
|
+
# To use for a new caption type, simply include this module and provide
|
6
|
+
# caption specific implementations
|
7
|
+
#
|
8
|
+
module AllFather
|
9
|
+
|
10
|
+
#
|
11
|
+
# Valid file extensions that we support; Keep expanding as we grow
|
12
|
+
#
|
13
|
+
VALID_FILES = [".scc", ".srt", ".vtt", ".ttml", ".dfxp"]
|
14
|
+
|
15
|
+
#
|
16
|
+
# Generic exception class that is raised for validation errors
|
17
|
+
#
|
18
|
+
class InvalidInputException < StandardError; end
|
19
|
+
|
20
|
+
#
|
21
|
+
# Lang inference failure exception
|
22
|
+
#
|
23
|
+
class LangDetectionFailureException < StandardError; end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Method to do basic validations like is this a valid file to even
|
27
|
+
# accept for any future transactions
|
28
|
+
#
|
29
|
+
# ==== Returns:
|
30
|
+
# true if the file is valid and false otherwise
|
31
|
+
#
|
32
|
+
def is_valid?
|
33
|
+
raise "Not Implemented. Class #{self.class.name} doesn't implement is_valid?"
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Method to infer the language(s) of the caption by inspecting the file
|
38
|
+
# depending on the type of the caption file
|
39
|
+
#
|
40
|
+
# ==== Returns
|
41
|
+
#
|
42
|
+
# * The ISO 639-1 Letter Language codes
|
43
|
+
#
|
44
|
+
def infer_languages
|
45
|
+
raise "Not Implemented. Class #{self.class.name} doesn't implement infer_languages"
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Method to translate the caption from one language to another
|
50
|
+
#
|
51
|
+
# :args: src_lang, target_lang, output_file
|
52
|
+
#
|
53
|
+
# * +input_caption+ - A Valid input caption file. Refer to #is_valid?
|
54
|
+
# * +src_lang+ - can be inferred using #infer_language method
|
55
|
+
# * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
|
56
|
+
# * +output_file+ - Output file. Can be a fully qualified path or just file name
|
57
|
+
#
|
58
|
+
# ==== Raises
|
59
|
+
#
|
60
|
+
# InvalidInputException shall be raised if
|
61
|
+
# 1. The input file doesn't exist or is unreadable or is invalid caption
|
62
|
+
# 2. The output file can't be written
|
63
|
+
# 3. The target_lang is not a valid ISO 639-1 Letter Language code
|
64
|
+
#
|
65
|
+
def translate(src_lang, target_lang, output_file)
|
66
|
+
# Check if a non empty output file is present and error out to avoid
|
67
|
+
# the danger or overwriting some important file !!
|
68
|
+
if File.exists?(output_file) && File.size(output_file) > 0
|
69
|
+
raise InvalidInputException.new("Output file #{output_file} is not empty.")
|
70
|
+
else
|
71
|
+
# Just open the file in writable mode and close it just to ensure that
|
72
|
+
# we can write the output file
|
73
|
+
File.open(output_file, "w") {|f|
|
74
|
+
}
|
75
|
+
end
|
76
|
+
# Check if the file is writable ?
|
77
|
+
unless File.writable?(output_file)
|
78
|
+
raise InvalidInputException.new("Output file #{output_file} not writable.")
|
79
|
+
end
|
80
|
+
# Further checks can be done only in caption specific implementations
|
81
|
+
# or translation engine specific implementation
|
82
|
+
end
|
83
|
+
end
|
data/lib/dfxp.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require_relative "engines/translator"
|
2
|
+
require_relative "allfather"
|
3
|
+
require_relative "ttml"
|
4
|
+
|
5
|
+
#
|
6
|
+
# Library to handle DFXP Files
|
7
|
+
#
|
8
|
+
# Uses the translator available to do the necessary language operations
|
9
|
+
# as defined by the AllFather
|
10
|
+
#
|
11
|
+
class DFXP < TTML
|
12
|
+
|
13
|
+
def initialize(cc_file, translator, opts={})
|
14
|
+
@cc_file = cc_file
|
15
|
+
@translator = translator
|
16
|
+
@force_detect = opts[:force_detect] || false
|
17
|
+
raise "Invalid TTML file provided" unless is_valid?
|
18
|
+
end
|
19
|
+
|
20
|
+
def is_valid?
|
21
|
+
# Do any VTT specific validations here
|
22
|
+
if @cc_file =~ /^.*\.(dfxp)$/
|
23
|
+
return true
|
24
|
+
end
|
25
|
+
# TODO: Check if it's required to do a File read to see if this
|
26
|
+
# a well-formed XML. Another is to see if lang is available in each div
|
27
|
+
return false
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
data/lib/engines/aws.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
require 'aws-sdk'
|
3
|
+
require_relative 'translator'
|
4
|
+
|
5
|
+
#
|
6
|
+
# Provides Language services using Amazon Translate
|
7
|
+
#
|
8
|
+
# Module can be intialized using multiple options
|
9
|
+
#
|
10
|
+
# == Credential Referencing Order
|
11
|
+
#
|
12
|
+
# * [Arguments] - Pass the credentials access_key_id and secret_access_key as arguments
|
13
|
+
# * [Environment route] - AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY can be exposed as
|
14
|
+
# environment variables
|
15
|
+
# * [Profile Name] - The application uses the credentials of the system and picks the
|
16
|
+
# credentials referred to by the profile
|
17
|
+
#
|
18
|
+
class AwsEngine
|
19
|
+
include Translator
|
20
|
+
|
21
|
+
DEFAULT_REGION = ENV["AWS_DEFAULT_REGION"] || "us-east-1"
|
22
|
+
|
23
|
+
#
|
24
|
+
# :args: options
|
25
|
+
#
|
26
|
+
# ==== Arguments
|
27
|
+
# options can carry the following details
|
28
|
+
#
|
29
|
+
# * [:access_key_id] - access key id
|
30
|
+
# * [:secret_access_key] - Secret access key
|
31
|
+
# * [:env] - true for using credentials from environment variables
|
32
|
+
# * [:profile] - profile name for using shared credentials setup
|
33
|
+
# * [:region] - If not provided defaults to us-east-1
|
34
|
+
#
|
35
|
+
# ==== raises
|
36
|
+
#
|
37
|
+
# * EngineInitializationException if credentials cannot be setup due to lack of details
|
38
|
+
# * Aws Exceptions if profile name is invalid or invalid credentials are passed
|
39
|
+
#
|
40
|
+
def initialize(options)
|
41
|
+
access_key_id = nil
|
42
|
+
secret_access_key = nil
|
43
|
+
@region = options[:region] || DEFAULT_REGION
|
44
|
+
if options[:env]
|
45
|
+
access_key_id = ENV["AWS_ACCESS_KEY_ID"]
|
46
|
+
secret_access_key = ENV["AWS_SECRET_ACCESS_KEY"]
|
47
|
+
elsif options[:access_key_id] && options[:secret_access_key]
|
48
|
+
access_key_id = options[:access_key_id]
|
49
|
+
secret_access_key = options[:secret_access_key]
|
50
|
+
end
|
51
|
+
if access_key_id && secret_access_key
|
52
|
+
Aws.config.update({
|
53
|
+
region: options[:region] || DEFAULT_REGION,
|
54
|
+
credentials: Aws::Credentials.new(access_key_id, secret_access_key)
|
55
|
+
})
|
56
|
+
elsif options[:profile]
|
57
|
+
credentials = Aws::SharedCredentials.new(profile_name: options[:profile])
|
58
|
+
Aws.config.update({
|
59
|
+
region: @region,
|
60
|
+
credentials: credentials.credentials
|
61
|
+
})
|
62
|
+
else
|
63
|
+
raise Translator::EngineInitializationException.new(
|
64
|
+
"Failed to initialize Aws Engine. Credentials are missing / not provided")
|
65
|
+
end
|
66
|
+
@translate_service = Aws::Translate::Client.new(region: @region)
|
67
|
+
@comprehend_service = Aws::Comprehend::Client.new(region: @region)
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Invokes the language detection API of AWS and returns only the language
|
72
|
+
# of the highest score and returns the ISO 639-1 code
|
73
|
+
#
|
74
|
+
# :args: text
|
75
|
+
#
|
76
|
+
# ===== Arguments
|
77
|
+
# * +text+ - The text for which the language is to be inferred
|
78
|
+
#
|
79
|
+
def infer_language(text)
|
80
|
+
response = @comprehend_service.detect_dominant_language({ text: "#{text}" })
|
81
|
+
response[:languages][0][:language_code]
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Invokes the translation API of AWS and returns the translated text
|
86
|
+
# as per the arguments provided
|
87
|
+
# Will Raise exception if a translation cannot be made between the source
|
88
|
+
# and target language codes or if the lang code is invalid
|
89
|
+
#
|
90
|
+
# :args: input_text, src_lang, target_lang
|
91
|
+
#
|
92
|
+
# * +input_text+ - The text that needs to be translated
|
93
|
+
# * +src_lang+ - The source language of the text
|
94
|
+
# * +target_lang+ - The target language to which the input_text needs to be translated to
|
95
|
+
#
|
96
|
+
def translate(input_text, src_lang, target_lang)
|
97
|
+
response = @translate_service.translate_text({ :text => "#{input_text}" ,
|
98
|
+
:source_language_code => "#{src_lang}", :target_language_code => "#{target_lang}"})
|
99
|
+
response.translated_text
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
data/lib/engines/gcp.rb
ADDED
File without changes
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#
|
2
|
+
# A Module that kind of acts as an interface where the methods
|
3
|
+
# expected out of each vendor is encapsulated into
|
4
|
+
#
|
5
|
+
# To use for a new vendor, simply include this module and provide
|
6
|
+
# caption specific implementations
|
7
|
+
#
|
8
|
+
module Translator
|
9
|
+
|
10
|
+
#
|
11
|
+
# Constants For Engines
|
12
|
+
ENGINE_AWS = 1
|
13
|
+
ENGINE_GCP = 2
|
14
|
+
|
15
|
+
#
|
16
|
+
# Keys for each Engine
|
17
|
+
AWS_KEYS = [:access_key_id, :secret_access_key, :profile]
|
18
|
+
GCP_KEYS = [:api_key, :project_id, :creds_path]
|
19
|
+
|
20
|
+
ENGINE_KEYS = {ENGINE_AWS => AWS_KEYS, ENGINE_GCP => GCP_KEYS}
|
21
|
+
#
|
22
|
+
# This exception shall be raised when we fail to initialize an
|
23
|
+
# engine for the purposes of language detection / translation
|
24
|
+
#
|
25
|
+
# ==== Example
|
26
|
+
# * When credentials are not passed
|
27
|
+
#
|
28
|
+
class EngineInitializationException < StandardError; end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Method to infer the language by inspecting the text
|
32
|
+
# passed as argument
|
33
|
+
#
|
34
|
+
# :args: text
|
35
|
+
#
|
36
|
+
# * +text+ - String whose language needs to be inferred
|
37
|
+
#
|
38
|
+
# ==== Returns
|
39
|
+
#
|
40
|
+
# * The ISO 639-1 Letter Language code
|
41
|
+
#
|
42
|
+
def infer_language(text)
|
43
|
+
raise "Not Implemented. Class #{self.class.name} doesn't implement infer_language"
|
44
|
+
end
|
45
|
+
|
46
|
+
#
|
47
|
+
# Method to translate from given language to another
|
48
|
+
#
|
49
|
+
# :args: input_text, src_lang, target_lang, output_file
|
50
|
+
#
|
51
|
+
# * +input_text+ - Text which needs to be translated
|
52
|
+
# * +src_lang+ - can be inferred using #infer_language method
|
53
|
+
# * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
|
54
|
+
#
|
55
|
+
def translate(input_text, src_lang, target_lang)
|
56
|
+
raise "Not Implemented. Class #{self.class.name} doesn't implement translate"
|
57
|
+
end
|
58
|
+
end
|
data/lib/scc.rb
CHANGED
@@ -1,13 +1,47 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative "engines/translator"
|
2
|
+
require_relative "allfather"
|
3
3
|
|
4
|
+
#
|
5
|
+
# Library to handle SCC Files
|
6
|
+
#
|
7
|
+
# Uses the translator available to do the necessary language operations
|
8
|
+
# as defined by the AllFather
|
9
|
+
#
|
4
10
|
class SCC
|
5
11
|
|
6
|
-
|
7
|
-
|
8
|
-
|
12
|
+
include AllFather
|
13
|
+
|
14
|
+
def initialize(cc_file, translator)
|
15
|
+
@cc_file = cc_file
|
16
|
+
@translator = translator
|
17
|
+
raise "Invalid SCC file provided" unless is_valid?
|
18
|
+
end
|
19
|
+
|
20
|
+
def is_valid?
|
21
|
+
# Do any SCC specific validations here
|
22
|
+
if @cc_file =~ /^.*\.(scc)$/
|
23
|
+
return true
|
24
|
+
end
|
25
|
+
return false
|
26
|
+
end
|
27
|
+
|
28
|
+
def infer_languages
|
29
|
+
lang = nil
|
30
|
+
begin
|
31
|
+
sample_text = get_text(@cc_file, 100)
|
32
|
+
lang = @translator.infer_language(sample_text)
|
33
|
+
rescue StandardError => e
|
34
|
+
puts "Error while detecting the language due to #{e.message}"
|
35
|
+
end
|
36
|
+
lang
|
9
37
|
end
|
10
38
|
|
39
|
+
def translate(src_lang, dest_lang, out_file)
|
40
|
+
raise "Not Implemented. Class #{self.class.name} doesn't implement translate yet !!"
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
11
45
|
def get_text(srt_file, num_chars)
|
12
46
|
ccfile = File.open(srt_file, 'r:UTF-8', &:read)
|
13
47
|
text_sample = ""
|
@@ -15,12 +49,12 @@ class SCC
|
|
15
49
|
if line =~ /^\d\d:\d\d:\d\d:\d\d\s/
|
16
50
|
scc_text_code = line.gsub(/^\d\d:\d\d:\d\d:\d\d\s/, '')
|
17
51
|
text_sample << decode(scc_text_code)
|
18
|
-
if text_sample.length > (num_chars+1)
|
52
|
+
if text_sample.length > (num_chars + 1)
|
19
53
|
break
|
20
54
|
end
|
21
55
|
end
|
22
56
|
end
|
23
|
-
return text_sample[0,num_chars]
|
57
|
+
return text_sample[0, num_chars]
|
24
58
|
end
|
25
59
|
|
26
60
|
def decode(scc_code_text)
|
@@ -31,7 +65,7 @@ class SCC
|
|
31
65
|
hex_codes.each do | code |
|
32
66
|
if ["94", "91", "92", "97", "15", "16", "10", "13"].include?(code)
|
33
67
|
skip_next = true
|
34
|
-
skip_count = skip_count +1
|
68
|
+
skip_count = skip_count + 1
|
35
69
|
next
|
36
70
|
end
|
37
71
|
if skip_count == 1 && skip_next
|
@@ -60,18 +94,4 @@ class SCC
|
|
60
94
|
end
|
61
95
|
encoded_str
|
62
96
|
end
|
63
|
-
|
64
|
-
def detect_lang(scc_file)
|
65
|
-
lang = nil
|
66
|
-
begin
|
67
|
-
sample_text = get_text(scc_file, 100)
|
68
|
-
response = @comp.detect_dominant_language( {
|
69
|
-
text: "#{sample_text}"
|
70
|
-
})
|
71
|
-
lang = response[:languages][0][:language_code] rescue nil
|
72
|
-
rescue => error
|
73
|
-
puts "Error while detecting the language!!"
|
74
|
-
end
|
75
|
-
lang
|
76
|
-
end
|
77
|
-
end
|
97
|
+
end
|
data/lib/srt.rb
CHANGED
@@ -1,81 +1,106 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative "engines/translator"
|
2
|
+
require_relative "allfather"
|
3
3
|
|
4
|
+
#
|
5
|
+
# Library to handle SRT Files
|
6
|
+
#
|
7
|
+
# Uses the translator available to do the necessary language operations
|
8
|
+
# as defined by the AllFather
|
9
|
+
#
|
4
10
|
class SRT
|
5
|
-
|
6
|
-
|
7
|
-
|
11
|
+
|
12
|
+
include AllFather
|
13
|
+
|
14
|
+
def initialize(cc_file, translator)
|
15
|
+
@cc_file = cc_file
|
16
|
+
@translator = translator
|
17
|
+
raise "Invalid SRT file provided" unless is_valid?
|
18
|
+
end
|
19
|
+
|
20
|
+
def is_valid?
|
21
|
+
# Do any SRT specific validations here
|
22
|
+
if @cc_file =~ /^.*\.(srt)$/
|
23
|
+
return true
|
24
|
+
end
|
25
|
+
return false
|
8
26
|
end
|
9
27
|
|
10
|
-
def
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
28
|
+
def translate(src_lang, dest_lang, out_file)
|
29
|
+
super(src_lang, dest_lang, out_file)
|
30
|
+
begin
|
31
|
+
ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
|
32
|
+
outfile = File.open(out_file, "w")
|
33
|
+
text_collection = false
|
34
|
+
text_sample = ""
|
35
|
+
ccfile.each_line do | line |
|
36
|
+
if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
|
37
|
+
text_collection = true
|
38
|
+
outfile.puts line
|
39
|
+
elsif line.strip.empty? && !text_sample.empty?
|
40
|
+
json_text = JSON.parse(text_sample) rescue nil
|
41
|
+
if json_text.nil?
|
42
|
+
trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
|
43
|
+
outfile.puts trans_resp
|
44
|
+
else
|
45
|
+
outfile.puts text_sample
|
46
|
+
end
|
24
47
|
outfile.puts
|
48
|
+
text_sample = ""
|
49
|
+
text_collection = false
|
50
|
+
elsif text_collection
|
51
|
+
text_sample << line
|
25
52
|
else
|
26
|
-
outfile.puts
|
27
|
-
outfile.puts
|
53
|
+
outfile.puts line
|
28
54
|
end
|
29
|
-
text_sample = ""
|
30
|
-
text_collection = false
|
31
|
-
elsif text_collection
|
32
|
-
text_sample << line
|
33
|
-
else
|
34
|
-
outfile.puts line
|
35
55
|
end
|
36
|
-
next
|
37
|
-
end
|
38
56
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
57
|
+
if !text_sample.empty?
|
58
|
+
trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
|
59
|
+
outfile.puts trans_resp
|
60
|
+
outfile.puts
|
61
|
+
end
|
62
|
+
ensure
|
63
|
+
ccfile.close rescue nil
|
43
64
|
outfile.close
|
44
65
|
end
|
45
66
|
end
|
46
67
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
|
55
|
-
text_collection = true
|
56
|
-
elsif line.strip.empty?
|
57
|
-
text_collection = false
|
58
|
-
elsif text_collection && text_sample.length < (num_chars+1)
|
59
|
-
text_sample << line
|
60
|
-
end
|
61
|
-
break if text_sample.length > (num_chars+1)
|
62
|
-
next
|
68
|
+
def infer_languages
|
69
|
+
lang = nil
|
70
|
+
begin
|
71
|
+
sample_text = get_text(@cc_file, 100)
|
72
|
+
lang = @translator.infer_language(sample_text)
|
73
|
+
rescue StandardError => e
|
74
|
+
puts "Error while detecting the language due to #{e.message}"
|
63
75
|
end
|
64
|
-
|
76
|
+
[lang]
|
65
77
|
end
|
66
78
|
|
67
|
-
|
68
|
-
|
79
|
+
private
|
80
|
+
|
81
|
+
#
|
82
|
+
# Method to get a minimal amount of key text that excludes any tags
|
83
|
+
# or control information for the engine to meaninfully and
|
84
|
+
# correctly infer the language being referred to in ths VTT
|
85
|
+
#
|
86
|
+
def get_text(srt_file, num_chars)
|
69
87
|
begin
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
88
|
+
ccfile = File.open(srt_file, 'r:UTF-8', &:read)
|
89
|
+
text_collection = false
|
90
|
+
text_sample = ""
|
91
|
+
ccfile.each_line do |line|
|
92
|
+
if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
|
93
|
+
text_collection = true
|
94
|
+
elsif line.strip.empty?
|
95
|
+
text_collection = false
|
96
|
+
elsif text_collection && text_sample.length < (num_chars + 1)
|
97
|
+
text_sample << line
|
98
|
+
end
|
99
|
+
break if text_sample.length > (num_chars + 1)
|
100
|
+
end
|
101
|
+
ensure
|
102
|
+
ccfile.close rescue nil
|
77
103
|
end
|
78
|
-
|
104
|
+
return text_sample[0, num_chars]
|
79
105
|
end
|
80
|
-
|
81
106
|
end
|
data/lib/subtitle.rb
CHANGED
@@ -1,43 +1,88 @@
|
|
1
|
-
|
1
|
+
require_relative "srt"
|
2
|
+
require_relative "vtt"
|
3
|
+
require_relative "scc"
|
4
|
+
require_relative "ttml"
|
5
|
+
require_relative "dfxp"
|
6
|
+
require_relative "allfather"
|
7
|
+
require_relative "engines/translator"
|
8
|
+
require_relative "engines/aws"
|
9
|
+
|
2
10
|
|
3
11
|
class Subtitle
|
4
|
-
def initialize(
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
raise "Incorrect File extension"
|
11
|
-
end
|
12
|
-
begin
|
13
|
-
@srt_parser = SRT.new(awskey, awssecret)
|
14
|
-
rescue
|
15
|
-
raise "Could not initialize Parser!!. Check the Keys supplied."
|
16
|
-
end
|
12
|
+
def initialize(options={})
|
13
|
+
# Infer the caption handler from the extension
|
14
|
+
@cc_file = options[:cc_file]
|
15
|
+
raise "Input caption not provided. Please provide the same in :cc_file option" if @cc_file.nil?
|
16
|
+
translator = get_translator(options)
|
17
|
+
@handler = get_caption_handler(options, translator)
|
17
18
|
end
|
18
19
|
|
19
20
|
def detect_language
|
20
|
-
|
21
|
-
detected_lang
|
21
|
+
@handler.infer_languages
|
22
22
|
end
|
23
23
|
|
24
|
-
def
|
24
|
+
def translate(dest_lang, src_lang = nil, outfile = nil)
|
25
25
|
if outfile.nil?
|
26
|
-
outfile = "#{@
|
26
|
+
outfile = "#{@cc_file}_#{dest_lang}"
|
27
27
|
end
|
28
28
|
if src_lang.nil?
|
29
|
-
src_lang = detect_language
|
30
|
-
raise "
|
29
|
+
src_lang = detect_language[0] rescue nil
|
30
|
+
raise "Could not detect Source Language!!" if src_lang.nil?
|
31
31
|
end
|
32
|
-
@
|
32
|
+
@handler.translate(src_lang, dest_lang, outfile)
|
33
33
|
outfile
|
34
34
|
end
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_translator(options)
|
39
|
+
translator = nil
|
40
|
+
# Try to infer the engine based on the passed options
|
41
|
+
engine = options[:engine]
|
42
|
+
unless engine
|
43
|
+
engine_props = Translator::ENGINE_KEYS
|
44
|
+
engine_props.each do |k, values|
|
45
|
+
original_size = values.size
|
46
|
+
diff = values - options.keys
|
47
|
+
if diff.size < original_size
|
48
|
+
# We have some keys for this engine in options
|
49
|
+
engine = k
|
50
|
+
break
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
case engine
|
55
|
+
when Translator::ENGINE_AWS
|
56
|
+
translator = AwsEngine.new(options)
|
57
|
+
when Translator::ENGINE_GCP
|
58
|
+
raise "GCP is yet to be implemented"
|
59
|
+
else
|
60
|
+
raise "Unable to infer the Translation Engine. Options missing key credential params"
|
61
|
+
end
|
62
|
+
translator
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_caption_handler(options, translator)
|
66
|
+
caption_file = options[:cc_file]
|
67
|
+
extension = File.extname(caption_file)
|
68
|
+
unless AllFather::VALID_FILES.include?(extension)
|
69
|
+
raise "Caption support for #{caption_file} of type #{extension} is not supported yet"
|
70
|
+
end
|
71
|
+
handler = nil
|
72
|
+
case extension.downcase
|
73
|
+
when ".scc"
|
74
|
+
handler = SCC.new(caption_file, translator)
|
75
|
+
when ".srt"
|
76
|
+
handler = SRT.new(caption_file, translator)
|
77
|
+
when ".vtt"
|
78
|
+
handler = VTT.new(caption_file, translator)
|
79
|
+
when ".ttml"
|
80
|
+
handler = TTML.new(caption_file, translator, {:force_detect => options[:force_detect]})
|
81
|
+
when ".dfxp"
|
82
|
+
handler = DFXP.new(caption_file, translator, {:force_detect => options[:force_detect]})
|
83
|
+
else
|
84
|
+
raise "Cannot handle file type .#{extension}"
|
40
85
|
end
|
41
|
-
|
86
|
+
handler
|
42
87
|
end
|
43
|
-
end
|
88
|
+
end
|
data/lib/ttml.rb
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
require_relative "engines/translator"
|
2
|
+
require_relative "allfather"
|
3
|
+
|
4
|
+
require "nokogiri"
|
5
|
+
|
6
|
+
#
|
7
|
+
# Library to handle TTML Files
|
8
|
+
#
|
9
|
+
# Uses the translator available to do the necessary language operations
|
10
|
+
# as defined by the AllFather
|
11
|
+
#
|
12
|
+
class TTML
|
13
|
+
|
14
|
+
include AllFather
|
15
|
+
|
16
|
+
def initialize(cc_file, translator, opts={})
|
17
|
+
@cc_file = cc_file
|
18
|
+
@translator = translator
|
19
|
+
@force_detect = opts[:force_detect] || false
|
20
|
+
raise "Invalid TTML file provided" unless is_valid?
|
21
|
+
end
|
22
|
+
|
23
|
+
def is_valid?
|
24
|
+
# Do any VTT specific validations here
|
25
|
+
if @cc_file =~ /^.*\.(ttml)$/
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
# TODO: Check if it's required to do a File read to see if this
|
29
|
+
# a well-formed XML. Another is to see if lang is available in each div
|
30
|
+
return false
|
31
|
+
end
|
32
|
+
|
33
|
+
def infer_languages
|
34
|
+
lang = []
|
35
|
+
begin
|
36
|
+
xml_file = File.open(@cc_file)
|
37
|
+
xml_doc = Nokogiri::XML(xml_file)
|
38
|
+
div_objects = xml_doc.css("/tt/body/div")
|
39
|
+
div_objects.each_with_index do |div, index|
|
40
|
+
# By default, return the lang if specified in the div and
|
41
|
+
# force detect is false
|
42
|
+
inferred_lang = div.attributes['lang'].value rescue nil
|
43
|
+
if inferred_lang.nil?
|
44
|
+
# If lang is not provided in the caption, then override
|
45
|
+
# force detect for inferrence
|
46
|
+
@force_detect = true
|
47
|
+
end
|
48
|
+
if @force_detect
|
49
|
+
sample_text = get_text(div, 100)
|
50
|
+
inferred_lang = @translator.infer_language(sample_text) rescue nil
|
51
|
+
if inferred_lang.nil?
|
52
|
+
err_msg = "Failed to detect lang for div block number #{index + 1}"
|
53
|
+
unless lang.empty?
|
54
|
+
err_msg += "; Detected languages before failure are #{lang}"
|
55
|
+
end
|
56
|
+
raise AllFather::LangDetectionFailureException.new(err_msg)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
lang << inferred_lang
|
60
|
+
end
|
61
|
+
rescue StandardError => e
|
62
|
+
puts "Error while detecting the language due to #{e.message}"
|
63
|
+
ensure
|
64
|
+
xml_file.close rescue nil
|
65
|
+
end
|
66
|
+
return nil if lang.empty?
|
67
|
+
lang
|
68
|
+
end
|
69
|
+
|
70
|
+
def translate(src_lang, dest_lang, out_file)
|
71
|
+
super(src_lang, dest_lang, out_file)
|
72
|
+
xml_file = File.open(@cc_file, 'r:UTF-8', &:read)
|
73
|
+
xml_doc = Nokogiri::XML(xml_file)
|
74
|
+
div_objects = xml_doc.css("/tt/body/div")
|
75
|
+
# Irrespective of what lang the div xml:lang says, infer the lang and then
|
76
|
+
# check to see if it matches src_lang
|
77
|
+
matched_div = nil
|
78
|
+
div_objects.each do |div|
|
79
|
+
sample_text = get_text(div, 100)
|
80
|
+
inferred_lang = @translator.infer_language(sample_text) rescue nil
|
81
|
+
next if inferred_lang.nil?
|
82
|
+
if inferred_lang.eql?(src_lang)
|
83
|
+
matched_div = div
|
84
|
+
break
|
85
|
+
end
|
86
|
+
end
|
87
|
+
if matched_div.nil?
|
88
|
+
FileUtils.remove_file(out_file)
|
89
|
+
raise AllFather::InvalidInputException.new("Unable to find #{src_lang} language section in TTML")
|
90
|
+
end
|
91
|
+
# Update the Lang in the Div
|
92
|
+
matched_div.lang = dest_lang
|
93
|
+
|
94
|
+
blocks = matched_div.css("p")
|
95
|
+
blocks.each do |block|
|
96
|
+
# Multiple spaces being stripped off
|
97
|
+
text = block.inner_html.strip.gsub(/(\s){2,}/, '')
|
98
|
+
text_blocks = get_block_text(text)
|
99
|
+
translated_text = ""
|
100
|
+
text_blocks.each do |text_block|
|
101
|
+
if text_block.start_with?('<') || text_block.empty?
|
102
|
+
translated_text << text_block
|
103
|
+
next
|
104
|
+
end
|
105
|
+
translated_resp = @translator.translate(text_block, src_lang, dest_lang)
|
106
|
+
translated_text << translated_resp
|
107
|
+
end
|
108
|
+
block.inner_html = translated_text
|
109
|
+
end
|
110
|
+
xml_file.close rescue nil
|
111
|
+
File.write(out_file, xml_doc)
|
112
|
+
out_file
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
#
|
118
|
+
# Method to segregate the data from markups as markups don't need
|
119
|
+
# translations.
|
120
|
+
# For example, if the cue block is of the form
|
121
|
+
# This is a test caption with <span id="1">a test span </span> within a block
|
122
|
+
# This method returns
|
123
|
+
# ["This is a test caption with ", "<span id=\"1\">", "a test span ", "</span>", " within a block"]
|
124
|
+
# as we can infer the markups can be retained as is to avoid translation
|
125
|
+
#
|
126
|
+
def get_block_text(text)
|
127
|
+
data = []
|
128
|
+
tag_start = tag_end = false
|
129
|
+
str_length = text.size
|
130
|
+
text_block = ""
|
131
|
+
markup_block = ""
|
132
|
+
for i in 0...text.size do
|
133
|
+
if text[i] == '<'
|
134
|
+
tag_end = false
|
135
|
+
tag_start = true
|
136
|
+
markup_block << text[i]
|
137
|
+
data << text_block
|
138
|
+
text_block = ""
|
139
|
+
next
|
140
|
+
elsif text[i] == '>'
|
141
|
+
tag_end = true
|
142
|
+
tag_start = false
|
143
|
+
markup_block << text[i]
|
144
|
+
data << markup_block
|
145
|
+
markup_block = ""
|
146
|
+
next
|
147
|
+
end
|
148
|
+
if tag_start && !tag_end
|
149
|
+
markup_block << text[i]
|
150
|
+
else
|
151
|
+
text_block << text[i]
|
152
|
+
end
|
153
|
+
end
|
154
|
+
unless text_block.empty?
|
155
|
+
data << text_block
|
156
|
+
end
|
157
|
+
data
|
158
|
+
end
|
159
|
+
|
160
|
+
#
|
161
|
+
# Method to get a minimal amount of key text that excludes any tags
|
162
|
+
# or control information for the engine to meaninfully and
|
163
|
+
# correctly infer the language being referred to in ths TTML
|
164
|
+
#
|
165
|
+
def get_text(div, num_chars)
|
166
|
+
text_sample = ""
|
167
|
+
blocks = div.css("p")
|
168
|
+
blocks.each do |block|
|
169
|
+
# Multiple spaces being stripped off
|
170
|
+
text = block.inner_html.strip.gsub(/(\s){2,}/, '')
|
171
|
+
# Strip off html tags (if any)
|
172
|
+
text = text.gsub(/(<.*?>)/, ' ')
|
173
|
+
text_sample << text
|
174
|
+
if text_sample.length > (num_chars + 1)
|
175
|
+
break
|
176
|
+
end
|
177
|
+
end
|
178
|
+
return text_sample[0, num_chars]
|
179
|
+
end
|
180
|
+
end
|
data/lib/vtt.rb
CHANGED
@@ -1,81 +1,115 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative "engines/translator"
|
2
|
+
require_relative "allfather"
|
3
3
|
|
4
|
+
#
|
5
|
+
# Library to handle VTT Files
|
6
|
+
#
|
7
|
+
# Uses the translator available to do the necessary language operations
|
8
|
+
# as defined by the AllFather
|
9
|
+
#
|
4
10
|
class VTT
|
5
|
-
|
6
|
-
|
7
|
-
|
11
|
+
|
12
|
+
include AllFather
|
13
|
+
|
14
|
+
def initialize(cc_file, translator)
|
15
|
+
@cc_file = cc_file
|
16
|
+
@translator = translator
|
17
|
+
raise "Invalid VTT file provided" unless is_valid?
|
8
18
|
end
|
9
19
|
|
10
|
-
def
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
20
|
+
def translate(src_lang, dest_lang, out_file)
|
21
|
+
super(src_lang, dest_lang, out_file)
|
22
|
+
begin
|
23
|
+
ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
|
24
|
+
outfile = File.open(out_file, "w")
|
25
|
+
text_collection = false
|
26
|
+
text_sample = ""
|
27
|
+
ccfile.each_line do | line |
|
28
|
+
if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
|
29
|
+
text_collection = true
|
30
|
+
outfile.puts line
|
31
|
+
elsif line.strip.empty? && !text_sample.empty?
|
32
|
+
json_text = JSON.parse(text_sample) rescue nil
|
33
|
+
if json_text.nil?
|
34
|
+
trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
|
35
|
+
outfile.puts trans_resp
|
36
|
+
outfile.puts
|
37
|
+
else
|
38
|
+
outfile.puts text_sample
|
39
|
+
outfile.puts
|
40
|
+
end
|
41
|
+
text_sample = ""
|
42
|
+
text_collection = false
|
43
|
+
elsif text_collection
|
44
|
+
text_sample << line
|
25
45
|
else
|
26
|
-
outfile.puts
|
27
|
-
outfile.puts
|
46
|
+
outfile.puts line
|
28
47
|
end
|
29
|
-
text_sample = ""
|
30
|
-
text_collection = false
|
31
|
-
elsif text_collection
|
32
|
-
text_sample << line
|
33
|
-
else
|
34
|
-
outfile.puts line
|
35
48
|
end
|
36
|
-
next
|
37
|
-
end
|
38
49
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
50
|
+
if !text_sample.empty?
|
51
|
+
trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
|
52
|
+
outfile.puts trans_resp
|
53
|
+
outfile.puts
|
54
|
+
end
|
55
|
+
ensure
|
56
|
+
ccfile.close rescue nil
|
43
57
|
outfile.close
|
44
58
|
end
|
45
59
|
end
|
46
60
|
|
61
|
+
#
|
62
|
+
# Returns the inferred language in an array
|
63
|
+
#
|
64
|
+
def infer_languages
|
65
|
+
lang = nil
|
66
|
+
begin
|
67
|
+
sample_text = get_text(@cc_file, 100)
|
68
|
+
lang = @translator.infer_language(sample_text)
|
69
|
+
rescue StandardError => e
|
70
|
+
puts "Error while detecting the language due to #{e.message}"
|
71
|
+
end
|
72
|
+
[lang]
|
73
|
+
end
|
47
74
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
text_collection = true
|
56
|
-
elsif line.strip.empty?
|
57
|
-
text_collection = false
|
58
|
-
elsif text_collection && text_sample.length < (num_chars+1)
|
59
|
-
text_sample << line
|
60
|
-
end
|
61
|
-
break if text_sample.length > (num_chars+1)
|
62
|
-
next
|
75
|
+
#
|
76
|
+
# Method to add required set of validations specific to caption type
|
77
|
+
#
|
78
|
+
def is_valid?
|
79
|
+
# Do any VTT specific validations here
|
80
|
+
if @cc_file =~ /^.*\.(vtt)$/
|
81
|
+
return true
|
63
82
|
end
|
64
|
-
|
83
|
+
# TODO: Check if it's required to do a File read to see if the 1st line is WEBVTT
|
84
|
+
# to handle cases where invalid file is named with vtt extension
|
85
|
+
return false
|
65
86
|
end
|
66
87
|
|
67
|
-
|
68
|
-
|
88
|
+
private
|
89
|
+
|
90
|
+
#
|
91
|
+
# Method to get a minimal amount of key text that excludes any tags
|
92
|
+
# or control information for the engine to meaninfully and
|
93
|
+
# correctly infer the language being referred to in ths VTT
|
94
|
+
#
|
95
|
+
def get_text(vtt_file, num_chars)
|
69
96
|
begin
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
97
|
+
ccfile = File.open(vtt_file, 'r:UTF-8', &:read)
|
98
|
+
text_collection = false
|
99
|
+
text_sample = ""
|
100
|
+
ccfile.each_line do |line|
|
101
|
+
if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
|
102
|
+
text_collection = true
|
103
|
+
elsif line.strip.empty?
|
104
|
+
text_collection = false
|
105
|
+
elsif text_collection && text_sample.length < (num_chars + 1)
|
106
|
+
text_sample << line
|
107
|
+
end
|
108
|
+
break if text_sample.length > (num_chars + 1)
|
109
|
+
end
|
110
|
+
ensure
|
111
|
+
ccfile.close rescue nil
|
77
112
|
end
|
78
|
-
|
113
|
+
return text_sample[0, num_chars]
|
79
114
|
end
|
80
|
-
|
81
115
|
end
|
metadata
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: subtitle
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maheshwaran G
|
8
|
+
- Arunjeyaprasad A J
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
12
|
+
date: 2019-10-31 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: bundler
|
@@ -25,43 +26,37 @@ dependencies:
|
|
25
26
|
- !ruby/object:Gem::Version
|
26
27
|
version: '2.0'
|
27
28
|
- !ruby/object:Gem::Dependency
|
28
|
-
name: aws-sdk
|
29
|
+
name: aws-sdk
|
29
30
|
requirement: !ruby/object:Gem::Requirement
|
30
31
|
requirements:
|
31
|
-
- - "
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: aws-sdk-translate
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
32
|
+
- - "~>"
|
46
33
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
48
|
-
type: :
|
34
|
+
version: '2.11'
|
35
|
+
type: :development
|
49
36
|
prerelease: false
|
50
37
|
version_requirements: !ruby/object:Gem::Requirement
|
51
38
|
requirements:
|
52
|
-
- - "
|
39
|
+
- - "~>"
|
53
40
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
55
|
-
description:
|
41
|
+
version: '2.11'
|
42
|
+
description: Subtitle gem helps you to detect language and translate closed caption
|
43
|
+
to required language.
|
56
44
|
email:
|
57
45
|
- pgmaheshwaran@gmail.com
|
46
|
+
- arunjeyaprasad@gmail.com
|
58
47
|
executables: []
|
59
48
|
extensions: []
|
60
49
|
extra_rdoc_files: []
|
61
50
|
files:
|
51
|
+
- lib/allfather.rb
|
52
|
+
- lib/dfxp.rb
|
53
|
+
- lib/engines/aws.rb
|
54
|
+
- lib/engines/gcp.rb
|
55
|
+
- lib/engines/translator.rb
|
62
56
|
- lib/scc.rb
|
63
57
|
- lib/srt.rb
|
64
58
|
- lib/subtitle.rb
|
59
|
+
- lib/ttml.rb
|
65
60
|
- lib/vtt.rb
|
66
61
|
homepage: https://github.com/cloudaffair/subtitle
|
67
62
|
licenses:
|
@@ -85,9 +80,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
85
80
|
version: '0'
|
86
81
|
requirements: []
|
87
82
|
rubyforge_project:
|
88
|
-
rubygems_version: 2.
|
83
|
+
rubygems_version: 2.5.1
|
89
84
|
signing_key:
|
90
85
|
specification_version: 4
|
91
|
-
summary:
|
92
|
-
language
|
86
|
+
summary: Subtitle gem helps you to detect language and translate closed caption to
|
87
|
+
required language
|
93
88
|
test_files: []
|