subtitle 0.1.8 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/allfather.rb +83 -0
- data/lib/dfxp.rb +30 -0
- data/lib/engines/aws.rb +102 -0
- data/lib/engines/gcp.rb +0 -0
- data/lib/engines/translator.rb +58 -0
- data/lib/scc.rb +43 -23
- data/lib/srt.rb +86 -61
- data/lib/subtitle.rb +72 -27
- data/lib/ttml.rb +180 -0
- data/lib/vtt.rb +95 -61
- metadata +21 -26
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f603ac76acbb145807944c0f948d6550eee197cc
|
4
|
+
data.tar.gz: 7d06b0e8ee047ab1790237fca1c478da03725541
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 204c3af4231e25e6caaa198e9a8b7d46b4f917afcc8abdbce27bccbb94908d28d21c1b0318aa36a1ba63c83c064963e24c7398a99a0cd183c3ca10568fb6fe34
|
7
|
+
data.tar.gz: fb867912d76f039abf21fd1495c0f9b9139a594cde028f1d50184bd15d007a59113366507b7a85f1039ce0c638d423fbd7cff99d22792f83af59ccede48167dc
|
data/lib/allfather.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
#
|
2
|
+
# A Module that kind of acts as an interface where the generic methods
|
3
|
+
# that applies to each caption type can be defined
|
4
|
+
#
|
5
|
+
# To use for a new caption type, simply include this module and provide
|
6
|
+
# caption specific implementations
|
7
|
+
#
|
8
|
+
module AllFather
|
9
|
+
|
10
|
+
#
|
11
|
+
# Valid file extensions that we support; Keep expanding as we grow
|
12
|
+
#
|
13
|
+
VALID_FILES = [".scc", ".srt", ".vtt", ".ttml", ".dfxp"]
|
14
|
+
|
15
|
+
#
|
16
|
+
# Generic exception class that is raised for validation errors
|
17
|
+
#
|
18
|
+
class InvalidInputException < StandardError; end
|
19
|
+
|
20
|
+
#
|
21
|
+
# Lang inference failure exception
|
22
|
+
#
|
23
|
+
class LangDetectionFailureException < StandardError; end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Method to do basic validations like is this a valid file to even
|
27
|
+
# accept for any future transactions
|
28
|
+
#
|
29
|
+
# ==== Returns:
|
30
|
+
# true if the file is valid and false otherwise
|
31
|
+
#
|
32
|
+
def is_valid?
|
33
|
+
raise "Not Implemented. Class #{self.class.name} doesn't implement is_valid?"
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Method to infer the language(s) of the caption by inspecting the file
|
38
|
+
# depending on the type of the caption file
|
39
|
+
#
|
40
|
+
# ==== Returns
|
41
|
+
#
|
42
|
+
# * The ISO 639-1 Letter Language codes
|
43
|
+
#
|
44
|
+
def infer_languages
|
45
|
+
raise "Not Implemented. Class #{self.class.name} doesn't implement infer_languages"
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Method to translate the caption from one language to another
|
50
|
+
#
|
51
|
+
# :args: src_lang, target_lang, output_file
|
52
|
+
#
|
53
|
+
# * +input_caption+ - A Valid input caption file. Refer to #is_valid?
|
54
|
+
# * +src_lang+ - can be inferred using #infer_language method
|
55
|
+
# * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
|
56
|
+
# * +output_file+ - Output file. Can be a fully qualified path or just file name
|
57
|
+
#
|
58
|
+
# ==== Raises
|
59
|
+
#
|
60
|
+
# InvalidInputException shall be raised if
|
61
|
+
# 1. The input file doesn't exist or is unreadable or is invalid caption
|
62
|
+
# 2. The output file can't be written
|
63
|
+
# 3. The target_lang is not a valid ISO 639-1 Letter Language code
|
64
|
+
#
|
65
|
+
def translate(src_lang, target_lang, output_file)
|
66
|
+
# Check if a non empty output file is present and error out to avoid
|
67
|
+
# the danger or overwriting some important file !!
|
68
|
+
if File.exists?(output_file) && File.size(output_file) > 0
|
69
|
+
raise InvalidInputException.new("Output file #{output_file} is not empty.")
|
70
|
+
else
|
71
|
+
# Just open the file in writable mode and close it just to ensure that
|
72
|
+
# we can write the output file
|
73
|
+
File.open(output_file, "w") {|f|
|
74
|
+
}
|
75
|
+
end
|
76
|
+
# Check if the file is writable ?
|
77
|
+
unless File.writable?(output_file)
|
78
|
+
raise InvalidInputException.new("Output file #{output_file} not writable.")
|
79
|
+
end
|
80
|
+
# Further checks can be done only in caption specific implementations
|
81
|
+
# or translation engine specific implementation
|
82
|
+
end
|
83
|
+
end
|
data/lib/dfxp.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require_relative "engines/translator"
|
2
|
+
require_relative "allfather"
|
3
|
+
require_relative "ttml"
|
4
|
+
|
5
|
+
#
|
6
|
+
# Library to handle DFXP Files
|
7
|
+
#
|
8
|
+
# Uses the translator available to do the necessary language operations
|
9
|
+
# as defined by the AllFather
|
10
|
+
#
|
11
|
+
class DFXP < TTML
|
12
|
+
|
13
|
+
def initialize(cc_file, translator, opts={})
|
14
|
+
@cc_file = cc_file
|
15
|
+
@translator = translator
|
16
|
+
@force_detect = opts[:force_detect] || false
|
17
|
+
raise "Invalid TTML file provided" unless is_valid?
|
18
|
+
end
|
19
|
+
|
20
|
+
def is_valid?
|
21
|
+
# Do any VTT specific validations here
|
22
|
+
if @cc_file =~ /^.*\.(dfxp)$/
|
23
|
+
return true
|
24
|
+
end
|
25
|
+
# TODO: Check if it's required to do a File read to see if this
|
26
|
+
# a well-formed XML. Another is to see if lang is available in each div
|
27
|
+
return false
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
data/lib/engines/aws.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
require 'aws-sdk'
|
3
|
+
require_relative 'translator'
|
4
|
+
|
5
|
+
#
|
6
|
+
# Provides Language services using Amazon Translate
|
7
|
+
#
|
8
|
+
# Module can be intialized using multiple options
|
9
|
+
#
|
10
|
+
# == Credential Referencing Order
|
11
|
+
#
|
12
|
+
# * [Arguments] - Pass the credentials access_key_id and secret_access_key as arguments
|
13
|
+
# * [Environment route] - AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY can be exposed as
|
14
|
+
# environment variables
|
15
|
+
# * [Profile Name] - The application uses the credentials of the system and picks the
|
16
|
+
# credentials referred to by the profile
|
17
|
+
#
|
18
|
+
class AwsEngine
|
19
|
+
include Translator
|
20
|
+
|
21
|
+
DEFAULT_REGION = ENV["AWS_DEFAULT_REGION"] || "us-east-1"
|
22
|
+
|
23
|
+
#
|
24
|
+
# :args: options
|
25
|
+
#
|
26
|
+
# ==== Arguments
|
27
|
+
# options can carry the following details
|
28
|
+
#
|
29
|
+
# * [:access_key_id] - access key id
|
30
|
+
# * [:secret_access_key] - Secret access key
|
31
|
+
# * [:env] - true for using credentials from environment variables
|
32
|
+
# * [:profile] - profile name for using shared credentials setup
|
33
|
+
# * [:region] - If not provided defaults to us-east-1
|
34
|
+
#
|
35
|
+
# ==== raises
|
36
|
+
#
|
37
|
+
# * EngineInitializationException if credentials cannot be setup due to lack of details
|
38
|
+
# * Aws Exceptions if profile name is invalid or invalid credentials are passed
|
39
|
+
#
|
40
|
+
def initialize(options)
|
41
|
+
access_key_id = nil
|
42
|
+
secret_access_key = nil
|
43
|
+
@region = options[:region] || DEFAULT_REGION
|
44
|
+
if options[:env]
|
45
|
+
access_key_id = ENV["AWS_ACCESS_KEY_ID"]
|
46
|
+
secret_access_key = ENV["AWS_SECRET_ACCESS_KEY"]
|
47
|
+
elsif options[:access_key_id] && options[:secret_access_key]
|
48
|
+
access_key_id = options[:access_key_id]
|
49
|
+
secret_access_key = options[:secret_access_key]
|
50
|
+
end
|
51
|
+
if access_key_id && secret_access_key
|
52
|
+
Aws.config.update({
|
53
|
+
region: options[:region] || DEFAULT_REGION,
|
54
|
+
credentials: Aws::Credentials.new(access_key_id, secret_access_key)
|
55
|
+
})
|
56
|
+
elsif options[:profile]
|
57
|
+
credentials = Aws::SharedCredentials.new(profile_name: options[:profile])
|
58
|
+
Aws.config.update({
|
59
|
+
region: @region,
|
60
|
+
credentials: credentials.credentials
|
61
|
+
})
|
62
|
+
else
|
63
|
+
raise Translator::EngineInitializationException.new(
|
64
|
+
"Failed to initialize Aws Engine. Credentials are missing / not provided")
|
65
|
+
end
|
66
|
+
@translate_service = Aws::Translate::Client.new(region: @region)
|
67
|
+
@comprehend_service = Aws::Comprehend::Client.new(region: @region)
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Invokes the language detection API of AWS and returns only the language
|
72
|
+
# of the highest score and returns the ISO 639-1 code
|
73
|
+
#
|
74
|
+
# :args: text
|
75
|
+
#
|
76
|
+
# ===== Arguments
|
77
|
+
# * +text+ - The text for which the language is to be inferred
|
78
|
+
#
|
79
|
+
def infer_language(text)
|
80
|
+
response = @comprehend_service.detect_dominant_language({ text: "#{text}" })
|
81
|
+
response[:languages][0][:language_code]
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Invokes the translation API of AWS and returns the translated text
|
86
|
+
# as per the arguments provided
|
87
|
+
# Will Raise exception if a translation cannot be made between the source
|
88
|
+
# and target language codes or if the lang code is invalid
|
89
|
+
#
|
90
|
+
# :args: input_text, src_lang, target_lang
|
91
|
+
#
|
92
|
+
# * +input_text+ - The text that needs to be translated
|
93
|
+
# * +src_lang+ - The source language of the text
|
94
|
+
# * +target_lang+ - The target language to which the input_text needs to be translated to
|
95
|
+
#
|
96
|
+
def translate(input_text, src_lang, target_lang)
|
97
|
+
response = @translate_service.translate_text({ :text => "#{input_text}" ,
|
98
|
+
:source_language_code => "#{src_lang}", :target_language_code => "#{target_lang}"})
|
99
|
+
response.translated_text
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
data/lib/engines/gcp.rb
ADDED
File without changes
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#
|
2
|
+
# A Module that kind of acts as an interface where the methods
|
3
|
+
# expected out of each vendor is encapsulated into
|
4
|
+
#
|
5
|
+
# To use for a new vendor, simply include this module and provide
|
6
|
+
# caption specific implementations
|
7
|
+
#
|
8
|
+
module Translator
|
9
|
+
|
10
|
+
#
|
11
|
+
# Constants For Engines
|
12
|
+
ENGINE_AWS = 1
|
13
|
+
ENGINE_GCP = 2
|
14
|
+
|
15
|
+
#
|
16
|
+
# Keys for each Engine
|
17
|
+
AWS_KEYS = [:access_key_id, :secret_access_key, :profile]
|
18
|
+
GCP_KEYS = [:api_key, :project_id, :creds_path]
|
19
|
+
|
20
|
+
ENGINE_KEYS = {ENGINE_AWS => AWS_KEYS, ENGINE_GCP => GCP_KEYS}
|
21
|
+
#
|
22
|
+
# This exception shall be raised when we fail to initialize an
|
23
|
+
# engine for the purposes of language detection / translation
|
24
|
+
#
|
25
|
+
# ==== Example
|
26
|
+
# * When credentials are not passed
|
27
|
+
#
|
28
|
+
class EngineInitializationException < StandardError; end
|
29
|
+
|
30
|
+
#
|
31
|
+
# Method to infer the language by inspecting the text
|
32
|
+
# passed as argument
|
33
|
+
#
|
34
|
+
# :args: text
|
35
|
+
#
|
36
|
+
# * +text+ - String whose language needs to be inferred
|
37
|
+
#
|
38
|
+
# ==== Returns
|
39
|
+
#
|
40
|
+
# * The ISO 639-1 Letter Language code
|
41
|
+
#
|
42
|
+
def infer_language(text)
|
43
|
+
raise "Not Implemented. Class #{self.class.name} doesn't implement infer_language"
|
44
|
+
end
|
45
|
+
|
46
|
+
#
|
47
|
+
# Method to translate from given language to another
|
48
|
+
#
|
49
|
+
# :args: input_text, src_lang, target_lang, output_file
|
50
|
+
#
|
51
|
+
# * +input_text+ - Text which needs to be translated
|
52
|
+
# * +src_lang+ - can be inferred using #infer_language method
|
53
|
+
# * +target_lang+ - Target 2 letter ISO language code to which the source needs to be translated in to.
|
54
|
+
#
|
55
|
+
def translate(input_text, src_lang, target_lang)
|
56
|
+
raise "Not Implemented. Class #{self.class.name} doesn't implement translate"
|
57
|
+
end
|
58
|
+
end
|
data/lib/scc.rb
CHANGED
@@ -1,13 +1,47 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative "engines/translator"
|
2
|
+
require_relative "allfather"
|
3
3
|
|
4
|
+
#
|
5
|
+
# Library to handle SCC Files
|
6
|
+
#
|
7
|
+
# Uses the translator available to do the necessary language operations
|
8
|
+
# as defined by the AllFather
|
9
|
+
#
|
4
10
|
class SCC
|
5
11
|
|
6
|
-
|
7
|
-
|
8
|
-
|
12
|
+
include AllFather
|
13
|
+
|
14
|
+
def initialize(cc_file, translator)
|
15
|
+
@cc_file = cc_file
|
16
|
+
@translator = translator
|
17
|
+
raise "Invalid SCC file provided" unless is_valid?
|
18
|
+
end
|
19
|
+
|
20
|
+
def is_valid?
|
21
|
+
# Do any SCC specific validations here
|
22
|
+
if @cc_file =~ /^.*\.(scc)$/
|
23
|
+
return true
|
24
|
+
end
|
25
|
+
return false
|
26
|
+
end
|
27
|
+
|
28
|
+
def infer_languages
|
29
|
+
lang = nil
|
30
|
+
begin
|
31
|
+
sample_text = get_text(@cc_file, 100)
|
32
|
+
lang = @translator.infer_language(sample_text)
|
33
|
+
rescue StandardError => e
|
34
|
+
puts "Error while detecting the language due to #{e.message}"
|
35
|
+
end
|
36
|
+
lang
|
9
37
|
end
|
10
38
|
|
39
|
+
def translate(src_lang, dest_lang, out_file)
|
40
|
+
raise "Not Implemented. Class #{self.class.name} doesn't implement translate yet !!"
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
11
45
|
def get_text(srt_file, num_chars)
|
12
46
|
ccfile = File.open(srt_file, 'r:UTF-8', &:read)
|
13
47
|
text_sample = ""
|
@@ -15,12 +49,12 @@ class SCC
|
|
15
49
|
if line =~ /^\d\d:\d\d:\d\d:\d\d\s/
|
16
50
|
scc_text_code = line.gsub(/^\d\d:\d\d:\d\d:\d\d\s/, '')
|
17
51
|
text_sample << decode(scc_text_code)
|
18
|
-
if text_sample.length > (num_chars+1)
|
52
|
+
if text_sample.length > (num_chars + 1)
|
19
53
|
break
|
20
54
|
end
|
21
55
|
end
|
22
56
|
end
|
23
|
-
return text_sample[0,num_chars]
|
57
|
+
return text_sample[0, num_chars]
|
24
58
|
end
|
25
59
|
|
26
60
|
def decode(scc_code_text)
|
@@ -31,7 +65,7 @@ class SCC
|
|
31
65
|
hex_codes.each do | code |
|
32
66
|
if ["94", "91", "92", "97", "15", "16", "10", "13"].include?(code)
|
33
67
|
skip_next = true
|
34
|
-
skip_count = skip_count +1
|
68
|
+
skip_count = skip_count + 1
|
35
69
|
next
|
36
70
|
end
|
37
71
|
if skip_count == 1 && skip_next
|
@@ -60,18 +94,4 @@ class SCC
|
|
60
94
|
end
|
61
95
|
encoded_str
|
62
96
|
end
|
63
|
-
|
64
|
-
def detect_lang(scc_file)
|
65
|
-
lang = nil
|
66
|
-
begin
|
67
|
-
sample_text = get_text(scc_file, 100)
|
68
|
-
response = @comp.detect_dominant_language( {
|
69
|
-
text: "#{sample_text}"
|
70
|
-
})
|
71
|
-
lang = response[:languages][0][:language_code] rescue nil
|
72
|
-
rescue => error
|
73
|
-
puts "Error while detecting the language!!"
|
74
|
-
end
|
75
|
-
lang
|
76
|
-
end
|
77
|
-
end
|
97
|
+
end
|
data/lib/srt.rb
CHANGED
@@ -1,81 +1,106 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative "engines/translator"
|
2
|
+
require_relative "allfather"
|
3
3
|
|
4
|
+
#
|
5
|
+
# Library to handle SRT Files
|
6
|
+
#
|
7
|
+
# Uses the translator available to do the necessary language operations
|
8
|
+
# as defined by the AllFather
|
9
|
+
#
|
4
10
|
class SRT
|
5
|
-
|
6
|
-
|
7
|
-
|
11
|
+
|
12
|
+
include AllFather
|
13
|
+
|
14
|
+
def initialize(cc_file, translator)
|
15
|
+
@cc_file = cc_file
|
16
|
+
@translator = translator
|
17
|
+
raise "Invalid SRT file provided" unless is_valid?
|
18
|
+
end
|
19
|
+
|
20
|
+
def is_valid?
|
21
|
+
# Do any SRT specific validations here
|
22
|
+
if @cc_file =~ /^.*\.(srt)$/
|
23
|
+
return true
|
24
|
+
end
|
25
|
+
return false
|
8
26
|
end
|
9
27
|
|
10
|
-
def
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
28
|
+
def translate(src_lang, dest_lang, out_file)
|
29
|
+
super(src_lang, dest_lang, out_file)
|
30
|
+
begin
|
31
|
+
ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
|
32
|
+
outfile = File.open(out_file, "w")
|
33
|
+
text_collection = false
|
34
|
+
text_sample = ""
|
35
|
+
ccfile.each_line do | line |
|
36
|
+
if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
|
37
|
+
text_collection = true
|
38
|
+
outfile.puts line
|
39
|
+
elsif line.strip.empty? && !text_sample.empty?
|
40
|
+
json_text = JSON.parse(text_sample) rescue nil
|
41
|
+
if json_text.nil?
|
42
|
+
trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
|
43
|
+
outfile.puts trans_resp
|
44
|
+
else
|
45
|
+
outfile.puts text_sample
|
46
|
+
end
|
24
47
|
outfile.puts
|
48
|
+
text_sample = ""
|
49
|
+
text_collection = false
|
50
|
+
elsif text_collection
|
51
|
+
text_sample << line
|
25
52
|
else
|
26
|
-
outfile.puts
|
27
|
-
outfile.puts
|
53
|
+
outfile.puts line
|
28
54
|
end
|
29
|
-
text_sample = ""
|
30
|
-
text_collection = false
|
31
|
-
elsif text_collection
|
32
|
-
text_sample << line
|
33
|
-
else
|
34
|
-
outfile.puts line
|
35
55
|
end
|
36
|
-
next
|
37
|
-
end
|
38
56
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
57
|
+
if !text_sample.empty?
|
58
|
+
trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
|
59
|
+
outfile.puts trans_resp
|
60
|
+
outfile.puts
|
61
|
+
end
|
62
|
+
ensure
|
63
|
+
ccfile.close rescue nil
|
43
64
|
outfile.close
|
44
65
|
end
|
45
66
|
end
|
46
67
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
|
55
|
-
text_collection = true
|
56
|
-
elsif line.strip.empty?
|
57
|
-
text_collection = false
|
58
|
-
elsif text_collection && text_sample.length < (num_chars+1)
|
59
|
-
text_sample << line
|
60
|
-
end
|
61
|
-
break if text_sample.length > (num_chars+1)
|
62
|
-
next
|
68
|
+
def infer_languages
|
69
|
+
lang = nil
|
70
|
+
begin
|
71
|
+
sample_text = get_text(@cc_file, 100)
|
72
|
+
lang = @translator.infer_language(sample_text)
|
73
|
+
rescue StandardError => e
|
74
|
+
puts "Error while detecting the language due to #{e.message}"
|
63
75
|
end
|
64
|
-
|
76
|
+
[lang]
|
65
77
|
end
|
66
78
|
|
67
|
-
|
68
|
-
|
79
|
+
private
|
80
|
+
|
81
|
+
#
|
82
|
+
# Method to get a minimal amount of key text that excludes any tags
|
83
|
+
# or control information for the engine to meaninfully and
|
84
|
+
# correctly infer the language being referred to in ths VTT
|
85
|
+
#
|
86
|
+
def get_text(srt_file, num_chars)
|
69
87
|
begin
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
88
|
+
ccfile = File.open(srt_file, 'r:UTF-8', &:read)
|
89
|
+
text_collection = false
|
90
|
+
text_sample = ""
|
91
|
+
ccfile.each_line do |line|
|
92
|
+
if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
|
93
|
+
text_collection = true
|
94
|
+
elsif line.strip.empty?
|
95
|
+
text_collection = false
|
96
|
+
elsif text_collection && text_sample.length < (num_chars + 1)
|
97
|
+
text_sample << line
|
98
|
+
end
|
99
|
+
break if text_sample.length > (num_chars + 1)
|
100
|
+
end
|
101
|
+
ensure
|
102
|
+
ccfile.close rescue nil
|
77
103
|
end
|
78
|
-
|
104
|
+
return text_sample[0, num_chars]
|
79
105
|
end
|
80
|
-
|
81
106
|
end
|
data/lib/subtitle.rb
CHANGED
@@ -1,43 +1,88 @@
|
|
1
|
-
|
1
|
+
require_relative "srt"
|
2
|
+
require_relative "vtt"
|
3
|
+
require_relative "scc"
|
4
|
+
require_relative "ttml"
|
5
|
+
require_relative "dfxp"
|
6
|
+
require_relative "allfather"
|
7
|
+
require_relative "engines/translator"
|
8
|
+
require_relative "engines/aws"
|
9
|
+
|
2
10
|
|
3
11
|
class Subtitle
|
4
|
-
def initialize(
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
raise "Incorrect File extension"
|
11
|
-
end
|
12
|
-
begin
|
13
|
-
@srt_parser = SRT.new(awskey, awssecret)
|
14
|
-
rescue
|
15
|
-
raise "Could not initialize Parser!!. Check the Keys supplied."
|
16
|
-
end
|
12
|
+
def initialize(options={})
|
13
|
+
# Infer the caption handler from the extension
|
14
|
+
@cc_file = options[:cc_file]
|
15
|
+
raise "Input caption not provided. Please provide the same in :cc_file option" if @cc_file.nil?
|
16
|
+
translator = get_translator(options)
|
17
|
+
@handler = get_caption_handler(options, translator)
|
17
18
|
end
|
18
19
|
|
19
20
|
def detect_language
|
20
|
-
|
21
|
-
detected_lang
|
21
|
+
@handler.infer_languages
|
22
22
|
end
|
23
23
|
|
24
|
-
def
|
24
|
+
def translate(dest_lang, src_lang = nil, outfile = nil)
|
25
25
|
if outfile.nil?
|
26
|
-
outfile = "#{@
|
26
|
+
outfile = "#{@cc_file}_#{dest_lang}"
|
27
27
|
end
|
28
28
|
if src_lang.nil?
|
29
|
-
src_lang = detect_language
|
30
|
-
raise "
|
29
|
+
src_lang = detect_language[0] rescue nil
|
30
|
+
raise "Could not detect Source Language!!" if src_lang.nil?
|
31
31
|
end
|
32
|
-
@
|
32
|
+
@handler.translate(src_lang, dest_lang, outfile)
|
33
33
|
outfile
|
34
34
|
end
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_translator(options)
|
39
|
+
translator = nil
|
40
|
+
# Try to infer the engine based on the passed options
|
41
|
+
engine = options[:engine]
|
42
|
+
unless engine
|
43
|
+
engine_props = Translator::ENGINE_KEYS
|
44
|
+
engine_props.each do |k, values|
|
45
|
+
original_size = values.size
|
46
|
+
diff = values - options.keys
|
47
|
+
if diff.size < original_size
|
48
|
+
# We have some keys for this engine in options
|
49
|
+
engine = k
|
50
|
+
break
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
case engine
|
55
|
+
when Translator::ENGINE_AWS
|
56
|
+
translator = AwsEngine.new(options)
|
57
|
+
when Translator::ENGINE_GCP
|
58
|
+
raise "GCP is yet to be implemented"
|
59
|
+
else
|
60
|
+
raise "Unable to infer the Translation Engine. Options missing key credential params"
|
61
|
+
end
|
62
|
+
translator
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_caption_handler(options, translator)
|
66
|
+
caption_file = options[:cc_file]
|
67
|
+
extension = File.extname(caption_file)
|
68
|
+
unless AllFather::VALID_FILES.include?(extension)
|
69
|
+
raise "Caption support for #{caption_file} of type #{extension} is not supported yet"
|
70
|
+
end
|
71
|
+
handler = nil
|
72
|
+
case extension.downcase
|
73
|
+
when ".scc"
|
74
|
+
handler = SCC.new(caption_file, translator)
|
75
|
+
when ".srt"
|
76
|
+
handler = SRT.new(caption_file, translator)
|
77
|
+
when ".vtt"
|
78
|
+
handler = VTT.new(caption_file, translator)
|
79
|
+
when ".ttml"
|
80
|
+
handler = TTML.new(caption_file, translator, {:force_detect => options[:force_detect]})
|
81
|
+
when ".dfxp"
|
82
|
+
handler = DFXP.new(caption_file, translator, {:force_detect => options[:force_detect]})
|
83
|
+
else
|
84
|
+
raise "Cannot handle file type .#{extension}"
|
40
85
|
end
|
41
|
-
|
86
|
+
handler
|
42
87
|
end
|
43
|
-
end
|
88
|
+
end
|
data/lib/ttml.rb
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
require_relative "engines/translator"
|
2
|
+
require_relative "allfather"
|
3
|
+
|
4
|
+
require "nokogiri"
|
5
|
+
|
6
|
+
#
|
7
|
+
# Library to handle TTML Files
|
8
|
+
#
|
9
|
+
# Uses the translator available to do the necessary language operations
|
10
|
+
# as defined by the AllFather
|
11
|
+
#
|
12
|
+
class TTML
|
13
|
+
|
14
|
+
include AllFather
|
15
|
+
|
16
|
+
def initialize(cc_file, translator, opts={})
|
17
|
+
@cc_file = cc_file
|
18
|
+
@translator = translator
|
19
|
+
@force_detect = opts[:force_detect] || false
|
20
|
+
raise "Invalid TTML file provided" unless is_valid?
|
21
|
+
end
|
22
|
+
|
23
|
+
def is_valid?
|
24
|
+
# Do any VTT specific validations here
|
25
|
+
if @cc_file =~ /^.*\.(ttml)$/
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
# TODO: Check if it's required to do a File read to see if this
|
29
|
+
# a well-formed XML. Another is to see if lang is available in each div
|
30
|
+
return false
|
31
|
+
end
|
32
|
+
|
33
|
+
def infer_languages
|
34
|
+
lang = []
|
35
|
+
begin
|
36
|
+
xml_file = File.open(@cc_file)
|
37
|
+
xml_doc = Nokogiri::XML(xml_file)
|
38
|
+
div_objects = xml_doc.css("/tt/body/div")
|
39
|
+
div_objects.each_with_index do |div, index|
|
40
|
+
# By default, return the lang if specified in the div and
|
41
|
+
# force detect is false
|
42
|
+
inferred_lang = div.attributes['lang'].value rescue nil
|
43
|
+
if inferred_lang.nil?
|
44
|
+
# If lang is not provided in the caption, then override
|
45
|
+
# force detect for inferrence
|
46
|
+
@force_detect = true
|
47
|
+
end
|
48
|
+
if @force_detect
|
49
|
+
sample_text = get_text(div, 100)
|
50
|
+
inferred_lang = @translator.infer_language(sample_text) rescue nil
|
51
|
+
if inferred_lang.nil?
|
52
|
+
err_msg = "Failed to detect lang for div block number #{index + 1}"
|
53
|
+
unless lang.empty?
|
54
|
+
err_msg += "; Detected languages before failure are #{lang}"
|
55
|
+
end
|
56
|
+
raise AllFather::LangDetectionFailureException.new(err_msg)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
lang << inferred_lang
|
60
|
+
end
|
61
|
+
rescue StandardError => e
|
62
|
+
puts "Error while detecting the language due to #{e.message}"
|
63
|
+
ensure
|
64
|
+
xml_file.close rescue nil
|
65
|
+
end
|
66
|
+
return nil if lang.empty?
|
67
|
+
lang
|
68
|
+
end
|
69
|
+
|
70
|
+
def translate(src_lang, dest_lang, out_file)
|
71
|
+
super(src_lang, dest_lang, out_file)
|
72
|
+
xml_file = File.open(@cc_file, 'r:UTF-8', &:read)
|
73
|
+
xml_doc = Nokogiri::XML(xml_file)
|
74
|
+
div_objects = xml_doc.css("/tt/body/div")
|
75
|
+
# Irrespective of what lang the div xml:lang says, infer the lang and then
|
76
|
+
# check to see if it matches src_lang
|
77
|
+
matched_div = nil
|
78
|
+
div_objects.each do |div|
|
79
|
+
sample_text = get_text(div, 100)
|
80
|
+
inferred_lang = @translator.infer_language(sample_text) rescue nil
|
81
|
+
next if inferred_lang.nil?
|
82
|
+
if inferred_lang.eql?(src_lang)
|
83
|
+
matched_div = div
|
84
|
+
break
|
85
|
+
end
|
86
|
+
end
|
87
|
+
if matched_div.nil?
|
88
|
+
FileUtils.remove_file(out_file)
|
89
|
+
raise AllFather::InvalidInputException.new("Unable to find #{src_lang} language section in TTML")
|
90
|
+
end
|
91
|
+
# Update the Lang in the Div
|
92
|
+
matched_div.lang = dest_lang
|
93
|
+
|
94
|
+
blocks = matched_div.css("p")
|
95
|
+
blocks.each do |block|
|
96
|
+
# Multiple spaces being stripped off
|
97
|
+
text = block.inner_html.strip.gsub(/(\s){2,}/, '')
|
98
|
+
text_blocks = get_block_text(text)
|
99
|
+
translated_text = ""
|
100
|
+
text_blocks.each do |text_block|
|
101
|
+
if text_block.start_with?('<') || text_block.empty?
|
102
|
+
translated_text << text_block
|
103
|
+
next
|
104
|
+
end
|
105
|
+
translated_resp = @translator.translate(text_block, src_lang, dest_lang)
|
106
|
+
translated_text << translated_resp
|
107
|
+
end
|
108
|
+
block.inner_html = translated_text
|
109
|
+
end
|
110
|
+
xml_file.close rescue nil
|
111
|
+
File.write(out_file, xml_doc)
|
112
|
+
out_file
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
#
|
118
|
+
# Method to segregate the data from markups as markups don't need
|
119
|
+
# translations.
|
120
|
+
# For example, if the cue block is of the form
|
121
|
+
# This is a test caption with <span id="1">a test span </span> within a block
|
122
|
+
# This method returns
|
123
|
+
# ["This is a test caption with ", "<span id=\"1\">", "a test span ", "</span>", " within a block"]
|
124
|
+
# as we can infer the markups can be retained as is to avoid translation
|
125
|
+
#
|
126
|
+
def get_block_text(text)
|
127
|
+
data = []
|
128
|
+
tag_start = tag_end = false
|
129
|
+
str_length = text.size
|
130
|
+
text_block = ""
|
131
|
+
markup_block = ""
|
132
|
+
for i in 0...text.size do
|
133
|
+
if text[i] == '<'
|
134
|
+
tag_end = false
|
135
|
+
tag_start = true
|
136
|
+
markup_block << text[i]
|
137
|
+
data << text_block
|
138
|
+
text_block = ""
|
139
|
+
next
|
140
|
+
elsif text[i] == '>'
|
141
|
+
tag_end = true
|
142
|
+
tag_start = false
|
143
|
+
markup_block << text[i]
|
144
|
+
data << markup_block
|
145
|
+
markup_block = ""
|
146
|
+
next
|
147
|
+
end
|
148
|
+
if tag_start && !tag_end
|
149
|
+
markup_block << text[i]
|
150
|
+
else
|
151
|
+
text_block << text[i]
|
152
|
+
end
|
153
|
+
end
|
154
|
+
unless text_block.empty?
|
155
|
+
data << text_block
|
156
|
+
end
|
157
|
+
data
|
158
|
+
end
|
159
|
+
|
160
|
+
#
|
161
|
+
# Method to get a minimal amount of key text that excludes any tags
|
162
|
+
# or control information for the engine to meaninfully and
|
163
|
+
# correctly infer the language being referred to in ths TTML
|
164
|
+
#
|
165
|
+
def get_text(div, num_chars)
|
166
|
+
text_sample = ""
|
167
|
+
blocks = div.css("p")
|
168
|
+
blocks.each do |block|
|
169
|
+
# Multiple spaces being stripped off
|
170
|
+
text = block.inner_html.strip.gsub(/(\s){2,}/, '')
|
171
|
+
# Strip off html tags (if any)
|
172
|
+
text = text.gsub(/(<.*?>)/, ' ')
|
173
|
+
text_sample << text
|
174
|
+
if text_sample.length > (num_chars + 1)
|
175
|
+
break
|
176
|
+
end
|
177
|
+
end
|
178
|
+
return text_sample[0, num_chars]
|
179
|
+
end
|
180
|
+
end
|
data/lib/vtt.rb
CHANGED
@@ -1,81 +1,115 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require_relative "engines/translator"
|
2
|
+
require_relative "allfather"
|
3
3
|
|
4
|
+
#
|
5
|
+
# Library to handle VTT Files
|
6
|
+
#
|
7
|
+
# Uses the translator available to do the necessary language operations
|
8
|
+
# as defined by the AllFather
|
9
|
+
#
|
4
10
|
class VTT
|
5
|
-
|
6
|
-
|
7
|
-
|
11
|
+
|
12
|
+
include AllFather
|
13
|
+
|
14
|
+
def initialize(cc_file, translator)
|
15
|
+
@cc_file = cc_file
|
16
|
+
@translator = translator
|
17
|
+
raise "Invalid VTT file provided" unless is_valid?
|
8
18
|
end
|
9
19
|
|
10
|
-
def
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
20
|
+
def translate(src_lang, dest_lang, out_file)
|
21
|
+
super(src_lang, dest_lang, out_file)
|
22
|
+
begin
|
23
|
+
ccfile = File.open(@cc_file, 'r:UTF-8', &:read)
|
24
|
+
outfile = File.open(out_file, "w")
|
25
|
+
text_collection = false
|
26
|
+
text_sample = ""
|
27
|
+
ccfile.each_line do | line |
|
28
|
+
if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
|
29
|
+
text_collection = true
|
30
|
+
outfile.puts line
|
31
|
+
elsif line.strip.empty? && !text_sample.empty?
|
32
|
+
json_text = JSON.parse(text_sample) rescue nil
|
33
|
+
if json_text.nil?
|
34
|
+
trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
|
35
|
+
outfile.puts trans_resp
|
36
|
+
outfile.puts
|
37
|
+
else
|
38
|
+
outfile.puts text_sample
|
39
|
+
outfile.puts
|
40
|
+
end
|
41
|
+
text_sample = ""
|
42
|
+
text_collection = false
|
43
|
+
elsif text_collection
|
44
|
+
text_sample << line
|
25
45
|
else
|
26
|
-
outfile.puts
|
27
|
-
outfile.puts
|
46
|
+
outfile.puts line
|
28
47
|
end
|
29
|
-
text_sample = ""
|
30
|
-
text_collection = false
|
31
|
-
elsif text_collection
|
32
|
-
text_sample << line
|
33
|
-
else
|
34
|
-
outfile.puts line
|
35
48
|
end
|
36
|
-
next
|
37
|
-
end
|
38
49
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
50
|
+
if !text_sample.empty?
|
51
|
+
trans_resp = @translator.translate(text_sample, src_lang, dest_lang)
|
52
|
+
outfile.puts trans_resp
|
53
|
+
outfile.puts
|
54
|
+
end
|
55
|
+
ensure
|
56
|
+
ccfile.close rescue nil
|
43
57
|
outfile.close
|
44
58
|
end
|
45
59
|
end
|
46
60
|
|
61
|
+
#
|
62
|
+
# Returns the inferred language in an array
|
63
|
+
#
|
64
|
+
def infer_languages
|
65
|
+
lang = nil
|
66
|
+
begin
|
67
|
+
sample_text = get_text(@cc_file, 100)
|
68
|
+
lang = @translator.infer_language(sample_text)
|
69
|
+
rescue StandardError => e
|
70
|
+
puts "Error while detecting the language due to #{e.message}"
|
71
|
+
end
|
72
|
+
[lang]
|
73
|
+
end
|
47
74
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
text_collection = true
|
56
|
-
elsif line.strip.empty?
|
57
|
-
text_collection = false
|
58
|
-
elsif text_collection && text_sample.length < (num_chars+1)
|
59
|
-
text_sample << line
|
60
|
-
end
|
61
|
-
break if text_sample.length > (num_chars+1)
|
62
|
-
next
|
75
|
+
#
|
76
|
+
# Method to add required set of validations specific to caption type
|
77
|
+
#
|
78
|
+
def is_valid?
|
79
|
+
# Do any VTT specific validations here
|
80
|
+
if @cc_file =~ /^.*\.(vtt)$/
|
81
|
+
return true
|
63
82
|
end
|
64
|
-
|
83
|
+
# TODO: Check if it's required to do a File read to see if the 1st line is WEBVTT
|
84
|
+
# to handle cases where invalid file is named with vtt extension
|
85
|
+
return false
|
65
86
|
end
|
66
87
|
|
67
|
-
|
68
|
-
|
88
|
+
private
|
89
|
+
|
90
|
+
#
|
91
|
+
# Method to get a minimal amount of key text that excludes any tags
|
92
|
+
# or control information for the engine to meaninfully and
|
93
|
+
# correctly infer the language being referred to in ths VTT
|
94
|
+
#
|
95
|
+
def get_text(vtt_file, num_chars)
|
69
96
|
begin
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
97
|
+
ccfile = File.open(vtt_file, 'r:UTF-8', &:read)
|
98
|
+
text_collection = false
|
99
|
+
text_sample = ""
|
100
|
+
ccfile.each_line do |line|
|
101
|
+
if line =~ /^(\d\d:)\d\d:\d\d[,.]\d\d\d.*-->.*(\d\d:)\d\d:\d\d[,.]\d\d\d/
|
102
|
+
text_collection = true
|
103
|
+
elsif line.strip.empty?
|
104
|
+
text_collection = false
|
105
|
+
elsif text_collection && text_sample.length < (num_chars + 1)
|
106
|
+
text_sample << line
|
107
|
+
end
|
108
|
+
break if text_sample.length > (num_chars + 1)
|
109
|
+
end
|
110
|
+
ensure
|
111
|
+
ccfile.close rescue nil
|
77
112
|
end
|
78
|
-
|
113
|
+
return text_sample[0, num_chars]
|
79
114
|
end
|
80
|
-
|
81
115
|
end
|
metadata
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: subtitle
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maheshwaran G
|
8
|
+
- Arunjeyaprasad A J
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2019-10-
|
12
|
+
date: 2019-10-31 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: bundler
|
@@ -25,43 +26,37 @@ dependencies:
|
|
25
26
|
- !ruby/object:Gem::Version
|
26
27
|
version: '2.0'
|
27
28
|
- !ruby/object:Gem::Dependency
|
28
|
-
name: aws-sdk
|
29
|
+
name: aws-sdk
|
29
30
|
requirement: !ruby/object:Gem::Requirement
|
30
31
|
requirements:
|
31
|
-
- - "
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :runtime
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: aws-sdk-translate
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
32
|
+
- - "~>"
|
46
33
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
48
|
-
type: :
|
34
|
+
version: '2.11'
|
35
|
+
type: :development
|
49
36
|
prerelease: false
|
50
37
|
version_requirements: !ruby/object:Gem::Requirement
|
51
38
|
requirements:
|
52
|
-
- - "
|
39
|
+
- - "~>"
|
53
40
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
55
|
-
description:
|
41
|
+
version: '2.11'
|
42
|
+
description: Subtitle gem helps you to detect language and translate closed caption
|
43
|
+
to required language.
|
56
44
|
email:
|
57
45
|
- pgmaheshwaran@gmail.com
|
46
|
+
- arunjeyaprasad@gmail.com
|
58
47
|
executables: []
|
59
48
|
extensions: []
|
60
49
|
extra_rdoc_files: []
|
61
50
|
files:
|
51
|
+
- lib/allfather.rb
|
52
|
+
- lib/dfxp.rb
|
53
|
+
- lib/engines/aws.rb
|
54
|
+
- lib/engines/gcp.rb
|
55
|
+
- lib/engines/translator.rb
|
62
56
|
- lib/scc.rb
|
63
57
|
- lib/srt.rb
|
64
58
|
- lib/subtitle.rb
|
59
|
+
- lib/ttml.rb
|
65
60
|
- lib/vtt.rb
|
66
61
|
homepage: https://github.com/cloudaffair/subtitle
|
67
62
|
licenses:
|
@@ -85,9 +80,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
85
80
|
version: '0'
|
86
81
|
requirements: []
|
87
82
|
rubyforge_project:
|
88
|
-
rubygems_version: 2.
|
83
|
+
rubygems_version: 2.5.1
|
89
84
|
signing_key:
|
90
85
|
specification_version: 4
|
91
|
-
summary:
|
92
|
-
language
|
86
|
+
summary: Subtitle gem helps you to detect language and translate closed caption to
|
87
|
+
required language
|
93
88
|
test_files: []
|