full_lengther_next 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest.txt +27 -0
- data/PostInstall.txt +6 -0
- data/README.rdoc +147 -0
- data/Rakefile +37 -0
- data/bin/download_fln_dbs.rb +197 -0
- data/bin/full_lengther_next +173 -0
- data/bin/make_user_db.rb +144 -0
- data/lib/full_lengther_next.rb +13 -0
- data/lib/full_lengther_next/classes/common_functions.rb +94 -0
- data/lib/full_lengther_next/classes/fl2_stats.rb +222 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +688 -0
- data/lib/full_lengther_next/classes/fl_string_utils.rb +139 -0
- data/lib/full_lengther_next/classes/lcs.rb +33 -0
- data/lib/full_lengther_next/classes/my_worker.rb +122 -0
- data/lib/full_lengther_next/classes/my_worker_manager.rb +167 -0
- data/lib/full_lengther_next/classes/orf.rb +32 -0
- data/lib/full_lengther_next/classes/sequence.rb +111 -0
- data/lib/full_lengther_next/classes/test_code.rb +877 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +287 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_full_lengther_next.rb +11 -0
- data/test/test_helper.rb +3 -0
- metadata +150 -0
@@ -0,0 +1,139 @@
|
|
1
|
+
|
2
|
+
class String
|
3
|
+
|
4
|
+
def translate
|
5
|
+
s = self.upcase
|
6
|
+
a = s.split('').each_slice(3).map{|e| e.join}
|
7
|
+
|
8
|
+
c={'GCT'=>'A', 'GCC'=>'A','GCA'=>'A','GCG'=>'A',
|
9
|
+
'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
|
10
|
+
'AAT'=>'N','AAC'=>'N',
|
11
|
+
'GAT'=>'D','GAC'=>'D',
|
12
|
+
'TGT'=>'C','TGC'=>'C',
|
13
|
+
'CAA'=>'Q','CAG'=>'Q',
|
14
|
+
'GAA'=>'E','GAG'=>'E',
|
15
|
+
'GGT'=>'G','GGC'=>'G','GGA'=>'G','GGG'=>'G',
|
16
|
+
'CAT'=>'H','CAC'=>'H',
|
17
|
+
'ATT'=>'I','ATC'=>'I','ATA'=>'I',
|
18
|
+
'TTA'=>'L','TTG'=>'L','CTT'=>'L','CTC'=>'L','CTA'=>'L','CTG'=>'L',
|
19
|
+
'ATG'=>'M',
|
20
|
+
'AAA'=>'K','AAG'=>'K',
|
21
|
+
'TTT'=>'F','TTC'=>'F',
|
22
|
+
'CCT'=>'P','CCC'=>'P','CCA'=>'P','CCG'=>'P',
|
23
|
+
'TCT'=>'S','TCC'=>'S','TCA'=>'S','TCG'=>'S','AGT'=>'S','AGC'=>'S',
|
24
|
+
'ACT'=>'T','ACC'=>'T','ACA'=>'T','ACG'=>'T',
|
25
|
+
'TGG'=>'W',
|
26
|
+
'TAT'=>'Y','TAC'=>'Y',
|
27
|
+
'GTT'=>'V','GTC'=>'V','GTA'=>'V','GTG'=>'V',
|
28
|
+
'TAG'=>'*','TGA'=>'*','TAA'=>'*'}
|
29
|
+
|
30
|
+
#EN CASO DE NO ENCONTRAR EL TRIPLETE SE AÑADE UNA X
|
31
|
+
|
32
|
+
res=a.map{
|
33
|
+
|e|
|
34
|
+
if (e.length == 3)
|
35
|
+
if (e =~ /[NnRrWwMmKkSsYyHhBbDdVv]/)
|
36
|
+
'x'
|
37
|
+
else
|
38
|
+
c[e]||'x'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
}
|
42
|
+
return res.compact.join
|
43
|
+
end
|
44
|
+
|
45
|
+
def generate_orf_old(a,frame)
|
46
|
+
|
47
|
+
all_orfs = []
|
48
|
+
each_orf = []
|
49
|
+
|
50
|
+
atg_codon = false
|
51
|
+
stop_codon = false
|
52
|
+
orf =''
|
53
|
+
t_start = 0
|
54
|
+
t_end = 0
|
55
|
+
|
56
|
+
a.each do |e|
|
57
|
+
t_end += 3
|
58
|
+
if (atg_codon)
|
59
|
+
orf += e
|
60
|
+
if (e == 'TAG') or (e == 'TGA') or (e == 'TAA')
|
61
|
+
if (orf.length >= 200)
|
62
|
+
each_orf.push orf
|
63
|
+
each_orf.push t_start
|
64
|
+
each_orf.push t_end
|
65
|
+
each_orf.push frame
|
66
|
+
each_orf.push stop_codon
|
67
|
+
|
68
|
+
all_orfs.push each_orf
|
69
|
+
each_orf = []
|
70
|
+
end
|
71
|
+
orf=''
|
72
|
+
stop_codon = true
|
73
|
+
atg_codon = false
|
74
|
+
t_start = t_end
|
75
|
+
end
|
76
|
+
elsif (e == 'ATG')
|
77
|
+
atg_codon = true
|
78
|
+
orf += e
|
79
|
+
t_start += 1
|
80
|
+
elsif (e == 'TAG') or (e == 'TGA') or (e == 'TAA')
|
81
|
+
stop_codon = true
|
82
|
+
t_start += 3
|
83
|
+
else
|
84
|
+
t_start += 3
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
if (all_orfs != '') && (all_orfs != nil)
|
89
|
+
return all_orfs
|
90
|
+
else
|
91
|
+
return nil
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def orf_finder_old
|
96
|
+
res =[]
|
97
|
+
|
98
|
+
s = self.upcase
|
99
|
+
f1 = s.split('').each_slice(3).map{|e| e.join}
|
100
|
+
r1 = generate_orf(f1,1)
|
101
|
+
res += r1
|
102
|
+
|
103
|
+
s.sub!(/^./,'')
|
104
|
+
f2 = s.split('').each_slice(3).map{|e| e.join}
|
105
|
+
r2 = generate_orf(f2,2)
|
106
|
+
res += r2
|
107
|
+
|
108
|
+
s.sub!(/^./,'')
|
109
|
+
f3 = s.split('').each_slice(3).map{|e| e.join}
|
110
|
+
r3 = generate_orf(f3,3)
|
111
|
+
res += r3
|
112
|
+
|
113
|
+
# vamos a por los ORFs de la cadena complementaria
|
114
|
+
s = self.upcase
|
115
|
+
s = s.complementary_dna
|
116
|
+
|
117
|
+
f4 = s.split('').each_slice(3).map{|e| e.join}
|
118
|
+
r4 = generate_orf(f4,-1)
|
119
|
+
res += r4
|
120
|
+
|
121
|
+
s.sub!(/^./,'')
|
122
|
+
f5 = s.split('').each_slice(3).map{|e| e.join}
|
123
|
+
r5 = generate_orf(f5,-2)
|
124
|
+
res += r5
|
125
|
+
|
126
|
+
s.sub!(/^./,'')
|
127
|
+
f6 = s.split('').each_slice(3).map{|e| e.join}
|
128
|
+
r6 = generate_orf(f6,-3)
|
129
|
+
res += r6
|
130
|
+
|
131
|
+
return res
|
132
|
+
end
|
133
|
+
|
134
|
+
def complementary_dna
|
135
|
+
c={'A'=>'T', 'a' => 't', 'T' => 'A', 't' => 'a', 'C' => 'G', 'c'=>'g' , 'G' => 'C', 'g' => 'c', 'N' => 'N', 'n' => 'N' , 'R' => 'N', 'r' => 'N', 'W' => 'N', 'w' => 'N', 'M' => 'N', 'm' => 'N', 'K' => 'N', 'k' => 'N', 'S' => 'N', 's' => 'N', 'Y' => 'N', 'y' => 'N', 'H' => 'N', 'h' => 'N', 'B' => 'N', 'b' => 'N', 'D' => 'N', 'd' => 'N', 'V' => 'N', 'v' => 'N' }
|
136
|
+
return self.reverse.split('').map{|e| c[e]}.join
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class String
|
4
|
+
|
5
|
+
def lcs(s2)
|
6
|
+
s1=self
|
7
|
+
res=""
|
8
|
+
num=Array.new(s1.size){Array.new(s2.size)}
|
9
|
+
len,ans=0
|
10
|
+
lastsub=0
|
11
|
+
s1.scan(/./).each_with_index do |l1,i |
|
12
|
+
s2.scan(/./).each_with_index do |l2,j |
|
13
|
+
unless l1==l2
|
14
|
+
num[i][j]=0
|
15
|
+
else
|
16
|
+
(i==0 || j==0)? num[i][j]=1 : num[i][j]=1 + num[i-1][j-1]
|
17
|
+
if num[i][j] > len
|
18
|
+
len = ans = num[i][j]
|
19
|
+
thissub = i
|
20
|
+
thissub -= num[i-1][j-1] unless num[i-1][j-1].nil?
|
21
|
+
if lastsub==thissub
|
22
|
+
res+=s1[i,1]
|
23
|
+
else
|
24
|
+
lastsub=thissub
|
25
|
+
res=s1[lastsub, (i+1)-lastsub]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
res
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
$: << File.expand_path(File.join(File.dirname(__FILE__)))
|
2
|
+
|
3
|
+
require 'scbi_mapreduce'
|
4
|
+
require 'scbi_blast'
|
5
|
+
require 'json'
|
6
|
+
require 'sequence'
|
7
|
+
require 'fl_string_utils'
|
8
|
+
require "lcs" # like the class simliar of seqtrim, return the longest common sequence
|
9
|
+
require "test_code"
|
10
|
+
|
11
|
+
require 'fl_analysis'
|
12
|
+
include FlAnalysis
|
13
|
+
|
14
|
+
|
15
|
+
class MyWorker < ScbiMapreduce::Worker
|
16
|
+
|
17
|
+
def starting_worker
|
18
|
+
|
19
|
+
# $WORKER_LOG.info "Loading actions"
|
20
|
+
rescue Exception => e
|
21
|
+
puts (e.message+ e.backtrace.join("\n"))
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
def receive_initial_config(obj)
|
26
|
+
|
27
|
+
# Reads the parameters
|
28
|
+
# $WORKER_LOG.info "Params received: #{obj.to_json}"
|
29
|
+
@options = obj
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
def process_object(obj)
|
34
|
+
|
35
|
+
full_lenghter2(obj)
|
36
|
+
return obj
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
def closing_worker
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
# ejecuta blastx utilizando los parametros fichero de entrada, base de datos y fichero de salida
|
45
|
+
def run_blastx(input, database, user_db_name)
|
46
|
+
# puts "\n#{user_db_name} ..... executing BLASTx"
|
47
|
+
|
48
|
+
blast=BatchBlast.new("-db #{database}",'blastx',"-evalue 1e-6 -num_alignments 1 -num_descriptions 1")
|
49
|
+
blast_result = blast.do_blast_seqs(input, :xml)
|
50
|
+
|
51
|
+
# puts "#{user_db_name} ..... BLASTx finished"
|
52
|
+
|
53
|
+
return blast_result
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def full_lenghter2(seqs)
|
58
|
+
|
59
|
+
# -------------------------------------------- User database
|
60
|
+
# if the user has included his own database in the parameters entry,
|
61
|
+
# the location of the database is tested, and blast and the results analysis is done
|
62
|
+
|
63
|
+
if (@options[:user_db])
|
64
|
+
|
65
|
+
if (@options[:user_db] =~ /\//)
|
66
|
+
user_db_name = @options[:user_db].sub(/.+\//,'')
|
67
|
+
end
|
68
|
+
|
69
|
+
if !File.exists?("#{@options[:user_db]}.psq")
|
70
|
+
puts "user database: #{@options[:user_db]} was not found"
|
71
|
+
exit
|
72
|
+
end
|
73
|
+
|
74
|
+
# do blast
|
75
|
+
my_blast = run_blastx(seqs, "#{@options[:user_db]}", user_db_name)
|
76
|
+
|
77
|
+
# split and parse blast
|
78
|
+
seqs.each_with_index do |seq,i|
|
79
|
+
analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
|
80
|
+
end
|
81
|
+
|
82
|
+
new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
|
83
|
+
|
84
|
+
else
|
85
|
+
new_seqs = seqs
|
86
|
+
end
|
87
|
+
|
88
|
+
# -------------------------------------------- UniProt (sp)
|
89
|
+
# blast
|
90
|
+
my_blast = run_blastx(new_seqs, "sp_#{@options[:tax_group]}/sp_#{@options[:tax_group]}.fasta", "sp_#{@options[:tax_group]}")
|
91
|
+
|
92
|
+
# split and parse blast
|
93
|
+
new_seqs.each_with_index do |seq,i|
|
94
|
+
analiza_orf_y_fl(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
|
95
|
+
end
|
96
|
+
|
97
|
+
new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
|
98
|
+
|
99
|
+
# -------------------------------------------- UniProt (tr)
|
100
|
+
# blast
|
101
|
+
my_blast = run_blastx(new_seqs, "tr_#{@options[:tax_group]}/tr_#{@options[:tax_group]}.fasta", "tr_#{@options[:tax_group]}")
|
102
|
+
|
103
|
+
# split and parse blast
|
104
|
+
new_seqs.each_with_index do |seq,i|
|
105
|
+
analiza_orf_y_fl(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
|
106
|
+
end
|
107
|
+
|
108
|
+
# -------------------------------------------- Test Code
|
109
|
+
# the sequences without a reliable similarity with an orthologue are processed with Test Code
|
110
|
+
testcode_input=seqs.select{|s| !s.get_annotations(:tcode).empty?}
|
111
|
+
|
112
|
+
# active this line to test tcode. hay que comentar todas las lineas de arriba de este metodo
|
113
|
+
# testcode_input=seqs
|
114
|
+
|
115
|
+
testcode_input.each do |seq|
|
116
|
+
TestCode.new(seq)
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
@@ -0,0 +1,167 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'scbi_fasta'
|
3
|
+
require 'sequence'
|
4
|
+
|
5
|
+
require 'fl2_stats'
|
6
|
+
include Fl2Stats
|
7
|
+
|
8
|
+
class MyWorkerManager < ScbiMapreduce::WorkManager
|
9
|
+
|
10
|
+
# open files and prepare global data
|
11
|
+
def self.init_work_manager(options,chunk_size=100)
|
12
|
+
|
13
|
+
input_file=options[:fasta]
|
14
|
+
|
15
|
+
if !File.exists?('fl2_results')
|
16
|
+
Dir.mkdir('fl2_results')
|
17
|
+
end
|
18
|
+
|
19
|
+
@@fasta_file = FastaQualFile.new(input_file,'')
|
20
|
+
@@chunk_size=chunk_size
|
21
|
+
@@options = options
|
22
|
+
|
23
|
+
@@annotation_file = File.open("fl2_results/annotations.txt", 'w')
|
24
|
+
@@annotation_file.puts "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\tt_code\te_value\tp_ident\tprotein_length\ts_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tProtein_sequence"
|
25
|
+
|
26
|
+
@@alignment_file = File.open("fl2_results/alignments.txt", 'w')
|
27
|
+
@@prot_file = File.open("fl2_results/proteins.fasta", 'w')
|
28
|
+
@@nts_file = File.open("fl2_results/nt_seq.txt", 'w')
|
29
|
+
@@tcode_file=File.open("fl2_results/tcode_result.txt", 'w')
|
30
|
+
@@tcode_file.puts "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\tt_code\te_value\tp_ident\tprotein_length\ts_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tProtein_sequence"
|
31
|
+
|
32
|
+
# @@error_fasta_file = File.open("fl2_results/error_seqs.fasta", 'w')
|
33
|
+
# @@error_file = File.open("fl2_results/errors_info.txt", 'w')
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
# close files
|
38
|
+
def self.end_work_manager
|
39
|
+
@@fasta_file.close
|
40
|
+
|
41
|
+
@@annotation_file.close
|
42
|
+
@@alignment_file.close
|
43
|
+
@@prot_file.close
|
44
|
+
@@nts_file.close
|
45
|
+
@@tcode_file.close
|
46
|
+
|
47
|
+
# @@error_fasta_file.close
|
48
|
+
# @@error_file.close
|
49
|
+
|
50
|
+
summary_stats
|
51
|
+
end
|
52
|
+
|
53
|
+
def error_received(worker_error, obj)
|
54
|
+
puts "Error while processing object #{obj.inspect}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
|
55
|
+
end
|
56
|
+
|
57
|
+
def too_many_errors_received
|
58
|
+
$LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
|
59
|
+
end
|
60
|
+
|
61
|
+
# send initial config
|
62
|
+
def worker_initial_config
|
63
|
+
return @@options
|
64
|
+
end
|
65
|
+
|
66
|
+
# this method is called every time a worker needs a new work
|
67
|
+
# Return the work data or nil if no more data is available
|
68
|
+
def next_work
|
69
|
+
|
70
|
+
# seqs=[]
|
71
|
+
# if (@@count % 2==0)
|
72
|
+
# $LOG.info("Processed #{@@count}")
|
73
|
+
# end
|
74
|
+
|
75
|
+
# prepare work
|
76
|
+
# @@chunk_size.times do
|
77
|
+
n,f,q = @@fasta_file.next_seq
|
78
|
+
|
79
|
+
if !n.nil?
|
80
|
+
return Sequence.new(n,f,q)
|
81
|
+
else
|
82
|
+
return nil
|
83
|
+
end
|
84
|
+
|
85
|
+
# end
|
86
|
+
|
87
|
+
# return work
|
88
|
+
# if !seqs.empty?
|
89
|
+
# return seqs
|
90
|
+
# else
|
91
|
+
# return nil
|
92
|
+
# end
|
93
|
+
|
94
|
+
end
|
95
|
+
|
96
|
+
# this method is ejecuted each time an obj is finished
|
97
|
+
def work_received(obj)
|
98
|
+
|
99
|
+
obj.each do |seq|
|
100
|
+
# puts seq.seq_name
|
101
|
+
|
102
|
+
write_seq(seq)
|
103
|
+
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
def write_seq(seq)
|
109
|
+
begin
|
110
|
+
# -------------------------------------------------------- Complete Seqs
|
111
|
+
if (e=seq.get_annotations(:complete).first)
|
112
|
+
|
113
|
+
@@annotation_file.puts e[:message]
|
114
|
+
|
115
|
+
if (a=seq.get_annotations(:alignment).first)
|
116
|
+
@@alignment_file.puts a[:message]
|
117
|
+
end
|
118
|
+
|
119
|
+
if (p=seq.get_annotations(:protein).first)
|
120
|
+
@@prot_file.puts p[:message]
|
121
|
+
end
|
122
|
+
|
123
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
124
|
+
@@nts_file.puts n[:message]
|
125
|
+
end
|
126
|
+
# -------------------------------------------------------- Non Complete Seqs
|
127
|
+
elsif (e=seq.get_annotations(:tmp_annotation).first)
|
128
|
+
|
129
|
+
@@annotation_file.puts e[:message][0]
|
130
|
+
|
131
|
+
if (a=seq.get_annotations(:alignment).first)
|
132
|
+
if !a[:message].empty?
|
133
|
+
@@alignment_file.puts a[:message]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
if (p=seq.get_annotations(:protein).first)
|
138
|
+
if !p[:message].empty?
|
139
|
+
@@prot_file.puts p[:message]
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
144
|
+
@@nts_file.puts n[:message]
|
145
|
+
end
|
146
|
+
# -------------------------------------------------------- Test Code
|
147
|
+
elsif (t=seq.get_annotations(:tcode).first)
|
148
|
+
@@tcode_file.puts t[:message]
|
149
|
+
end
|
150
|
+
# -------------------------------------------------------- Errors
|
151
|
+
# if e=seq.get_annotations(:error).first
|
152
|
+
# if !e[:message].empty?
|
153
|
+
# @@error_fasta_file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
|
154
|
+
# @@error_file.puts e[:message]
|
155
|
+
# end
|
156
|
+
# end
|
157
|
+
|
158
|
+
rescue
|
159
|
+
puts "Error printing #{seq.seq_name}"
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
|
166
|
+
end
|
167
|
+
|