full_lengther_next 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest.txt +27 -0
- data/PostInstall.txt +6 -0
- data/README.rdoc +147 -0
- data/Rakefile +37 -0
- data/bin/download_fln_dbs.rb +197 -0
- data/bin/full_lengther_next +173 -0
- data/bin/make_user_db.rb +144 -0
- data/lib/full_lengther_next.rb +13 -0
- data/lib/full_lengther_next/classes/common_functions.rb +94 -0
- data/lib/full_lengther_next/classes/fl2_stats.rb +222 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +688 -0
- data/lib/full_lengther_next/classes/fl_string_utils.rb +139 -0
- data/lib/full_lengther_next/classes/lcs.rb +33 -0
- data/lib/full_lengther_next/classes/my_worker.rb +122 -0
- data/lib/full_lengther_next/classes/my_worker_manager.rb +167 -0
- data/lib/full_lengther_next/classes/orf.rb +32 -0
- data/lib/full_lengther_next/classes/sequence.rb +111 -0
- data/lib/full_lengther_next/classes/test_code.rb +877 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +287 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_full_lengther_next.rb +11 -0
- data/test/test_helper.rb +3 -0
- metadata +150 -0
@@ -0,0 +1,139 @@
|
|
1
|
+
|
2
|
+
class String
|
3
|
+
|
4
|
+
def translate
|
5
|
+
s = self.upcase
|
6
|
+
a = s.split('').each_slice(3).map{|e| e.join}
|
7
|
+
|
8
|
+
c={'GCT'=>'A', 'GCC'=>'A','GCA'=>'A','GCG'=>'A',
|
9
|
+
'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
|
10
|
+
'AAT'=>'N','AAC'=>'N',
|
11
|
+
'GAT'=>'D','GAC'=>'D',
|
12
|
+
'TGT'=>'C','TGC'=>'C',
|
13
|
+
'CAA'=>'Q','CAG'=>'Q',
|
14
|
+
'GAA'=>'E','GAG'=>'E',
|
15
|
+
'GGT'=>'G','GGC'=>'G','GGA'=>'G','GGG'=>'G',
|
16
|
+
'CAT'=>'H','CAC'=>'H',
|
17
|
+
'ATT'=>'I','ATC'=>'I','ATA'=>'I',
|
18
|
+
'TTA'=>'L','TTG'=>'L','CTT'=>'L','CTC'=>'L','CTA'=>'L','CTG'=>'L',
|
19
|
+
'ATG'=>'M',
|
20
|
+
'AAA'=>'K','AAG'=>'K',
|
21
|
+
'TTT'=>'F','TTC'=>'F',
|
22
|
+
'CCT'=>'P','CCC'=>'P','CCA'=>'P','CCG'=>'P',
|
23
|
+
'TCT'=>'S','TCC'=>'S','TCA'=>'S','TCG'=>'S','AGT'=>'S','AGC'=>'S',
|
24
|
+
'ACT'=>'T','ACC'=>'T','ACA'=>'T','ACG'=>'T',
|
25
|
+
'TGG'=>'W',
|
26
|
+
'TAT'=>'Y','TAC'=>'Y',
|
27
|
+
'GTT'=>'V','GTC'=>'V','GTA'=>'V','GTG'=>'V',
|
28
|
+
'TAG'=>'*','TGA'=>'*','TAA'=>'*'}
|
29
|
+
|
30
|
+
#EN CASO DE NO ENCONTRAR EL TRIPLETE SE AÑADE UNA X
|
31
|
+
|
32
|
+
res=a.map{
|
33
|
+
|e|
|
34
|
+
if (e.length == 3)
|
35
|
+
if (e =~ /[NnRrWwMmKkSsYyHhBbDdVv]/)
|
36
|
+
'x'
|
37
|
+
else
|
38
|
+
c[e]||'x'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
}
|
42
|
+
return res.compact.join
|
43
|
+
end
|
44
|
+
|
45
|
+
def generate_orf_old(a,frame)
|
46
|
+
|
47
|
+
all_orfs = []
|
48
|
+
each_orf = []
|
49
|
+
|
50
|
+
atg_codon = false
|
51
|
+
stop_codon = false
|
52
|
+
orf =''
|
53
|
+
t_start = 0
|
54
|
+
t_end = 0
|
55
|
+
|
56
|
+
a.each do |e|
|
57
|
+
t_end += 3
|
58
|
+
if (atg_codon)
|
59
|
+
orf += e
|
60
|
+
if (e == 'TAG') or (e == 'TGA') or (e == 'TAA')
|
61
|
+
if (orf.length >= 200)
|
62
|
+
each_orf.push orf
|
63
|
+
each_orf.push t_start
|
64
|
+
each_orf.push t_end
|
65
|
+
each_orf.push frame
|
66
|
+
each_orf.push stop_codon
|
67
|
+
|
68
|
+
all_orfs.push each_orf
|
69
|
+
each_orf = []
|
70
|
+
end
|
71
|
+
orf=''
|
72
|
+
stop_codon = true
|
73
|
+
atg_codon = false
|
74
|
+
t_start = t_end
|
75
|
+
end
|
76
|
+
elsif (e == 'ATG')
|
77
|
+
atg_codon = true
|
78
|
+
orf += e
|
79
|
+
t_start += 1
|
80
|
+
elsif (e == 'TAG') or (e == 'TGA') or (e == 'TAA')
|
81
|
+
stop_codon = true
|
82
|
+
t_start += 3
|
83
|
+
else
|
84
|
+
t_start += 3
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
if (all_orfs != '') && (all_orfs != nil)
|
89
|
+
return all_orfs
|
90
|
+
else
|
91
|
+
return nil
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def orf_finder_old
|
96
|
+
res =[]
|
97
|
+
|
98
|
+
s = self.upcase
|
99
|
+
f1 = s.split('').each_slice(3).map{|e| e.join}
|
100
|
+
r1 = generate_orf(f1,1)
|
101
|
+
res += r1
|
102
|
+
|
103
|
+
s.sub!(/^./,'')
|
104
|
+
f2 = s.split('').each_slice(3).map{|e| e.join}
|
105
|
+
r2 = generate_orf(f2,2)
|
106
|
+
res += r2
|
107
|
+
|
108
|
+
s.sub!(/^./,'')
|
109
|
+
f3 = s.split('').each_slice(3).map{|e| e.join}
|
110
|
+
r3 = generate_orf(f3,3)
|
111
|
+
res += r3
|
112
|
+
|
113
|
+
# vamos a por los ORFs de la cadena complementaria
|
114
|
+
s = self.upcase
|
115
|
+
s = s.complementary_dna
|
116
|
+
|
117
|
+
f4 = s.split('').each_slice(3).map{|e| e.join}
|
118
|
+
r4 = generate_orf(f4,-1)
|
119
|
+
res += r4
|
120
|
+
|
121
|
+
s.sub!(/^./,'')
|
122
|
+
f5 = s.split('').each_slice(3).map{|e| e.join}
|
123
|
+
r5 = generate_orf(f5,-2)
|
124
|
+
res += r5
|
125
|
+
|
126
|
+
s.sub!(/^./,'')
|
127
|
+
f6 = s.split('').each_slice(3).map{|e| e.join}
|
128
|
+
r6 = generate_orf(f6,-3)
|
129
|
+
res += r6
|
130
|
+
|
131
|
+
return res
|
132
|
+
end
|
133
|
+
|
134
|
+
def complementary_dna
|
135
|
+
c={'A'=>'T', 'a' => 't', 'T' => 'A', 't' => 'a', 'C' => 'G', 'c'=>'g' , 'G' => 'C', 'g' => 'c', 'N' => 'N', 'n' => 'N' , 'R' => 'N', 'r' => 'N', 'W' => 'N', 'w' => 'N', 'M' => 'N', 'm' => 'N', 'K' => 'N', 'k' => 'N', 'S' => 'N', 's' => 'N', 'Y' => 'N', 'y' => 'N', 'H' => 'N', 'h' => 'N', 'B' => 'N', 'b' => 'N', 'D' => 'N', 'd' => 'N', 'V' => 'N', 'v' => 'N' }
|
136
|
+
return self.reverse.split('').map{|e| c[e]}.join
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class String
|
4
|
+
|
5
|
+
def lcs(s2)
|
6
|
+
s1=self
|
7
|
+
res=""
|
8
|
+
num=Array.new(s1.size){Array.new(s2.size)}
|
9
|
+
len,ans=0
|
10
|
+
lastsub=0
|
11
|
+
s1.scan(/./).each_with_index do |l1,i |
|
12
|
+
s2.scan(/./).each_with_index do |l2,j |
|
13
|
+
unless l1==l2
|
14
|
+
num[i][j]=0
|
15
|
+
else
|
16
|
+
(i==0 || j==0)? num[i][j]=1 : num[i][j]=1 + num[i-1][j-1]
|
17
|
+
if num[i][j] > len
|
18
|
+
len = ans = num[i][j]
|
19
|
+
thissub = i
|
20
|
+
thissub -= num[i-1][j-1] unless num[i-1][j-1].nil?
|
21
|
+
if lastsub==thissub
|
22
|
+
res+=s1[i,1]
|
23
|
+
else
|
24
|
+
lastsub=thissub
|
25
|
+
res=s1[lastsub, (i+1)-lastsub]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
res
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
$: << File.expand_path(File.join(File.dirname(__FILE__)))
|
2
|
+
|
3
|
+
require 'scbi_mapreduce'
|
4
|
+
require 'scbi_blast'
|
5
|
+
require 'json'
|
6
|
+
require 'sequence'
|
7
|
+
require 'fl_string_utils'
|
8
|
+
require "lcs" # like the class simliar of seqtrim, return the longest common sequence
|
9
|
+
require "test_code"
|
10
|
+
|
11
|
+
require 'fl_analysis'
|
12
|
+
include FlAnalysis
|
13
|
+
|
14
|
+
|
15
|
+
class MyWorker < ScbiMapreduce::Worker
|
16
|
+
|
17
|
+
def starting_worker
|
18
|
+
|
19
|
+
# $WORKER_LOG.info "Loading actions"
|
20
|
+
rescue Exception => e
|
21
|
+
puts (e.message+ e.backtrace.join("\n"))
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
def receive_initial_config(obj)
|
26
|
+
|
27
|
+
# Reads the parameters
|
28
|
+
# $WORKER_LOG.info "Params received: #{obj.to_json}"
|
29
|
+
@options = obj
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
def process_object(obj)
|
34
|
+
|
35
|
+
full_lenghter2(obj)
|
36
|
+
return obj
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
def closing_worker
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
# ejecuta blastx utilizando los parametros fichero de entrada, base de datos y fichero de salida
|
45
|
+
def run_blastx(input, database, user_db_name)
|
46
|
+
# puts "\n#{user_db_name} ..... executing BLASTx"
|
47
|
+
|
48
|
+
blast=BatchBlast.new("-db #{database}",'blastx',"-evalue 1e-6 -num_alignments 1 -num_descriptions 1")
|
49
|
+
blast_result = blast.do_blast_seqs(input, :xml)
|
50
|
+
|
51
|
+
# puts "#{user_db_name} ..... BLASTx finished"
|
52
|
+
|
53
|
+
return blast_result
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def full_lenghter2(seqs)
|
58
|
+
|
59
|
+
# -------------------------------------------- User database
|
60
|
+
# if the user has included his own database in the parameters entry,
|
61
|
+
# the location of the database is tested, and blast and the results analysis is done
|
62
|
+
|
63
|
+
if (@options[:user_db])
|
64
|
+
|
65
|
+
if (@options[:user_db] =~ /\//)
|
66
|
+
user_db_name = @options[:user_db].sub(/.+\//,'')
|
67
|
+
end
|
68
|
+
|
69
|
+
if !File.exists?("#{@options[:user_db]}.psq")
|
70
|
+
puts "user database: #{@options[:user_db]} was not found"
|
71
|
+
exit
|
72
|
+
end
|
73
|
+
|
74
|
+
# do blast
|
75
|
+
my_blast = run_blastx(seqs, "#{@options[:user_db]}", user_db_name)
|
76
|
+
|
77
|
+
# split and parse blast
|
78
|
+
seqs.each_with_index do |seq,i|
|
79
|
+
analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
|
80
|
+
end
|
81
|
+
|
82
|
+
new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
|
83
|
+
|
84
|
+
else
|
85
|
+
new_seqs = seqs
|
86
|
+
end
|
87
|
+
|
88
|
+
# -------------------------------------------- UniProt (sp)
|
89
|
+
# blast
|
90
|
+
my_blast = run_blastx(new_seqs, "sp_#{@options[:tax_group]}/sp_#{@options[:tax_group]}.fasta", "sp_#{@options[:tax_group]}")
|
91
|
+
|
92
|
+
# split and parse blast
|
93
|
+
new_seqs.each_with_index do |seq,i|
|
94
|
+
analiza_orf_y_fl(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
|
95
|
+
end
|
96
|
+
|
97
|
+
new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
|
98
|
+
|
99
|
+
# -------------------------------------------- UniProt (tr)
|
100
|
+
# blast
|
101
|
+
my_blast = run_blastx(new_seqs, "tr_#{@options[:tax_group]}/tr_#{@options[:tax_group]}.fasta", "tr_#{@options[:tax_group]}")
|
102
|
+
|
103
|
+
# split and parse blast
|
104
|
+
new_seqs.each_with_index do |seq,i|
|
105
|
+
analiza_orf_y_fl(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
|
106
|
+
end
|
107
|
+
|
108
|
+
# -------------------------------------------- Test Code
|
109
|
+
# the sequences without a reliable similarity with an orthologue are processed with Test Code
|
110
|
+
testcode_input=seqs.select{|s| !s.get_annotations(:tcode).empty?}
|
111
|
+
|
112
|
+
# active this line to test tcode. hay que comentar todas las lineas de arriba de este metodo
|
113
|
+
# testcode_input=seqs
|
114
|
+
|
115
|
+
testcode_input.each do |seq|
|
116
|
+
TestCode.new(seq)
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
@@ -0,0 +1,167 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'scbi_fasta'
|
3
|
+
require 'sequence'
|
4
|
+
|
5
|
+
require 'fl2_stats'
|
6
|
+
include Fl2Stats
|
7
|
+
|
8
|
+
class MyWorkerManager < ScbiMapreduce::WorkManager
|
9
|
+
|
10
|
+
# open files and prepare global data
|
11
|
+
def self.init_work_manager(options,chunk_size=100)
|
12
|
+
|
13
|
+
input_file=options[:fasta]
|
14
|
+
|
15
|
+
if !File.exists?('fl2_results')
|
16
|
+
Dir.mkdir('fl2_results')
|
17
|
+
end
|
18
|
+
|
19
|
+
@@fasta_file = FastaQualFile.new(input_file,'')
|
20
|
+
@@chunk_size=chunk_size
|
21
|
+
@@options = options
|
22
|
+
|
23
|
+
@@annotation_file = File.open("fl2_results/annotations.txt", 'w')
|
24
|
+
@@annotation_file.puts "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\tt_code\te_value\tp_ident\tprotein_length\ts_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tProtein_sequence"
|
25
|
+
|
26
|
+
@@alignment_file = File.open("fl2_results/alignments.txt", 'w')
|
27
|
+
@@prot_file = File.open("fl2_results/proteins.fasta", 'w')
|
28
|
+
@@nts_file = File.open("fl2_results/nt_seq.txt", 'w')
|
29
|
+
@@tcode_file=File.open("fl2_results/tcode_result.txt", 'w')
|
30
|
+
@@tcode_file.puts "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\tt_code\te_value\tp_ident\tprotein_length\ts_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tProtein_sequence"
|
31
|
+
|
32
|
+
# @@error_fasta_file = File.open("fl2_results/error_seqs.fasta", 'w')
|
33
|
+
# @@error_file = File.open("fl2_results/errors_info.txt", 'w')
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
# close files
|
38
|
+
def self.end_work_manager
|
39
|
+
@@fasta_file.close
|
40
|
+
|
41
|
+
@@annotation_file.close
|
42
|
+
@@alignment_file.close
|
43
|
+
@@prot_file.close
|
44
|
+
@@nts_file.close
|
45
|
+
@@tcode_file.close
|
46
|
+
|
47
|
+
# @@error_fasta_file.close
|
48
|
+
# @@error_file.close
|
49
|
+
|
50
|
+
summary_stats
|
51
|
+
end
|
52
|
+
|
53
|
+
def error_received(worker_error, obj)
|
54
|
+
puts "Error while processing object #{obj.inspect}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
|
55
|
+
end
|
56
|
+
|
57
|
+
def too_many_errors_received
|
58
|
+
$LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
|
59
|
+
end
|
60
|
+
|
61
|
+
# send initial config
|
62
|
+
def worker_initial_config
|
63
|
+
return @@options
|
64
|
+
end
|
65
|
+
|
66
|
+
# this method is called every time a worker needs a new work
|
67
|
+
# Return the work data or nil if no more data is available
|
68
|
+
def next_work
|
69
|
+
|
70
|
+
# seqs=[]
|
71
|
+
# if (@@count % 2==0)
|
72
|
+
# $LOG.info("Processed #{@@count}")
|
73
|
+
# end
|
74
|
+
|
75
|
+
# prepare work
|
76
|
+
# @@chunk_size.times do
|
77
|
+
n,f,q = @@fasta_file.next_seq
|
78
|
+
|
79
|
+
if !n.nil?
|
80
|
+
return Sequence.new(n,f,q)
|
81
|
+
else
|
82
|
+
return nil
|
83
|
+
end
|
84
|
+
|
85
|
+
# end
|
86
|
+
|
87
|
+
# return work
|
88
|
+
# if !seqs.empty?
|
89
|
+
# return seqs
|
90
|
+
# else
|
91
|
+
# return nil
|
92
|
+
# end
|
93
|
+
|
94
|
+
end
|
95
|
+
|
96
|
+
# this method is ejecuted each time an obj is finished
|
97
|
+
def work_received(obj)
|
98
|
+
|
99
|
+
obj.each do |seq|
|
100
|
+
# puts seq.seq_name
|
101
|
+
|
102
|
+
write_seq(seq)
|
103
|
+
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
def write_seq(seq)
|
109
|
+
begin
|
110
|
+
# -------------------------------------------------------- Complete Seqs
|
111
|
+
if (e=seq.get_annotations(:complete).first)
|
112
|
+
|
113
|
+
@@annotation_file.puts e[:message]
|
114
|
+
|
115
|
+
if (a=seq.get_annotations(:alignment).first)
|
116
|
+
@@alignment_file.puts a[:message]
|
117
|
+
end
|
118
|
+
|
119
|
+
if (p=seq.get_annotations(:protein).first)
|
120
|
+
@@prot_file.puts p[:message]
|
121
|
+
end
|
122
|
+
|
123
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
124
|
+
@@nts_file.puts n[:message]
|
125
|
+
end
|
126
|
+
# -------------------------------------------------------- Non Complete Seqs
|
127
|
+
elsif (e=seq.get_annotations(:tmp_annotation).first)
|
128
|
+
|
129
|
+
@@annotation_file.puts e[:message][0]
|
130
|
+
|
131
|
+
if (a=seq.get_annotations(:alignment).first)
|
132
|
+
if !a[:message].empty?
|
133
|
+
@@alignment_file.puts a[:message]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
if (p=seq.get_annotations(:protein).first)
|
138
|
+
if !p[:message].empty?
|
139
|
+
@@prot_file.puts p[:message]
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
if (n=seq.get_annotations(:nucleotide).first)
|
144
|
+
@@nts_file.puts n[:message]
|
145
|
+
end
|
146
|
+
# -------------------------------------------------------- Test Code
|
147
|
+
elsif (t=seq.get_annotations(:tcode).first)
|
148
|
+
@@tcode_file.puts t[:message]
|
149
|
+
end
|
150
|
+
# -------------------------------------------------------- Errors
|
151
|
+
# if e=seq.get_annotations(:error).first
|
152
|
+
# if !e[:message].empty?
|
153
|
+
# @@error_fasta_file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
|
154
|
+
# @@error_file.puts e[:message]
|
155
|
+
# end
|
156
|
+
# end
|
157
|
+
|
158
|
+
rescue
|
159
|
+
puts "Error printing #{seq.seq_name}"
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
|
166
|
+
end
|
167
|
+
|