full_lengther_next 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,139 @@
1
+
2
+ class String
3
+
4
+ def translate
5
+ s = self.upcase
6
+ a = s.split('').each_slice(3).map{|e| e.join}
7
+
8
+ c={'GCT'=>'A', 'GCC'=>'A','GCA'=>'A','GCG'=>'A',
9
+ 'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
10
+ 'AAT'=>'N','AAC'=>'N',
11
+ 'GAT'=>'D','GAC'=>'D',
12
+ 'TGT'=>'C','TGC'=>'C',
13
+ 'CAA'=>'Q','CAG'=>'Q',
14
+ 'GAA'=>'E','GAG'=>'E',
15
+ 'GGT'=>'G','GGC'=>'G','GGA'=>'G','GGG'=>'G',
16
+ 'CAT'=>'H','CAC'=>'H',
17
+ 'ATT'=>'I','ATC'=>'I','ATA'=>'I',
18
+ 'TTA'=>'L','TTG'=>'L','CTT'=>'L','CTC'=>'L','CTA'=>'L','CTG'=>'L',
19
+ 'ATG'=>'M',
20
+ 'AAA'=>'K','AAG'=>'K',
21
+ 'TTT'=>'F','TTC'=>'F',
22
+ 'CCT'=>'P','CCC'=>'P','CCA'=>'P','CCG'=>'P',
23
+ 'TCT'=>'S','TCC'=>'S','TCA'=>'S','TCG'=>'S','AGT'=>'S','AGC'=>'S',
24
+ 'ACT'=>'T','ACC'=>'T','ACA'=>'T','ACG'=>'T',
25
+ 'TGG'=>'W',
26
+ 'TAT'=>'Y','TAC'=>'Y',
27
+ 'GTT'=>'V','GTC'=>'V','GTA'=>'V','GTG'=>'V',
28
+ 'TAG'=>'*','TGA'=>'*','TAA'=>'*'}
29
+
30
+ #EN CASO DE NO ENCONTRAR EL TRIPLETE SE AÑADE UNA X
31
+
32
+ res=a.map{
33
+ |e|
34
+ if (e.length == 3)
35
+ if (e =~ /[NnRrWwMmKkSsYyHhBbDdVv]/)
36
+ 'x'
37
+ else
38
+ c[e]||'x'
39
+ end
40
+ end
41
+ }
42
+ return res.compact.join
43
+ end
44
+
45
+ def generate_orf_old(a,frame)
46
+
47
+ all_orfs = []
48
+ each_orf = []
49
+
50
+ atg_codon = false
51
+ stop_codon = false
52
+ orf =''
53
+ t_start = 0
54
+ t_end = 0
55
+
56
+ a.each do |e|
57
+ t_end += 3
58
+ if (atg_codon)
59
+ orf += e
60
+ if (e == 'TAG') or (e == 'TGA') or (e == 'TAA')
61
+ if (orf.length >= 200)
62
+ each_orf.push orf
63
+ each_orf.push t_start
64
+ each_orf.push t_end
65
+ each_orf.push frame
66
+ each_orf.push stop_codon
67
+
68
+ all_orfs.push each_orf
69
+ each_orf = []
70
+ end
71
+ orf=''
72
+ stop_codon = true
73
+ atg_codon = false
74
+ t_start = t_end
75
+ end
76
+ elsif (e == 'ATG')
77
+ atg_codon = true
78
+ orf += e
79
+ t_start += 1
80
+ elsif (e == 'TAG') or (e == 'TGA') or (e == 'TAA')
81
+ stop_codon = true
82
+ t_start += 3
83
+ else
84
+ t_start += 3
85
+ end
86
+ end
87
+
88
+ if (all_orfs != '') && (all_orfs != nil)
89
+ return all_orfs
90
+ else
91
+ return nil
92
+ end
93
+ end
94
+
95
+ def orf_finder_old
96
+ res =[]
97
+
98
+ s = self.upcase
99
+ f1 = s.split('').each_slice(3).map{|e| e.join}
100
+ r1 = generate_orf(f1,1)
101
+ res += r1
102
+
103
+ s.sub!(/^./,'')
104
+ f2 = s.split('').each_slice(3).map{|e| e.join}
105
+ r2 = generate_orf(f2,2)
106
+ res += r2
107
+
108
+ s.sub!(/^./,'')
109
+ f3 = s.split('').each_slice(3).map{|e| e.join}
110
+ r3 = generate_orf(f3,3)
111
+ res += r3
112
+
113
+ # vamos a por los ORFs de la cadena complementaria
114
+ s = self.upcase
115
+ s = s.complementary_dna
116
+
117
+ f4 = s.split('').each_slice(3).map{|e| e.join}
118
+ r4 = generate_orf(f4,-1)
119
+ res += r4
120
+
121
+ s.sub!(/^./,'')
122
+ f5 = s.split('').each_slice(3).map{|e| e.join}
123
+ r5 = generate_orf(f5,-2)
124
+ res += r5
125
+
126
+ s.sub!(/^./,'')
127
+ f6 = s.split('').each_slice(3).map{|e| e.join}
128
+ r6 = generate_orf(f6,-3)
129
+ res += r6
130
+
131
+ return res
132
+ end
133
+
134
+ def complementary_dna
135
+ c={'A'=>'T', 'a' => 't', 'T' => 'A', 't' => 'a', 'C' => 'G', 'c'=>'g' , 'G' => 'C', 'g' => 'c', 'N' => 'N', 'n' => 'N' , 'R' => 'N', 'r' => 'N', 'W' => 'N', 'w' => 'N', 'M' => 'N', 'm' => 'N', 'K' => 'N', 'k' => 'N', 'S' => 'N', 's' => 'N', 'Y' => 'N', 'y' => 'N', 'H' => 'N', 'h' => 'N', 'B' => 'N', 'b' => 'N', 'D' => 'N', 'd' => 'N', 'V' => 'N', 'v' => 'N' }
136
+ return self.reverse.split('').map{|e| c[e]}.join
137
+ end
138
+
139
+ end
@@ -0,0 +1,33 @@
1
+
2
+
3
+ class String
4
+
5
+ def lcs(s2)
6
+ s1=self
7
+ res=""
8
+ num=Array.new(s1.size){Array.new(s2.size)}
9
+ len,ans=0
10
+ lastsub=0
11
+ s1.scan(/./).each_with_index do |l1,i |
12
+ s2.scan(/./).each_with_index do |l2,j |
13
+ unless l1==l2
14
+ num[i][j]=0
15
+ else
16
+ (i==0 || j==0)? num[i][j]=1 : num[i][j]=1 + num[i-1][j-1]
17
+ if num[i][j] > len
18
+ len = ans = num[i][j]
19
+ thissub = i
20
+ thissub -= num[i-1][j-1] unless num[i-1][j-1].nil?
21
+ if lastsub==thissub
22
+ res+=s1[i,1]
23
+ else
24
+ lastsub=thissub
25
+ res=s1[lastsub, (i+1)-lastsub]
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ res
32
+ end
33
+ end
@@ -0,0 +1,122 @@
1
+ $: << File.expand_path(File.join(File.dirname(__FILE__)))
2
+
3
+ require 'scbi_mapreduce'
4
+ require 'scbi_blast'
5
+ require 'json'
6
+ require 'sequence'
7
+ require 'fl_string_utils'
8
+ require "lcs" # like the class simliar of seqtrim, return the longest common sequence
9
+ require "test_code"
10
+
11
+ require 'fl_analysis'
12
+ include FlAnalysis
13
+
14
+
15
+ class MyWorker < ScbiMapreduce::Worker
16
+
17
+ def starting_worker
18
+
19
+ # $WORKER_LOG.info "Loading actions"
20
+ rescue Exception => e
21
+ puts (e.message+ e.backtrace.join("\n"))
22
+
23
+ end
24
+
25
+ def receive_initial_config(obj)
26
+
27
+ # Reads the parameters
28
+ # $WORKER_LOG.info "Params received: #{obj.to_json}"
29
+ @options = obj
30
+
31
+ end
32
+
33
+ def process_object(obj)
34
+
35
+ full_lenghter2(obj)
36
+ return obj
37
+
38
+ end
39
+
40
+ def closing_worker
41
+
42
+ end
43
+
44
+ # ejecuta blastx utilizando los parametros fichero de entrada, base de datos y fichero de salida
45
+ def run_blastx(input, database, user_db_name)
46
+ # puts "\n#{user_db_name} ..... executing BLASTx"
47
+
48
+ blast=BatchBlast.new("-db #{database}",'blastx',"-evalue 1e-6 -num_alignments 1 -num_descriptions 1")
49
+ blast_result = blast.do_blast_seqs(input, :xml)
50
+
51
+ # puts "#{user_db_name} ..... BLASTx finished"
52
+
53
+ return blast_result
54
+ end
55
+
56
+
57
+ def full_lenghter2(seqs)
58
+
59
+ # -------------------------------------------- User database
60
+ # if the user has included his own database in the parameters entry,
61
+ # the location of the database is tested, and blast and the results analysis is done
62
+
63
+ if (@options[:user_db])
64
+
65
+ if (@options[:user_db] =~ /\//)
66
+ user_db_name = @options[:user_db].sub(/.+\//,'')
67
+ end
68
+
69
+ if !File.exists?("#{@options[:user_db]}.psq")
70
+ puts "user database: #{@options[:user_db]} was not found"
71
+ exit
72
+ end
73
+
74
+ # do blast
75
+ my_blast = run_blastx(seqs, "#{@options[:user_db]}", user_db_name)
76
+
77
+ # split and parse blast
78
+ seqs.each_with_index do |seq,i|
79
+ analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
80
+ end
81
+
82
+ new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
83
+
84
+ else
85
+ new_seqs = seqs
86
+ end
87
+
88
+ # -------------------------------------------- UniProt (sp)
89
+ # blast
90
+ my_blast = run_blastx(new_seqs, "sp_#{@options[:tax_group]}/sp_#{@options[:tax_group]}.fasta", "sp_#{@options[:tax_group]}")
91
+
92
+ # split and parse blast
93
+ new_seqs.each_with_index do |seq,i|
94
+ analiza_orf_y_fl(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
95
+ end
96
+
97
+ new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
98
+
99
+ # -------------------------------------------- UniProt (tr)
100
+ # blast
101
+ my_blast = run_blastx(new_seqs, "tr_#{@options[:tax_group]}/tr_#{@options[:tax_group]}.fasta", "tr_#{@options[:tax_group]}")
102
+
103
+ # split and parse blast
104
+ new_seqs.each_with_index do |seq,i|
105
+ analiza_orf_y_fl(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
106
+ end
107
+
108
+ # -------------------------------------------- Test Code
109
+ # the sequences without a reliable similarity with an orthologue are processed with Test Code
110
+ testcode_input=seqs.select{|s| !s.get_annotations(:tcode).empty?}
111
+
112
+ # active this line to test tcode. hay que comentar todas las lineas de arriba de este metodo
113
+ # testcode_input=seqs
114
+
115
+ testcode_input.each do |seq|
116
+ TestCode.new(seq)
117
+ end
118
+
119
+ end
120
+
121
+ end
122
+
@@ -0,0 +1,167 @@
1
+ require 'json'
2
+ require 'scbi_fasta'
3
+ require 'sequence'
4
+
5
+ require 'fl2_stats'
6
+ include Fl2Stats
7
+
8
+ class MyWorkerManager < ScbiMapreduce::WorkManager
9
+
10
+ # open files and prepare global data
11
+ def self.init_work_manager(options,chunk_size=100)
12
+
13
+ input_file=options[:fasta]
14
+
15
+ if !File.exists?('fl2_results')
16
+ Dir.mkdir('fl2_results')
17
+ end
18
+
19
+ @@fasta_file = FastaQualFile.new(input_file,'')
20
+ @@chunk_size=chunk_size
21
+ @@options = options
22
+
23
+ @@annotation_file = File.open("fl2_results/annotations.txt", 'w')
24
+ @@annotation_file.puts "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\tt_code\te_value\tp_ident\tprotein_length\ts_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tProtein_sequence"
25
+
26
+ @@alignment_file = File.open("fl2_results/alignments.txt", 'w')
27
+ @@prot_file = File.open("fl2_results/proteins.fasta", 'w')
28
+ @@nts_file = File.open("fl2_results/nt_seq.txt", 'w')
29
+ @@tcode_file=File.open("fl2_results/tcode_result.txt", 'w')
30
+ @@tcode_file.puts "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\tt_code\te_value\tp_ident\tprotein_length\ts_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tProtein_sequence"
31
+
32
+ # @@error_fasta_file = File.open("fl2_results/error_seqs.fasta", 'w')
33
+ # @@error_file = File.open("fl2_results/errors_info.txt", 'w')
34
+
35
+ end
36
+
37
+ # close files
38
+ def self.end_work_manager
39
+ @@fasta_file.close
40
+
41
+ @@annotation_file.close
42
+ @@alignment_file.close
43
+ @@prot_file.close
44
+ @@nts_file.close
45
+ @@tcode_file.close
46
+
47
+ # @@error_fasta_file.close
48
+ # @@error_file.close
49
+
50
+ summary_stats
51
+ end
52
+
53
+ def error_received(worker_error, obj)
54
+ puts "Error while processing object #{obj.inspect}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
55
+ end
56
+
57
+ def too_many_errors_received
58
+ $LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
59
+ end
60
+
61
+ # send initial config
62
+ def worker_initial_config
63
+ return @@options
64
+ end
65
+
66
+ # this method is called every time a worker needs a new work
67
+ # Return the work data or nil if no more data is available
68
+ def next_work
69
+
70
+ # seqs=[]
71
+ # if (@@count % 2==0)
72
+ # $LOG.info("Processed #{@@count}")
73
+ # end
74
+
75
+ # prepare work
76
+ # @@chunk_size.times do
77
+ n,f,q = @@fasta_file.next_seq
78
+
79
+ if !n.nil?
80
+ return Sequence.new(n,f,q)
81
+ else
82
+ return nil
83
+ end
84
+
85
+ # end
86
+
87
+ # return work
88
+ # if !seqs.empty?
89
+ # return seqs
90
+ # else
91
+ # return nil
92
+ # end
93
+
94
+ end
95
+
96
+ # this method is ejecuted each time an obj is finished
97
+ def work_received(obj)
98
+
99
+ obj.each do |seq|
100
+ # puts seq.seq_name
101
+
102
+ write_seq(seq)
103
+
104
+ end
105
+ end
106
+
107
+
108
+ def write_seq(seq)
109
+ begin
110
+ # -------------------------------------------------------- Complete Seqs
111
+ if (e=seq.get_annotations(:complete).first)
112
+
113
+ @@annotation_file.puts e[:message]
114
+
115
+ if (a=seq.get_annotations(:alignment).first)
116
+ @@alignment_file.puts a[:message]
117
+ end
118
+
119
+ if (p=seq.get_annotations(:protein).first)
120
+ @@prot_file.puts p[:message]
121
+ end
122
+
123
+ if (n=seq.get_annotations(:nucleotide).first)
124
+ @@nts_file.puts n[:message]
125
+ end
126
+ # -------------------------------------------------------- Non Complete Seqs
127
+ elsif (e=seq.get_annotations(:tmp_annotation).first)
128
+
129
+ @@annotation_file.puts e[:message][0]
130
+
131
+ if (a=seq.get_annotations(:alignment).first)
132
+ if !a[:message].empty?
133
+ @@alignment_file.puts a[:message]
134
+ end
135
+ end
136
+
137
+ if (p=seq.get_annotations(:protein).first)
138
+ if !p[:message].empty?
139
+ @@prot_file.puts p[:message]
140
+ end
141
+ end
142
+
143
+ if (n=seq.get_annotations(:nucleotide).first)
144
+ @@nts_file.puts n[:message]
145
+ end
146
+ # -------------------------------------------------------- Test Code
147
+ elsif (t=seq.get_annotations(:tcode).first)
148
+ @@tcode_file.puts t[:message]
149
+ end
150
+ # -------------------------------------------------------- Errors
151
+ # if e=seq.get_annotations(:error).first
152
+ # if !e[:message].empty?
153
+ # @@error_fasta_file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
154
+ # @@error_file.puts e[:message]
155
+ # end
156
+ # end
157
+
158
+ rescue
159
+ puts "Error printing #{seq.seq_name}"
160
+ end
161
+
162
+ end
163
+
164
+
165
+
166
+ end
167
+