full_lengther_next 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,139 @@
1
+
2
+ class String
3
+
4
+ def translate
5
+ s = self.upcase
6
+ a = s.split('').each_slice(3).map{|e| e.join}
7
+
8
+ c={'GCT'=>'A', 'GCC'=>'A','GCA'=>'A','GCG'=>'A',
9
+ 'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
10
+ 'AAT'=>'N','AAC'=>'N',
11
+ 'GAT'=>'D','GAC'=>'D',
12
+ 'TGT'=>'C','TGC'=>'C',
13
+ 'CAA'=>'Q','CAG'=>'Q',
14
+ 'GAA'=>'E','GAG'=>'E',
15
+ 'GGT'=>'G','GGC'=>'G','GGA'=>'G','GGG'=>'G',
16
+ 'CAT'=>'H','CAC'=>'H',
17
+ 'ATT'=>'I','ATC'=>'I','ATA'=>'I',
18
+ 'TTA'=>'L','TTG'=>'L','CTT'=>'L','CTC'=>'L','CTA'=>'L','CTG'=>'L',
19
+ 'ATG'=>'M',
20
+ 'AAA'=>'K','AAG'=>'K',
21
+ 'TTT'=>'F','TTC'=>'F',
22
+ 'CCT'=>'P','CCC'=>'P','CCA'=>'P','CCG'=>'P',
23
+ 'TCT'=>'S','TCC'=>'S','TCA'=>'S','TCG'=>'S','AGT'=>'S','AGC'=>'S',
24
+ 'ACT'=>'T','ACC'=>'T','ACA'=>'T','ACG'=>'T',
25
+ 'TGG'=>'W',
26
+ 'TAT'=>'Y','TAC'=>'Y',
27
+ 'GTT'=>'V','GTC'=>'V','GTA'=>'V','GTG'=>'V',
28
+ 'TAG'=>'*','TGA'=>'*','TAA'=>'*'}
29
+
30
+ #EN CASO DE NO ENCONTRAR EL TRIPLETE SE AÑADE UNA X
31
+
32
+ res=a.map{
33
+ |e|
34
+ if (e.length == 3)
35
+ if (e =~ /[NnRrWwMmKkSsYyHhBbDdVv]/)
36
+ 'x'
37
+ else
38
+ c[e]||'x'
39
+ end
40
+ end
41
+ }
42
+ return res.compact.join
43
+ end
44
+
45
+ def generate_orf_old(a,frame)
46
+
47
+ all_orfs = []
48
+ each_orf = []
49
+
50
+ atg_codon = false
51
+ stop_codon = false
52
+ orf =''
53
+ t_start = 0
54
+ t_end = 0
55
+
56
+ a.each do |e|
57
+ t_end += 3
58
+ if (atg_codon)
59
+ orf += e
60
+ if (e == 'TAG') or (e == 'TGA') or (e == 'TAA')
61
+ if (orf.length >= 200)
62
+ each_orf.push orf
63
+ each_orf.push t_start
64
+ each_orf.push t_end
65
+ each_orf.push frame
66
+ each_orf.push stop_codon
67
+
68
+ all_orfs.push each_orf
69
+ each_orf = []
70
+ end
71
+ orf=''
72
+ stop_codon = true
73
+ atg_codon = false
74
+ t_start = t_end
75
+ end
76
+ elsif (e == 'ATG')
77
+ atg_codon = true
78
+ orf += e
79
+ t_start += 1
80
+ elsif (e == 'TAG') or (e == 'TGA') or (e == 'TAA')
81
+ stop_codon = true
82
+ t_start += 3
83
+ else
84
+ t_start += 3
85
+ end
86
+ end
87
+
88
+ if (all_orfs != '') && (all_orfs != nil)
89
+ return all_orfs
90
+ else
91
+ return nil
92
+ end
93
+ end
94
+
95
+ def orf_finder_old
96
+ res =[]
97
+
98
+ s = self.upcase
99
+ f1 = s.split('').each_slice(3).map{|e| e.join}
100
+ r1 = generate_orf(f1,1)
101
+ res += r1
102
+
103
+ s.sub!(/^./,'')
104
+ f2 = s.split('').each_slice(3).map{|e| e.join}
105
+ r2 = generate_orf(f2,2)
106
+ res += r2
107
+
108
+ s.sub!(/^./,'')
109
+ f3 = s.split('').each_slice(3).map{|e| e.join}
110
+ r3 = generate_orf(f3,3)
111
+ res += r3
112
+
113
+ # vamos a por los ORFs de la cadena complementaria
114
+ s = self.upcase
115
+ s = s.complementary_dna
116
+
117
+ f4 = s.split('').each_slice(3).map{|e| e.join}
118
+ r4 = generate_orf(f4,-1)
119
+ res += r4
120
+
121
+ s.sub!(/^./,'')
122
+ f5 = s.split('').each_slice(3).map{|e| e.join}
123
+ r5 = generate_orf(f5,-2)
124
+ res += r5
125
+
126
+ s.sub!(/^./,'')
127
+ f6 = s.split('').each_slice(3).map{|e| e.join}
128
+ r6 = generate_orf(f6,-3)
129
+ res += r6
130
+
131
+ return res
132
+ end
133
+
134
+ def complementary_dna
135
+ c={'A'=>'T', 'a' => 't', 'T' => 'A', 't' => 'a', 'C' => 'G', 'c'=>'g' , 'G' => 'C', 'g' => 'c', 'N' => 'N', 'n' => 'N' , 'R' => 'N', 'r' => 'N', 'W' => 'N', 'w' => 'N', 'M' => 'N', 'm' => 'N', 'K' => 'N', 'k' => 'N', 'S' => 'N', 's' => 'N', 'Y' => 'N', 'y' => 'N', 'H' => 'N', 'h' => 'N', 'B' => 'N', 'b' => 'N', 'D' => 'N', 'd' => 'N', 'V' => 'N', 'v' => 'N' }
136
+ return self.reverse.split('').map{|e| c[e]}.join
137
+ end
138
+
139
+ end
@@ -0,0 +1,33 @@
1
+
2
+
3
+ class String
4
+
5
+ def lcs(s2)
6
+ s1=self
7
+ res=""
8
+ num=Array.new(s1.size){Array.new(s2.size)}
9
+ len,ans=0
10
+ lastsub=0
11
+ s1.scan(/./).each_with_index do |l1,i |
12
+ s2.scan(/./).each_with_index do |l2,j |
13
+ unless l1==l2
14
+ num[i][j]=0
15
+ else
16
+ (i==0 || j==0)? num[i][j]=1 : num[i][j]=1 + num[i-1][j-1]
17
+ if num[i][j] > len
18
+ len = ans = num[i][j]
19
+ thissub = i
20
+ thissub -= num[i-1][j-1] unless num[i-1][j-1].nil?
21
+ if lastsub==thissub
22
+ res+=s1[i,1]
23
+ else
24
+ lastsub=thissub
25
+ res=s1[lastsub, (i+1)-lastsub]
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ res
32
+ end
33
+ end
@@ -0,0 +1,122 @@
1
+ $: << File.expand_path(File.join(File.dirname(__FILE__)))
2
+
3
+ require 'scbi_mapreduce'
4
+ require 'scbi_blast'
5
+ require 'json'
6
+ require 'sequence'
7
+ require 'fl_string_utils'
8
+ require "lcs" # like the class simliar of seqtrim, return the longest common sequence
9
+ require "test_code"
10
+
11
+ require 'fl_analysis'
12
+ include FlAnalysis
13
+
14
+
15
+ class MyWorker < ScbiMapreduce::Worker
16
+
17
+ def starting_worker
18
+
19
+ # $WORKER_LOG.info "Loading actions"
20
+ rescue Exception => e
21
+ puts (e.message+ e.backtrace.join("\n"))
22
+
23
+ end
24
+
25
+ def receive_initial_config(obj)
26
+
27
+ # Reads the parameters
28
+ # $WORKER_LOG.info "Params received: #{obj.to_json}"
29
+ @options = obj
30
+
31
+ end
32
+
33
+ def process_object(obj)
34
+
35
+ full_lenghter2(obj)
36
+ return obj
37
+
38
+ end
39
+
40
+ def closing_worker
41
+
42
+ end
43
+
44
+ # ejecuta blastx utilizando los parametros fichero de entrada, base de datos y fichero de salida
45
+ def run_blastx(input, database, user_db_name)
46
+ # puts "\n#{user_db_name} ..... executing BLASTx"
47
+
48
+ blast=BatchBlast.new("-db #{database}",'blastx',"-evalue 1e-6 -num_alignments 1 -num_descriptions 1")
49
+ blast_result = blast.do_blast_seqs(input, :xml)
50
+
51
+ # puts "#{user_db_name} ..... BLASTx finished"
52
+
53
+ return blast_result
54
+ end
55
+
56
+
57
+ def full_lenghter2(seqs)
58
+
59
+ # -------------------------------------------- User database
60
+ # if the user has included his own database in the parameters entry,
61
+ # the location of the database is tested, and blast and the results analysis is done
62
+
63
+ if (@options[:user_db])
64
+
65
+ if (@options[:user_db] =~ /\//)
66
+ user_db_name = @options[:user_db].sub(/.+\//,'')
67
+ end
68
+
69
+ if !File.exists?("#{@options[:user_db]}.psq")
70
+ puts "user database: #{@options[:user_db]} was not found"
71
+ exit
72
+ end
73
+
74
+ # do blast
75
+ my_blast = run_blastx(seqs, "#{@options[:user_db]}", user_db_name)
76
+
77
+ # split and parse blast
78
+ seqs.each_with_index do |seq,i|
79
+ analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
80
+ end
81
+
82
+ new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
83
+
84
+ else
85
+ new_seqs = seqs
86
+ end
87
+
88
+ # -------------------------------------------- UniProt (sp)
89
+ # blast
90
+ my_blast = run_blastx(new_seqs, "sp_#{@options[:tax_group]}/sp_#{@options[:tax_group]}.fasta", "sp_#{@options[:tax_group]}")
91
+
92
+ # split and parse blast
93
+ new_seqs.each_with_index do |seq,i|
94
+ analiza_orf_y_fl(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
95
+ end
96
+
97
+ new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
98
+
99
+ # -------------------------------------------- UniProt (tr)
100
+ # blast
101
+ my_blast = run_blastx(new_seqs, "tr_#{@options[:tax_group]}/tr_#{@options[:tax_group]}.fasta", "tr_#{@options[:tax_group]}")
102
+
103
+ # split and parse blast
104
+ new_seqs.each_with_index do |seq,i|
105
+ analiza_orf_y_fl(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
106
+ end
107
+
108
+ # -------------------------------------------- Test Code
109
+ # the sequences without a reliable similarity with an orthologue are processed with Test Code
110
+ testcode_input=seqs.select{|s| !s.get_annotations(:tcode).empty?}
111
+
112
+ # active this line to test tcode. hay que comentar todas las lineas de arriba de este metodo
113
+ # testcode_input=seqs
114
+
115
+ testcode_input.each do |seq|
116
+ TestCode.new(seq)
117
+ end
118
+
119
+ end
120
+
121
+ end
122
+
@@ -0,0 +1,167 @@
1
+ require 'json'
2
+ require 'scbi_fasta'
3
+ require 'sequence'
4
+
5
+ require 'fl2_stats'
6
+ include Fl2Stats
7
+
8
+ class MyWorkerManager < ScbiMapreduce::WorkManager
9
+
10
+ # open files and prepare global data
11
+ def self.init_work_manager(options,chunk_size=100)
12
+
13
+ input_file=options[:fasta]
14
+
15
+ if !File.exists?('fl2_results')
16
+ Dir.mkdir('fl2_results')
17
+ end
18
+
19
+ @@fasta_file = FastaQualFile.new(input_file,'')
20
+ @@chunk_size=chunk_size
21
+ @@options = options
22
+
23
+ @@annotation_file = File.open("fl2_results/annotations.txt", 'w')
24
+ @@annotation_file.puts "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\tt_code\te_value\tp_ident\tprotein_length\ts_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tProtein_sequence"
25
+
26
+ @@alignment_file = File.open("fl2_results/alignments.txt", 'w')
27
+ @@prot_file = File.open("fl2_results/proteins.fasta", 'w')
28
+ @@nts_file = File.open("fl2_results/nt_seq.txt", 'w')
29
+ @@tcode_file=File.open("fl2_results/tcode_result.txt", 'w')
30
+ @@tcode_file.puts "Query_id\tfasta_length\tSubject_id\tdb_name\tStatus\tt_code\te_value\tp_ident\tprotein_length\ts_length\tWarning_msgs\tframe\tORF_start\tORF_end\ts_start\ts_end\tDescription\tProtein_sequence"
31
+
32
+ # @@error_fasta_file = File.open("fl2_results/error_seqs.fasta", 'w')
33
+ # @@error_file = File.open("fl2_results/errors_info.txt", 'w')
34
+
35
+ end
36
+
37
+ # close files
38
+ def self.end_work_manager
39
+ @@fasta_file.close
40
+
41
+ @@annotation_file.close
42
+ @@alignment_file.close
43
+ @@prot_file.close
44
+ @@nts_file.close
45
+ @@tcode_file.close
46
+
47
+ # @@error_fasta_file.close
48
+ # @@error_file.close
49
+
50
+ summary_stats
51
+ end
52
+
53
+ def error_received(worker_error, obj)
54
+ puts "Error while processing object #{obj.inspect}\n" + worker_error.original_exception.message + ":\n" +worker_error.original_exception.backtrace.join("\n")
55
+ end
56
+
57
+ def too_many_errors_received
58
+ $LOG.error "Too many errors: #{@@error_count} errors on #{@@count} executed sequences, exiting before finishing"
59
+ end
60
+
61
+ # send initial config
62
+ def worker_initial_config
63
+ return @@options
64
+ end
65
+
66
+ # this method is called every time a worker needs a new work
67
+ # Return the work data or nil if no more data is available
68
+ def next_work
69
+
70
+ # seqs=[]
71
+ # if (@@count % 2==0)
72
+ # $LOG.info("Processed #{@@count}")
73
+ # end
74
+
75
+ # prepare work
76
+ # @@chunk_size.times do
77
+ n,f,q = @@fasta_file.next_seq
78
+
79
+ if !n.nil?
80
+ return Sequence.new(n,f,q)
81
+ else
82
+ return nil
83
+ end
84
+
85
+ # end
86
+
87
+ # return work
88
+ # if !seqs.empty?
89
+ # return seqs
90
+ # else
91
+ # return nil
92
+ # end
93
+
94
+ end
95
+
96
+ # this method is ejecuted each time an obj is finished
97
+ def work_received(obj)
98
+
99
+ obj.each do |seq|
100
+ # puts seq.seq_name
101
+
102
+ write_seq(seq)
103
+
104
+ end
105
+ end
106
+
107
+
108
+ def write_seq(seq)
109
+ begin
110
+ # -------------------------------------------------------- Complete Seqs
111
+ if (e=seq.get_annotations(:complete).first)
112
+
113
+ @@annotation_file.puts e[:message]
114
+
115
+ if (a=seq.get_annotations(:alignment).first)
116
+ @@alignment_file.puts a[:message]
117
+ end
118
+
119
+ if (p=seq.get_annotations(:protein).first)
120
+ @@prot_file.puts p[:message]
121
+ end
122
+
123
+ if (n=seq.get_annotations(:nucleotide).first)
124
+ @@nts_file.puts n[:message]
125
+ end
126
+ # -------------------------------------------------------- Non Complete Seqs
127
+ elsif (e=seq.get_annotations(:tmp_annotation).first)
128
+
129
+ @@annotation_file.puts e[:message][0]
130
+
131
+ if (a=seq.get_annotations(:alignment).first)
132
+ if !a[:message].empty?
133
+ @@alignment_file.puts a[:message]
134
+ end
135
+ end
136
+
137
+ if (p=seq.get_annotations(:protein).first)
138
+ if !p[:message].empty?
139
+ @@prot_file.puts p[:message]
140
+ end
141
+ end
142
+
143
+ if (n=seq.get_annotations(:nucleotide).first)
144
+ @@nts_file.puts n[:message]
145
+ end
146
+ # -------------------------------------------------------- Test Code
147
+ elsif (t=seq.get_annotations(:tcode).first)
148
+ @@tcode_file.puts t[:message]
149
+ end
150
+ # -------------------------------------------------------- Errors
151
+ # if e=seq.get_annotations(:error).first
152
+ # if !e[:message].empty?
153
+ # @@error_fasta_file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
154
+ # @@error_file.puts e[:message]
155
+ # end
156
+ # end
157
+
158
+ rescue
159
+ puts "Error printing #{seq.seq_name}"
160
+ end
161
+
162
+ end
163
+
164
+
165
+
166
+ end
167
+