full_lengther_next 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # 15-2-2011 Noe Fernandez-Pozo
4
+ # Script to create your own Full-LengtherNext User database.
5
+
6
+ require 'net/ftp'
7
+
8
+ #receive one argument or fail
9
+ if (ARGV.size != 2)
10
+
11
+ puts "incorrect number of arguments, you need a taxonomic group like 'Coniferopsida', you can search it in 'http://www.ncbi.nlm.nih.gov/Taxonomy/'
12
+ and a UniProt taxonomic group from this list:
13
+
14
+ fungi
15
+ human
16
+ invertebrates
17
+ mammals
18
+ plants
19
+ rodents
20
+ vertebrates
21
+
22
+ mode of use: ruby make_user_db.rb coniferopsida plants\n\n"
23
+
24
+ Process.exit(-1);
25
+ end
26
+
27
+ (my_group,uniprot_group)=ARGV
28
+
29
+ ################################################### Functions
30
+
31
+ def filter_incomplete_seqs(output_file,file_name, my_group)
32
+
33
+ puts " filtering sequences"
34
+
35
+ # UniProtKB fragments with FT NON_CONS and FT NON_TER features.
36
+ #
37
+ # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
38
+ # FT NON_TER 1 1
39
+ # FT NON_TER 29 29
40
+ # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
41
+ # FT NON_CONS 1683 1684
42
+ #
43
+ # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
44
+
45
+ newseq=false
46
+ print_seq=false
47
+ incomplete=false
48
+ id=''
49
+ description = ''
50
+ organism_name = ''
51
+ seq = ''
52
+ organelle = ''
53
+
54
+ File.open(file_name).each_line do |line|
55
+ if (newseq == false)
56
+ if (line =~ /^AC\s+(\w+);/)
57
+ id=$1
58
+ newseq = true
59
+ description = ''
60
+ organism_name = ''
61
+ seq = ''
62
+ print_seq = false
63
+ incomplete = false
64
+ organelle = ''
65
+ end
66
+ else
67
+ if (line =~ /^DE\s+(.+)\;*/)
68
+ if (description == '')
69
+ description = $1
70
+ description.sub!(/RecName: Full=/,'sp=')
71
+ description.sub!(/SubName: Full=/,'tr=')
72
+ end
73
+ if (line =~ /Flags: Fragment/)
74
+ # puts "#{id} #{line}"
75
+ incomplete = true
76
+ end
77
+ elsif (line =~ /^OS\s+(.+)/)
78
+ organism_name = $1
79
+ elsif (line =~ /^OG\s+(.+)/)
80
+ organelle = $1
81
+ elsif (line =~ /^OC\s+[\w\s\;]*#{my_group}/i) && (!incomplete)
82
+ print_seq=true
83
+ # puts "#{id} #{organism_name} print_seq?: #{print_seq}"
84
+ elsif (line =~ /^FT\s+NON_TER\s+/)
85
+ print_seq=false
86
+ # puts "#{id} NON_TER"
87
+ elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
88
+ print_seq=false
89
+ # puts "#{id} NON_CONS"
90
+ elsif (line =~ /^\s+([\w\s]+)/)
91
+ seq += $1
92
+ elsif (line =~ /^\/\//)
93
+ seq.gsub!(/\s*/,'')
94
+ if (seq !~ /^M/i)
95
+ print_seq=false
96
+ end
97
+ newseq = false
98
+
99
+ if (print_seq)
100
+ output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ ########################################################
108
+ ## MAIN
109
+ ########################################################
110
+
111
+ ROOT_PATH=File.dirname(__FILE__)
112
+
113
+ # $: << File.expand_path(File.join(ROOT_PATH, "classes"))
114
+
115
+ # load gem path, only to test locally
116
+ # $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
117
+
118
+ require 'full_lengther_next'
119
+
120
+ if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
121
+ formatted_db_path = ENV['BLASTDB']
122
+ else # otherwise use ROOTPATH + DB
123
+ formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
124
+ end
125
+
126
+ ENV['BLASTDB']=formatted_db_path
127
+
128
+
129
+ if !File.exists?(File.join(ENV['BLASTDB'], my_group))
130
+ Dir.mkdir("blast_dbs/#{my_group}")
131
+ end
132
+
133
+ output_file_path=File.join(ENV['BLASTDB'],my_group,my_group+".fasta")
134
+
135
+ output_file = File.new(output_file_path, "w")
136
+
137
+ filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_sprot_#{uniprot_group}.dat"), my_group)
138
+ filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_trembl_#{uniprot_group}.dat"), my_group)
139
+
140
+ output_file.close
141
+
142
+ `makeblastdb -in #{output_file_path} -dbtype 'prot' -parse_seqids`
143
+
144
+ puts "make_user_db.rb has finished"
@@ -0,0 +1,13 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
5
+
6
+ $: << File.expand_path(File.join(ROOT_PATH, 'classes'))
7
+
8
+
9
+ module FullLengtherNext
10
+ VERSION = '0.0.1'
11
+
12
+ FULLLENGHTER_VERSION = VERSION
13
+ end
@@ -0,0 +1,94 @@
1
+
2
+ module CommonFunctions
3
+
4
+ def contenidos_en_prot(hit, full_prot, q)
5
+
6
+ is_ok = false
7
+ q_index_start = 9999
8
+ fr_index_start = 0
9
+ min_index_start = 9999
10
+ aas_parecidos = 0
11
+ masked_x = 0
12
+ suma_fragments = 0
13
+
14
+ masked_x = hit.q_seq.count('X')
15
+ masked_x = masked_x + hit.q_seq.count('-')
16
+
17
+ full_prot = full_prot.gsub(/[\-Xx]+/,'')
18
+ compare_prot = hit.q_seq.gsub(/[\-Xx]+/,'-')
19
+ fragments_array = compare_prot.split(/\-+/)
20
+
21
+ fragments_array.each do |seq|
22
+ # puts "seq: #{seq}\nfull_prot: #{full_prot}"
23
+ simliar_fragment = full_prot.lcs(seq)
24
+ suma_fragments += simliar_fragment.length
25
+
26
+ fr_index_start = full_prot.index(simliar_fragment)
27
+
28
+ if (q_index_start == 9999)
29
+ q_index_start = fr_index_start
30
+ end
31
+ full_prot = full_prot[(fr_index_start + simliar_fragment.length)..full_prot.length]
32
+ end
33
+
34
+ simliar_fragment = full_prot.lcs(compare_prot)
35
+
36
+ # if ($verbose)
37
+ # puts "#{q.query_def}-------------------------------------#{suma_fragments} de #{compare_prot.length}"
38
+ # puts "#{q.query_def}-------------------------------------#{suma_fragments + masked_x} >= #{compare_prot.length * 0.7}"
39
+ # puts "\nfull: #{full_prot}\ncomp: #{compare_prot}\nsimliar_fragment: #{simliar_fragment}"
40
+ # end
41
+
42
+ if (suma_fragments + masked_x >= compare_prot.length * 0.7)
43
+ is_ok = true
44
+ # puts "OK -- encontramos suficiente similitud entre query y subject -- OK"
45
+ else
46
+ is_ok = false
47
+ # puts "\nfull: #{full_prot}\ncomp: #{compare_prot}"
48
+ # puts "Warning!: no match comparing proteins"
49
+ end
50
+
51
+ min_index_start = [min_index_start, q_index_start].min
52
+
53
+ if (min_index_start == 9999)
54
+ min_index_start = 0
55
+ end
56
+
57
+ return [is_ok, min_index_start]
58
+ end
59
+
60
+
61
+
62
+
63
+ def reverse_seq(query_fasta, h_qframe, h_qstart, h_qend)
64
+
65
+ q_frame = -h_qframe.to_i
66
+
67
+ q_beg = query_fasta.length - h_qend - 1
68
+ q_end = query_fasta.length - h_qstart - 1
69
+
70
+ query_fasta = query_fasta.complementary_dna
71
+
72
+ # el qend y el qstart estan al reves porque cuando la seq tiene frame negativo el blast los pone al reves
73
+ return [query_fasta, q_frame, q_beg, q_end]
74
+ end
75
+
76
+
77
+
78
+ def corrige_frame(ref_frame,ref_start,ref_end)
79
+
80
+ if (ref_frame.abs == 2)
81
+ ref_start = ref_start + 1
82
+ ref_end = ref_end + 1
83
+ elsif (ref_frame.abs == 3)
84
+ ref_start = ref_start + 2
85
+ ref_end = ref_end + 2
86
+ end
87
+
88
+ return [ref_start,ref_end]
89
+
90
+ end
91
+
92
+
93
+
94
+ end
@@ -0,0 +1,222 @@
1
+
2
+ module Fl2Stats
3
+
4
+ # -------------------------------------------------------------------------------- Main
5
+ def summary_stats
6
+ stats_file = File.open('fl2_results/summary_stats.txt', 'w')
7
+
8
+ total_seqs = 0
9
+
10
+ num1 = annotation_stats(stats_file)
11
+ num2 = testcode_stats(stats_file)
12
+
13
+ total_seqs = num1 + num2
14
+
15
+ stats_file.puts "\nInput sequences in your fasta: #{total_seqs}\n\n"
16
+ end
17
+
18
+ # ---------------------------------------------------------------------------------- Functions
19
+ def stats_my_db(db_name, array)
20
+
21
+ if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
22
+ array[1] += 1
23
+ elsif (db_name =~ /^sp_/)
24
+ array[2] += 1
25
+ elsif (db_name =~ /^tr_/)
26
+ array[3] += 1
27
+ end
28
+
29
+ return array
30
+ end
31
+
32
+
33
+ def annotation_stats(stats_file)
34
+
35
+ seqs_number = 0
36
+ array_of_all_accs = []
37
+ array_of_complete_accs = []
38
+ error_1_num = 0
39
+
40
+ seqs_longer_200 = 0
41
+ seqs_shorter_200 = 0
42
+ complete_longer_200 = 0
43
+ complete_shorter_200 = 0
44
+
45
+ seqs_longer_500 = 0
46
+ seqs_shorter_500 = 0
47
+ complete_longer_500 = 0
48
+ complete_shorter_500 = 0
49
+
50
+ complete = [0,0,0,0]
51
+ putative_complete = [0,0,0,0]
52
+ c_terminus = [0,0,0,0]
53
+ putative_c_terminus = [0,0,0,0]
54
+ n_terminus = [0,0,0,0]
55
+ putative_n_terminus = [0,0,0,0]
56
+ internal = [0,0,0,0]
57
+ cod_seq = [0,0,0,0]
58
+
59
+
60
+ File.open('fl2_results/annotations.txt').each do |line|
61
+ line.chomp!
62
+ (name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
63
+
64
+ if (line !~ /^Query_id\t/)
65
+ seqs_number += 1
66
+ array_of_all_accs.push acc
67
+ # -------------------------------------------------------------------------
68
+ if (fasta_length.to_i >= 200)
69
+ seqs_longer_200 += 1
70
+ else
71
+ seqs_shorter_200 += 1
72
+ end
73
+ if (fasta_length.to_i >= 500)
74
+ seqs_longer_500 += 1
75
+ else
76
+ seqs_shorter_500 += 1
77
+ end
78
+ # -------------------------------------------------------------------------
79
+ if (msgs =~ /ERROR#1/)
80
+ error_1_num += 1
81
+ end
82
+ # -------------------------------------------------------------------------
83
+ if (status == 'Complete')
84
+ complete[0] += 1
85
+ array_of_complete_accs.push acc
86
+ complete = stats_my_db(db_name, complete)
87
+
88
+ if (fasta_length.to_i >= 200)
89
+ complete_longer_200 += 1
90
+ else
91
+ complete_shorter_200 += 1
92
+ end
93
+
94
+ if (fasta_length.to_i >= 500)
95
+ complete_longer_500 += 1
96
+ else
97
+ complete_shorter_500 += 1
98
+ end
99
+
100
+ elsif (status == 'Putative Complete')
101
+ putative_complete[0] += 1
102
+ putative_complete = stats_my_db(db_name, putative_complete)
103
+ elsif (status == 'C-terminus')
104
+ c_terminus[0] += 1
105
+ c_terminus = stats_my_db(db_name, c_terminus)
106
+ elsif (status == 'N-terminus')
107
+ n_terminus[0] += 1
108
+ n_terminus = stats_my_db(db_name, n_terminus)
109
+ elsif (status == 'Putative C-terminus')
110
+ putative_c_terminus[0] += 1
111
+ putative_c_terminus = stats_my_db(db_name, putative_c_terminus)
112
+ elsif (status == 'Putative N-terminus')
113
+ putative_n_terminus[0] += 1
114
+ putative_n_terminus = stats_my_db(db_name, putative_n_terminus)
115
+ elsif (status == 'Internal')
116
+ internal[0] += 1
117
+ internal = stats_my_db(db_name, internal)
118
+ elsif (status == 'Coding Seq')
119
+ cod_seq[0] += 1
120
+ cod_seq = stats_my_db(db_name, cod_seq)
121
+ end
122
+ # -------------------------------------------------------------------------
123
+ end
124
+
125
+ end
126
+
127
+ stats_file.puts "--- Annotation Summary ---"
128
+ stats_file.puts "\n------------------------------ Summary of sequences found by similarity -----"
129
+
130
+ stats_file.puts "\n\tSequences found: #{seqs_number}\t\t(>200: #{seqs_longer_200}, <200: #{seqs_shorter_200})\t(>500: #{seqs_longer_500}, <500: #{seqs_shorter_500})"
131
+ stats_file.puts "\tDifferent IDs: #{array_of_all_accs.uniq.count}"
132
+
133
+ stats_file.puts "\n\tsequences with sense and antisense hits error: #{error_1_num}"
134
+ stats_file.puts "\n------------------------------------------------- Full-Length Sequences -----"
135
+ stats_file.puts "\tComplete Seqs: #{complete[0]} ("+ '%.3f' % (complete[0].to_f/seqs_number.to_f*100) +" %)\t\t(>200: #{complete_longer_200}, <200: #{complete_shorter_200})\t(>500: #{complete_longer_500}, <500: #{complete_shorter_500})"
136
+ stats_file.puts "\tDifferent IDs: #{array_of_complete_accs.uniq.count} ("+ '%.3f' % (array_of_complete_accs.uniq.count.to_f/seqs_number.to_f*100) +" %)"
137
+ stats_file.puts "\n\t\tuser_db: #{complete[1]}\n\t\tsp: #{complete[2]}\n\t\ttr: #{complete[3]}"
138
+ stats_file.puts "-----------------------------------------------------------------------------"
139
+
140
+ stats_file.puts "\n\tputative completes: #{putative_complete[0]}\n\t\tuser_db: #{putative_complete[1]}\n\t\tsp: #{putative_complete[2]}\n\t\ttr: #{putative_complete[3]}"
141
+ stats_file.puts "\n\tn-terminus: #{n_terminus[0]}\n\t\tuser_db: #{n_terminus[1]}\n\t\tsp: #{n_terminus[2]}\n\t\ttr: #{n_terminus[3]}"
142
+ stats_file.puts "\n\tputative_n_terminus: #{putative_n_terminus[0]}\n\t\tuser_db: #{putative_n_terminus[1]}\n\t\tsp: #{putative_n_terminus[2]}\n\t\ttr: #{putative_n_terminus[3]}"
143
+ stats_file.puts "\n\tc-terminus: #{c_terminus[0]}\n\t\tuser_db: #{c_terminus[1]}\n\t\tsp: #{c_terminus[2]}\n\t\ttr: #{c_terminus[3]}"
144
+ stats_file.puts "\n\tputative_c_terminus: #{putative_c_terminus[0]}\n\t\tuser_db: #{putative_c_terminus[1]}\n\t\tsp: #{putative_c_terminus[2]}\n\t\ttr: #{putative_c_terminus[3]}"
145
+ stats_file.puts "\n\tinternal: #{internal[0]}\n\t\tuser_db: #{internal[1]}\n\t\tsp: #{internal[2]}\n\t\ttr: #{internal[3]}"
146
+ stats_file.puts "\n\tcoding sequences with unknown status: #{cod_seq[0]}\n\t\tuser_db: #{cod_seq[1]}\n\t\tsp: #{cod_seq[2]}\n\t\ttr: #{cod_seq[3]}"
147
+
148
+ return seqs_number
149
+ end
150
+
151
+
152
+ def testcode_stats(stats_file)
153
+
154
+ seqs_number = 0
155
+ coding = 0
156
+ putative_coding = 0
157
+ unknown = 0
158
+
159
+ coding_longer_200 = 0
160
+ coding_shorter_200 = 0
161
+ unknown_longer_200 = 0
162
+ unknown_shorter_200 = 0
163
+
164
+ coding_longer_500 = 0
165
+ coding_shorter_500 = 0
166
+ unknown_longer_500 = 0
167
+ unknown_shorter_500 = 0
168
+
169
+ File.open('fl2_results/tcode_result.txt').each do |line|
170
+ line.chomp!
171
+ (name,fasta_length,acc,db_name,status) = line.split("\t")
172
+
173
+ if (line !~ /^Query_id\t/)
174
+ seqs_number += 1
175
+
176
+ if (status == 'coding')
177
+ coding += 1
178
+ if (fasta_length.to_i >= 200)
179
+ coding_longer_200 += 1
180
+ coding_longer_500 += 1
181
+ else
182
+ coding_shorter_200 += 1
183
+ coding_shorter_500 += 1
184
+ end
185
+ elsif (status == 'putative_coding')
186
+ putative_coding += 1
187
+ elsif (status == 'unknown')
188
+ unknown += 1
189
+ if (fasta_length.to_i >= 200)
190
+ unknown_longer_200 += 1
191
+ unknown_longer_500 += 1
192
+ else
193
+ unknown_shorter_200 += 1
194
+ unknown_shorter_500 += 1
195
+ end
196
+
197
+ end
198
+
199
+ end
200
+
201
+ end
202
+
203
+
204
+ stats_file.puts "\n--------------------------- Test Code Summary\n\n\ttotal seqs: #{seqs_number}"
205
+ stats_file.puts "\n\tcoding sequences: #{coding}"
206
+ stats_file.puts "\t\tlonger than 200 bp: #{coding_longer_200}"
207
+ stats_file.puts "\t\tshorter than 200 bp: #{coding_shorter_200}"
208
+ stats_file.puts "\t\tlonger than 500 bp: #{coding_longer_500}"
209
+ stats_file.puts "\t\tshorter than 500 bp: #{coding_shorter_500}"
210
+ stats_file.puts "\n\tputative coding sequences: #{putative_coding}\n"
211
+ stats_file.puts "\n\tunknown: #{unknown} ("+ '%.3f' % (unknown.to_f/seqs_number.to_f*100) +" %)"
212
+ stats_file.puts "\t\tlonger than 200 bp: #{unknown_longer_200}"
213
+ stats_file.puts "\t\tshorter than 200 bp: #{unknown_shorter_200}"
214
+ stats_file.puts "\t\tlonger than 500 bp: #{unknown_longer_500}"
215
+ stats_file.puts "\t\tshorter than 500 bp: #{unknown_shorter_500}"
216
+ stats_file.puts "\n\tUnknown sequences have a bad test code score or haven't got an ORF longer than 200 nt"
217
+ stats_file.puts "---------------------------------------------"
218
+
219
+ return seqs_number
220
+ end
221
+
222
+ end