full_lengther_next 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # 15-2-2011 Noe Fernandez-Pozo
4
+ # Script to create your own Full-LengtherNext User database.
5
+
6
+ require 'net/ftp'
7
+
8
+ #receive one argument or fail
9
+ if (ARGV.size != 2)
10
+
11
+ puts "incorrect number of arguments, you need a taxonomic group like 'Coniferopsida', you can search it in 'http://www.ncbi.nlm.nih.gov/Taxonomy/'
12
+ and a UniProt taxonomic group from this list:
13
+
14
+ fungi
15
+ human
16
+ invertebrates
17
+ mammals
18
+ plants
19
+ rodents
20
+ vertebrates
21
+
22
+ mode of use: ruby make_user_db.rb coniferopsida plants\n\n"
23
+
24
+ Process.exit(-1);
25
+ end
26
+
27
+ (my_group,uniprot_group)=ARGV
28
+
29
+ ################################################### Functions
30
+
31
+ def filter_incomplete_seqs(output_file,file_name, my_group)
32
+
33
+ puts " filtering sequences"
34
+
35
+ # UniProtKB fragments with FT NON_CONS and FT NON_TER features.
36
+ #
37
+ # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
38
+ # FT NON_TER 1 1
39
+ # FT NON_TER 29 29
40
+ # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
41
+ # FT NON_CONS 1683 1684
42
+ #
43
+ # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
44
+
45
+ newseq=false
46
+ print_seq=false
47
+ incomplete=false
48
+ id=''
49
+ description = ''
50
+ organism_name = ''
51
+ seq = ''
52
+ organelle = ''
53
+
54
+ File.open(file_name).each_line do |line|
55
+ if (newseq == false)
56
+ if (line =~ /^AC\s+(\w+);/)
57
+ id=$1
58
+ newseq = true
59
+ description = ''
60
+ organism_name = ''
61
+ seq = ''
62
+ print_seq = false
63
+ incomplete = false
64
+ organelle = ''
65
+ end
66
+ else
67
+ if (line =~ /^DE\s+(.+)\;*/)
68
+ if (description == '')
69
+ description = $1
70
+ description.sub!(/RecName: Full=/,'sp=')
71
+ description.sub!(/SubName: Full=/,'tr=')
72
+ end
73
+ if (line =~ /Flags: Fragment/)
74
+ # puts "#{id} #{line}"
75
+ incomplete = true
76
+ end
77
+ elsif (line =~ /^OS\s+(.+)/)
78
+ organism_name = $1
79
+ elsif (line =~ /^OG\s+(.+)/)
80
+ organelle = $1
81
+ elsif (line =~ /^OC\s+[\w\s\;]*#{my_group}/i) && (!incomplete)
82
+ print_seq=true
83
+ # puts "#{id} #{organism_name} print_seq?: #{print_seq}"
84
+ elsif (line =~ /^FT\s+NON_TER\s+/)
85
+ print_seq=false
86
+ # puts "#{id} NON_TER"
87
+ elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
88
+ print_seq=false
89
+ # puts "#{id} NON_CONS"
90
+ elsif (line =~ /^\s+([\w\s]+)/)
91
+ seq += $1
92
+ elsif (line =~ /^\/\//)
93
+ seq.gsub!(/\s*/,'')
94
+ if (seq !~ /^M/i)
95
+ print_seq=false
96
+ end
97
+ newseq = false
98
+
99
+ if (print_seq)
100
+ output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
106
+
107
+ ########################################################
108
+ ## MAIN
109
+ ########################################################
110
+
111
+ ROOT_PATH=File.dirname(__FILE__)
112
+
113
+ # $: << File.expand_path(File.join(ROOT_PATH, "classes"))
114
+
115
+ # load gem path, only to test locally
116
+ # $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
117
+
118
+ require 'full_lengther_next'
119
+
120
+ if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
121
+ formatted_db_path = ENV['BLASTDB']
122
+ else # otherwise use ROOTPATH + DB
123
+ formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
124
+ end
125
+
126
+ ENV['BLASTDB']=formatted_db_path
127
+
128
+
129
+ if !File.exists?(File.join(ENV['BLASTDB'], my_group))
130
+ Dir.mkdir("blast_dbs/#{my_group}")
131
+ end
132
+
133
+ output_file_path=File.join(ENV['BLASTDB'],my_group,my_group+".fasta")
134
+
135
+ output_file = File.new(output_file_path, "w")
136
+
137
+ filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_sprot_#{uniprot_group}.dat"), my_group)
138
+ filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_trembl_#{uniprot_group}.dat"), my_group)
139
+
140
+ output_file.close
141
+
142
+ `makeblastdb -in #{output_file_path} -dbtype 'prot' -parse_seqids`
143
+
144
+ puts "make_user_db.rb has finished"
@@ -0,0 +1,13 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
5
+
6
+ $: << File.expand_path(File.join(ROOT_PATH, 'classes'))
7
+
8
+
9
+ module FullLengtherNext
10
+ VERSION = '0.0.1'
11
+
12
+ FULLLENGHTER_VERSION = VERSION
13
+ end
@@ -0,0 +1,94 @@
1
+
2
+ module CommonFunctions
3
+
4
+ def contenidos_en_prot(hit, full_prot, q)
5
+
6
+ is_ok = false
7
+ q_index_start = 9999
8
+ fr_index_start = 0
9
+ min_index_start = 9999
10
+ aas_parecidos = 0
11
+ masked_x = 0
12
+ suma_fragments = 0
13
+
14
+ masked_x = hit.q_seq.count('X')
15
+ masked_x = masked_x + hit.q_seq.count('-')
16
+
17
+ full_prot = full_prot.gsub(/[\-Xx]+/,'')
18
+ compare_prot = hit.q_seq.gsub(/[\-Xx]+/,'-')
19
+ fragments_array = compare_prot.split(/\-+/)
20
+
21
+ fragments_array.each do |seq|
22
+ # puts "seq: #{seq}\nfull_prot: #{full_prot}"
23
+ simliar_fragment = full_prot.lcs(seq)
24
+ suma_fragments += simliar_fragment.length
25
+
26
+ fr_index_start = full_prot.index(simliar_fragment)
27
+
28
+ if (q_index_start == 9999)
29
+ q_index_start = fr_index_start
30
+ end
31
+ full_prot = full_prot[(fr_index_start + simliar_fragment.length)..full_prot.length]
32
+ end
33
+
34
+ simliar_fragment = full_prot.lcs(compare_prot)
35
+
36
+ # if ($verbose)
37
+ # puts "#{q.query_def}-------------------------------------#{suma_fragments} de #{compare_prot.length}"
38
+ # puts "#{q.query_def}-------------------------------------#{suma_fragments + masked_x} >= #{compare_prot.length * 0.7}"
39
+ # puts "\nfull: #{full_prot}\ncomp: #{compare_prot}\nsimliar_fragment: #{simliar_fragment}"
40
+ # end
41
+
42
+ if (suma_fragments + masked_x >= compare_prot.length * 0.7)
43
+ is_ok = true
44
+ # puts "OK -- encontramos suficiente similitud entre query y subject -- OK"
45
+ else
46
+ is_ok = false
47
+ # puts "\nfull: #{full_prot}\ncomp: #{compare_prot}"
48
+ # puts "Warning!: no match comparing proteins"
49
+ end
50
+
51
+ min_index_start = [min_index_start, q_index_start].min
52
+
53
+ if (min_index_start == 9999)
54
+ min_index_start = 0
55
+ end
56
+
57
+ return [is_ok, min_index_start]
58
+ end
59
+
60
+
61
+
62
+
63
+ def reverse_seq(query_fasta, h_qframe, h_qstart, h_qend)
64
+
65
+ q_frame = -h_qframe.to_i
66
+
67
+ q_beg = query_fasta.length - h_qend - 1
68
+ q_end = query_fasta.length - h_qstart - 1
69
+
70
+ query_fasta = query_fasta.complementary_dna
71
+
72
+ # el qend y el qstart estan al reves porque cuando la seq tiene frame negativo el blast los pone al reves
73
+ return [query_fasta, q_frame, q_beg, q_end]
74
+ end
75
+
76
+
77
+
78
+ def corrige_frame(ref_frame,ref_start,ref_end)
79
+
80
+ if (ref_frame.abs == 2)
81
+ ref_start = ref_start + 1
82
+ ref_end = ref_end + 1
83
+ elsif (ref_frame.abs == 3)
84
+ ref_start = ref_start + 2
85
+ ref_end = ref_end + 2
86
+ end
87
+
88
+ return [ref_start,ref_end]
89
+
90
+ end
91
+
92
+
93
+
94
+ end
@@ -0,0 +1,222 @@
1
+
2
+ module Fl2Stats
3
+
4
+ # -------------------------------------------------------------------------------- Main
5
+ def summary_stats
6
+ stats_file = File.open('fl2_results/summary_stats.txt', 'w')
7
+
8
+ total_seqs = 0
9
+
10
+ num1 = annotation_stats(stats_file)
11
+ num2 = testcode_stats(stats_file)
12
+
13
+ total_seqs = num1 + num2
14
+
15
+ stats_file.puts "\nInput sequences in your fasta: #{total_seqs}\n\n"
16
+ end
17
+
18
+ # ---------------------------------------------------------------------------------- Functions
19
+ def stats_my_db(db_name, array)
20
+
21
+ if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
22
+ array[1] += 1
23
+ elsif (db_name =~ /^sp_/)
24
+ array[2] += 1
25
+ elsif (db_name =~ /^tr_/)
26
+ array[3] += 1
27
+ end
28
+
29
+ return array
30
+ end
31
+
32
+
33
+ def annotation_stats(stats_file)
34
+
35
+ seqs_number = 0
36
+ array_of_all_accs = []
37
+ array_of_complete_accs = []
38
+ error_1_num = 0
39
+
40
+ seqs_longer_200 = 0
41
+ seqs_shorter_200 = 0
42
+ complete_longer_200 = 0
43
+ complete_shorter_200 = 0
44
+
45
+ seqs_longer_500 = 0
46
+ seqs_shorter_500 = 0
47
+ complete_longer_500 = 0
48
+ complete_shorter_500 = 0
49
+
50
+ complete = [0,0,0,0]
51
+ putative_complete = [0,0,0,0]
52
+ c_terminus = [0,0,0,0]
53
+ putative_c_terminus = [0,0,0,0]
54
+ n_terminus = [0,0,0,0]
55
+ putative_n_terminus = [0,0,0,0]
56
+ internal = [0,0,0,0]
57
+ cod_seq = [0,0,0,0]
58
+
59
+
60
+ File.open('fl2_results/annotations.txt').each do |line|
61
+ line.chomp!
62
+ (name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
63
+
64
+ if (line !~ /^Query_id\t/)
65
+ seqs_number += 1
66
+ array_of_all_accs.push acc
67
+ # -------------------------------------------------------------------------
68
+ if (fasta_length.to_i >= 200)
69
+ seqs_longer_200 += 1
70
+ else
71
+ seqs_shorter_200 += 1
72
+ end
73
+ if (fasta_length.to_i >= 500)
74
+ seqs_longer_500 += 1
75
+ else
76
+ seqs_shorter_500 += 1
77
+ end
78
+ # -------------------------------------------------------------------------
79
+ if (msgs =~ /ERROR#1/)
80
+ error_1_num += 1
81
+ end
82
+ # -------------------------------------------------------------------------
83
+ if (status == 'Complete')
84
+ complete[0] += 1
85
+ array_of_complete_accs.push acc
86
+ complete = stats_my_db(db_name, complete)
87
+
88
+ if (fasta_length.to_i >= 200)
89
+ complete_longer_200 += 1
90
+ else
91
+ complete_shorter_200 += 1
92
+ end
93
+
94
+ if (fasta_length.to_i >= 500)
95
+ complete_longer_500 += 1
96
+ else
97
+ complete_shorter_500 += 1
98
+ end
99
+
100
+ elsif (status == 'Putative Complete')
101
+ putative_complete[0] += 1
102
+ putative_complete = stats_my_db(db_name, putative_complete)
103
+ elsif (status == 'C-terminus')
104
+ c_terminus[0] += 1
105
+ c_terminus = stats_my_db(db_name, c_terminus)
106
+ elsif (status == 'N-terminus')
107
+ n_terminus[0] += 1
108
+ n_terminus = stats_my_db(db_name, n_terminus)
109
+ elsif (status == 'Putative C-terminus')
110
+ putative_c_terminus[0] += 1
111
+ putative_c_terminus = stats_my_db(db_name, putative_c_terminus)
112
+ elsif (status == 'Putative N-terminus')
113
+ putative_n_terminus[0] += 1
114
+ putative_n_terminus = stats_my_db(db_name, putative_n_terminus)
115
+ elsif (status == 'Internal')
116
+ internal[0] += 1
117
+ internal = stats_my_db(db_name, internal)
118
+ elsif (status == 'Coding Seq')
119
+ cod_seq[0] += 1
120
+ cod_seq = stats_my_db(db_name, cod_seq)
121
+ end
122
+ # -------------------------------------------------------------------------
123
+ end
124
+
125
+ end
126
+
127
+ stats_file.puts "--- Annotation Summary ---"
128
+ stats_file.puts "\n------------------------------ Summary of sequences found by similarity -----"
129
+
130
+ stats_file.puts "\n\tSequences found: #{seqs_number}\t\t(>200: #{seqs_longer_200}, <200: #{seqs_shorter_200})\t(>500: #{seqs_longer_500}, <500: #{seqs_shorter_500})"
131
+ stats_file.puts "\tDifferent IDs: #{array_of_all_accs.uniq.count}"
132
+
133
+ stats_file.puts "\n\tsequences with sense and antisense hits error: #{error_1_num}"
134
+ stats_file.puts "\n------------------------------------------------- Full-Length Sequences -----"
135
+ stats_file.puts "\tComplete Seqs: #{complete[0]} ("+ '%.3f' % (complete[0].to_f/seqs_number.to_f*100) +" %)\t\t(>200: #{complete_longer_200}, <200: #{complete_shorter_200})\t(>500: #{complete_longer_500}, <500: #{complete_shorter_500})"
136
+ stats_file.puts "\tDifferent IDs: #{array_of_complete_accs.uniq.count} ("+ '%.3f' % (array_of_complete_accs.uniq.count.to_f/seqs_number.to_f*100) +" %)"
137
+ stats_file.puts "\n\t\tuser_db: #{complete[1]}\n\t\tsp: #{complete[2]}\n\t\ttr: #{complete[3]}"
138
+ stats_file.puts "-----------------------------------------------------------------------------"
139
+
140
+ stats_file.puts "\n\tputative completes: #{putative_complete[0]}\n\t\tuser_db: #{putative_complete[1]}\n\t\tsp: #{putative_complete[2]}\n\t\ttr: #{putative_complete[3]}"
141
+ stats_file.puts "\n\tn-terminus: #{n_terminus[0]}\n\t\tuser_db: #{n_terminus[1]}\n\t\tsp: #{n_terminus[2]}\n\t\ttr: #{n_terminus[3]}"
142
+ stats_file.puts "\n\tputative_n_terminus: #{putative_n_terminus[0]}\n\t\tuser_db: #{putative_n_terminus[1]}\n\t\tsp: #{putative_n_terminus[2]}\n\t\ttr: #{putative_n_terminus[3]}"
143
+ stats_file.puts "\n\tc-terminus: #{c_terminus[0]}\n\t\tuser_db: #{c_terminus[1]}\n\t\tsp: #{c_terminus[2]}\n\t\ttr: #{c_terminus[3]}"
144
+ stats_file.puts "\n\tputative_c_terminus: #{putative_c_terminus[0]}\n\t\tuser_db: #{putative_c_terminus[1]}\n\t\tsp: #{putative_c_terminus[2]}\n\t\ttr: #{putative_c_terminus[3]}"
145
+ stats_file.puts "\n\tinternal: #{internal[0]}\n\t\tuser_db: #{internal[1]}\n\t\tsp: #{internal[2]}\n\t\ttr: #{internal[3]}"
146
+ stats_file.puts "\n\tcoding sequences with unknown status: #{cod_seq[0]}\n\t\tuser_db: #{cod_seq[1]}\n\t\tsp: #{cod_seq[2]}\n\t\ttr: #{cod_seq[3]}"
147
+
148
+ return seqs_number
149
+ end
150
+
151
+
152
+ def testcode_stats(stats_file)
153
+
154
+ seqs_number = 0
155
+ coding = 0
156
+ putative_coding = 0
157
+ unknown = 0
158
+
159
+ coding_longer_200 = 0
160
+ coding_shorter_200 = 0
161
+ unknown_longer_200 = 0
162
+ unknown_shorter_200 = 0
163
+
164
+ coding_longer_500 = 0
165
+ coding_shorter_500 = 0
166
+ unknown_longer_500 = 0
167
+ unknown_shorter_500 = 0
168
+
169
+ File.open('fl2_results/tcode_result.txt').each do |line|
170
+ line.chomp!
171
+ (name,fasta_length,acc,db_name,status) = line.split("\t")
172
+
173
+ if (line !~ /^Query_id\t/)
174
+ seqs_number += 1
175
+
176
+ if (status == 'coding')
177
+ coding += 1
178
+ if (fasta_length.to_i >= 200)
179
+ coding_longer_200 += 1
180
+ coding_longer_500 += 1
181
+ else
182
+ coding_shorter_200 += 1
183
+ coding_shorter_500 += 1
184
+ end
185
+ elsif (status == 'putative_coding')
186
+ putative_coding += 1
187
+ elsif (status == 'unknown')
188
+ unknown += 1
189
+ if (fasta_length.to_i >= 200)
190
+ unknown_longer_200 += 1
191
+ unknown_longer_500 += 1
192
+ else
193
+ unknown_shorter_200 += 1
194
+ unknown_shorter_500 += 1
195
+ end
196
+
197
+ end
198
+
199
+ end
200
+
201
+ end
202
+
203
+
204
+ stats_file.puts "\n--------------------------- Test Code Summary\n\n\ttotal seqs: #{seqs_number}"
205
+ stats_file.puts "\n\tcoding sequences: #{coding}"
206
+ stats_file.puts "\t\tlonger than 200 bp: #{coding_longer_200}"
207
+ stats_file.puts "\t\tshorter than 200 bp: #{coding_shorter_200}"
208
+ stats_file.puts "\t\tlonger than 500 bp: #{coding_longer_500}"
209
+ stats_file.puts "\t\tshorter than 500 bp: #{coding_shorter_500}"
210
+ stats_file.puts "\n\tputative coding sequences: #{putative_coding}\n"
211
+ stats_file.puts "\n\tunknown: #{unknown} ("+ '%.3f' % (unknown.to_f/seqs_number.to_f*100) +" %)"
212
+ stats_file.puts "\t\tlonger than 200 bp: #{unknown_longer_200}"
213
+ stats_file.puts "\t\tshorter than 200 bp: #{unknown_shorter_200}"
214
+ stats_file.puts "\t\tlonger than 500 bp: #{unknown_longer_500}"
215
+ stats_file.puts "\t\tshorter than 500 bp: #{unknown_shorter_500}"
216
+ stats_file.puts "\n\tUnknown sequences have a bad test code score or haven't got an ORF longer than 200 nt"
217
+ stats_file.puts "---------------------------------------------"
218
+
219
+ return seqs_number
220
+ end
221
+
222
+ end