full_lengther_next 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest.txt +27 -0
- data/PostInstall.txt +6 -0
- data/README.rdoc +147 -0
- data/Rakefile +37 -0
- data/bin/download_fln_dbs.rb +197 -0
- data/bin/full_lengther_next +173 -0
- data/bin/make_user_db.rb +144 -0
- data/lib/full_lengther_next.rb +13 -0
- data/lib/full_lengther_next/classes/common_functions.rb +94 -0
- data/lib/full_lengther_next/classes/fl2_stats.rb +222 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +688 -0
- data/lib/full_lengther_next/classes/fl_string_utils.rb +139 -0
- data/lib/full_lengther_next/classes/lcs.rb +33 -0
- data/lib/full_lengther_next/classes/my_worker.rb +122 -0
- data/lib/full_lengther_next/classes/my_worker_manager.rb +167 -0
- data/lib/full_lengther_next/classes/orf.rb +32 -0
- data/lib/full_lengther_next/classes/sequence.rb +111 -0
- data/lib/full_lengther_next/classes/test_code.rb +877 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +287 -0
- data/script/console +10 -0
- data/script/destroy +14 -0
- data/script/generate +14 -0
- data/test/test_full_lengther_next.rb +11 -0
- data/test/test_helper.rb +3 -0
- metadata +150 -0
data/bin/make_user_db.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# 15-2-2011 Noe Fernandez-Pozo
|
4
|
+
# Script to create your own Full-LengtherNext User database.
|
5
|
+
|
6
|
+
require 'net/ftp'
|
7
|
+
|
8
|
+
#receive one argument or fail
|
9
|
+
if (ARGV.size != 2)
|
10
|
+
|
11
|
+
puts "incorrect number of arguments, you need a taxonomic group like 'Coniferopsida', you can search it in 'http://www.ncbi.nlm.nih.gov/Taxonomy/'
|
12
|
+
and a UniProt taxonomic group from this list:
|
13
|
+
|
14
|
+
fungi
|
15
|
+
human
|
16
|
+
invertebrates
|
17
|
+
mammals
|
18
|
+
plants
|
19
|
+
rodents
|
20
|
+
vertebrates
|
21
|
+
|
22
|
+
mode of use: ruby make_user_db.rb coniferopsida plants\n\n"
|
23
|
+
|
24
|
+
Process.exit(-1);
|
25
|
+
end
|
26
|
+
|
27
|
+
(my_group,uniprot_group)=ARGV
|
28
|
+
|
29
|
+
################################################### Functions
|
30
|
+
|
31
|
+
def filter_incomplete_seqs(output_file,file_name, my_group)
|
32
|
+
|
33
|
+
puts " filtering sequences"
|
34
|
+
|
35
|
+
# UniProtKB fragments with FT NON_CONS and FT NON_TER features.
|
36
|
+
#
|
37
|
+
# * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
|
38
|
+
# FT NON_TER 1 1
|
39
|
+
# FT NON_TER 29 29
|
40
|
+
# * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
|
41
|
+
# FT NON_CONS 1683 1684
|
42
|
+
#
|
43
|
+
# NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
|
44
|
+
|
45
|
+
newseq=false
|
46
|
+
print_seq=false
|
47
|
+
incomplete=false
|
48
|
+
id=''
|
49
|
+
description = ''
|
50
|
+
organism_name = ''
|
51
|
+
seq = ''
|
52
|
+
organelle = ''
|
53
|
+
|
54
|
+
File.open(file_name).each_line do |line|
|
55
|
+
if (newseq == false)
|
56
|
+
if (line =~ /^AC\s+(\w+);/)
|
57
|
+
id=$1
|
58
|
+
newseq = true
|
59
|
+
description = ''
|
60
|
+
organism_name = ''
|
61
|
+
seq = ''
|
62
|
+
print_seq = false
|
63
|
+
incomplete = false
|
64
|
+
organelle = ''
|
65
|
+
end
|
66
|
+
else
|
67
|
+
if (line =~ /^DE\s+(.+)\;*/)
|
68
|
+
if (description == '')
|
69
|
+
description = $1
|
70
|
+
description.sub!(/RecName: Full=/,'sp=')
|
71
|
+
description.sub!(/SubName: Full=/,'tr=')
|
72
|
+
end
|
73
|
+
if (line =~ /Flags: Fragment/)
|
74
|
+
# puts "#{id} #{line}"
|
75
|
+
incomplete = true
|
76
|
+
end
|
77
|
+
elsif (line =~ /^OS\s+(.+)/)
|
78
|
+
organism_name = $1
|
79
|
+
elsif (line =~ /^OG\s+(.+)/)
|
80
|
+
organelle = $1
|
81
|
+
elsif (line =~ /^OC\s+[\w\s\;]*#{my_group}/i) && (!incomplete)
|
82
|
+
print_seq=true
|
83
|
+
# puts "#{id} #{organism_name} print_seq?: #{print_seq}"
|
84
|
+
elsif (line =~ /^FT\s+NON_TER\s+/)
|
85
|
+
print_seq=false
|
86
|
+
# puts "#{id} NON_TER"
|
87
|
+
elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
|
88
|
+
print_seq=false
|
89
|
+
# puts "#{id} NON_CONS"
|
90
|
+
elsif (line =~ /^\s+([\w\s]+)/)
|
91
|
+
seq += $1
|
92
|
+
elsif (line =~ /^\/\//)
|
93
|
+
seq.gsub!(/\s*/,'')
|
94
|
+
if (seq !~ /^M/i)
|
95
|
+
print_seq=false
|
96
|
+
end
|
97
|
+
newseq = false
|
98
|
+
|
99
|
+
if (print_seq)
|
100
|
+
output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
########################################################
|
108
|
+
## MAIN
|
109
|
+
########################################################
|
110
|
+
|
111
|
+
ROOT_PATH=File.dirname(__FILE__)
|
112
|
+
|
113
|
+
# $: << File.expand_path(File.join(ROOT_PATH, "classes"))
|
114
|
+
|
115
|
+
# load gem path, only to test locally
|
116
|
+
# $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
|
117
|
+
|
118
|
+
require 'full_lengther_next'
|
119
|
+
|
120
|
+
if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
|
121
|
+
formatted_db_path = ENV['BLASTDB']
|
122
|
+
else # otherwise use ROOTPATH + DB
|
123
|
+
formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
|
124
|
+
end
|
125
|
+
|
126
|
+
ENV['BLASTDB']=formatted_db_path
|
127
|
+
|
128
|
+
|
129
|
+
if !File.exists?(File.join(ENV['BLASTDB'], my_group))
|
130
|
+
Dir.mkdir("blast_dbs/#{my_group}")
|
131
|
+
end
|
132
|
+
|
133
|
+
output_file_path=File.join(ENV['BLASTDB'],my_group,my_group+".fasta")
|
134
|
+
|
135
|
+
output_file = File.new(output_file_path, "w")
|
136
|
+
|
137
|
+
filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_sprot_#{uniprot_group}.dat"), my_group)
|
138
|
+
filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_trembl_#{uniprot_group}.dat"), my_group)
|
139
|
+
|
140
|
+
output_file.close
|
141
|
+
|
142
|
+
`makeblastdb -in #{output_file_path} -dbtype 'prot' -parse_seqids`
|
143
|
+
|
144
|
+
puts "make_user_db.rb has finished"
|
@@ -0,0 +1,13 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
|
4
|
+
ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
|
5
|
+
|
6
|
+
$: << File.expand_path(File.join(ROOT_PATH, 'classes'))
|
7
|
+
|
8
|
+
|
9
|
+
module FullLengtherNext
|
10
|
+
VERSION = '0.0.1'
|
11
|
+
|
12
|
+
FULLLENGHTER_VERSION = VERSION
|
13
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
|
2
|
+
module CommonFunctions
|
3
|
+
|
4
|
+
def contenidos_en_prot(hit, full_prot, q)
|
5
|
+
|
6
|
+
is_ok = false
|
7
|
+
q_index_start = 9999
|
8
|
+
fr_index_start = 0
|
9
|
+
min_index_start = 9999
|
10
|
+
aas_parecidos = 0
|
11
|
+
masked_x = 0
|
12
|
+
suma_fragments = 0
|
13
|
+
|
14
|
+
masked_x = hit.q_seq.count('X')
|
15
|
+
masked_x = masked_x + hit.q_seq.count('-')
|
16
|
+
|
17
|
+
full_prot = full_prot.gsub(/[\-Xx]+/,'')
|
18
|
+
compare_prot = hit.q_seq.gsub(/[\-Xx]+/,'-')
|
19
|
+
fragments_array = compare_prot.split(/\-+/)
|
20
|
+
|
21
|
+
fragments_array.each do |seq|
|
22
|
+
# puts "seq: #{seq}\nfull_prot: #{full_prot}"
|
23
|
+
simliar_fragment = full_prot.lcs(seq)
|
24
|
+
suma_fragments += simliar_fragment.length
|
25
|
+
|
26
|
+
fr_index_start = full_prot.index(simliar_fragment)
|
27
|
+
|
28
|
+
if (q_index_start == 9999)
|
29
|
+
q_index_start = fr_index_start
|
30
|
+
end
|
31
|
+
full_prot = full_prot[(fr_index_start + simliar_fragment.length)..full_prot.length]
|
32
|
+
end
|
33
|
+
|
34
|
+
simliar_fragment = full_prot.lcs(compare_prot)
|
35
|
+
|
36
|
+
# if ($verbose)
|
37
|
+
# puts "#{q.query_def}-------------------------------------#{suma_fragments} de #{compare_prot.length}"
|
38
|
+
# puts "#{q.query_def}-------------------------------------#{suma_fragments + masked_x} >= #{compare_prot.length * 0.7}"
|
39
|
+
# puts "\nfull: #{full_prot}\ncomp: #{compare_prot}\nsimliar_fragment: #{simliar_fragment}"
|
40
|
+
# end
|
41
|
+
|
42
|
+
if (suma_fragments + masked_x >= compare_prot.length * 0.7)
|
43
|
+
is_ok = true
|
44
|
+
# puts "OK -- encontramos suficiente similitud entre query y subject -- OK"
|
45
|
+
else
|
46
|
+
is_ok = false
|
47
|
+
# puts "\nfull: #{full_prot}\ncomp: #{compare_prot}"
|
48
|
+
# puts "Warning!: no match comparing proteins"
|
49
|
+
end
|
50
|
+
|
51
|
+
min_index_start = [min_index_start, q_index_start].min
|
52
|
+
|
53
|
+
if (min_index_start == 9999)
|
54
|
+
min_index_start = 0
|
55
|
+
end
|
56
|
+
|
57
|
+
return [is_ok, min_index_start]
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
def reverse_seq(query_fasta, h_qframe, h_qstart, h_qend)
|
64
|
+
|
65
|
+
q_frame = -h_qframe.to_i
|
66
|
+
|
67
|
+
q_beg = query_fasta.length - h_qend - 1
|
68
|
+
q_end = query_fasta.length - h_qstart - 1
|
69
|
+
|
70
|
+
query_fasta = query_fasta.complementary_dna
|
71
|
+
|
72
|
+
# el qend y el qstart estan al reves porque cuando la seq tiene frame negativo el blast los pone al reves
|
73
|
+
return [query_fasta, q_frame, q_beg, q_end]
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
def corrige_frame(ref_frame,ref_start,ref_end)
|
79
|
+
|
80
|
+
if (ref_frame.abs == 2)
|
81
|
+
ref_start = ref_start + 1
|
82
|
+
ref_end = ref_end + 1
|
83
|
+
elsif (ref_frame.abs == 3)
|
84
|
+
ref_start = ref_start + 2
|
85
|
+
ref_end = ref_end + 2
|
86
|
+
end
|
87
|
+
|
88
|
+
return [ref_start,ref_end]
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
end
|
@@ -0,0 +1,222 @@
|
|
1
|
+
|
2
|
+
module Fl2Stats
|
3
|
+
|
4
|
+
# -------------------------------------------------------------------------------- Main
|
5
|
+
def summary_stats
|
6
|
+
stats_file = File.open('fl2_results/summary_stats.txt', 'w')
|
7
|
+
|
8
|
+
total_seqs = 0
|
9
|
+
|
10
|
+
num1 = annotation_stats(stats_file)
|
11
|
+
num2 = testcode_stats(stats_file)
|
12
|
+
|
13
|
+
total_seqs = num1 + num2
|
14
|
+
|
15
|
+
stats_file.puts "\nInput sequences in your fasta: #{total_seqs}\n\n"
|
16
|
+
end
|
17
|
+
|
18
|
+
# ---------------------------------------------------------------------------------- Functions
|
19
|
+
def stats_my_db(db_name, array)
|
20
|
+
|
21
|
+
if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
|
22
|
+
array[1] += 1
|
23
|
+
elsif (db_name =~ /^sp_/)
|
24
|
+
array[2] += 1
|
25
|
+
elsif (db_name =~ /^tr_/)
|
26
|
+
array[3] += 1
|
27
|
+
end
|
28
|
+
|
29
|
+
return array
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def annotation_stats(stats_file)
|
34
|
+
|
35
|
+
seqs_number = 0
|
36
|
+
array_of_all_accs = []
|
37
|
+
array_of_complete_accs = []
|
38
|
+
error_1_num = 0
|
39
|
+
|
40
|
+
seqs_longer_200 = 0
|
41
|
+
seqs_shorter_200 = 0
|
42
|
+
complete_longer_200 = 0
|
43
|
+
complete_shorter_200 = 0
|
44
|
+
|
45
|
+
seqs_longer_500 = 0
|
46
|
+
seqs_shorter_500 = 0
|
47
|
+
complete_longer_500 = 0
|
48
|
+
complete_shorter_500 = 0
|
49
|
+
|
50
|
+
complete = [0,0,0,0]
|
51
|
+
putative_complete = [0,0,0,0]
|
52
|
+
c_terminus = [0,0,0,0]
|
53
|
+
putative_c_terminus = [0,0,0,0]
|
54
|
+
n_terminus = [0,0,0,0]
|
55
|
+
putative_n_terminus = [0,0,0,0]
|
56
|
+
internal = [0,0,0,0]
|
57
|
+
cod_seq = [0,0,0,0]
|
58
|
+
|
59
|
+
|
60
|
+
File.open('fl2_results/annotations.txt').each do |line|
|
61
|
+
line.chomp!
|
62
|
+
(name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
|
63
|
+
|
64
|
+
if (line !~ /^Query_id\t/)
|
65
|
+
seqs_number += 1
|
66
|
+
array_of_all_accs.push acc
|
67
|
+
# -------------------------------------------------------------------------
|
68
|
+
if (fasta_length.to_i >= 200)
|
69
|
+
seqs_longer_200 += 1
|
70
|
+
else
|
71
|
+
seqs_shorter_200 += 1
|
72
|
+
end
|
73
|
+
if (fasta_length.to_i >= 500)
|
74
|
+
seqs_longer_500 += 1
|
75
|
+
else
|
76
|
+
seqs_shorter_500 += 1
|
77
|
+
end
|
78
|
+
# -------------------------------------------------------------------------
|
79
|
+
if (msgs =~ /ERROR#1/)
|
80
|
+
error_1_num += 1
|
81
|
+
end
|
82
|
+
# -------------------------------------------------------------------------
|
83
|
+
if (status == 'Complete')
|
84
|
+
complete[0] += 1
|
85
|
+
array_of_complete_accs.push acc
|
86
|
+
complete = stats_my_db(db_name, complete)
|
87
|
+
|
88
|
+
if (fasta_length.to_i >= 200)
|
89
|
+
complete_longer_200 += 1
|
90
|
+
else
|
91
|
+
complete_shorter_200 += 1
|
92
|
+
end
|
93
|
+
|
94
|
+
if (fasta_length.to_i >= 500)
|
95
|
+
complete_longer_500 += 1
|
96
|
+
else
|
97
|
+
complete_shorter_500 += 1
|
98
|
+
end
|
99
|
+
|
100
|
+
elsif (status == 'Putative Complete')
|
101
|
+
putative_complete[0] += 1
|
102
|
+
putative_complete = stats_my_db(db_name, putative_complete)
|
103
|
+
elsif (status == 'C-terminus')
|
104
|
+
c_terminus[0] += 1
|
105
|
+
c_terminus = stats_my_db(db_name, c_terminus)
|
106
|
+
elsif (status == 'N-terminus')
|
107
|
+
n_terminus[0] += 1
|
108
|
+
n_terminus = stats_my_db(db_name, n_terminus)
|
109
|
+
elsif (status == 'Putative C-terminus')
|
110
|
+
putative_c_terminus[0] += 1
|
111
|
+
putative_c_terminus = stats_my_db(db_name, putative_c_terminus)
|
112
|
+
elsif (status == 'Putative N-terminus')
|
113
|
+
putative_n_terminus[0] += 1
|
114
|
+
putative_n_terminus = stats_my_db(db_name, putative_n_terminus)
|
115
|
+
elsif (status == 'Internal')
|
116
|
+
internal[0] += 1
|
117
|
+
internal = stats_my_db(db_name, internal)
|
118
|
+
elsif (status == 'Coding Seq')
|
119
|
+
cod_seq[0] += 1
|
120
|
+
cod_seq = stats_my_db(db_name, cod_seq)
|
121
|
+
end
|
122
|
+
# -------------------------------------------------------------------------
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
stats_file.puts "--- Annotation Summary ---"
|
128
|
+
stats_file.puts "\n------------------------------ Summary of sequences found by similarity -----"
|
129
|
+
|
130
|
+
stats_file.puts "\n\tSequences found: #{seqs_number}\t\t(>200: #{seqs_longer_200}, <200: #{seqs_shorter_200})\t(>500: #{seqs_longer_500}, <500: #{seqs_shorter_500})"
|
131
|
+
stats_file.puts "\tDifferent IDs: #{array_of_all_accs.uniq.count}"
|
132
|
+
|
133
|
+
stats_file.puts "\n\tsequences with sense and antisense hits error: #{error_1_num}"
|
134
|
+
stats_file.puts "\n------------------------------------------------- Full-Length Sequences -----"
|
135
|
+
stats_file.puts "\tComplete Seqs: #{complete[0]} ("+ '%.3f' % (complete[0].to_f/seqs_number.to_f*100) +" %)\t\t(>200: #{complete_longer_200}, <200: #{complete_shorter_200})\t(>500: #{complete_longer_500}, <500: #{complete_shorter_500})"
|
136
|
+
stats_file.puts "\tDifferent IDs: #{array_of_complete_accs.uniq.count} ("+ '%.3f' % (array_of_complete_accs.uniq.count.to_f/seqs_number.to_f*100) +" %)"
|
137
|
+
stats_file.puts "\n\t\tuser_db: #{complete[1]}\n\t\tsp: #{complete[2]}\n\t\ttr: #{complete[3]}"
|
138
|
+
stats_file.puts "-----------------------------------------------------------------------------"
|
139
|
+
|
140
|
+
stats_file.puts "\n\tputative completes: #{putative_complete[0]}\n\t\tuser_db: #{putative_complete[1]}\n\t\tsp: #{putative_complete[2]}\n\t\ttr: #{putative_complete[3]}"
|
141
|
+
stats_file.puts "\n\tn-terminus: #{n_terminus[0]}\n\t\tuser_db: #{n_terminus[1]}\n\t\tsp: #{n_terminus[2]}\n\t\ttr: #{n_terminus[3]}"
|
142
|
+
stats_file.puts "\n\tputative_n_terminus: #{putative_n_terminus[0]}\n\t\tuser_db: #{putative_n_terminus[1]}\n\t\tsp: #{putative_n_terminus[2]}\n\t\ttr: #{putative_n_terminus[3]}"
|
143
|
+
stats_file.puts "\n\tc-terminus: #{c_terminus[0]}\n\t\tuser_db: #{c_terminus[1]}\n\t\tsp: #{c_terminus[2]}\n\t\ttr: #{c_terminus[3]}"
|
144
|
+
stats_file.puts "\n\tputative_c_terminus: #{putative_c_terminus[0]}\n\t\tuser_db: #{putative_c_terminus[1]}\n\t\tsp: #{putative_c_terminus[2]}\n\t\ttr: #{putative_c_terminus[3]}"
|
145
|
+
stats_file.puts "\n\tinternal: #{internal[0]}\n\t\tuser_db: #{internal[1]}\n\t\tsp: #{internal[2]}\n\t\ttr: #{internal[3]}"
|
146
|
+
stats_file.puts "\n\tcoding sequences with unknown status: #{cod_seq[0]}\n\t\tuser_db: #{cod_seq[1]}\n\t\tsp: #{cod_seq[2]}\n\t\ttr: #{cod_seq[3]}"
|
147
|
+
|
148
|
+
return seqs_number
|
149
|
+
end
|
150
|
+
|
151
|
+
|
152
|
+
def testcode_stats(stats_file)
|
153
|
+
|
154
|
+
seqs_number = 0
|
155
|
+
coding = 0
|
156
|
+
putative_coding = 0
|
157
|
+
unknown = 0
|
158
|
+
|
159
|
+
coding_longer_200 = 0
|
160
|
+
coding_shorter_200 = 0
|
161
|
+
unknown_longer_200 = 0
|
162
|
+
unknown_shorter_200 = 0
|
163
|
+
|
164
|
+
coding_longer_500 = 0
|
165
|
+
coding_shorter_500 = 0
|
166
|
+
unknown_longer_500 = 0
|
167
|
+
unknown_shorter_500 = 0
|
168
|
+
|
169
|
+
File.open('fl2_results/tcode_result.txt').each do |line|
|
170
|
+
line.chomp!
|
171
|
+
(name,fasta_length,acc,db_name,status) = line.split("\t")
|
172
|
+
|
173
|
+
if (line !~ /^Query_id\t/)
|
174
|
+
seqs_number += 1
|
175
|
+
|
176
|
+
if (status == 'coding')
|
177
|
+
coding += 1
|
178
|
+
if (fasta_length.to_i >= 200)
|
179
|
+
coding_longer_200 += 1
|
180
|
+
coding_longer_500 += 1
|
181
|
+
else
|
182
|
+
coding_shorter_200 += 1
|
183
|
+
coding_shorter_500 += 1
|
184
|
+
end
|
185
|
+
elsif (status == 'putative_coding')
|
186
|
+
putative_coding += 1
|
187
|
+
elsif (status == 'unknown')
|
188
|
+
unknown += 1
|
189
|
+
if (fasta_length.to_i >= 200)
|
190
|
+
unknown_longer_200 += 1
|
191
|
+
unknown_longer_500 += 1
|
192
|
+
else
|
193
|
+
unknown_shorter_200 += 1
|
194
|
+
unknown_shorter_500 += 1
|
195
|
+
end
|
196
|
+
|
197
|
+
end
|
198
|
+
|
199
|
+
end
|
200
|
+
|
201
|
+
end
|
202
|
+
|
203
|
+
|
204
|
+
stats_file.puts "\n--------------------------- Test Code Summary\n\n\ttotal seqs: #{seqs_number}"
|
205
|
+
stats_file.puts "\n\tcoding sequences: #{coding}"
|
206
|
+
stats_file.puts "\t\tlonger than 200 bp: #{coding_longer_200}"
|
207
|
+
stats_file.puts "\t\tshorter than 200 bp: #{coding_shorter_200}"
|
208
|
+
stats_file.puts "\t\tlonger than 500 bp: #{coding_longer_500}"
|
209
|
+
stats_file.puts "\t\tshorter than 500 bp: #{coding_shorter_500}"
|
210
|
+
stats_file.puts "\n\tputative coding sequences: #{putative_coding}\n"
|
211
|
+
stats_file.puts "\n\tunknown: #{unknown} ("+ '%.3f' % (unknown.to_f/seqs_number.to_f*100) +" %)"
|
212
|
+
stats_file.puts "\t\tlonger than 200 bp: #{unknown_longer_200}"
|
213
|
+
stats_file.puts "\t\tshorter than 200 bp: #{unknown_shorter_200}"
|
214
|
+
stats_file.puts "\t\tlonger than 500 bp: #{unknown_longer_500}"
|
215
|
+
stats_file.puts "\t\tshorter than 500 bp: #{unknown_shorter_500}"
|
216
|
+
stats_file.puts "\n\tUnknown sequences have a bad test code score or haven't got an ORF longer than 200 nt"
|
217
|
+
stats_file.puts "---------------------------------------------"
|
218
|
+
|
219
|
+
return seqs_number
|
220
|
+
end
|
221
|
+
|
222
|
+
end
|