full_lengther_next 0.0.8 → 0.5.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'types'
|
1
2
|
|
2
3
|
module NcRna
|
3
4
|
|
@@ -8,14 +9,11 @@ module NcRna
|
|
8
9
|
raise "BLAST query name and sequence are different"
|
9
10
|
end
|
10
11
|
|
11
|
-
|
12
|
+
hit=blast_query.hits.first
|
12
13
|
|
13
|
-
if
|
14
|
-
|
15
|
-
seq.
|
16
|
-
else
|
17
|
-
unknown_annot = seq.get_annotations(:tcode_unknown).first
|
18
|
-
seq.annotate(:tcode, unknown_annot[:message],true)
|
14
|
+
if !hit.nil? && hit.align_len >= 40 # There is match in blast and it has a good length.
|
15
|
+
seq.hit = hit
|
16
|
+
seq.type = NCRNA
|
19
17
|
end
|
20
18
|
end
|
21
19
|
end
|
@@ -0,0 +1,210 @@
|
|
1
|
+
require 'scbi_mapreduce'
|
2
|
+
require 'my_worker_manager_EST' #Second server
|
3
|
+
require 'fln_stats'
|
4
|
+
require 'types'
|
5
|
+
|
6
|
+
########################################################################
|
7
|
+
# MAIN FUNCTION
|
8
|
+
########################################################################
|
9
|
+
def reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
|
10
|
+
cpus = count_cpu(options)
|
11
|
+
stats_hash = initialize_stats_hash_reptrans
|
12
|
+
# Paths
|
13
|
+
#---------------------------------------------
|
14
|
+
main_path = File.join(Dir.pwd, 'fln_results')
|
15
|
+
reptrans_fasta = File.join(main_path, 'Representative_transcriptome.fasta')
|
16
|
+
blast_path = File.join(main_path, 'ESTdb')
|
17
|
+
cluster_prot_annotated_path =File.join(main_path, 'Prot_clusters')
|
18
|
+
cluster_EST_annotated_path =File.join(main_path, 'EST_clusters')
|
19
|
+
html_file = File.join(main_path, 'Representative_transcriptome_stats.html')
|
20
|
+
txt_file = File.join(main_path, 'Representative_transcriptome_stats.txt')
|
21
|
+
|
22
|
+
# Prot annotations sequence analysis
|
23
|
+
#---------------------------------------------
|
24
|
+
analysis_over_DB_annotated_seqs(seqs_annotation_prot, reptrans_fasta, cluster_prot_annotated_path, stats_hash, 'prot_annotated', options[:high_clustering])
|
25
|
+
seqs_annotation_prot = nil
|
26
|
+
|
27
|
+
# NOT Prot annotations sequence analysis
|
28
|
+
#---------------------------------------------
|
29
|
+
putative_seqs = seqs_some_coding
|
30
|
+
if !options[:est_db].nil? # WITH EST DATABASE
|
31
|
+
putative_seqs += seqs_unknown # Coding & unknown
|
32
|
+
putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus)
|
33
|
+
if !File.exists?(blast_path +'.nsq')
|
34
|
+
$LOG.info "Start makeblastdb over EST DB"
|
35
|
+
system("makeblastdb -in #{options[:est_db]} -out #{blast_path} -dbtype nucl -parse_seqids > #{File.join(main_path, 'log_makeblast_db')}")
|
36
|
+
$LOG.info "Ended makeblastdb over EST DB"
|
37
|
+
end
|
38
|
+
putative_seqs = do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Coding sequence analysis
|
42
|
+
#---------------------------------------------
|
43
|
+
if !putative_seqs.nil? && !putative_seqs.empty?
|
44
|
+
putative_seqs = select_seqs_more_500pb(putative_seqs)
|
45
|
+
putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus) if options[:est_db].nil? # NOT EST database
|
46
|
+
putative_seqs.sort!{|s1, s2| #Order by testcode (first) and sequence length (last)
|
47
|
+
if s2.t_code == s1.t_code
|
48
|
+
s2.fasta_length <=> s1.fasta_length
|
49
|
+
else
|
50
|
+
s2.t_code <=> s1.t_code
|
51
|
+
end
|
52
|
+
}
|
53
|
+
count = 0
|
54
|
+
putative_seqs.each do |coding_seq|
|
55
|
+
coding_stats_reptrans(coding_seq, stats_hash)
|
56
|
+
count +=1
|
57
|
+
end
|
58
|
+
|
59
|
+
write_fasta(putative_seqs, reptrans_fasta, 'a')
|
60
|
+
end
|
61
|
+
write_reptrans_stats(stats_hash, html_file, txt_file)
|
62
|
+
end
|
63
|
+
########################################################################
|
64
|
+
# END MAIN FUNCTION
|
65
|
+
########################################################################
|
66
|
+
|
67
|
+
def analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering)
|
68
|
+
clusters_seqs_annot_DB = clustering_by_id(seqs_annotation_DB)
|
69
|
+
representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB)
|
70
|
+
if pfam_clustering
|
71
|
+
clusters_seqs_annot_DB = clustering_by_annot(representative_seqs_annot_DB, :pfam_id) # pfam id, fix get the annotation guide on my_worker_manager_fln (@@func_annot_type) to this scope
|
72
|
+
representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB) # merge clusters by id and by pfam
|
73
|
+
end
|
74
|
+
stats_hash[key_stats] += representative_seqs_annot_DB.length
|
75
|
+
report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
|
76
|
+
write_fasta(representative_seqs_annot_DB, reptrans_fasta, 'w')
|
77
|
+
end
|
78
|
+
|
79
|
+
def report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
|
80
|
+
cluster_file = File.open(cluster_file_path, 'w')
|
81
|
+
representative_seqs_annot_DB.each_with_index do |rep_seq, i|
|
82
|
+
cluster_seqs = clusters_seqs_annot_DB[i].map{|seq| seq.seq_name}.join(';')
|
83
|
+
cluster_file.puts "#{rep_seq.seq_name}\t#{cluster_seqs}"
|
84
|
+
end
|
85
|
+
cluster_file.close
|
86
|
+
end
|
87
|
+
|
88
|
+
def reduce_pool_sequences(putative_seqs, main_path, cpu)
|
89
|
+
temp_fasta = File.join(main_path, 'temp.fasta')
|
90
|
+
temp_fasta_clean = File.join(main_path, 'temp_cln.fasta')
|
91
|
+
log_file = File.join(main_path, 'log_cd_hit_Cod_Unk')
|
92
|
+
write_fasta(putative_seqs, temp_fasta, 'w')
|
93
|
+
$LOG.info "Start cd-hit with coding and unknow sequences"
|
94
|
+
system("cd-hit -i #{temp_fasta} -o #{temp_fasta_clean} -c 0.95 -M 0 -T #{cpu} > #{log_file}") if !File.exists?(temp_fasta_clean)
|
95
|
+
$LOG.info "Ended cd-hit with coding and unknow sequences"
|
96
|
+
cd_hit_names_putative_seqs = load_cd_hit_sequences_names(temp_fasta_clean)
|
97
|
+
putative_seqs = select_seqs_with_name(putative_seqs, cd_hit_names_putative_seqs)
|
98
|
+
return putative_seqs
|
99
|
+
end
|
100
|
+
|
101
|
+
def do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) # Second server to representative transcriptome
|
102
|
+
$LOG.info 'Starting server for EST analysis'
|
103
|
+
custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker_EST.rb')
|
104
|
+
options[:chimera] = nil #Inactive chimeras system on RepTrans, this resume the BLAST's output
|
105
|
+
|
106
|
+
MyWorkerManagerEst.init_work_manager(putative_seqs, options, blast_path)
|
107
|
+
server_EST = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerEst, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
|
108
|
+
server_EST.chunk_size = options[:chunk_size]
|
109
|
+
server_EST.start_server
|
110
|
+
$LOG.info 'Closing server for EST analysis'
|
111
|
+
|
112
|
+
seqs_with_EST, putative_seqs = MyWorkerManagerEst.get_array_seqs
|
113
|
+
if !seqs_with_EST.empty?
|
114
|
+
analysis_over_DB_annotated_seqs(seqs_with_EST, reptrans_fasta, cluster_EST_annotated_path, stats_hash, 'est_annotated')
|
115
|
+
end
|
116
|
+
return putative_seqs
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
def load_cd_hit_sequences_names(file)
|
121
|
+
names=[]
|
122
|
+
File.open(file).readlines.each do |line|
|
123
|
+
if line =~ /^>/
|
124
|
+
line.chomp!
|
125
|
+
line.gsub!('>','')
|
126
|
+
names << line
|
127
|
+
end
|
128
|
+
end
|
129
|
+
return names
|
130
|
+
end
|
131
|
+
|
132
|
+
def select_seqs_more_500pb(seqs_array)
|
133
|
+
seqs = seqs_array.select{|seq| seq.fasta_length > 500 }
|
134
|
+
return seqs
|
135
|
+
end
|
136
|
+
|
137
|
+
def select_seqs_with_name(array_seqs, array_names)
|
138
|
+
seqs = array_seqs.select{|seq| array_names.include?(seq.seq_name)}
|
139
|
+
return seqs
|
140
|
+
end
|
141
|
+
|
142
|
+
def write_fasta(seqs_array, file_name, mode)
|
143
|
+
file=File.open(file_name, mode)
|
144
|
+
seqs_array.each do |seq|
|
145
|
+
file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
|
146
|
+
end
|
147
|
+
file.close
|
148
|
+
end
|
149
|
+
|
150
|
+
def clustering_by_id(seqs_with_hit)
|
151
|
+
clusters=[]
|
152
|
+
hit_id=[]
|
153
|
+
seqs_with_hit.each do |seq|
|
154
|
+
position=hit_id.index(seq.get_acc)
|
155
|
+
if position.nil?
|
156
|
+
hit_id << seq.get_acc
|
157
|
+
clusters << [seq]
|
158
|
+
else
|
159
|
+
clusters[position] << seq
|
160
|
+
end
|
161
|
+
end
|
162
|
+
return clusters
|
163
|
+
end
|
164
|
+
|
165
|
+
def clustering_by_annot(seqs_with_hit, annotation_type)
|
166
|
+
clusters = []
|
167
|
+
annot_id = []
|
168
|
+
no_annotation_clusters = []
|
169
|
+
seqs_with_hit.each do |seq|
|
170
|
+
annot = seq.functional_annotations[annotation_type]
|
171
|
+
annot = annot.split(';').sort.join(';') if !annot.nil?
|
172
|
+
if annot == '-' || annot.nil?
|
173
|
+
no_annotation_clusters << [seq]
|
174
|
+
else
|
175
|
+
position = annot_id.index(annot)
|
176
|
+
if position.nil?
|
177
|
+
annot_id << annot
|
178
|
+
clusters << [seq]
|
179
|
+
else
|
180
|
+
clusters[position] << seq
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
clusters.concat(no_annotation_clusters)
|
185
|
+
return clusters
|
186
|
+
end
|
187
|
+
|
188
|
+
def select_representative(clusters_seqs_annot_prot)
|
189
|
+
seqs=[]
|
190
|
+
clusters_seqs_annot_prot.each do |cluster|
|
191
|
+
seq=cluster.select{|s| s.type == COMPLETE}.sort{|fl1, fl2| fl2.seq_fasta <=> fl1.seq_fasta}.first # Take longest full-length, s -> sequence, fl -> full-lentgh
|
192
|
+
if seq.nil?
|
193
|
+
cluster.sort!{|cl1, cl2| cl2.get_pident <=> cl1.get_pident}
|
194
|
+
best_pident=cluster.first.get_pident
|
195
|
+
seq=cluster.select{|s| s.get_pident == best_pident}.sort{|s1, s2| s2.seq_fasta <=> s1.seq_fasta}.first
|
196
|
+
end
|
197
|
+
seqs << seq
|
198
|
+
end
|
199
|
+
return seqs
|
200
|
+
end
|
201
|
+
|
202
|
+
def count_cpu(options)
|
203
|
+
cpu = 0
|
204
|
+
if options[:workers].class.to_s == 'Array'
|
205
|
+
cpu = options[:workers].length + 1
|
206
|
+
else
|
207
|
+
cpu = options[:workers]
|
208
|
+
end
|
209
|
+
return cpu
|
210
|
+
end
|
@@ -1,113 +1,472 @@
|
|
1
|
-
|
2
1
|
require 'orf'
|
2
|
+
require 'types'
|
3
|
+
require 'warnings'
|
4
|
+
require 'common_functions'
|
3
5
|
|
4
6
|
class Sequence
|
5
7
|
|
6
|
-
attr_accessor :seq_name
|
7
|
-
|
8
|
-
def initialize(seq_name,seq_fasta,seq_qual='')
|
9
|
-
|
10
|
-
@seq_name=seq_name
|
8
|
+
attr_accessor :seq_name, :seq_fasta, :fasta_length, :db_name, :seq_nt, :seq_aa, :db, :type, :status, :id, :orfs, :area_without_annotation, :save_fasta, :ignore, :hit, :t_code, :functional_annotations
|
9
|
+
|
10
|
+
def initialize(seq_name, seq_fasta, seq_qual='')
|
11
|
+
@seq_name = seq_name
|
11
12
|
@seq_fasta = seq_fasta
|
12
|
-
@fasta_length =
|
13
|
-
|
14
|
-
@
|
15
|
-
@
|
13
|
+
@fasta_length = seq_fasta.length
|
14
|
+
@db_name = nil
|
15
|
+
@seq_nt = nil # Unigen sequence with tagged ATG & stop
|
16
|
+
@seq_aa = nil # Protein sequence generated over unigen
|
17
|
+
@db =nil
|
18
|
+
@type = UNKNOWN # See types.rb
|
19
|
+
@status = FALSE # TRUE => Sure, FALSE => Putative
|
20
|
+
@id = nil #Prot or EST id, can be several => array
|
21
|
+
@warnings = []
|
16
22
|
@annotations=[]
|
23
|
+
@functional_annotations = {}
|
17
24
|
@orfs=[]
|
18
25
|
|
19
|
-
@
|
20
|
-
@
|
21
|
-
|
26
|
+
@area_without_annotation=FALSE
|
27
|
+
@save_fasta=TRUE
|
28
|
+
@ignore=FALSE
|
29
|
+
@hit=nil
|
30
|
+
@t_code=0
|
22
31
|
end
|
32
|
+
|
23
33
|
|
24
|
-
def add_orf(orf_seq, orf_t_start, orf_t_end,
|
25
|
-
orf = Orf.new(orf_seq, orf_t_start, orf_t_end,
|
26
|
-
@orfs
|
34
|
+
def add_orf(orf_seq, orf_t_start, orf_t_end, orf_q_frame, orf_stop_codon, orf_type)
|
35
|
+
orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_q_frame, orf_stop_codon, orf_type)
|
36
|
+
@orfs << orf
|
37
|
+
end
|
27
38
|
|
39
|
+
|
40
|
+
def change_degenerated_nt!
|
41
|
+
translate_hash = {}
|
42
|
+
translate_hash['R']= [['a','g'],0]
|
43
|
+
translate_hash['W']= [['a','t'],0]
|
44
|
+
translate_hash['M']= [['a','c'],0]
|
45
|
+
translate_hash['K']= [['g','t'],0]
|
46
|
+
translate_hash['S']= [['g','c'],0]
|
47
|
+
translate_hash['Y']= [['c','t'],0]
|
48
|
+
translate_hash['H']= [['a','t','c'],0]
|
49
|
+
translate_hash['B']= [['g','t','c'],0]
|
50
|
+
translate_hash['D']= [['g','a','t'],0]
|
51
|
+
translate_hash['V']= [['g','a','c'],0]
|
52
|
+
translate_hash['N']= [['g','a','c','t'],0]
|
53
|
+
|
54
|
+
|
55
|
+
fix_degenerated_fasta!(translate_hash)
|
28
56
|
end
|
57
|
+
|
29
58
|
|
30
|
-
def
|
31
|
-
|
59
|
+
def fix_degenerated_fasta!(translate_hash)
|
60
|
+
s = @seq_fasta
|
61
|
+
res = []
|
62
|
+
|
63
|
+
nts_of_a_line = s.split('')
|
64
|
+
|
65
|
+
nts_of_a_line.map{|e|
|
66
|
+
|
67
|
+
if (e =~ /[RWMKSYHBDVN]/)
|
68
|
+
translate_hash[e][1] += 1
|
69
|
+
e = translate_hash[e][0][translate_hash[e][1]%translate_hash[e][0].length]
|
70
|
+
end
|
71
|
+
|
72
|
+
res << e
|
73
|
+
|
74
|
+
}
|
75
|
+
|
76
|
+
@seq_fasta=res.compact.join
|
32
77
|
end
|
33
|
-
|
34
|
-
def
|
35
|
-
@
|
36
|
-
@rejected_message=message
|
78
|
+
|
79
|
+
def clean_orfs
|
80
|
+
@orfs=[]
|
37
81
|
end
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
82
|
+
|
83
|
+
def reset_classification
|
84
|
+
@type = UNKNOWN
|
85
|
+
@status = FALSE
|
86
|
+
end
|
87
|
+
|
88
|
+
def clean_warnings
|
89
|
+
@warnings = []
|
90
|
+
end
|
91
|
+
|
92
|
+
def clean_annotations
|
93
|
+
@annotations = []
|
94
|
+
end
|
95
|
+
|
96
|
+
def get_acc
|
97
|
+
acc=hit.acc
|
98
|
+
return acc
|
99
|
+
end
|
100
|
+
|
101
|
+
def get_pident
|
102
|
+
pident=hit.ident
|
103
|
+
return pident
|
104
|
+
end
|
105
|
+
|
106
|
+
def format_chimera!
|
107
|
+
@hit = []
|
42
108
|
end
|
43
109
|
|
44
|
-
def
|
110
|
+
def warnings(warn)
|
111
|
+
if warn.class.to_s == 'Array'
|
112
|
+
warn.each do |w|
|
113
|
+
@warnings << check_warn(w)
|
114
|
+
end
|
115
|
+
else
|
116
|
+
@warnings << check_warn(warn)
|
117
|
+
end
|
118
|
+
end
|
45
119
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
120
|
+
def clone_warnings(array_warnings)
|
121
|
+
array_warnings.map{|warn| @warnings << warn.dup}
|
122
|
+
end
|
123
|
+
|
124
|
+
def check_warn(warn)
|
125
|
+
check = warn
|
126
|
+
replace = nil
|
127
|
+
if warn.class.to_s == 'Array'
|
128
|
+
check = warn.shift # Take warning tag message
|
129
|
+
replace = warn # Take values to replace in message
|
130
|
+
end
|
131
|
+
|
132
|
+
message = $warnings_hash[check]
|
133
|
+
if message.nil?
|
134
|
+
message = check # If not exists the message
|
135
|
+
end
|
136
|
+
|
137
|
+
if !replace.nil?
|
138
|
+
message = message.dup # Duplicate memory to avoid overwrite original warning hash messages
|
139
|
+
replace.each do |rep|
|
140
|
+
message.sub!('(*replace*)',"#{rep}") #message variable
|
51
141
|
end
|
52
142
|
end
|
53
|
-
|
54
|
-
|
55
|
-
@annotations.push({:annotation_type=>annotation_type,:message=>message})
|
143
|
+
return message
|
56
144
|
end
|
57
|
-
|
58
|
-
def
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
tranlaste_hash = {}
|
64
|
-
tranlaste_hash['R']= [['a','g'],0]
|
65
|
-
tranlaste_hash['W']= [['a','t'],0]
|
66
|
-
tranlaste_hash['M']= [['a','c'],0]
|
67
|
-
tranlaste_hash['K']= [['g','t'],0]
|
68
|
-
tranlaste_hash['S']= [['g','c'],0]
|
69
|
-
tranlaste_hash['Y']= [['c','t'],0]
|
70
|
-
tranlaste_hash['H']= [['a','t','c'],0]
|
71
|
-
tranlaste_hash['B']= [['g','t','c'],0]
|
72
|
-
tranlaste_hash['D']= [['g','a','t'],0]
|
73
|
-
tranlaste_hash['V']= [['g','a','c'],0]
|
74
|
-
tranlaste_hash['N']= [['g','a','c','t'],0]
|
75
|
-
|
76
|
-
########################################
|
77
|
-
|
78
|
-
fix_degenerated_fasta!(tranlaste_hash)
|
79
|
-
|
80
|
-
|
145
|
+
|
146
|
+
def test_code(test_code)
|
147
|
+
@t_code = test_code
|
148
|
+
if @t_code >= 0.95
|
149
|
+
@status = TRUE
|
150
|
+
end
|
81
151
|
end
|
82
|
-
|
83
|
-
def fix_degenerated_fasta!(tranlaste_hash)
|
84
|
-
s = @seq_fasta
|
85
|
-
res = []
|
86
152
|
|
87
|
-
|
153
|
+
def get_fasta(seq)
|
154
|
+
fasta = ">#{@seq_name}\n#{seq}"
|
155
|
+
return fasta
|
156
|
+
end
|
88
157
|
|
89
|
-
|
90
|
-
|
91
|
-
|
158
|
+
def write_info(output_files) # Output_files is a hash
|
159
|
+
if @save_fasta
|
160
|
+
output_files['seqs'].puts get_fasta(@seq_fasta)
|
161
|
+
end
|
162
|
+
case @type
|
163
|
+
when OTHER
|
164
|
+
write_other(output_files[@type])
|
165
|
+
when CHIMERA
|
166
|
+
write_chimera(output_files[@type])
|
167
|
+
when MISASSEMBLED
|
168
|
+
write_misassembled(output_files[@type])
|
169
|
+
when UNKNOWN
|
170
|
+
write_unknown(output_files[@type])
|
171
|
+
when COMPLETE .. INTERNAL
|
172
|
+
write_prot_annot(output_files['db'])
|
173
|
+
write_prot_seq(output_files['prot'])
|
174
|
+
write_nt_seq(output_files['nts'])
|
175
|
+
write_align(output_files['align'])
|
176
|
+
when NCRNA
|
177
|
+
write_ncrna(output_files[@type])
|
178
|
+
when CODING
|
179
|
+
write_coding(output_files[@type])
|
180
|
+
else
|
181
|
+
if @type != FAILED
|
182
|
+
raise "#{@type} is an incorrect type"
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
92
186
|
|
93
|
-
|
187
|
+
def all_warns
|
188
|
+
all = @warnings.join(' ')
|
189
|
+
return all
|
190
|
+
end
|
94
191
|
|
95
|
-
|
96
|
-
|
97
|
-
|
192
|
+
def write_other(file)
|
193
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}\t#{all_warns}"
|
194
|
+
end
|
98
195
|
|
99
|
-
|
196
|
+
def write_chimera(file) #TODO : write 'SOLVED' tag
|
197
|
+
@hit.each do |h|
|
198
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{h.acc}\t#{@db_name}\t#{h.q_frame}\t#{h.e_val}\t#{h.ident}\t#{h.q_beg + 1}\t#{h.q_end + 1}\t#{h.s_beg + 1}\t#{h.s_end + 1}\t#{h.definition}"
|
199
|
+
end
|
200
|
+
file.puts
|
201
|
+
end
|
202
|
+
|
203
|
+
def write_misassembled(file)
|
204
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}"
|
205
|
+
end
|
206
|
+
|
207
|
+
def write_unknown(file)
|
208
|
+
# ħit is an array. 2 => q_frame, 1 ORF end, 0 ORF beg
|
209
|
+
if hit.class.to_s == 'Array'
|
210
|
+
orf_beg = @hit[0]
|
211
|
+
orf_end = @hit[1]
|
212
|
+
q_frame = @hit[2]
|
213
|
+
else
|
214
|
+
orf_beg = '-'
|
215
|
+
orf_end = '-'
|
216
|
+
q_frame = '-'
|
217
|
+
end
|
218
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@t_code}\t#{all_warns}\t#{q_frame}\t#{orf_beg}\t#{orf_end}"
|
219
|
+
end
|
220
|
+
|
221
|
+
def write_prot_annot(file)
|
222
|
+
final_func_annot = Array.new(9, '-')
|
223
|
+
if !@functional_annotations.empty?
|
224
|
+
final_func_annot = @functional_annotations.values
|
225
|
+
end
|
226
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}\t#{prot_annot_calification}\t#{@hit.e_val}\t#{@hit.ident}\t#{@hit.full_subject_length}\t#{@seq_aa.length}\t#{all_warns}\t#{@hit.q_frame}\t#{@hit.q_beg + 1}\t#{@hit.q_end + 1}\t#{@hit.s_beg + 1}\t#{@hit.s_end + 1}\t#{@hit.definition}\t#{final_func_annot.join("\t")}"
|
227
|
+
end
|
228
|
+
|
229
|
+
def write_ncrna(file)
|
230
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@hit.e_val}\t#{@hit.ident}\t#{@hit.q_beg + 1}\t#{@hit.q_end + 1}\t#{@hit.s_beg + 1}\t#{@hit.s_end + 1}\t#{@hit.definition}"
|
231
|
+
end
|
232
|
+
|
233
|
+
def write_coding(file)
|
234
|
+
# ħit is an array. 2 => q_frame, 1 ORF end, 0 ORF beg
|
235
|
+
calification = 'Putative'
|
236
|
+
if @status
|
237
|
+
calification = 'Sure'
|
238
|
+
end
|
239
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{calification}\t#{@t_code}\t#{@hit.last}\t#{@hit.first}\t#{@hit[1]}"
|
240
|
+
end
|
241
|
+
|
242
|
+
#Write complementary files
|
243
|
+
def write_prot_seq(file)
|
244
|
+
file.puts get_fasta(@seq_aa)
|
245
|
+
end
|
246
|
+
|
247
|
+
def write_align(file)
|
248
|
+
tabs = (seq_name.length/8).ceil
|
249
|
+
if tabs == 0
|
250
|
+
tabs = 1
|
251
|
+
end
|
252
|
+
second_tab = 0
|
253
|
+
if seq_name.length > 7
|
254
|
+
second_tab = 1
|
255
|
+
end
|
256
|
+
file.puts "#{@seq_name}#{"\t"*tabs}#{@hit.q_seq}\n#{@hit.acc}#{"\t"*(tabs+second_tab)}#{@hit.s_seq}"
|
257
|
+
file.puts
|
258
|
+
end
|
259
|
+
|
260
|
+
def write_nt_seq(file)
|
261
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@seq_nt}"
|
262
|
+
end
|
263
|
+
|
264
|
+
def calification
|
265
|
+
type_description = nil
|
266
|
+
case @type
|
267
|
+
when FAILED
|
268
|
+
type_description = 'Failed'
|
269
|
+
when OTHER
|
270
|
+
type_description = 'Other'
|
271
|
+
when CHIMERA
|
272
|
+
type_description = 'Chimera'
|
273
|
+
when MISASSEMBLED
|
274
|
+
type_description = 'Misassembled'
|
275
|
+
when UNKNOWN
|
276
|
+
type_description = 'Unknown'
|
277
|
+
when COMPLETE
|
278
|
+
type_description = 'Complete'
|
279
|
+
when N_TERMINAL
|
280
|
+
type_description = 'N_terminal'
|
281
|
+
when C_TERMINAL
|
282
|
+
type_description = 'C_terminal'
|
283
|
+
when INTERNAL
|
284
|
+
type_description = 'Internal'
|
285
|
+
when CODING
|
286
|
+
type_description = 'Coding'
|
287
|
+
when NCRNA
|
288
|
+
type_description = 'NcRNA'
|
289
|
+
end
|
290
|
+
|
291
|
+
end
|
292
|
+
|
293
|
+
def prot_annot_calification
|
294
|
+
info = "#{calification} "
|
295
|
+
if @status
|
296
|
+
info << 'Sure'
|
297
|
+
else
|
298
|
+
info << 'Putative'
|
299
|
+
end
|
300
|
+
return info
|
301
|
+
end
|
302
|
+
|
303
|
+
def show_alignment(h, nts, show_nts, original_query_coordinates = nil)
|
304
|
+
puts "Prot id:\t#{h.acc}", "Alignment length:\t#{h.align_len} aa", "Subject length:\t#{h.s_len} aa", "Query length:\t#{nts.length/3} aa"
|
305
|
+
puts prot_annot_calification
|
306
|
+
puts
|
307
|
+
|
308
|
+
aa_unigen = nts[h.q_frame - 1 .. nts.length-1].translate
|
309
|
+
index = contenidos_en_prot(h.q_seq, aa_unigen)
|
310
|
+
|
311
|
+
# View desplacements 5-prime/align/3-prime
|
312
|
+
subzone_align = nil
|
313
|
+
if !original_query_coordinates.nil?
|
314
|
+
subzone_align = {}
|
315
|
+
if h.q_beg > original_query_coordinates.first #alignment has transferred characters to 5 prime
|
316
|
+
subzone_align['beg'] = [original_query_coordinates.first, h.q_beg-3, 42] # -3 to exclude the last aa
|
317
|
+
elsif h.q_beg < original_query_coordinates.first
|
318
|
+
subzone_align['beg'] = [h.q_beg, original_query_coordinates.first-3, 46] #alignment has received characters from 5 prime
|
319
|
+
end
|
100
320
|
|
101
|
-
|
321
|
+
if h.q_end < original_query_coordinates.last #alignment has transferred characters to 3 prime
|
322
|
+
subzone_align['end'] = [h.q_end, original_query_coordinates.last, 42]
|
323
|
+
elsif h.q_end > original_query_coordinates.last
|
324
|
+
subzone_align['end'] = [original_query_coordinates.last, h.q_end, 43] #alignment has received characters from 3 prime
|
102
325
|
end
|
326
|
+
end
|
103
327
|
|
104
|
-
|
328
|
+
# Print 5 prime
|
329
|
+
if index > 0 # 5 prime exists
|
330
|
+
aa_align = aa_unigen[0 .. index-1].split('')
|
331
|
+
nt_align = nts[h.q_frame-1..h.q_beg-1]
|
332
|
+
print_alignment(aa_align, nt_align, 36, show_nts, subzone_align)
|
333
|
+
reduce_coordinates(subzone_align, aa_align, h)
|
334
|
+
end
|
105
335
|
|
106
|
-
|
336
|
+
# Print core alignment or protein
|
337
|
+
aa_align = h.q_seq.split('')
|
338
|
+
nt_align = nts[h.q_beg..h.q_end]
|
339
|
+
print_alignment(aa_align, nt_align, 32, show_nts, subzone_align)
|
340
|
+
reduce_coordinates(subzone_align, aa_align, h)
|
341
|
+
|
342
|
+
# Print 3 prime
|
343
|
+
gaps = h.q_seq.count('-')
|
344
|
+
three_prime_beg = index+h.q_seq.length-gaps
|
345
|
+
if aa_unigen.length > three_prime_beg # 3 prime exists
|
346
|
+
aa_align = aa_unigen[three_prime_beg .. aa_unigen.length-1].split('')
|
347
|
+
fs = check_frame_shift(h)
|
348
|
+
nt_align = nts[h.q_end+1-fs..nts.length-1]
|
349
|
+
print_alignment(aa_align, nt_align, 33, show_nts, subzone_align)
|
350
|
+
end
|
107
351
|
|
108
|
-
@seq_fasta=res.compact.join
|
109
|
-
# @seq_fasta='dario'
|
110
352
|
end
|
111
|
-
|
112
|
-
|
113
|
-
|
353
|
+
|
354
|
+
|
355
|
+
def print_alignment(aa_align, nt_align, color, show_nts, mark_subzone = nil)
|
356
|
+
original_color = color
|
357
|
+
c={ 'GCT'=>'A','GCC'=>'A','GCA'=>'A','GCG'=>'A',
|
358
|
+
'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
|
359
|
+
'AAT'=>'N','AAC'=>'N',
|
360
|
+
'GAT'=>'D','GAC'=>'D',
|
361
|
+
'TGT'=>'C','TGC'=>'C',
|
362
|
+
'CAA'=>'Q','CAG'=>'Q',
|
363
|
+
'GAA'=>'E','GAG'=>'E',
|
364
|
+
'GGT'=>'G','GGC'=>'G','GGA'=>'G','GGG'=>'G',
|
365
|
+
'CAT'=>'H','CAC'=>'H',
|
366
|
+
'ATT'=>'I','ATC'=>'I','ATA'=>'I',
|
367
|
+
'TTA'=>'L','TTG'=>'L','CTT'=>'L','CTC'=>'L','CTA'=>'L','CTG'=>'L',
|
368
|
+
'ATG'=>'M',
|
369
|
+
'AAA'=>'K','AAG'=>'K',
|
370
|
+
'TTT'=>'F','TTC'=>'F',
|
371
|
+
'CCT'=>'P','CCC'=>'P','CCA'=>'P','CCG'=>'P',
|
372
|
+
'TCT'=>'S','TCC'=>'S','TCA'=>'S','TCG'=>'S','AGT'=>'S','AGC'=>'S',
|
373
|
+
'ACT'=>'T','ACC'=>'T','ACA'=>'T','ACG'=>'T',
|
374
|
+
'TGG'=>'W',
|
375
|
+
'TAT'=>'Y','TAC'=>'Y',
|
376
|
+
'GTT'=>'V','GTC'=>'V','GTA'=>'V','GTG'=>'V',
|
377
|
+
'TAG'=>'*','TGA'=>'*','TAA'=>'*'}
|
378
|
+
|
379
|
+
nt_line = ''
|
380
|
+
aa_line = ''
|
381
|
+
gaps = 0
|
382
|
+
count = 0
|
383
|
+
aa_align.each_with_index do |aa, n|
|
384
|
+
if aa == '-'
|
385
|
+
nt_line << '---'
|
386
|
+
gaps += 1
|
387
|
+
else
|
388
|
+
# Check aa with codon
|
389
|
+
codon_window = (n-gaps)*3
|
390
|
+
codon = nt_align[codon_window..codon_window+2]
|
391
|
+
nt_line << "#{codon}"
|
392
|
+
if aa.upcase != 'X'
|
393
|
+
if codon.upcase.include?('N')
|
394
|
+
traslated_aa = '-'
|
395
|
+
else
|
396
|
+
traslated_aa = c[codon]
|
397
|
+
end
|
398
|
+
if traslated_aa != '-' && traslated_aa != aa
|
399
|
+
puts "#{traslated_aa} #{aa}"
|
400
|
+
aa = '?'
|
401
|
+
end
|
402
|
+
end
|
403
|
+
end
|
404
|
+
if !mark_subzone.nil?
|
405
|
+
nts_coordenate = (n-gaps)*3
|
406
|
+
mark_subzone.values.each do |subzone|
|
407
|
+
if nts_coordenate >= subzone[0] && nts_coordenate <= subzone[1] #0 => first coordenate, 1 => second coordenate
|
408
|
+
color = subzone.last
|
409
|
+
end
|
410
|
+
end
|
411
|
+
end
|
412
|
+
space = nil
|
413
|
+
if show_nts
|
414
|
+
space = ' '
|
415
|
+
end
|
416
|
+
aa_line << "\e[#{color}m#{space}#{aa}#{space}\e[0m"
|
417
|
+
color = original_color
|
418
|
+
line_length = 60
|
419
|
+
if (n+1) % line_length == 0 || n+1 == aa_align.length
|
420
|
+
count = n + 1
|
421
|
+
print "#{count}\t"
|
422
|
+
puts aa_line
|
423
|
+
if show_nts
|
424
|
+
print "#{count*3}\t"
|
425
|
+
puts nt_line
|
426
|
+
end
|
427
|
+
aa_line = ''
|
428
|
+
nt_line = ''
|
429
|
+
end
|
430
|
+
end
|
431
|
+
|
432
|
+
end
|
433
|
+
|
434
|
+
def reduce_coordinates(subzone_align, aa_align, h)
|
435
|
+
if !subzone_align.nil?
|
436
|
+
aligned = 3 * aa_align.length + h.q_frame-1
|
437
|
+
subzone_align.values.each do |subzone|
|
438
|
+
subzone[0]-= aligned
|
439
|
+
subzone[1]-= aligned
|
440
|
+
end
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
def area_without_annotation?
|
445
|
+
if @hit.class == Array
|
446
|
+
hit = @hit.first
|
447
|
+
else
|
448
|
+
hit = @hit
|
449
|
+
end
|
450
|
+
upstream_annotation_space = hit.q_beg
|
451
|
+
downstream_annotation_space = @fasta_length - hit.q_end
|
452
|
+
if upstream_annotation_space >= 150 || downstream_annotation_space >= 150
|
453
|
+
@area_without_annotation = TRUE
|
454
|
+
end
|
455
|
+
return @area_without_annotation
|
456
|
+
end
|
457
|
+
|
458
|
+
def clone
|
459
|
+
new_seq = self.dup
|
460
|
+
new_seq.clean_annotations
|
461
|
+
new_seq.clean_warnings
|
462
|
+
new_seq.clean_orfs
|
463
|
+
new_seq.clone_warnings(@warnings)
|
464
|
+
new_seq.clone_annotations(@annotations)
|
465
|
+
return new_seq
|
466
|
+
end
|
467
|
+
|
468
|
+
def clone_annotations(array_annotations)
|
469
|
+
array_annotations.map{|annotation| @annotations << annotation.dup}
|
470
|
+
end
|
471
|
+
|
472
|
+
end
|