full_lengther_next 0.0.8 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'types'
|
1
2
|
|
2
3
|
module NcRna
|
3
4
|
|
@@ -8,14 +9,11 @@ module NcRna
|
|
8
9
|
raise "BLAST query name and sequence are different"
|
9
10
|
end
|
10
11
|
|
11
|
-
|
12
|
+
hit=blast_query.hits.first
|
12
13
|
|
13
|
-
if
|
14
|
-
|
15
|
-
seq.
|
16
|
-
else
|
17
|
-
unknown_annot = seq.get_annotations(:tcode_unknown).first
|
18
|
-
seq.annotate(:tcode, unknown_annot[:message],true)
|
14
|
+
if !hit.nil? && hit.align_len >= 40 # There is match in blast and it has a good length.
|
15
|
+
seq.hit = hit
|
16
|
+
seq.type = NCRNA
|
19
17
|
end
|
20
18
|
end
|
21
19
|
end
|
@@ -0,0 +1,210 @@
|
|
1
|
+
require 'scbi_mapreduce'
|
2
|
+
require 'my_worker_manager_EST' #Second server
|
3
|
+
require 'fln_stats'
|
4
|
+
require 'types'
|
5
|
+
|
6
|
+
########################################################################
|
7
|
+
# MAIN FUNCTION
|
8
|
+
########################################################################
|
9
|
+
def reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
|
10
|
+
cpus = count_cpu(options)
|
11
|
+
stats_hash = initialize_stats_hash_reptrans
|
12
|
+
# Paths
|
13
|
+
#---------------------------------------------
|
14
|
+
main_path = File.join(Dir.pwd, 'fln_results')
|
15
|
+
reptrans_fasta = File.join(main_path, 'Representative_transcriptome.fasta')
|
16
|
+
blast_path = File.join(main_path, 'ESTdb')
|
17
|
+
cluster_prot_annotated_path =File.join(main_path, 'Prot_clusters')
|
18
|
+
cluster_EST_annotated_path =File.join(main_path, 'EST_clusters')
|
19
|
+
html_file = File.join(main_path, 'Representative_transcriptome_stats.html')
|
20
|
+
txt_file = File.join(main_path, 'Representative_transcriptome_stats.txt')
|
21
|
+
|
22
|
+
# Prot annotations sequence analysis
|
23
|
+
#---------------------------------------------
|
24
|
+
analysis_over_DB_annotated_seqs(seqs_annotation_prot, reptrans_fasta, cluster_prot_annotated_path, stats_hash, 'prot_annotated', options[:high_clustering])
|
25
|
+
seqs_annotation_prot = nil
|
26
|
+
|
27
|
+
# NOT Prot annotations sequence analysis
|
28
|
+
#---------------------------------------------
|
29
|
+
putative_seqs = seqs_some_coding
|
30
|
+
if !options[:est_db].nil? # WITH EST DATABASE
|
31
|
+
putative_seqs += seqs_unknown # Coding & unknown
|
32
|
+
putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus)
|
33
|
+
if !File.exists?(blast_path +'.nsq')
|
34
|
+
$LOG.info "Start makeblastdb over EST DB"
|
35
|
+
system("makeblastdb -in #{options[:est_db]} -out #{blast_path} -dbtype nucl -parse_seqids > #{File.join(main_path, 'log_makeblast_db')}")
|
36
|
+
$LOG.info "Ended makeblastdb over EST DB"
|
37
|
+
end
|
38
|
+
putative_seqs = do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Coding sequence analysis
|
42
|
+
#---------------------------------------------
|
43
|
+
if !putative_seqs.nil? && !putative_seqs.empty?
|
44
|
+
putative_seqs = select_seqs_more_500pb(putative_seqs)
|
45
|
+
putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus) if options[:est_db].nil? # NOT EST database
|
46
|
+
putative_seqs.sort!{|s1, s2| #Order by testcode (first) and sequence length (last)
|
47
|
+
if s2.t_code == s1.t_code
|
48
|
+
s2.fasta_length <=> s1.fasta_length
|
49
|
+
else
|
50
|
+
s2.t_code <=> s1.t_code
|
51
|
+
end
|
52
|
+
}
|
53
|
+
count = 0
|
54
|
+
putative_seqs.each do |coding_seq|
|
55
|
+
coding_stats_reptrans(coding_seq, stats_hash)
|
56
|
+
count +=1
|
57
|
+
end
|
58
|
+
|
59
|
+
write_fasta(putative_seqs, reptrans_fasta, 'a')
|
60
|
+
end
|
61
|
+
write_reptrans_stats(stats_hash, html_file, txt_file)
|
62
|
+
end
|
63
|
+
########################################################################
|
64
|
+
# END MAIN FUNCTION
|
65
|
+
########################################################################
|
66
|
+
|
67
|
+
def analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering)
|
68
|
+
clusters_seqs_annot_DB = clustering_by_id(seqs_annotation_DB)
|
69
|
+
representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB)
|
70
|
+
if pfam_clustering
|
71
|
+
clusters_seqs_annot_DB = clustering_by_annot(representative_seqs_annot_DB, :pfam_id) # pfam id, fix get the annotation guide on my_worker_manager_fln (@@func_annot_type) to this scope
|
72
|
+
representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB) # merge clusters by id and by pfam
|
73
|
+
end
|
74
|
+
stats_hash[key_stats] += representative_seqs_annot_DB.length
|
75
|
+
report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
|
76
|
+
write_fasta(representative_seqs_annot_DB, reptrans_fasta, 'w')
|
77
|
+
end
|
78
|
+
|
79
|
+
def report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
|
80
|
+
cluster_file = File.open(cluster_file_path, 'w')
|
81
|
+
representative_seqs_annot_DB.each_with_index do |rep_seq, i|
|
82
|
+
cluster_seqs = clusters_seqs_annot_DB[i].map{|seq| seq.seq_name}.join(';')
|
83
|
+
cluster_file.puts "#{rep_seq.seq_name}\t#{cluster_seqs}"
|
84
|
+
end
|
85
|
+
cluster_file.close
|
86
|
+
end
|
87
|
+
|
88
|
+
def reduce_pool_sequences(putative_seqs, main_path, cpu)
|
89
|
+
temp_fasta = File.join(main_path, 'temp.fasta')
|
90
|
+
temp_fasta_clean = File.join(main_path, 'temp_cln.fasta')
|
91
|
+
log_file = File.join(main_path, 'log_cd_hit_Cod_Unk')
|
92
|
+
write_fasta(putative_seqs, temp_fasta, 'w')
|
93
|
+
$LOG.info "Start cd-hit with coding and unknow sequences"
|
94
|
+
system("cd-hit -i #{temp_fasta} -o #{temp_fasta_clean} -c 0.95 -M 0 -T #{cpu} > #{log_file}") if !File.exists?(temp_fasta_clean)
|
95
|
+
$LOG.info "Ended cd-hit with coding and unknow sequences"
|
96
|
+
cd_hit_names_putative_seqs = load_cd_hit_sequences_names(temp_fasta_clean)
|
97
|
+
putative_seqs = select_seqs_with_name(putative_seqs, cd_hit_names_putative_seqs)
|
98
|
+
return putative_seqs
|
99
|
+
end
|
100
|
+
|
101
|
+
def do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) # Second server to representative transcriptome
|
102
|
+
$LOG.info 'Starting server for EST analysis'
|
103
|
+
custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker_EST.rb')
|
104
|
+
options[:chimera] = nil #Inactive chimeras system on RepTrans, this resume the BLAST's output
|
105
|
+
|
106
|
+
MyWorkerManagerEst.init_work_manager(putative_seqs, options, blast_path)
|
107
|
+
server_EST = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerEst, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
|
108
|
+
server_EST.chunk_size = options[:chunk_size]
|
109
|
+
server_EST.start_server
|
110
|
+
$LOG.info 'Closing server for EST analysis'
|
111
|
+
|
112
|
+
seqs_with_EST, putative_seqs = MyWorkerManagerEst.get_array_seqs
|
113
|
+
if !seqs_with_EST.empty?
|
114
|
+
analysis_over_DB_annotated_seqs(seqs_with_EST, reptrans_fasta, cluster_EST_annotated_path, stats_hash, 'est_annotated')
|
115
|
+
end
|
116
|
+
return putative_seqs
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
def load_cd_hit_sequences_names(file)
|
121
|
+
names=[]
|
122
|
+
File.open(file).readlines.each do |line|
|
123
|
+
if line =~ /^>/
|
124
|
+
line.chomp!
|
125
|
+
line.gsub!('>','')
|
126
|
+
names << line
|
127
|
+
end
|
128
|
+
end
|
129
|
+
return names
|
130
|
+
end
|
131
|
+
|
132
|
+
def select_seqs_more_500pb(seqs_array)
|
133
|
+
seqs = seqs_array.select{|seq| seq.fasta_length > 500 }
|
134
|
+
return seqs
|
135
|
+
end
|
136
|
+
|
137
|
+
def select_seqs_with_name(array_seqs, array_names)
|
138
|
+
seqs = array_seqs.select{|seq| array_names.include?(seq.seq_name)}
|
139
|
+
return seqs
|
140
|
+
end
|
141
|
+
|
142
|
+
def write_fasta(seqs_array, file_name, mode)
|
143
|
+
file=File.open(file_name, mode)
|
144
|
+
seqs_array.each do |seq|
|
145
|
+
file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
|
146
|
+
end
|
147
|
+
file.close
|
148
|
+
end
|
149
|
+
|
150
|
+
def clustering_by_id(seqs_with_hit)
|
151
|
+
clusters=[]
|
152
|
+
hit_id=[]
|
153
|
+
seqs_with_hit.each do |seq|
|
154
|
+
position=hit_id.index(seq.get_acc)
|
155
|
+
if position.nil?
|
156
|
+
hit_id << seq.get_acc
|
157
|
+
clusters << [seq]
|
158
|
+
else
|
159
|
+
clusters[position] << seq
|
160
|
+
end
|
161
|
+
end
|
162
|
+
return clusters
|
163
|
+
end
|
164
|
+
|
165
|
+
def clustering_by_annot(seqs_with_hit, annotation_type)
|
166
|
+
clusters = []
|
167
|
+
annot_id = []
|
168
|
+
no_annotation_clusters = []
|
169
|
+
seqs_with_hit.each do |seq|
|
170
|
+
annot = seq.functional_annotations[annotation_type]
|
171
|
+
annot = annot.split(';').sort.join(';') if !annot.nil?
|
172
|
+
if annot == '-' || annot.nil?
|
173
|
+
no_annotation_clusters << [seq]
|
174
|
+
else
|
175
|
+
position = annot_id.index(annot)
|
176
|
+
if position.nil?
|
177
|
+
annot_id << annot
|
178
|
+
clusters << [seq]
|
179
|
+
else
|
180
|
+
clusters[position] << seq
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
clusters.concat(no_annotation_clusters)
|
185
|
+
return clusters
|
186
|
+
end
|
187
|
+
|
188
|
+
def select_representative(clusters_seqs_annot_prot)
|
189
|
+
seqs=[]
|
190
|
+
clusters_seqs_annot_prot.each do |cluster|
|
191
|
+
seq=cluster.select{|s| s.type == COMPLETE}.sort{|fl1, fl2| fl2.seq_fasta <=> fl1.seq_fasta}.first # Take longest full-length, s -> sequence, fl -> full-lentgh
|
192
|
+
if seq.nil?
|
193
|
+
cluster.sort!{|cl1, cl2| cl2.get_pident <=> cl1.get_pident}
|
194
|
+
best_pident=cluster.first.get_pident
|
195
|
+
seq=cluster.select{|s| s.get_pident == best_pident}.sort{|s1, s2| s2.seq_fasta <=> s1.seq_fasta}.first
|
196
|
+
end
|
197
|
+
seqs << seq
|
198
|
+
end
|
199
|
+
return seqs
|
200
|
+
end
|
201
|
+
|
202
|
+
def count_cpu(options)
|
203
|
+
cpu = 0
|
204
|
+
if options[:workers].class.to_s == 'Array'
|
205
|
+
cpu = options[:workers].length + 1
|
206
|
+
else
|
207
|
+
cpu = options[:workers]
|
208
|
+
end
|
209
|
+
return cpu
|
210
|
+
end
|
@@ -1,113 +1,472 @@
|
|
1
|
-
|
2
1
|
require 'orf'
|
2
|
+
require 'types'
|
3
|
+
require 'warnings'
|
4
|
+
require 'common_functions'
|
3
5
|
|
4
6
|
class Sequence
|
5
7
|
|
6
|
-
attr_accessor :seq_name
|
7
|
-
|
8
|
-
def initialize(seq_name,seq_fasta,seq_qual='')
|
9
|
-
|
10
|
-
@seq_name=seq_name
|
8
|
+
attr_accessor :seq_name, :seq_fasta, :fasta_length, :db_name, :seq_nt, :seq_aa, :db, :type, :status, :id, :orfs, :area_without_annotation, :save_fasta, :ignore, :hit, :t_code, :functional_annotations
|
9
|
+
|
10
|
+
def initialize(seq_name, seq_fasta, seq_qual='')
|
11
|
+
@seq_name = seq_name
|
11
12
|
@seq_fasta = seq_fasta
|
12
|
-
@fasta_length =
|
13
|
-
|
14
|
-
@
|
15
|
-
@
|
13
|
+
@fasta_length = seq_fasta.length
|
14
|
+
@db_name = nil
|
15
|
+
@seq_nt = nil # Unigen sequence with tagged ATG & stop
|
16
|
+
@seq_aa = nil # Protein sequence generated over unigen
|
17
|
+
@db =nil
|
18
|
+
@type = UNKNOWN # See types.rb
|
19
|
+
@status = FALSE # TRUE => Sure, FALSE => Putative
|
20
|
+
@id = nil #Prot or EST id, can be several => array
|
21
|
+
@warnings = []
|
16
22
|
@annotations=[]
|
23
|
+
@functional_annotations = {}
|
17
24
|
@orfs=[]
|
18
25
|
|
19
|
-
@
|
20
|
-
@
|
21
|
-
|
26
|
+
@area_without_annotation=FALSE
|
27
|
+
@save_fasta=TRUE
|
28
|
+
@ignore=FALSE
|
29
|
+
@hit=nil
|
30
|
+
@t_code=0
|
22
31
|
end
|
32
|
+
|
23
33
|
|
24
|
-
def add_orf(orf_seq, orf_t_start, orf_t_end,
|
25
|
-
orf = Orf.new(orf_seq, orf_t_start, orf_t_end,
|
26
|
-
@orfs
|
34
|
+
def add_orf(orf_seq, orf_t_start, orf_t_end, orf_q_frame, orf_stop_codon, orf_type)
|
35
|
+
orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_q_frame, orf_stop_codon, orf_type)
|
36
|
+
@orfs << orf
|
37
|
+
end
|
27
38
|
|
39
|
+
|
40
|
+
def change_degenerated_nt!
|
41
|
+
translate_hash = {}
|
42
|
+
translate_hash['R']= [['a','g'],0]
|
43
|
+
translate_hash['W']= [['a','t'],0]
|
44
|
+
translate_hash['M']= [['a','c'],0]
|
45
|
+
translate_hash['K']= [['g','t'],0]
|
46
|
+
translate_hash['S']= [['g','c'],0]
|
47
|
+
translate_hash['Y']= [['c','t'],0]
|
48
|
+
translate_hash['H']= [['a','t','c'],0]
|
49
|
+
translate_hash['B']= [['g','t','c'],0]
|
50
|
+
translate_hash['D']= [['g','a','t'],0]
|
51
|
+
translate_hash['V']= [['g','a','c'],0]
|
52
|
+
translate_hash['N']= [['g','a','c','t'],0]
|
53
|
+
|
54
|
+
|
55
|
+
fix_degenerated_fasta!(translate_hash)
|
28
56
|
end
|
57
|
+
|
29
58
|
|
30
|
-
def
|
31
|
-
|
59
|
+
def fix_degenerated_fasta!(translate_hash)
|
60
|
+
s = @seq_fasta
|
61
|
+
res = []
|
62
|
+
|
63
|
+
nts_of_a_line = s.split('')
|
64
|
+
|
65
|
+
nts_of_a_line.map{|e|
|
66
|
+
|
67
|
+
if (e =~ /[RWMKSYHBDVN]/)
|
68
|
+
translate_hash[e][1] += 1
|
69
|
+
e = translate_hash[e][0][translate_hash[e][1]%translate_hash[e][0].length]
|
70
|
+
end
|
71
|
+
|
72
|
+
res << e
|
73
|
+
|
74
|
+
}
|
75
|
+
|
76
|
+
@seq_fasta=res.compact.join
|
32
77
|
end
|
33
|
-
|
34
|
-
def
|
35
|
-
@
|
36
|
-
@rejected_message=message
|
78
|
+
|
79
|
+
def clean_orfs
|
80
|
+
@orfs=[]
|
37
81
|
end
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
82
|
+
|
83
|
+
def reset_classification
|
84
|
+
@type = UNKNOWN
|
85
|
+
@status = FALSE
|
86
|
+
end
|
87
|
+
|
88
|
+
def clean_warnings
|
89
|
+
@warnings = []
|
90
|
+
end
|
91
|
+
|
92
|
+
def clean_annotations
|
93
|
+
@annotations = []
|
94
|
+
end
|
95
|
+
|
96
|
+
def get_acc
|
97
|
+
acc=hit.acc
|
98
|
+
return acc
|
99
|
+
end
|
100
|
+
|
101
|
+
def get_pident
|
102
|
+
pident=hit.ident
|
103
|
+
return pident
|
104
|
+
end
|
105
|
+
|
106
|
+
def format_chimera!
|
107
|
+
@hit = []
|
42
108
|
end
|
43
109
|
|
44
|
-
def
|
110
|
+
def warnings(warn)
|
111
|
+
if warn.class.to_s == 'Array'
|
112
|
+
warn.each do |w|
|
113
|
+
@warnings << check_warn(w)
|
114
|
+
end
|
115
|
+
else
|
116
|
+
@warnings << check_warn(warn)
|
117
|
+
end
|
118
|
+
end
|
45
119
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
120
|
+
def clone_warnings(array_warnings)
|
121
|
+
array_warnings.map{|warn| @warnings << warn.dup}
|
122
|
+
end
|
123
|
+
|
124
|
+
def check_warn(warn)
|
125
|
+
check = warn
|
126
|
+
replace = nil
|
127
|
+
if warn.class.to_s == 'Array'
|
128
|
+
check = warn.shift # Take warning tag message
|
129
|
+
replace = warn # Take values to replace in message
|
130
|
+
end
|
131
|
+
|
132
|
+
message = $warnings_hash[check]
|
133
|
+
if message.nil?
|
134
|
+
message = check # If not exists the message
|
135
|
+
end
|
136
|
+
|
137
|
+
if !replace.nil?
|
138
|
+
message = message.dup # Duplicate memory to avoid overwrite original warning hash messages
|
139
|
+
replace.each do |rep|
|
140
|
+
message.sub!('(*replace*)',"#{rep}") #message variable
|
51
141
|
end
|
52
142
|
end
|
53
|
-
|
54
|
-
|
55
|
-
@annotations.push({:annotation_type=>annotation_type,:message=>message})
|
143
|
+
return message
|
56
144
|
end
|
57
|
-
|
58
|
-
def
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
tranlaste_hash = {}
|
64
|
-
tranlaste_hash['R']= [['a','g'],0]
|
65
|
-
tranlaste_hash['W']= [['a','t'],0]
|
66
|
-
tranlaste_hash['M']= [['a','c'],0]
|
67
|
-
tranlaste_hash['K']= [['g','t'],0]
|
68
|
-
tranlaste_hash['S']= [['g','c'],0]
|
69
|
-
tranlaste_hash['Y']= [['c','t'],0]
|
70
|
-
tranlaste_hash['H']= [['a','t','c'],0]
|
71
|
-
tranlaste_hash['B']= [['g','t','c'],0]
|
72
|
-
tranlaste_hash['D']= [['g','a','t'],0]
|
73
|
-
tranlaste_hash['V']= [['g','a','c'],0]
|
74
|
-
tranlaste_hash['N']= [['g','a','c','t'],0]
|
75
|
-
|
76
|
-
########################################
|
77
|
-
|
78
|
-
fix_degenerated_fasta!(tranlaste_hash)
|
79
|
-
|
80
|
-
|
145
|
+
|
146
|
+
def test_code(test_code)
|
147
|
+
@t_code = test_code
|
148
|
+
if @t_code >= 0.95
|
149
|
+
@status = TRUE
|
150
|
+
end
|
81
151
|
end
|
82
|
-
|
83
|
-
def fix_degenerated_fasta!(tranlaste_hash)
|
84
|
-
s = @seq_fasta
|
85
|
-
res = []
|
86
152
|
|
87
|
-
|
153
|
+
def get_fasta(seq)
|
154
|
+
fasta = ">#{@seq_name}\n#{seq}"
|
155
|
+
return fasta
|
156
|
+
end
|
88
157
|
|
89
|
-
|
90
|
-
|
91
|
-
|
158
|
+
def write_info(output_files) # Output_files is a hash
|
159
|
+
if @save_fasta
|
160
|
+
output_files['seqs'].puts get_fasta(@seq_fasta)
|
161
|
+
end
|
162
|
+
case @type
|
163
|
+
when OTHER
|
164
|
+
write_other(output_files[@type])
|
165
|
+
when CHIMERA
|
166
|
+
write_chimera(output_files[@type])
|
167
|
+
when MISASSEMBLED
|
168
|
+
write_misassembled(output_files[@type])
|
169
|
+
when UNKNOWN
|
170
|
+
write_unknown(output_files[@type])
|
171
|
+
when COMPLETE .. INTERNAL
|
172
|
+
write_prot_annot(output_files['db'])
|
173
|
+
write_prot_seq(output_files['prot'])
|
174
|
+
write_nt_seq(output_files['nts'])
|
175
|
+
write_align(output_files['align'])
|
176
|
+
when NCRNA
|
177
|
+
write_ncrna(output_files[@type])
|
178
|
+
when CODING
|
179
|
+
write_coding(output_files[@type])
|
180
|
+
else
|
181
|
+
if @type != FAILED
|
182
|
+
raise "#{@type} is an incorrect type"
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
92
186
|
|
93
|
-
|
187
|
+
def all_warns
|
188
|
+
all = @warnings.join(' ')
|
189
|
+
return all
|
190
|
+
end
|
94
191
|
|
95
|
-
|
96
|
-
|
97
|
-
|
192
|
+
def write_other(file)
|
193
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}\t#{all_warns}"
|
194
|
+
end
|
98
195
|
|
99
|
-
|
196
|
+
def write_chimera(file) #TODO : write 'SOLVED' tag
|
197
|
+
@hit.each do |h|
|
198
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{h.acc}\t#{@db_name}\t#{h.q_frame}\t#{h.e_val}\t#{h.ident}\t#{h.q_beg + 1}\t#{h.q_end + 1}\t#{h.s_beg + 1}\t#{h.s_end + 1}\t#{h.definition}"
|
199
|
+
end
|
200
|
+
file.puts
|
201
|
+
end
|
202
|
+
|
203
|
+
def write_misassembled(file)
|
204
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}"
|
205
|
+
end
|
206
|
+
|
207
|
+
def write_unknown(file)
|
208
|
+
# ħit is an array. 2 => q_frame, 1 ORF end, 0 ORF beg
|
209
|
+
if hit.class.to_s == 'Array'
|
210
|
+
orf_beg = @hit[0]
|
211
|
+
orf_end = @hit[1]
|
212
|
+
q_frame = @hit[2]
|
213
|
+
else
|
214
|
+
orf_beg = '-'
|
215
|
+
orf_end = '-'
|
216
|
+
q_frame = '-'
|
217
|
+
end
|
218
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@t_code}\t#{all_warns}\t#{q_frame}\t#{orf_beg}\t#{orf_end}"
|
219
|
+
end
|
220
|
+
|
221
|
+
def write_prot_annot(file)
|
222
|
+
final_func_annot = Array.new(9, '-')
|
223
|
+
if !@functional_annotations.empty?
|
224
|
+
final_func_annot = @functional_annotations.values
|
225
|
+
end
|
226
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}\t#{prot_annot_calification}\t#{@hit.e_val}\t#{@hit.ident}\t#{@hit.full_subject_length}\t#{@seq_aa.length}\t#{all_warns}\t#{@hit.q_frame}\t#{@hit.q_beg + 1}\t#{@hit.q_end + 1}\t#{@hit.s_beg + 1}\t#{@hit.s_end + 1}\t#{@hit.definition}\t#{final_func_annot.join("\t")}"
|
227
|
+
end
|
228
|
+
|
229
|
+
def write_ncrna(file)
|
230
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@hit.e_val}\t#{@hit.ident}\t#{@hit.q_beg + 1}\t#{@hit.q_end + 1}\t#{@hit.s_beg + 1}\t#{@hit.s_end + 1}\t#{@hit.definition}"
|
231
|
+
end
|
232
|
+
|
233
|
+
def write_coding(file)
|
234
|
+
# ħit is an array. 2 => q_frame, 1 ORF end, 0 ORF beg
|
235
|
+
calification = 'Putative'
|
236
|
+
if @status
|
237
|
+
calification = 'Sure'
|
238
|
+
end
|
239
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{calification}\t#{@t_code}\t#{@hit.last}\t#{@hit.first}\t#{@hit[1]}"
|
240
|
+
end
|
241
|
+
|
242
|
+
#Write complementary files
|
243
|
+
def write_prot_seq(file)
|
244
|
+
file.puts get_fasta(@seq_aa)
|
245
|
+
end
|
246
|
+
|
247
|
+
def write_align(file)
|
248
|
+
tabs = (seq_name.length/8).ceil
|
249
|
+
if tabs == 0
|
250
|
+
tabs = 1
|
251
|
+
end
|
252
|
+
second_tab = 0
|
253
|
+
if seq_name.length > 7
|
254
|
+
second_tab = 1
|
255
|
+
end
|
256
|
+
file.puts "#{@seq_name}#{"\t"*tabs}#{@hit.q_seq}\n#{@hit.acc}#{"\t"*(tabs+second_tab)}#{@hit.s_seq}"
|
257
|
+
file.puts
|
258
|
+
end
|
259
|
+
|
260
|
+
def write_nt_seq(file)
|
261
|
+
file.puts "#{@seq_name}\t#{@fasta_length}\t#{@seq_nt}"
|
262
|
+
end
|
263
|
+
|
264
|
+
def calification
|
265
|
+
type_description = nil
|
266
|
+
case @type
|
267
|
+
when FAILED
|
268
|
+
type_description = 'Failed'
|
269
|
+
when OTHER
|
270
|
+
type_description = 'Other'
|
271
|
+
when CHIMERA
|
272
|
+
type_description = 'Chimera'
|
273
|
+
when MISASSEMBLED
|
274
|
+
type_description = 'Misassembled'
|
275
|
+
when UNKNOWN
|
276
|
+
type_description = 'Unknown'
|
277
|
+
when COMPLETE
|
278
|
+
type_description = 'Complete'
|
279
|
+
when N_TERMINAL
|
280
|
+
type_description = 'N_terminal'
|
281
|
+
when C_TERMINAL
|
282
|
+
type_description = 'C_terminal'
|
283
|
+
when INTERNAL
|
284
|
+
type_description = 'Internal'
|
285
|
+
when CODING
|
286
|
+
type_description = 'Coding'
|
287
|
+
when NCRNA
|
288
|
+
type_description = 'NcRNA'
|
289
|
+
end
|
290
|
+
|
291
|
+
end
|
292
|
+
|
293
|
+
def prot_annot_calification
|
294
|
+
info = "#{calification} "
|
295
|
+
if @status
|
296
|
+
info << 'Sure'
|
297
|
+
else
|
298
|
+
info << 'Putative'
|
299
|
+
end
|
300
|
+
return info
|
301
|
+
end
|
302
|
+
|
303
|
+
def show_alignment(h, nts, show_nts, original_query_coordinates = nil)
|
304
|
+
puts "Prot id:\t#{h.acc}", "Alignment length:\t#{h.align_len} aa", "Subject length:\t#{h.s_len} aa", "Query length:\t#{nts.length/3} aa"
|
305
|
+
puts prot_annot_calification
|
306
|
+
puts
|
307
|
+
|
308
|
+
aa_unigen = nts[h.q_frame - 1 .. nts.length-1].translate
|
309
|
+
index = contenidos_en_prot(h.q_seq, aa_unigen)
|
310
|
+
|
311
|
+
# View desplacements 5-prime/align/3-prime
|
312
|
+
subzone_align = nil
|
313
|
+
if !original_query_coordinates.nil?
|
314
|
+
subzone_align = {}
|
315
|
+
if h.q_beg > original_query_coordinates.first #alignment has transferred characters to 5 prime
|
316
|
+
subzone_align['beg'] = [original_query_coordinates.first, h.q_beg-3, 42] # -3 to exclude the last aa
|
317
|
+
elsif h.q_beg < original_query_coordinates.first
|
318
|
+
subzone_align['beg'] = [h.q_beg, original_query_coordinates.first-3, 46] #alignment has received characters from 5 prime
|
319
|
+
end
|
100
320
|
|
101
|
-
|
321
|
+
if h.q_end < original_query_coordinates.last #alignment has transferred characters to 3 prime
|
322
|
+
subzone_align['end'] = [h.q_end, original_query_coordinates.last, 42]
|
323
|
+
elsif h.q_end > original_query_coordinates.last
|
324
|
+
subzone_align['end'] = [original_query_coordinates.last, h.q_end, 43] #alignment has received characters from 3 prime
|
102
325
|
end
|
326
|
+
end
|
103
327
|
|
104
|
-
|
328
|
+
# Print 5 prime
|
329
|
+
if index > 0 # 5 prime exists
|
330
|
+
aa_align = aa_unigen[0 .. index-1].split('')
|
331
|
+
nt_align = nts[h.q_frame-1..h.q_beg-1]
|
332
|
+
print_alignment(aa_align, nt_align, 36, show_nts, subzone_align)
|
333
|
+
reduce_coordinates(subzone_align, aa_align, h)
|
334
|
+
end
|
105
335
|
|
106
|
-
|
336
|
+
# Print core alignment or protein
|
337
|
+
aa_align = h.q_seq.split('')
|
338
|
+
nt_align = nts[h.q_beg..h.q_end]
|
339
|
+
print_alignment(aa_align, nt_align, 32, show_nts, subzone_align)
|
340
|
+
reduce_coordinates(subzone_align, aa_align, h)
|
341
|
+
|
342
|
+
# Print 3 prime
|
343
|
+
gaps = h.q_seq.count('-')
|
344
|
+
three_prime_beg = index+h.q_seq.length-gaps
|
345
|
+
if aa_unigen.length > three_prime_beg # 3 prime exists
|
346
|
+
aa_align = aa_unigen[three_prime_beg .. aa_unigen.length-1].split('')
|
347
|
+
fs = check_frame_shift(h)
|
348
|
+
nt_align = nts[h.q_end+1-fs..nts.length-1]
|
349
|
+
print_alignment(aa_align, nt_align, 33, show_nts, subzone_align)
|
350
|
+
end
|
107
351
|
|
108
|
-
@seq_fasta=res.compact.join
|
109
|
-
# @seq_fasta='dario'
|
110
352
|
end
|
111
|
-
|
112
|
-
|
113
|
-
|
353
|
+
|
354
|
+
|
355
|
+
def print_alignment(aa_align, nt_align, color, show_nts, mark_subzone = nil)
|
356
|
+
original_color = color
|
357
|
+
c={ 'GCT'=>'A','GCC'=>'A','GCA'=>'A','GCG'=>'A',
|
358
|
+
'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
|
359
|
+
'AAT'=>'N','AAC'=>'N',
|
360
|
+
'GAT'=>'D','GAC'=>'D',
|
361
|
+
'TGT'=>'C','TGC'=>'C',
|
362
|
+
'CAA'=>'Q','CAG'=>'Q',
|
363
|
+
'GAA'=>'E','GAG'=>'E',
|
364
|
+
'GGT'=>'G','GGC'=>'G','GGA'=>'G','GGG'=>'G',
|
365
|
+
'CAT'=>'H','CAC'=>'H',
|
366
|
+
'ATT'=>'I','ATC'=>'I','ATA'=>'I',
|
367
|
+
'TTA'=>'L','TTG'=>'L','CTT'=>'L','CTC'=>'L','CTA'=>'L','CTG'=>'L',
|
368
|
+
'ATG'=>'M',
|
369
|
+
'AAA'=>'K','AAG'=>'K',
|
370
|
+
'TTT'=>'F','TTC'=>'F',
|
371
|
+
'CCT'=>'P','CCC'=>'P','CCA'=>'P','CCG'=>'P',
|
372
|
+
'TCT'=>'S','TCC'=>'S','TCA'=>'S','TCG'=>'S','AGT'=>'S','AGC'=>'S',
|
373
|
+
'ACT'=>'T','ACC'=>'T','ACA'=>'T','ACG'=>'T',
|
374
|
+
'TGG'=>'W',
|
375
|
+
'TAT'=>'Y','TAC'=>'Y',
|
376
|
+
'GTT'=>'V','GTC'=>'V','GTA'=>'V','GTG'=>'V',
|
377
|
+
'TAG'=>'*','TGA'=>'*','TAA'=>'*'}
|
378
|
+
|
379
|
+
nt_line = ''
|
380
|
+
aa_line = ''
|
381
|
+
gaps = 0
|
382
|
+
count = 0
|
383
|
+
aa_align.each_with_index do |aa, n|
|
384
|
+
if aa == '-'
|
385
|
+
nt_line << '---'
|
386
|
+
gaps += 1
|
387
|
+
else
|
388
|
+
# Check aa with codon
|
389
|
+
codon_window = (n-gaps)*3
|
390
|
+
codon = nt_align[codon_window..codon_window+2]
|
391
|
+
nt_line << "#{codon}"
|
392
|
+
if aa.upcase != 'X'
|
393
|
+
if codon.upcase.include?('N')
|
394
|
+
traslated_aa = '-'
|
395
|
+
else
|
396
|
+
traslated_aa = c[codon]
|
397
|
+
end
|
398
|
+
if traslated_aa != '-' && traslated_aa != aa
|
399
|
+
puts "#{traslated_aa} #{aa}"
|
400
|
+
aa = '?'
|
401
|
+
end
|
402
|
+
end
|
403
|
+
end
|
404
|
+
if !mark_subzone.nil?
|
405
|
+
nts_coordenate = (n-gaps)*3
|
406
|
+
mark_subzone.values.each do |subzone|
|
407
|
+
if nts_coordenate >= subzone[0] && nts_coordenate <= subzone[1] #0 => first coordenate, 1 => second coordenate
|
408
|
+
color = subzone.last
|
409
|
+
end
|
410
|
+
end
|
411
|
+
end
|
412
|
+
space = nil
|
413
|
+
if show_nts
|
414
|
+
space = ' '
|
415
|
+
end
|
416
|
+
aa_line << "\e[#{color}m#{space}#{aa}#{space}\e[0m"
|
417
|
+
color = original_color
|
418
|
+
line_length = 60
|
419
|
+
if (n+1) % line_length == 0 || n+1 == aa_align.length
|
420
|
+
count = n + 1
|
421
|
+
print "#{count}\t"
|
422
|
+
puts aa_line
|
423
|
+
if show_nts
|
424
|
+
print "#{count*3}\t"
|
425
|
+
puts nt_line
|
426
|
+
end
|
427
|
+
aa_line = ''
|
428
|
+
nt_line = ''
|
429
|
+
end
|
430
|
+
end
|
431
|
+
|
432
|
+
end
|
433
|
+
|
434
|
+
def reduce_coordinates(subzone_align, aa_align, h)
|
435
|
+
if !subzone_align.nil?
|
436
|
+
aligned = 3 * aa_align.length + h.q_frame-1
|
437
|
+
subzone_align.values.each do |subzone|
|
438
|
+
subzone[0]-= aligned
|
439
|
+
subzone[1]-= aligned
|
440
|
+
end
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
def area_without_annotation?
|
445
|
+
if @hit.class == Array
|
446
|
+
hit = @hit.first
|
447
|
+
else
|
448
|
+
hit = @hit
|
449
|
+
end
|
450
|
+
upstream_annotation_space = hit.q_beg
|
451
|
+
downstream_annotation_space = @fasta_length - hit.q_end
|
452
|
+
if upstream_annotation_space >= 150 || downstream_annotation_space >= 150
|
453
|
+
@area_without_annotation = TRUE
|
454
|
+
end
|
455
|
+
return @area_without_annotation
|
456
|
+
end
|
457
|
+
|
458
|
+
def clone
|
459
|
+
new_seq = self.dup
|
460
|
+
new_seq.clean_annotations
|
461
|
+
new_seq.clean_warnings
|
462
|
+
new_seq.clean_orfs
|
463
|
+
new_seq.clone_warnings(@warnings)
|
464
|
+
new_seq.clone_annotations(@annotations)
|
465
|
+
return new_seq
|
466
|
+
end
|
467
|
+
|
468
|
+
def clone_annotations(array_annotations)
|
469
|
+
array_annotations.map{|annotation| @annotations << annotation.dup}
|
470
|
+
end
|
471
|
+
|
472
|
+
end
|