full_lengther_next 0.0.8 → 0.5.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -1,3 +1,4 @@
1
+ require 'types'
1
2
 
2
3
  module NcRna
3
4
 
@@ -8,14 +9,11 @@ module NcRna
8
9
  raise "BLAST query name and sequence are different"
9
10
  end
10
11
 
11
- q=blast_query
12
+ hit=blast_query.hits.first
12
13
 
13
- if (!q.hits[0].nil?) # There is match in blast.
14
- nc_annotations = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\tncRNA\tPutative ncRNA\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t\t\t#{q.hits[0].q_frame}\t#{q.hits[0].q_beg}\t#{q.hits[0].q_end}\t#{q.hits[0].s_beg.to_i}\t#{q.hits[0].s_end.to_i}\t#{q.hits[0].definition}\t"
15
- seq.annotate(:ncrna,nc_annotations,true)
16
- else
17
- unknown_annot = seq.get_annotations(:tcode_unknown).first
18
- seq.annotate(:tcode, unknown_annot[:message],true)
14
+ if !hit.nil? && hit.align_len >= 40 # There is match in blast and it has a good length.
15
+ seq.hit = hit
16
+ seq.type = NCRNA
19
17
  end
20
18
  end
21
19
  end
@@ -0,0 +1,210 @@
1
+ require 'scbi_mapreduce'
2
+ require 'my_worker_manager_EST' #Second server
3
+ require 'fln_stats'
4
+ require 'types'
5
+
6
+ ########################################################################
7
+ # MAIN FUNCTION
8
+ ########################################################################
9
+ def reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
10
+ cpus = count_cpu(options)
11
+ stats_hash = initialize_stats_hash_reptrans
12
+ # Paths
13
+ #---------------------------------------------
14
+ main_path = File.join(Dir.pwd, 'fln_results')
15
+ reptrans_fasta = File.join(main_path, 'Representative_transcriptome.fasta')
16
+ blast_path = File.join(main_path, 'ESTdb')
17
+ cluster_prot_annotated_path =File.join(main_path, 'Prot_clusters')
18
+ cluster_EST_annotated_path =File.join(main_path, 'EST_clusters')
19
+ html_file = File.join(main_path, 'Representative_transcriptome_stats.html')
20
+ txt_file = File.join(main_path, 'Representative_transcriptome_stats.txt')
21
+
22
+ # Prot annotations sequence analysis
23
+ #---------------------------------------------
24
+ analysis_over_DB_annotated_seqs(seqs_annotation_prot, reptrans_fasta, cluster_prot_annotated_path, stats_hash, 'prot_annotated', options[:high_clustering])
25
+ seqs_annotation_prot = nil
26
+
27
+ # NOT Prot annotations sequence analysis
28
+ #---------------------------------------------
29
+ putative_seqs = seqs_some_coding
30
+ if !options[:est_db].nil? # WITH EST DATABASE
31
+ putative_seqs += seqs_unknown # Coding & unknown
32
+ putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus)
33
+ if !File.exists?(blast_path +'.nsq')
34
+ $LOG.info "Start makeblastdb over EST DB"
35
+ system("makeblastdb -in #{options[:est_db]} -out #{blast_path} -dbtype nucl -parse_seqids > #{File.join(main_path, 'log_makeblast_db')}")
36
+ $LOG.info "Ended makeblastdb over EST DB"
37
+ end
38
+ putative_seqs = do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash)
39
+ end
40
+
41
+ # Coding sequence analysis
42
+ #---------------------------------------------
43
+ if !putative_seqs.nil? && !putative_seqs.empty?
44
+ putative_seqs = select_seqs_more_500pb(putative_seqs)
45
+ putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus) if options[:est_db].nil? # NOT EST database
46
+ putative_seqs.sort!{|s1, s2| #Order by testcode (first) and sequence length (last)
47
+ if s2.t_code == s1.t_code
48
+ s2.fasta_length <=> s1.fasta_length
49
+ else
50
+ s2.t_code <=> s1.t_code
51
+ end
52
+ }
53
+ count = 0
54
+ putative_seqs.each do |coding_seq|
55
+ coding_stats_reptrans(coding_seq, stats_hash)
56
+ count +=1
57
+ end
58
+
59
+ write_fasta(putative_seqs, reptrans_fasta, 'a')
60
+ end
61
+ write_reptrans_stats(stats_hash, html_file, txt_file)
62
+ end
63
+ ########################################################################
64
+ # END MAIN FUNCTION
65
+ ########################################################################
66
+
67
+ def analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering)
68
+ clusters_seqs_annot_DB = clustering_by_id(seqs_annotation_DB)
69
+ representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB)
70
+ if pfam_clustering
71
+ clusters_seqs_annot_DB = clustering_by_annot(representative_seqs_annot_DB, :pfam_id) # pfam id, fix get the annotation guide on my_worker_manager_fln (@@func_annot_type) to this scope
72
+ representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB) # merge clusters by id and by pfam
73
+ end
74
+ stats_hash[key_stats] += representative_seqs_annot_DB.length
75
+ report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
76
+ write_fasta(representative_seqs_annot_DB, reptrans_fasta, 'w')
77
+ end
78
+
79
+ def report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
80
+ cluster_file = File.open(cluster_file_path, 'w')
81
+ representative_seqs_annot_DB.each_with_index do |rep_seq, i|
82
+ cluster_seqs = clusters_seqs_annot_DB[i].map{|seq| seq.seq_name}.join(';')
83
+ cluster_file.puts "#{rep_seq.seq_name}\t#{cluster_seqs}"
84
+ end
85
+ cluster_file.close
86
+ end
87
+
88
+ def reduce_pool_sequences(putative_seqs, main_path, cpu)
89
+ temp_fasta = File.join(main_path, 'temp.fasta')
90
+ temp_fasta_clean = File.join(main_path, 'temp_cln.fasta')
91
+ log_file = File.join(main_path, 'log_cd_hit_Cod_Unk')
92
+ write_fasta(putative_seqs, temp_fasta, 'w')
93
+ $LOG.info "Start cd-hit with coding and unknow sequences"
94
+ system("cd-hit -i #{temp_fasta} -o #{temp_fasta_clean} -c 0.95 -M 0 -T #{cpu} > #{log_file}") if !File.exists?(temp_fasta_clean)
95
+ $LOG.info "Ended cd-hit with coding and unknow sequences"
96
+ cd_hit_names_putative_seqs = load_cd_hit_sequences_names(temp_fasta_clean)
97
+ putative_seqs = select_seqs_with_name(putative_seqs, cd_hit_names_putative_seqs)
98
+ return putative_seqs
99
+ end
100
+
101
+ def do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) # Second server to representative transcriptome
102
+ $LOG.info 'Starting server for EST analysis'
103
+ custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker_EST.rb')
104
+ options[:chimera] = nil #Inactive chimeras system on RepTrans, this resume the BLAST's output
105
+
106
+ MyWorkerManagerEst.init_work_manager(putative_seqs, options, blast_path)
107
+ server_EST = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerEst, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
108
+ server_EST.chunk_size = options[:chunk_size]
109
+ server_EST.start_server
110
+ $LOG.info 'Closing server for EST analysis'
111
+
112
+ seqs_with_EST, putative_seqs = MyWorkerManagerEst.get_array_seqs
113
+ if !seqs_with_EST.empty?
114
+ analysis_over_DB_annotated_seqs(seqs_with_EST, reptrans_fasta, cluster_EST_annotated_path, stats_hash, 'est_annotated')
115
+ end
116
+ return putative_seqs
117
+ end
118
+
119
+
120
+ def load_cd_hit_sequences_names(file)
121
+ names=[]
122
+ File.open(file).readlines.each do |line|
123
+ if line =~ /^>/
124
+ line.chomp!
125
+ line.gsub!('>','')
126
+ names << line
127
+ end
128
+ end
129
+ return names
130
+ end
131
+
132
+ def select_seqs_more_500pb(seqs_array)
133
+ seqs = seqs_array.select{|seq| seq.fasta_length > 500 }
134
+ return seqs
135
+ end
136
+
137
+ def select_seqs_with_name(array_seqs, array_names)
138
+ seqs = array_seqs.select{|seq| array_names.include?(seq.seq_name)}
139
+ return seqs
140
+ end
141
+
142
+ def write_fasta(seqs_array, file_name, mode)
143
+ file=File.open(file_name, mode)
144
+ seqs_array.each do |seq|
145
+ file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
146
+ end
147
+ file.close
148
+ end
149
+
150
+ def clustering_by_id(seqs_with_hit)
151
+ clusters=[]
152
+ hit_id=[]
153
+ seqs_with_hit.each do |seq|
154
+ position=hit_id.index(seq.get_acc)
155
+ if position.nil?
156
+ hit_id << seq.get_acc
157
+ clusters << [seq]
158
+ else
159
+ clusters[position] << seq
160
+ end
161
+ end
162
+ return clusters
163
+ end
164
+
165
+ def clustering_by_annot(seqs_with_hit, annotation_type)
166
+ clusters = []
167
+ annot_id = []
168
+ no_annotation_clusters = []
169
+ seqs_with_hit.each do |seq|
170
+ annot = seq.functional_annotations[annotation_type]
171
+ annot = annot.split(';').sort.join(';') if !annot.nil?
172
+ if annot == '-' || annot.nil?
173
+ no_annotation_clusters << [seq]
174
+ else
175
+ position = annot_id.index(annot)
176
+ if position.nil?
177
+ annot_id << annot
178
+ clusters << [seq]
179
+ else
180
+ clusters[position] << seq
181
+ end
182
+ end
183
+ end
184
+ clusters.concat(no_annotation_clusters)
185
+ return clusters
186
+ end
187
+
188
+ def select_representative(clusters_seqs_annot_prot)
189
+ seqs=[]
190
+ clusters_seqs_annot_prot.each do |cluster|
191
+ seq=cluster.select{|s| s.type == COMPLETE}.sort{|fl1, fl2| fl2.seq_fasta <=> fl1.seq_fasta}.first # Take longest full-length, s -> sequence, fl -> full-lentgh
192
+ if seq.nil?
193
+ cluster.sort!{|cl1, cl2| cl2.get_pident <=> cl1.get_pident}
194
+ best_pident=cluster.first.get_pident
195
+ seq=cluster.select{|s| s.get_pident == best_pident}.sort{|s1, s2| s2.seq_fasta <=> s1.seq_fasta}.first
196
+ end
197
+ seqs << seq
198
+ end
199
+ return seqs
200
+ end
201
+
202
+ def count_cpu(options)
203
+ cpu = 0
204
+ if options[:workers].class.to_s == 'Array'
205
+ cpu = options[:workers].length + 1
206
+ else
207
+ cpu = options[:workers]
208
+ end
209
+ return cpu
210
+ end
@@ -1,113 +1,472 @@
1
-
2
1
  require 'orf'
2
+ require 'types'
3
+ require 'warnings'
4
+ require 'common_functions'
3
5
 
4
6
  class Sequence
5
7
 
6
- attr_accessor :seq_name,:seq_fasta,:seq_qual,:orfs,:sec_desc,:fasta_length
7
-
8
- def initialize(seq_name,seq_fasta,seq_qual='')
9
- fasta_ori = seq_fasta.dup
10
- @seq_name=seq_name
8
+ attr_accessor :seq_name, :seq_fasta, :fasta_length, :db_name, :seq_nt, :seq_aa, :db, :type, :status, :id, :orfs, :area_without_annotation, :save_fasta, :ignore, :hit, :t_code, :functional_annotations
9
+
10
+ def initialize(seq_name, seq_fasta, seq_qual='')
11
+ @seq_name = seq_name
11
12
  @seq_fasta = seq_fasta
12
- @fasta_length = fasta_ori.length
13
- change_degenerated_nt!
14
- @seq_qual = ''
15
- @sec_desc = ''
13
+ @fasta_length = seq_fasta.length
14
+ @db_name = nil
15
+ @seq_nt = nil # Unigen sequence with tagged ATG & stop
16
+ @seq_aa = nil # Protein sequence generated over unigen
17
+ @db =nil
18
+ @type = UNKNOWN # See types.rb
19
+ @status = FALSE # TRUE => Sure, FALSE => Putative
20
+ @id = nil #Prot or EST id, can be several => array
21
+ @warnings = []
16
22
  @annotations=[]
23
+ @functional_annotations = {}
17
24
  @orfs=[]
18
25
 
19
- @rejected=false
20
- @rejected_message=''
21
-
26
+ @area_without_annotation=FALSE
27
+ @save_fasta=TRUE
28
+ @ignore=FALSE
29
+ @hit=nil
30
+ @t_code=0
22
31
  end
32
+
23
33
 
24
- def add_orf(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
25
- orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
26
- @orfs.push orf
34
+ def add_orf(orf_seq, orf_t_start, orf_t_end, orf_q_frame, orf_stop_codon, orf_type)
35
+ orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_q_frame, orf_stop_codon, orf_type)
36
+ @orfs << orf
37
+ end
27
38
 
39
+
40
+ def change_degenerated_nt!
41
+ translate_hash = {}
42
+ translate_hash['R']= [['a','g'],0]
43
+ translate_hash['W']= [['a','t'],0]
44
+ translate_hash['M']= [['a','c'],0]
45
+ translate_hash['K']= [['g','t'],0]
46
+ translate_hash['S']= [['g','c'],0]
47
+ translate_hash['Y']= [['c','t'],0]
48
+ translate_hash['H']= [['a','t','c'],0]
49
+ translate_hash['B']= [['g','t','c'],0]
50
+ translate_hash['D']= [['g','a','t'],0]
51
+ translate_hash['V']= [['g','a','c'],0]
52
+ translate_hash['N']= [['g','a','c','t'],0]
53
+
54
+
55
+ fix_degenerated_fasta!(translate_hash)
28
56
  end
57
+
29
58
 
30
- def rejected?
31
- return @rejected
59
+ def fix_degenerated_fasta!(translate_hash)
60
+ s = @seq_fasta
61
+ res = []
62
+
63
+ nts_of_a_line = s.split('')
64
+
65
+ nts_of_a_line.map{|e|
66
+
67
+ if (e =~ /[RWMKSYHBDVN]/)
68
+ translate_hash[e][1] += 1
69
+ e = translate_hash[e][0][translate_hash[e][1]%translate_hash[e][0].length]
70
+ end
71
+
72
+ res << e
73
+
74
+ }
75
+
76
+ @seq_fasta=res.compact.join
32
77
  end
33
-
34
- def reject!(message='')
35
- @rejected=true
36
- @rejected_message=message
78
+
79
+ def clean_orfs
80
+ @orfs=[]
37
81
  end
38
-
39
- # :complete, :tmp_annotation, :error, :protein, :nucleotide, :alignment, :tcode
40
- def get_annotations(annotation_type)
41
- return @annotations.select{|a| a[:annotation_type]==annotation_type}
82
+
83
+ def reset_classification
84
+ @type = UNKNOWN
85
+ @status = FALSE
86
+ end
87
+
88
+ def clean_warnings
89
+ @warnings = []
90
+ end
91
+
92
+ def clean_annotations
93
+ @annotations = []
94
+ end
95
+
96
+ def get_acc
97
+ acc=hit.acc
98
+ return acc
99
+ end
100
+
101
+ def get_pident
102
+ pident=hit.ident
103
+ return pident
104
+ end
105
+
106
+ def format_chimera!
107
+ @hit = []
42
108
  end
43
109
 
44
- def annotate(annotation_type, message='', replace_existing = false)
110
+ def warnings(warn)
111
+ if warn.class.to_s == 'Array'
112
+ warn.each do |w|
113
+ @warnings << check_warn(w)
114
+ end
115
+ else
116
+ @warnings << check_warn(warn)
117
+ end
118
+ end
45
119
 
46
- if replace_existing
47
- @annotations.reverse_each do |annotation|
48
- if annotation[:annotation_type]==annotation_type
49
- @annotations.delete(annotation)
50
- end
120
+ def clone_warnings(array_warnings)
121
+ array_warnings.map{|warn| @warnings << warn.dup}
122
+ end
123
+
124
+ def check_warn(warn)
125
+ check = warn
126
+ replace = nil
127
+ if warn.class.to_s == 'Array'
128
+ check = warn.shift # Take warning tag message
129
+ replace = warn # Take values to replace in message
130
+ end
131
+
132
+ message = $warnings_hash[check]
133
+ if message.nil?
134
+ message = check # If not exists the message
135
+ end
136
+
137
+ if !replace.nil?
138
+ message = message.dup # Duplicate memory to avoid overwrite original warning hash messages
139
+ replace.each do |rep|
140
+ message.sub!('(*replace*)',"#{rep}") #message variable
51
141
  end
52
142
  end
53
-
54
-
55
- @annotations.push({:annotation_type=>annotation_type,:message=>message})
143
+ return message
56
144
  end
57
-
58
- def change_degenerated_nt!
59
-
60
-
61
- ########################################
62
-
63
- tranlaste_hash = {}
64
- tranlaste_hash['R']= [['a','g'],0]
65
- tranlaste_hash['W']= [['a','t'],0]
66
- tranlaste_hash['M']= [['a','c'],0]
67
- tranlaste_hash['K']= [['g','t'],0]
68
- tranlaste_hash['S']= [['g','c'],0]
69
- tranlaste_hash['Y']= [['c','t'],0]
70
- tranlaste_hash['H']= [['a','t','c'],0]
71
- tranlaste_hash['B']= [['g','t','c'],0]
72
- tranlaste_hash['D']= [['g','a','t'],0]
73
- tranlaste_hash['V']= [['g','a','c'],0]
74
- tranlaste_hash['N']= [['g','a','c','t'],0]
75
-
76
- ########################################
77
-
78
- fix_degenerated_fasta!(tranlaste_hash)
79
-
80
-
145
+
146
+ def test_code(test_code)
147
+ @t_code = test_code
148
+ if @t_code >= 0.95
149
+ @status = TRUE
150
+ end
81
151
  end
82
-
83
- def fix_degenerated_fasta!(tranlaste_hash)
84
- s = @seq_fasta
85
- res = []
86
152
 
87
- nts_of_a_line = s.split('')
153
+ def get_fasta(seq)
154
+ fasta = ">#{@seq_name}\n#{seq}"
155
+ return fasta
156
+ end
88
157
 
89
- nts_of_a_line.map{
90
- |e|
91
- # puts "#{e} "
158
+ def write_info(output_files) # Output_files is a hash
159
+ if @save_fasta
160
+ output_files['seqs'].puts get_fasta(@seq_fasta)
161
+ end
162
+ case @type
163
+ when OTHER
164
+ write_other(output_files[@type])
165
+ when CHIMERA
166
+ write_chimera(output_files[@type])
167
+ when MISASSEMBLED
168
+ write_misassembled(output_files[@type])
169
+ when UNKNOWN
170
+ write_unknown(output_files[@type])
171
+ when COMPLETE .. INTERNAL
172
+ write_prot_annot(output_files['db'])
173
+ write_prot_seq(output_files['prot'])
174
+ write_nt_seq(output_files['nts'])
175
+ write_align(output_files['align'])
176
+ when NCRNA
177
+ write_ncrna(output_files[@type])
178
+ when CODING
179
+ write_coding(output_files[@type])
180
+ else
181
+ if @type != FAILED
182
+ raise "#{@type} is an incorrect type"
183
+ end
184
+ end
185
+ end
92
186
 
93
- if (e =~ /[RWMKSYHBDVN]/)
187
+ def all_warns
188
+ all = @warnings.join(' ')
189
+ return all
190
+ end
94
191
 
95
- # puts "#{e} "
96
- tranlaste_hash[e][1] += 1
97
- # puts "#{e} #{tranlaste_hash[e][1]}"
192
+ def write_other(file)
193
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}\t#{all_warns}"
194
+ end
98
195
 
99
- e = tranlaste_hash[e][0][tranlaste_hash[e][1]%tranlaste_hash[e][0].length]
196
+ def write_chimera(file) #TODO : write 'SOLVED' tag
197
+ @hit.each do |h|
198
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{h.acc}\t#{@db_name}\t#{h.q_frame}\t#{h.e_val}\t#{h.ident}\t#{h.q_beg + 1}\t#{h.q_end + 1}\t#{h.s_beg + 1}\t#{h.s_end + 1}\t#{h.definition}"
199
+ end
200
+ file.puts
201
+ end
202
+
203
+ def write_misassembled(file)
204
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}"
205
+ end
206
+
207
+ def write_unknown(file)
208
+ # ħit is an array. 2 => q_frame, 1 ORF end, 0 ORF beg
209
+ if hit.class.to_s == 'Array'
210
+ orf_beg = @hit[0]
211
+ orf_end = @hit[1]
212
+ q_frame = @hit[2]
213
+ else
214
+ orf_beg = '-'
215
+ orf_end = '-'
216
+ q_frame = '-'
217
+ end
218
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@t_code}\t#{all_warns}\t#{q_frame}\t#{orf_beg}\t#{orf_end}"
219
+ end
220
+
221
+ def write_prot_annot(file)
222
+ final_func_annot = Array.new(9, '-')
223
+ if !@functional_annotations.empty?
224
+ final_func_annot = @functional_annotations.values
225
+ end
226
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}\t#{prot_annot_calification}\t#{@hit.e_val}\t#{@hit.ident}\t#{@hit.full_subject_length}\t#{@seq_aa.length}\t#{all_warns}\t#{@hit.q_frame}\t#{@hit.q_beg + 1}\t#{@hit.q_end + 1}\t#{@hit.s_beg + 1}\t#{@hit.s_end + 1}\t#{@hit.definition}\t#{final_func_annot.join("\t")}"
227
+ end
228
+
229
+ def write_ncrna(file)
230
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@hit.e_val}\t#{@hit.ident}\t#{@hit.q_beg + 1}\t#{@hit.q_end + 1}\t#{@hit.s_beg + 1}\t#{@hit.s_end + 1}\t#{@hit.definition}"
231
+ end
232
+
233
+ def write_coding(file)
234
+ # ħit is an array. 2 => q_frame, 1 ORF end, 0 ORF beg
235
+ calification = 'Putative'
236
+ if @status
237
+ calification = 'Sure'
238
+ end
239
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{calification}\t#{@t_code}\t#{@hit.last}\t#{@hit.first}\t#{@hit[1]}"
240
+ end
241
+
242
+ #Write complementary files
243
+ def write_prot_seq(file)
244
+ file.puts get_fasta(@seq_aa)
245
+ end
246
+
247
+ def write_align(file)
248
+ tabs = (seq_name.length/8).ceil
249
+ if tabs == 0
250
+ tabs = 1
251
+ end
252
+ second_tab = 0
253
+ if seq_name.length > 7
254
+ second_tab = 1
255
+ end
256
+ file.puts "#{@seq_name}#{"\t"*tabs}#{@hit.q_seq}\n#{@hit.acc}#{"\t"*(tabs+second_tab)}#{@hit.s_seq}"
257
+ file.puts
258
+ end
259
+
260
+ def write_nt_seq(file)
261
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@seq_nt}"
262
+ end
263
+
264
+ def calification
265
+ type_description = nil
266
+ case @type
267
+ when FAILED
268
+ type_description = 'Failed'
269
+ when OTHER
270
+ type_description = 'Other'
271
+ when CHIMERA
272
+ type_description = 'Chimera'
273
+ when MISASSEMBLED
274
+ type_description = 'Misassembled'
275
+ when UNKNOWN
276
+ type_description = 'Unknown'
277
+ when COMPLETE
278
+ type_description = 'Complete'
279
+ when N_TERMINAL
280
+ type_description = 'N_terminal'
281
+ when C_TERMINAL
282
+ type_description = 'C_terminal'
283
+ when INTERNAL
284
+ type_description = 'Internal'
285
+ when CODING
286
+ type_description = 'Coding'
287
+ when NCRNA
288
+ type_description = 'NcRNA'
289
+ end
290
+
291
+ end
292
+
293
+ def prot_annot_calification
294
+ info = "#{calification} "
295
+ if @status
296
+ info << 'Sure'
297
+ else
298
+ info << 'Putative'
299
+ end
300
+ return info
301
+ end
302
+
303
+ def show_alignment(h, nts, show_nts, original_query_coordinates = nil)
304
+ puts "Prot id:\t#{h.acc}", "Alignment length:\t#{h.align_len} aa", "Subject length:\t#{h.s_len} aa", "Query length:\t#{nts.length/3} aa"
305
+ puts prot_annot_calification
306
+ puts
307
+
308
+ aa_unigen = nts[h.q_frame - 1 .. nts.length-1].translate
309
+ index = contenidos_en_prot(h.q_seq, aa_unigen)
310
+
311
+ # View desplacements 5-prime/align/3-prime
312
+ subzone_align = nil
313
+ if !original_query_coordinates.nil?
314
+ subzone_align = {}
315
+ if h.q_beg > original_query_coordinates.first #alignment has transferred characters to 5 prime
316
+ subzone_align['beg'] = [original_query_coordinates.first, h.q_beg-3, 42] # -3 to exclude the last aa
317
+ elsif h.q_beg < original_query_coordinates.first
318
+ subzone_align['beg'] = [h.q_beg, original_query_coordinates.first-3, 46] #alignment has received characters from 5 prime
319
+ end
100
320
 
101
- # puts "#{e}"
321
+ if h.q_end < original_query_coordinates.last #alignment has transferred characters to 3 prime
322
+ subzone_align['end'] = [h.q_end, original_query_coordinates.last, 42]
323
+ elsif h.q_end > original_query_coordinates.last
324
+ subzone_align['end'] = [original_query_coordinates.last, h.q_end, 43] #alignment has received characters from 3 prime
102
325
  end
326
+ end
103
327
 
104
- res.push e
328
+ # Print 5 prime
329
+ if index > 0 # 5 prime exists
330
+ aa_align = aa_unigen[0 .. index-1].split('')
331
+ nt_align = nts[h.q_frame-1..h.q_beg-1]
332
+ print_alignment(aa_align, nt_align, 36, show_nts, subzone_align)
333
+ reduce_coordinates(subzone_align, aa_align, h)
334
+ end
105
335
 
106
- }
336
+ # Print core alignment or protein
337
+ aa_align = h.q_seq.split('')
338
+ nt_align = nts[h.q_beg..h.q_end]
339
+ print_alignment(aa_align, nt_align, 32, show_nts, subzone_align)
340
+ reduce_coordinates(subzone_align, aa_align, h)
341
+
342
+ # Print 3 prime
343
+ gaps = h.q_seq.count('-')
344
+ three_prime_beg = index+h.q_seq.length-gaps
345
+ if aa_unigen.length > three_prime_beg # 3 prime exists
346
+ aa_align = aa_unigen[three_prime_beg .. aa_unigen.length-1].split('')
347
+ fs = check_frame_shift(h)
348
+ nt_align = nts[h.q_end+1-fs..nts.length-1]
349
+ print_alignment(aa_align, nt_align, 33, show_nts, subzone_align)
350
+ end
107
351
 
108
- @seq_fasta=res.compact.join
109
- # @seq_fasta='dario'
110
352
  end
111
-
112
-
113
- end
353
+
354
+
355
+ def print_alignment(aa_align, nt_align, color, show_nts, mark_subzone = nil)
356
+ original_color = color
357
+ c={ 'GCT'=>'A','GCC'=>'A','GCA'=>'A','GCG'=>'A',
358
+ 'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
359
+ 'AAT'=>'N','AAC'=>'N',
360
+ 'GAT'=>'D','GAC'=>'D',
361
+ 'TGT'=>'C','TGC'=>'C',
362
+ 'CAA'=>'Q','CAG'=>'Q',
363
+ 'GAA'=>'E','GAG'=>'E',
364
+ 'GGT'=>'G','GGC'=>'G','GGA'=>'G','GGG'=>'G',
365
+ 'CAT'=>'H','CAC'=>'H',
366
+ 'ATT'=>'I','ATC'=>'I','ATA'=>'I',
367
+ 'TTA'=>'L','TTG'=>'L','CTT'=>'L','CTC'=>'L','CTA'=>'L','CTG'=>'L',
368
+ 'ATG'=>'M',
369
+ 'AAA'=>'K','AAG'=>'K',
370
+ 'TTT'=>'F','TTC'=>'F',
371
+ 'CCT'=>'P','CCC'=>'P','CCA'=>'P','CCG'=>'P',
372
+ 'TCT'=>'S','TCC'=>'S','TCA'=>'S','TCG'=>'S','AGT'=>'S','AGC'=>'S',
373
+ 'ACT'=>'T','ACC'=>'T','ACA'=>'T','ACG'=>'T',
374
+ 'TGG'=>'W',
375
+ 'TAT'=>'Y','TAC'=>'Y',
376
+ 'GTT'=>'V','GTC'=>'V','GTA'=>'V','GTG'=>'V',
377
+ 'TAG'=>'*','TGA'=>'*','TAA'=>'*'}
378
+
379
+ nt_line = ''
380
+ aa_line = ''
381
+ gaps = 0
382
+ count = 0
383
+ aa_align.each_with_index do |aa, n|
384
+ if aa == '-'
385
+ nt_line << '---'
386
+ gaps += 1
387
+ else
388
+ # Check aa with codon
389
+ codon_window = (n-gaps)*3
390
+ codon = nt_align[codon_window..codon_window+2]
391
+ nt_line << "#{codon}"
392
+ if aa.upcase != 'X'
393
+ if codon.upcase.include?('N')
394
+ traslated_aa = '-'
395
+ else
396
+ traslated_aa = c[codon]
397
+ end
398
+ if traslated_aa != '-' && traslated_aa != aa
399
+ puts "#{traslated_aa} #{aa}"
400
+ aa = '?'
401
+ end
402
+ end
403
+ end
404
+ if !mark_subzone.nil?
405
+ nts_coordenate = (n-gaps)*3
406
+ mark_subzone.values.each do |subzone|
407
+ if nts_coordenate >= subzone[0] && nts_coordenate <= subzone[1] #0 => first coordenate, 1 => second coordenate
408
+ color = subzone.last
409
+ end
410
+ end
411
+ end
412
+ space = nil
413
+ if show_nts
414
+ space = ' '
415
+ end
416
+ aa_line << "\e[#{color}m#{space}#{aa}#{space}\e[0m"
417
+ color = original_color
418
+ line_length = 60
419
+ if (n+1) % line_length == 0 || n+1 == aa_align.length
420
+ count = n + 1
421
+ print "#{count}\t"
422
+ puts aa_line
423
+ if show_nts
424
+ print "#{count*3}\t"
425
+ puts nt_line
426
+ end
427
+ aa_line = ''
428
+ nt_line = ''
429
+ end
430
+ end
431
+
432
+ end
433
+
434
+ def reduce_coordinates(subzone_align, aa_align, h)
435
+ if !subzone_align.nil?
436
+ aligned = 3 * aa_align.length + h.q_frame-1
437
+ subzone_align.values.each do |subzone|
438
+ subzone[0]-= aligned
439
+ subzone[1]-= aligned
440
+ end
441
+ end
442
+ end
443
+
444
+ def area_without_annotation?
445
+ if @hit.class == Array
446
+ hit = @hit.first
447
+ else
448
+ hit = @hit
449
+ end
450
+ upstream_annotation_space = hit.q_beg
451
+ downstream_annotation_space = @fasta_length - hit.q_end
452
+ if upstream_annotation_space >= 150 || downstream_annotation_space >= 150
453
+ @area_without_annotation = TRUE
454
+ end
455
+ return @area_without_annotation
456
+ end
457
+
458
+ def clone
459
+ new_seq = self.dup
460
+ new_seq.clean_annotations
461
+ new_seq.clean_warnings
462
+ new_seq.clean_orfs
463
+ new_seq.clone_warnings(@warnings)
464
+ new_seq.clone_annotations(@annotations)
465
+ return new_seq
466
+ end
467
+
468
+ def clone_annotations(array_annotations)
469
+ array_annotations.map{|annotation| @annotations << annotation.dup}
470
+ end
471
+
472
+ end