full_lengther_next 0.0.8 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -1,3 +1,4 @@
1
+ require 'types'
1
2
 
2
3
  module NcRna
3
4
 
@@ -8,14 +9,11 @@ module NcRna
8
9
  raise "BLAST query name and sequence are different"
9
10
  end
10
11
 
11
- q=blast_query
12
+ hit=blast_query.hits.first
12
13
 
13
- if (!q.hits[0].nil?) # There is match in blast.
14
- nc_annotations = "#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\tncRNA\tPutative ncRNA\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t\t\t#{q.hits[0].q_frame}\t#{q.hits[0].q_beg}\t#{q.hits[0].q_end}\t#{q.hits[0].s_beg.to_i}\t#{q.hits[0].s_end.to_i}\t#{q.hits[0].definition}\t"
15
- seq.annotate(:ncrna,nc_annotations,true)
16
- else
17
- unknown_annot = seq.get_annotations(:tcode_unknown).first
18
- seq.annotate(:tcode, unknown_annot[:message],true)
14
+ if !hit.nil? && hit.align_len >= 40 # There is match in blast and it has a good length.
15
+ seq.hit = hit
16
+ seq.type = NCRNA
19
17
  end
20
18
  end
21
19
  end
@@ -0,0 +1,210 @@
1
+ require 'scbi_mapreduce'
2
+ require 'my_worker_manager_EST' #Second server
3
+ require 'fln_stats'
4
+ require 'types'
5
+
6
+ ########################################################################
7
+ # MAIN FUNCTION
8
+ ########################################################################
9
+ def reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
10
+ cpus = count_cpu(options)
11
+ stats_hash = initialize_stats_hash_reptrans
12
+ # Paths
13
+ #---------------------------------------------
14
+ main_path = File.join(Dir.pwd, 'fln_results')
15
+ reptrans_fasta = File.join(main_path, 'Representative_transcriptome.fasta')
16
+ blast_path = File.join(main_path, 'ESTdb')
17
+ cluster_prot_annotated_path =File.join(main_path, 'Prot_clusters')
18
+ cluster_EST_annotated_path =File.join(main_path, 'EST_clusters')
19
+ html_file = File.join(main_path, 'Representative_transcriptome_stats.html')
20
+ txt_file = File.join(main_path, 'Representative_transcriptome_stats.txt')
21
+
22
+ # Prot annotations sequence analysis
23
+ #---------------------------------------------
24
+ analysis_over_DB_annotated_seqs(seqs_annotation_prot, reptrans_fasta, cluster_prot_annotated_path, stats_hash, 'prot_annotated', options[:high_clustering])
25
+ seqs_annotation_prot = nil
26
+
27
+ # NOT Prot annotations sequence analysis
28
+ #---------------------------------------------
29
+ putative_seqs = seqs_some_coding
30
+ if !options[:est_db].nil? # WITH EST DATABASE
31
+ putative_seqs += seqs_unknown # Coding & unknown
32
+ putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus)
33
+ if !File.exists?(blast_path +'.nsq')
34
+ $LOG.info "Start makeblastdb over EST DB"
35
+ system("makeblastdb -in #{options[:est_db]} -out #{blast_path} -dbtype nucl -parse_seqids > #{File.join(main_path, 'log_makeblast_db')}")
36
+ $LOG.info "Ended makeblastdb over EST DB"
37
+ end
38
+ putative_seqs = do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash)
39
+ end
40
+
41
+ # Coding sequence analysis
42
+ #---------------------------------------------
43
+ if !putative_seqs.nil? && !putative_seqs.empty?
44
+ putative_seqs = select_seqs_more_500pb(putative_seqs)
45
+ putative_seqs = reduce_pool_sequences(putative_seqs, main_path, cpus) if options[:est_db].nil? # NOT EST database
46
+ putative_seqs.sort!{|s1, s2| #Order by testcode (first) and sequence length (last)
47
+ if s2.t_code == s1.t_code
48
+ s2.fasta_length <=> s1.fasta_length
49
+ else
50
+ s2.t_code <=> s1.t_code
51
+ end
52
+ }
53
+ count = 0
54
+ putative_seqs.each do |coding_seq|
55
+ coding_stats_reptrans(coding_seq, stats_hash)
56
+ count +=1
57
+ end
58
+
59
+ write_fasta(putative_seqs, reptrans_fasta, 'a')
60
+ end
61
+ write_reptrans_stats(stats_hash, html_file, txt_file)
62
+ end
63
+ ########################################################################
64
+ # END MAIN FUNCTION
65
+ ########################################################################
66
+
67
+ def analysis_over_DB_annotated_seqs(seqs_annotation_DB, reptrans_fasta, cluster_file_path, stats_hash, key_stats, pfam_clustering)
68
+ clusters_seqs_annot_DB = clustering_by_id(seqs_annotation_DB)
69
+ representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB)
70
+ if pfam_clustering
71
+ clusters_seqs_annot_DB = clustering_by_annot(representative_seqs_annot_DB, :pfam_id) # pfam id, fix get the annotation guide on my_worker_manager_fln (@@func_annot_type) to this scope
72
+ representative_seqs_annot_DB = select_representative(clusters_seqs_annot_DB) # merge clusters by id and by pfam
73
+ end
74
+ stats_hash[key_stats] += representative_seqs_annot_DB.length
75
+ report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
76
+ write_fasta(representative_seqs_annot_DB, reptrans_fasta, 'w')
77
+ end
78
+
79
+ def report_clustering(cluster_file_path, clusters_seqs_annot_DB, representative_seqs_annot_DB)
80
+ cluster_file = File.open(cluster_file_path, 'w')
81
+ representative_seqs_annot_DB.each_with_index do |rep_seq, i|
82
+ cluster_seqs = clusters_seqs_annot_DB[i].map{|seq| seq.seq_name}.join(';')
83
+ cluster_file.puts "#{rep_seq.seq_name}\t#{cluster_seqs}"
84
+ end
85
+ cluster_file.close
86
+ end
87
+
88
+ def reduce_pool_sequences(putative_seqs, main_path, cpu)
89
+ temp_fasta = File.join(main_path, 'temp.fasta')
90
+ temp_fasta_clean = File.join(main_path, 'temp_cln.fasta')
91
+ log_file = File.join(main_path, 'log_cd_hit_Cod_Unk')
92
+ write_fasta(putative_seqs, temp_fasta, 'w')
93
+ $LOG.info "Start cd-hit with coding and unknow sequences"
94
+ system("cd-hit -i #{temp_fasta} -o #{temp_fasta_clean} -c 0.95 -M 0 -T #{cpu} > #{log_file}") if !File.exists?(temp_fasta_clean)
95
+ $LOG.info "Ended cd-hit with coding and unknow sequences"
96
+ cd_hit_names_putative_seqs = load_cd_hit_sequences_names(temp_fasta_clean)
97
+ putative_seqs = select_seqs_with_name(putative_seqs, cd_hit_names_putative_seqs)
98
+ return putative_seqs
99
+ end
100
+
101
+ def do_blast_with_EST(putative_seqs, options, reptrans_fasta, blast_path, cluster_EST_annotated_path, stats_hash) # Second server to representative transcriptome
102
+ $LOG.info 'Starting server for EST analysis'
103
+ custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker_EST.rb')
104
+ options[:chimera] = nil #Inactive chimeras system on RepTrans, this resume the BLAST's output
105
+
106
+ MyWorkerManagerEst.init_work_manager(putative_seqs, options, blast_path)
107
+ server_EST = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerEst, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
108
+ server_EST.chunk_size = options[:chunk_size]
109
+ server_EST.start_server
110
+ $LOG.info 'Closing server for EST analysis'
111
+
112
+ seqs_with_EST, putative_seqs = MyWorkerManagerEst.get_array_seqs
113
+ if !seqs_with_EST.empty?
114
+ analysis_over_DB_annotated_seqs(seqs_with_EST, reptrans_fasta, cluster_EST_annotated_path, stats_hash, 'est_annotated')
115
+ end
116
+ return putative_seqs
117
+ end
118
+
119
+
120
+ def load_cd_hit_sequences_names(file)
121
+ names=[]
122
+ File.open(file).readlines.each do |line|
123
+ if line =~ /^>/
124
+ line.chomp!
125
+ line.gsub!('>','')
126
+ names << line
127
+ end
128
+ end
129
+ return names
130
+ end
131
+
132
+ def select_seqs_more_500pb(seqs_array)
133
+ seqs = seqs_array.select{|seq| seq.fasta_length > 500 }
134
+ return seqs
135
+ end
136
+
137
+ def select_seqs_with_name(array_seqs, array_names)
138
+ seqs = array_seqs.select{|seq| array_names.include?(seq.seq_name)}
139
+ return seqs
140
+ end
141
+
142
+ def write_fasta(seqs_array, file_name, mode)
143
+ file=File.open(file_name, mode)
144
+ seqs_array.each do |seq|
145
+ file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"
146
+ end
147
+ file.close
148
+ end
149
+
150
+ def clustering_by_id(seqs_with_hit)
151
+ clusters=[]
152
+ hit_id=[]
153
+ seqs_with_hit.each do |seq|
154
+ position=hit_id.index(seq.get_acc)
155
+ if position.nil?
156
+ hit_id << seq.get_acc
157
+ clusters << [seq]
158
+ else
159
+ clusters[position] << seq
160
+ end
161
+ end
162
+ return clusters
163
+ end
164
+
165
+ def clustering_by_annot(seqs_with_hit, annotation_type)
166
+ clusters = []
167
+ annot_id = []
168
+ no_annotation_clusters = []
169
+ seqs_with_hit.each do |seq|
170
+ annot = seq.functional_annotations[annotation_type]
171
+ annot = annot.split(';').sort.join(';') if !annot.nil?
172
+ if annot == '-' || annot.nil?
173
+ no_annotation_clusters << [seq]
174
+ else
175
+ position = annot_id.index(annot)
176
+ if position.nil?
177
+ annot_id << annot
178
+ clusters << [seq]
179
+ else
180
+ clusters[position] << seq
181
+ end
182
+ end
183
+ end
184
+ clusters.concat(no_annotation_clusters)
185
+ return clusters
186
+ end
187
+
188
+ def select_representative(clusters_seqs_annot_prot)
189
+ seqs=[]
190
+ clusters_seqs_annot_prot.each do |cluster|
191
+ seq=cluster.select{|s| s.type == COMPLETE}.sort{|fl1, fl2| fl2.seq_fasta <=> fl1.seq_fasta}.first # Take longest full-length, s -> sequence, fl -> full-lentgh
192
+ if seq.nil?
193
+ cluster.sort!{|cl1, cl2| cl2.get_pident <=> cl1.get_pident}
194
+ best_pident=cluster.first.get_pident
195
+ seq=cluster.select{|s| s.get_pident == best_pident}.sort{|s1, s2| s2.seq_fasta <=> s1.seq_fasta}.first
196
+ end
197
+ seqs << seq
198
+ end
199
+ return seqs
200
+ end
201
+
202
+ def count_cpu(options)
203
+ cpu = 0
204
+ if options[:workers].class.to_s == 'Array'
205
+ cpu = options[:workers].length + 1
206
+ else
207
+ cpu = options[:workers]
208
+ end
209
+ return cpu
210
+ end
@@ -1,113 +1,472 @@
1
-
2
1
  require 'orf'
2
+ require 'types'
3
+ require 'warnings'
4
+ require 'common_functions'
3
5
 
4
6
  class Sequence
5
7
 
6
- attr_accessor :seq_name,:seq_fasta,:seq_qual,:orfs,:sec_desc,:fasta_length
7
-
8
- def initialize(seq_name,seq_fasta,seq_qual='')
9
- fasta_ori = seq_fasta.dup
10
- @seq_name=seq_name
8
+ attr_accessor :seq_name, :seq_fasta, :fasta_length, :db_name, :seq_nt, :seq_aa, :db, :type, :status, :id, :orfs, :area_without_annotation, :save_fasta, :ignore, :hit, :t_code, :functional_annotations
9
+
10
+ def initialize(seq_name, seq_fasta, seq_qual='')
11
+ @seq_name = seq_name
11
12
  @seq_fasta = seq_fasta
12
- @fasta_length = fasta_ori.length
13
- change_degenerated_nt!
14
- @seq_qual = ''
15
- @sec_desc = ''
13
+ @fasta_length = seq_fasta.length
14
+ @db_name = nil
15
+ @seq_nt = nil # Unigen sequence with tagged ATG & stop
16
+ @seq_aa = nil # Protein sequence generated over unigen
17
+ @db =nil
18
+ @type = UNKNOWN # See types.rb
19
+ @status = FALSE # TRUE => Sure, FALSE => Putative
20
+ @id = nil #Prot or EST id, can be several => array
21
+ @warnings = []
16
22
  @annotations=[]
23
+ @functional_annotations = {}
17
24
  @orfs=[]
18
25
 
19
- @rejected=false
20
- @rejected_message=''
21
-
26
+ @area_without_annotation=FALSE
27
+ @save_fasta=TRUE
28
+ @ignore=FALSE
29
+ @hit=nil
30
+ @t_code=0
22
31
  end
32
+
23
33
 
24
- def add_orf(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
25
- orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_frame, orf_stop_codon, orf_type)
26
- @orfs.push orf
34
+ def add_orf(orf_seq, orf_t_start, orf_t_end, orf_q_frame, orf_stop_codon, orf_type)
35
+ orf = Orf.new(orf_seq, orf_t_start, orf_t_end, orf_q_frame, orf_stop_codon, orf_type)
36
+ @orfs << orf
37
+ end
27
38
 
39
+
40
+ def change_degenerated_nt!
41
+ translate_hash = {}
42
+ translate_hash['R']= [['a','g'],0]
43
+ translate_hash['W']= [['a','t'],0]
44
+ translate_hash['M']= [['a','c'],0]
45
+ translate_hash['K']= [['g','t'],0]
46
+ translate_hash['S']= [['g','c'],0]
47
+ translate_hash['Y']= [['c','t'],0]
48
+ translate_hash['H']= [['a','t','c'],0]
49
+ translate_hash['B']= [['g','t','c'],0]
50
+ translate_hash['D']= [['g','a','t'],0]
51
+ translate_hash['V']= [['g','a','c'],0]
52
+ translate_hash['N']= [['g','a','c','t'],0]
53
+
54
+
55
+ fix_degenerated_fasta!(translate_hash)
28
56
  end
57
+
29
58
 
30
- def rejected?
31
- return @rejected
59
+ def fix_degenerated_fasta!(translate_hash)
60
+ s = @seq_fasta
61
+ res = []
62
+
63
+ nts_of_a_line = s.split('')
64
+
65
+ nts_of_a_line.map{|e|
66
+
67
+ if (e =~ /[RWMKSYHBDVN]/)
68
+ translate_hash[e][1] += 1
69
+ e = translate_hash[e][0][translate_hash[e][1]%translate_hash[e][0].length]
70
+ end
71
+
72
+ res << e
73
+
74
+ }
75
+
76
+ @seq_fasta=res.compact.join
32
77
  end
33
-
34
- def reject!(message='')
35
- @rejected=true
36
- @rejected_message=message
78
+
79
+ def clean_orfs
80
+ @orfs=[]
37
81
  end
38
-
39
- # :complete, :tmp_annotation, :error, :protein, :nucleotide, :alignment, :tcode
40
- def get_annotations(annotation_type)
41
- return @annotations.select{|a| a[:annotation_type]==annotation_type}
82
+
83
+ def reset_classification
84
+ @type = UNKNOWN
85
+ @status = FALSE
86
+ end
87
+
88
+ def clean_warnings
89
+ @warnings = []
90
+ end
91
+
92
+ def clean_annotations
93
+ @annotations = []
94
+ end
95
+
96
+ def get_acc
97
+ acc=hit.acc
98
+ return acc
99
+ end
100
+
101
+ def get_pident
102
+ pident=hit.ident
103
+ return pident
104
+ end
105
+
106
+ def format_chimera!
107
+ @hit = []
42
108
  end
43
109
 
44
- def annotate(annotation_type, message='', replace_existing = false)
110
+ def warnings(warn)
111
+ if warn.class.to_s == 'Array'
112
+ warn.each do |w|
113
+ @warnings << check_warn(w)
114
+ end
115
+ else
116
+ @warnings << check_warn(warn)
117
+ end
118
+ end
45
119
 
46
- if replace_existing
47
- @annotations.reverse_each do |annotation|
48
- if annotation[:annotation_type]==annotation_type
49
- @annotations.delete(annotation)
50
- end
120
+ def clone_warnings(array_warnings)
121
+ array_warnings.map{|warn| @warnings << warn.dup}
122
+ end
123
+
124
+ def check_warn(warn)
125
+ check = warn
126
+ replace = nil
127
+ if warn.class.to_s == 'Array'
128
+ check = warn.shift # Take warning tag message
129
+ replace = warn # Take values to replace in message
130
+ end
131
+
132
+ message = $warnings_hash[check]
133
+ if message.nil?
134
+ message = check # If not exists the message
135
+ end
136
+
137
+ if !replace.nil?
138
+ message = message.dup # Duplicate memory to avoid overwrite original warning hash messages
139
+ replace.each do |rep|
140
+ message.sub!('(*replace*)',"#{rep}") #message variable
51
141
  end
52
142
  end
53
-
54
-
55
- @annotations.push({:annotation_type=>annotation_type,:message=>message})
143
+ return message
56
144
  end
57
-
58
- def change_degenerated_nt!
59
-
60
-
61
- ########################################
62
-
63
- tranlaste_hash = {}
64
- tranlaste_hash['R']= [['a','g'],0]
65
- tranlaste_hash['W']= [['a','t'],0]
66
- tranlaste_hash['M']= [['a','c'],0]
67
- tranlaste_hash['K']= [['g','t'],0]
68
- tranlaste_hash['S']= [['g','c'],0]
69
- tranlaste_hash['Y']= [['c','t'],0]
70
- tranlaste_hash['H']= [['a','t','c'],0]
71
- tranlaste_hash['B']= [['g','t','c'],0]
72
- tranlaste_hash['D']= [['g','a','t'],0]
73
- tranlaste_hash['V']= [['g','a','c'],0]
74
- tranlaste_hash['N']= [['g','a','c','t'],0]
75
-
76
- ########################################
77
-
78
- fix_degenerated_fasta!(tranlaste_hash)
79
-
80
-
145
+
146
+ def test_code(test_code)
147
+ @t_code = test_code
148
+ if @t_code >= 0.95
149
+ @status = TRUE
150
+ end
81
151
  end
82
-
83
- def fix_degenerated_fasta!(tranlaste_hash)
84
- s = @seq_fasta
85
- res = []
86
152
 
87
- nts_of_a_line = s.split('')
153
+ def get_fasta(seq)
154
+ fasta = ">#{@seq_name}\n#{seq}"
155
+ return fasta
156
+ end
88
157
 
89
- nts_of_a_line.map{
90
- |e|
91
- # puts "#{e} "
158
+ def write_info(output_files) # Output_files is a hash
159
+ if @save_fasta
160
+ output_files['seqs'].puts get_fasta(@seq_fasta)
161
+ end
162
+ case @type
163
+ when OTHER
164
+ write_other(output_files[@type])
165
+ when CHIMERA
166
+ write_chimera(output_files[@type])
167
+ when MISASSEMBLED
168
+ write_misassembled(output_files[@type])
169
+ when UNKNOWN
170
+ write_unknown(output_files[@type])
171
+ when COMPLETE .. INTERNAL
172
+ write_prot_annot(output_files['db'])
173
+ write_prot_seq(output_files['prot'])
174
+ write_nt_seq(output_files['nts'])
175
+ write_align(output_files['align'])
176
+ when NCRNA
177
+ write_ncrna(output_files[@type])
178
+ when CODING
179
+ write_coding(output_files[@type])
180
+ else
181
+ if @type != FAILED
182
+ raise "#{@type} is an incorrect type"
183
+ end
184
+ end
185
+ end
92
186
 
93
- if (e =~ /[RWMKSYHBDVN]/)
187
+ def all_warns
188
+ all = @warnings.join(' ')
189
+ return all
190
+ end
94
191
 
95
- # puts "#{e} "
96
- tranlaste_hash[e][1] += 1
97
- # puts "#{e} #{tranlaste_hash[e][1]}"
192
+ def write_other(file)
193
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}\t#{all_warns}"
194
+ end
98
195
 
99
- e = tranlaste_hash[e][0][tranlaste_hash[e][1]%tranlaste_hash[e][0].length]
196
+ def write_chimera(file) #TODO : write 'SOLVED' tag
197
+ @hit.each do |h|
198
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{h.acc}\t#{@db_name}\t#{h.q_frame}\t#{h.e_val}\t#{h.ident}\t#{h.q_beg + 1}\t#{h.q_end + 1}\t#{h.s_beg + 1}\t#{h.s_end + 1}\t#{h.definition}"
199
+ end
200
+ file.puts
201
+ end
202
+
203
+ def write_misassembled(file)
204
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}"
205
+ end
206
+
207
+ def write_unknown(file)
208
+ # ħit is an array. 2 => q_frame, 1 ORF end, 0 ORF beg
209
+ if hit.class.to_s == 'Array'
210
+ orf_beg = @hit[0]
211
+ orf_end = @hit[1]
212
+ q_frame = @hit[2]
213
+ else
214
+ orf_beg = '-'
215
+ orf_end = '-'
216
+ q_frame = '-'
217
+ end
218
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@t_code}\t#{all_warns}\t#{q_frame}\t#{orf_beg}\t#{orf_end}"
219
+ end
220
+
221
+ def write_prot_annot(file)
222
+ final_func_annot = Array.new(9, '-')
223
+ if !@functional_annotations.empty?
224
+ final_func_annot = @functional_annotations.values
225
+ end
226
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@db_name}\t#{prot_annot_calification}\t#{@hit.e_val}\t#{@hit.ident}\t#{@hit.full_subject_length}\t#{@seq_aa.length}\t#{all_warns}\t#{@hit.q_frame}\t#{@hit.q_beg + 1}\t#{@hit.q_end + 1}\t#{@hit.s_beg + 1}\t#{@hit.s_end + 1}\t#{@hit.definition}\t#{final_func_annot.join("\t")}"
227
+ end
228
+
229
+ def write_ncrna(file)
230
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@hit.acc}\t#{@hit.e_val}\t#{@hit.ident}\t#{@hit.q_beg + 1}\t#{@hit.q_end + 1}\t#{@hit.s_beg + 1}\t#{@hit.s_end + 1}\t#{@hit.definition}"
231
+ end
232
+
233
+ def write_coding(file)
234
+ # ħit is an array. 2 => q_frame, 1 ORF end, 0 ORF beg
235
+ calification = 'Putative'
236
+ if @status
237
+ calification = 'Sure'
238
+ end
239
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{calification}\t#{@t_code}\t#{@hit.last}\t#{@hit.first}\t#{@hit[1]}"
240
+ end
241
+
242
+ #Write complementary files
243
+ def write_prot_seq(file)
244
+ file.puts get_fasta(@seq_aa)
245
+ end
246
+
247
+ def write_align(file)
248
+ tabs = (seq_name.length/8).ceil
249
+ if tabs == 0
250
+ tabs = 1
251
+ end
252
+ second_tab = 0
253
+ if seq_name.length > 7
254
+ second_tab = 1
255
+ end
256
+ file.puts "#{@seq_name}#{"\t"*tabs}#{@hit.q_seq}\n#{@hit.acc}#{"\t"*(tabs+second_tab)}#{@hit.s_seq}"
257
+ file.puts
258
+ end
259
+
260
+ def write_nt_seq(file)
261
+ file.puts "#{@seq_name}\t#{@fasta_length}\t#{@seq_nt}"
262
+ end
263
+
264
+ def calification
265
+ type_description = nil
266
+ case @type
267
+ when FAILED
268
+ type_description = 'Failed'
269
+ when OTHER
270
+ type_description = 'Other'
271
+ when CHIMERA
272
+ type_description = 'Chimera'
273
+ when MISASSEMBLED
274
+ type_description = 'Misassembled'
275
+ when UNKNOWN
276
+ type_description = 'Unknown'
277
+ when COMPLETE
278
+ type_description = 'Complete'
279
+ when N_TERMINAL
280
+ type_description = 'N_terminal'
281
+ when C_TERMINAL
282
+ type_description = 'C_terminal'
283
+ when INTERNAL
284
+ type_description = 'Internal'
285
+ when CODING
286
+ type_description = 'Coding'
287
+ when NCRNA
288
+ type_description = 'NcRNA'
289
+ end
290
+
291
+ end
292
+
293
+ def prot_annot_calification
294
+ info = "#{calification} "
295
+ if @status
296
+ info << 'Sure'
297
+ else
298
+ info << 'Putative'
299
+ end
300
+ return info
301
+ end
302
+
303
+ def show_alignment(h, nts, show_nts, original_query_coordinates = nil)
304
+ puts "Prot id:\t#{h.acc}", "Alignment length:\t#{h.align_len} aa", "Subject length:\t#{h.s_len} aa", "Query length:\t#{nts.length/3} aa"
305
+ puts prot_annot_calification
306
+ puts
307
+
308
+ aa_unigen = nts[h.q_frame - 1 .. nts.length-1].translate
309
+ index = contenidos_en_prot(h.q_seq, aa_unigen)
310
+
311
+ # View desplacements 5-prime/align/3-prime
312
+ subzone_align = nil
313
+ if !original_query_coordinates.nil?
314
+ subzone_align = {}
315
+ if h.q_beg > original_query_coordinates.first #alignment has transferred characters to 5 prime
316
+ subzone_align['beg'] = [original_query_coordinates.first, h.q_beg-3, 42] # -3 to exclude the last aa
317
+ elsif h.q_beg < original_query_coordinates.first
318
+ subzone_align['beg'] = [h.q_beg, original_query_coordinates.first-3, 46] #alignment has received characters from 5 prime
319
+ end
100
320
 
101
- # puts "#{e}"
321
+ if h.q_end < original_query_coordinates.last #alignment has transferred characters to 3 prime
322
+ subzone_align['end'] = [h.q_end, original_query_coordinates.last, 42]
323
+ elsif h.q_end > original_query_coordinates.last
324
+ subzone_align['end'] = [original_query_coordinates.last, h.q_end, 43] #alignment has received characters from 3 prime
102
325
  end
326
+ end
103
327
 
104
- res.push e
328
+ # Print 5 prime
329
+ if index > 0 # 5 prime exists
330
+ aa_align = aa_unigen[0 .. index-1].split('')
331
+ nt_align = nts[h.q_frame-1..h.q_beg-1]
332
+ print_alignment(aa_align, nt_align, 36, show_nts, subzone_align)
333
+ reduce_coordinates(subzone_align, aa_align, h)
334
+ end
105
335
 
106
- }
336
+ # Print core alignment or protein
337
+ aa_align = h.q_seq.split('')
338
+ nt_align = nts[h.q_beg..h.q_end]
339
+ print_alignment(aa_align, nt_align, 32, show_nts, subzone_align)
340
+ reduce_coordinates(subzone_align, aa_align, h)
341
+
342
+ # Print 3 prime
343
+ gaps = h.q_seq.count('-')
344
+ three_prime_beg = index+h.q_seq.length-gaps
345
+ if aa_unigen.length > three_prime_beg # 3 prime exists
346
+ aa_align = aa_unigen[three_prime_beg .. aa_unigen.length-1].split('')
347
+ fs = check_frame_shift(h)
348
+ nt_align = nts[h.q_end+1-fs..nts.length-1]
349
+ print_alignment(aa_align, nt_align, 33, show_nts, subzone_align)
350
+ end
107
351
 
108
- @seq_fasta=res.compact.join
109
- # @seq_fasta='dario'
110
352
  end
111
-
112
-
113
- end
353
+
354
+
355
+ def print_alignment(aa_align, nt_align, color, show_nts, mark_subzone = nil)
356
+ original_color = color
357
+ c={ 'GCT'=>'A','GCC'=>'A','GCA'=>'A','GCG'=>'A',
358
+ 'CGT'=>'R','CGC'=>'R','CGA'=>'R','CGG'=>'R','AGA'=>'R','AGG'=>'R',
359
+ 'AAT'=>'N','AAC'=>'N',
360
+ 'GAT'=>'D','GAC'=>'D',
361
+ 'TGT'=>'C','TGC'=>'C',
362
+ 'CAA'=>'Q','CAG'=>'Q',
363
+ 'GAA'=>'E','GAG'=>'E',
364
+ 'GGT'=>'G','GGC'=>'G','GGA'=>'G','GGG'=>'G',
365
+ 'CAT'=>'H','CAC'=>'H',
366
+ 'ATT'=>'I','ATC'=>'I','ATA'=>'I',
367
+ 'TTA'=>'L','TTG'=>'L','CTT'=>'L','CTC'=>'L','CTA'=>'L','CTG'=>'L',
368
+ 'ATG'=>'M',
369
+ 'AAA'=>'K','AAG'=>'K',
370
+ 'TTT'=>'F','TTC'=>'F',
371
+ 'CCT'=>'P','CCC'=>'P','CCA'=>'P','CCG'=>'P',
372
+ 'TCT'=>'S','TCC'=>'S','TCA'=>'S','TCG'=>'S','AGT'=>'S','AGC'=>'S',
373
+ 'ACT'=>'T','ACC'=>'T','ACA'=>'T','ACG'=>'T',
374
+ 'TGG'=>'W',
375
+ 'TAT'=>'Y','TAC'=>'Y',
376
+ 'GTT'=>'V','GTC'=>'V','GTA'=>'V','GTG'=>'V',
377
+ 'TAG'=>'*','TGA'=>'*','TAA'=>'*'}
378
+
379
+ nt_line = ''
380
+ aa_line = ''
381
+ gaps = 0
382
+ count = 0
383
+ aa_align.each_with_index do |aa, n|
384
+ if aa == '-'
385
+ nt_line << '---'
386
+ gaps += 1
387
+ else
388
+ # Check aa with codon
389
+ codon_window = (n-gaps)*3
390
+ codon = nt_align[codon_window..codon_window+2]
391
+ nt_line << "#{codon}"
392
+ if aa.upcase != 'X'
393
+ if codon.upcase.include?('N')
394
+ traslated_aa = '-'
395
+ else
396
+ traslated_aa = c[codon]
397
+ end
398
+ if traslated_aa != '-' && traslated_aa != aa
399
+ puts "#{traslated_aa} #{aa}"
400
+ aa = '?'
401
+ end
402
+ end
403
+ end
404
+ if !mark_subzone.nil?
405
+ nts_coordenate = (n-gaps)*3
406
+ mark_subzone.values.each do |subzone|
407
+ if nts_coordenate >= subzone[0] && nts_coordenate <= subzone[1] #0 => first coordenate, 1 => second coordenate
408
+ color = subzone.last
409
+ end
410
+ end
411
+ end
412
+ space = nil
413
+ if show_nts
414
+ space = ' '
415
+ end
416
+ aa_line << "\e[#{color}m#{space}#{aa}#{space}\e[0m"
417
+ color = original_color
418
+ line_length = 60
419
+ if (n+1) % line_length == 0 || n+1 == aa_align.length
420
+ count = n + 1
421
+ print "#{count}\t"
422
+ puts aa_line
423
+ if show_nts
424
+ print "#{count*3}\t"
425
+ puts nt_line
426
+ end
427
+ aa_line = ''
428
+ nt_line = ''
429
+ end
430
+ end
431
+
432
+ end
433
+
434
+ def reduce_coordinates(subzone_align, aa_align, h)
435
+ if !subzone_align.nil?
436
+ aligned = 3 * aa_align.length + h.q_frame-1
437
+ subzone_align.values.each do |subzone|
438
+ subzone[0]-= aligned
439
+ subzone[1]-= aligned
440
+ end
441
+ end
442
+ end
443
+
444
+ def area_without_annotation?
445
+ if @hit.class == Array
446
+ hit = @hit.first
447
+ else
448
+ hit = @hit
449
+ end
450
+ upstream_annotation_space = hit.q_beg
451
+ downstream_annotation_space = @fasta_length - hit.q_end
452
+ if upstream_annotation_space >= 150 || downstream_annotation_space >= 150
453
+ @area_without_annotation = TRUE
454
+ end
455
+ return @area_without_annotation
456
+ end
457
+
458
+ def clone
459
+ new_seq = self.dup
460
+ new_seq.clean_annotations
461
+ new_seq.clean_warnings
462
+ new_seq.clean_orfs
463
+ new_seq.clone_warnings(@warnings)
464
+ new_seq.clone_annotations(@annotations)
465
+ return new_seq
466
+ end
467
+
468
+ def clone_annotations(array_annotations)
469
+ array_annotations.map{|annotation| @annotations << annotation.dup}
470
+ end
471
+
472
+ end