RubyGems - full_lengther_next - Versions diffs - 0.0.6 → 0.0.8 - Mend

full_lengther_next 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/History.txt +8 -0
data/Manifest.txt +1 -0
data/Rakefile +1 -1
data/bin/full_lengther_next +8 -1
data/bin/make_user_db.rb +5 -5
data/lib/full_lengther_next.rb +3 -3
data/lib/full_lengther_next/classes/chimeric_seqs.rb +78 -0
data/lib/full_lengther_next/classes/fln_stats.rb +148 -36
data/lib/full_lengther_next/classes/my_worker.rb +53 -5
data/lib/full_lengther_next/classes/my_worker_manager.rb +93 -29
metadata +5 -4

data/History.txt CHANGED

@@ -1,3 +1,11 @@
+=== 0.0.8 2012-11-28
+Protection against empty seqs when all seqs match against user_db
+=== 0.0.7 2012-07-25
+Chimera detection
 === 0.0.6 2012-04-16
 Fixed some cosmetic issues and parameters names

data/Manifest.txt CHANGED

@@ -3,6 +3,7 @@ bin/make_user_db.rb
 bin/full_lengther_next
 History.txt
 lib/full_lengther_next/classes/common_functions.rb
+lib/full_lengther_next/classes/chimeric_seqs.rb
 lib/full_lengther_next/classes/fl_analysis.rb
 lib/full_lengther_next/classes/fl_string_utils.rb
 lib/full_lengther_next/classes/fln_stats.rb

data/Rakefile CHANGED

@@ -20,7 +20,7 @@ $hoe = Hoe.spec 'full_lengther_next' do
   # self.extra_deps << ['gnuplot','>=0']
   # self.extra_deps << ['term-ansicolor','>=1.0.5']
   self.extra_deps << ['xml-simple','>=1.0.12']
-  self.extra_deps << ['scbi_blast','>=0.0.32']
+  self.extra_deps << ['scbi_blast','>=0.0.37']
   self.extra_deps << ['scbi_mapreduce','>=0.0.29']
   self.extra_deps << ['scbi_fasta','>=0.1.7']
   # self.extra_deps << ['scbi_fastq','>=0.0.13']

data/bin/full_lengther_next CHANGED

@@ -50,6 +50,11 @@ optparse = OptionParser.new do |opts|
 		options[:distance] = distance.to_i
 	end
+	options[:chimera] = nil
+	opts.on( '-q', '--chimera_detection', "apply chimera detection mode\n\n" ) do |chimera|
+		options[:chimera] = chimera
+	end
 	options[:workers] = 2
 	opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|
     if File.exists?(workers)
@@ -91,7 +96,7 @@ optparse = OptionParser.new do |opts|
 	# Set a banner, displayed at the top of the help screen.
-	opts.banner = "Usage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
+	opts.banner = "\nUsage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
 	# This displays the help screen
 	opts.on( '-h', '--help', 'Display this screen' ) do
@@ -172,6 +177,8 @@ require 'my_worker_manager'
 $LOG = Logger.new(STDOUT)
 $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
+# puts "ROOT_PATH: #{ROOT_PATH}"
 custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker.rb')
 	$LOG.info 'Starting server'

data/bin/make_user_db.rb CHANGED

@@ -126,16 +126,16 @@ end
 ENV['BLASTDB']=formatted_db_path
-if !File.exists?(File.join(ENV['BLASTDB'], my_group))
-	Dir.mkdir("blast_dbs/#{my_group}")
+if !File.exists?(File.join(formatted_db_path, my_group))
+	Dir.mkdir(File.join(formatted_db_path,my_group))
 end
-output_file_path=File.join(ENV['BLASTDB'],my_group,my_group+".fasta")
+output_file_path=File.join(formatted_db_path,my_group,my_group+".fasta")
 output_file = File.new(output_file_path, "w")
-filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_sprot_#{uniprot_group}.dat"), my_group)
-filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_trembl_#{uniprot_group}.dat"), my_group)
+filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_sprot_#{uniprot_group}.dat"), my_group)
+filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_trembl_#{uniprot_group}.dat"), my_group)
 output_file.close

data/lib/full_lengther_next.rb CHANGED

@@ -1,13 +1,13 @@
 $:.unshift(File.dirname(__FILE__)) unless
   $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
-root_path=File.join(File.dirname(__FILE__),'full_lengther_next')
+# ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
-$: << File.expand_path(File.join(root_path, 'classes'))
+$: << File.expand_path(File.join(File.dirname(__FILE__), 'full_lengther_next', 'classes'))
 module FullLengtherNext
-   VERSION = '0.0.6'
+   VERSION = '0.0.8'
   FULLLENGHTER_VERSION = VERSION
 end

data/lib/full_lengther_next/classes/chimeric_seqs.rb ADDED

@@ -0,0 +1,78 @@
+require 'scbi_blast'
+module ChimericSeqs
+	def search_chimeras(seq, blast_query, options, db_name)
+		# used to detect if the sequence and the blast are from different query
+		if (seq.seq_name != blast_query.query_def)
+			puts "#{seq.seq_name} --> #{blast_query.query_def}"
+			raise "BLAST query name and sequence are different"
+		end
+		q=blast_query
+		# puts "#{q.query_def}"
+		ref_hit_beg = q.hits[0].q_beg
+		ref_hit_end = q.hits[0].q_end
+		q.hits.each do |hit|
+			# puts "---------#{hit.acc}"
+			# if overlaps or is contained in the ref hit
+			if ((ref_hit_beg <= hit.q_beg) && (ref_hit_end > hit.q_beg)) || ((hit.q_beg <= ref_hit_beg) && (hit.q_end > ref_hit_beg))
+				# puts "hits overlapping: ref_hit #{ref_hit_beg}-#{ref_hit_end}, current hit #{hit.q_beg}-#{hit.q_end}"
+				ref_hit_beg = [ref_hit_beg,hit.q_beg].min
+				ref_hit_end = [ref_hit_end,hit.q_end].max
+				# puts "modified ref_hit #{ref_hit_beg}-#{ref_hit_end}"
+			end
+		end
+		q.hits.each do |hit|
+			if ((ref_hit_beg <= hit.q_beg) && (ref_hit_end > hit.q_beg)) || ((hit.q_beg <= ref_hit_beg) && (hit.q_end > ref_hit_beg))
+			else
+				if (hit.acc != q.hits[0].acc)
+					# puts "\nreference: #{ref_hit_beg} - #{ref_hit_end}"
+					# puts "hit 0: #{q.hits[0].q_beg} - #{q.hits[0].q_end}"
+					# puts "current: #{hit.q_beg} - #{hit.q_end}"
+					# puts "putative chimeric seq: \n#{q.hits[0].definition}\n#{hit.definition}\n------------------------------------------"
+					chimera_annotations = "\n#{q.query_def}\t#{seq.fasta_length}\t#{q.hits[0].acc}\t#{db_name}\tPutative chimera\t\t#{q.hits[0].e_val}\t#{q.hits[0].ident}\t\t\tPutative chimera detected showing similarity with two different genes #{q.hits[0].acc} - #{hit.acc}\t#{q.hits[0].q_frame}\t#{q.hits[0].q_beg}\t#{q.hits[0].q_end}\t#{q.hits[0].s_beg.to_i}\t#{q.hits[0].s_end.to_i}\t#{q.hits[0].definition}\t\n#{q.query_def}\t#{seq.fasta_length}\t#{hit.acc}\t#{db_name}\tPutative chimera\t\t#{hit.e_val}\t#{hit.ident}\t\t\tPutative chimera detected showing similarity with two different genes #{q.hits[0].acc} - #{hit.acc}\t#{hit.q_frame}\t#{hit.q_beg}\t#{hit.q_end}\t#{hit.s_beg.to_i}\t#{hit.s_end.to_i}\t#{hit.definition}\t"
+					seq.annotate(:chimera,chimera_annotations,false)
+				end
+				return
+			end
+		end
+	end
+	def select_best_blast(tmp_blast_obj, new_seqs)
+		my_seqs ={}
+		new_seqs.each do |seq|
+			my_seqs[seq.seq_name] = true
+		end
+		reverse_counter = (tmp_blast_obj.querys.length - 1)
+		tmp_blast_obj.querys.reverse_each do |query|
+			if (!my_seqs[query.query_def]) # los marcados como quimeras se eliminan para no utilizarse posteriormente
+				# tmp_blast_obj.querys[reverse_counter].delete
+				tmp_blast_obj.querys.delete_at(reverse_counter)
+			else
+				reverse_hit_counter = (query.hits.length - 1)
+				query.hits.reverse_each do |hit|
+					if (hit.acc != query.hits[0].acc)
+						tmp_blast_obj.querys[reverse_counter].hits.delete_at(reverse_hit_counter)
+					end
+					reverse_hit_counter -= 1
+				end
+			end
+			reverse_counter -= 1
+		end
+		return tmp_blast_obj
+	end
+end

data/lib/full_lengther_next/classes/fln_stats.rb CHANGED

@@ -4,20 +4,28 @@ module FlnStats
 	def summary_stats
 		stats_file = File.open('fln_results/summary_stats.html', 'w')
+		size_filter1 = 200
+		size_filter2 = 500
 		# recogemos los trozos de html fijos
 		(html_head, html_st, html_uni, html_db, html_as, html_end) = html_code
 		total_seqs = 0
 		status_suma = 0
 		#recogemos los datos que necesitamos de los ficheros de resultados
-		(status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats
-		(tcode_array, seqs_number2, unk_200, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats
-		(ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats
+		(status_array, db_usage, seqs_number1, error_1_num, seq_uniq, complete_uniq, db_uni_500, db_uni_200, db_longest_one) = annotation_stats(size_filter1,size_filter2)
+		(tcode_array, seqs_number2, tc_uni_500, tc_uni_200, tc_longest_one) = testcode_stats(size_filter1,size_filter2)
+		(ncrna_total, nc_uni_500, nc_uni_200, nc_longest_one)=ncrna_stats(size_filter1,size_filter2)
+		(chimera_total, ch_uni_500, ch_uni_200, ch_longest_one, ch_db_usage)=chimera_stats(size_filter1,size_filter2)
-		total_seqs = seqs_number1 + seqs_number2 + ncrna_total.to_i
-		uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500)
-		uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200)
-		longest_one = [db_longest_one, tc_longest_one, nc_longest_one].max
+		seqs_number1 = (seqs_number1+chimera_total.to_i)
+		total_seqs = (seqs_number1 + seqs_number2 + ncrna_total.to_i)
+		uni_500 = (db_uni_500 + tc_uni_500 + nc_uni_500 + ch_uni_500)
+		uni_200 = (db_uni_200 + tc_uni_200 + nc_uni_200 + ch_uni_200)
+		longest_one = [db_longest_one, tc_longest_one, nc_longest_one, ch_longest_one].max
+		db_usage[0] += ch_db_usage[0]
+		db_usage[1] += ch_db_usage[1]
+		db_usage[2] += ch_db_usage[2]
 		stats_file.puts html_head
 		if (total_seqs.to_i > 0)
@@ -46,6 +54,15 @@ module FlnStats
 				end
 				status_suma += status[0]
 			end
+			# adding chimeric seqs
+			stats_file.puts '				<tr>
+					<td colspan="2" align="left">Putative chimera</td>
+					<td align="right">'+chimera_total.to_s+'</td>
+					<td align="right">'+'%.2f' % (100*chimera_total.to_f/total_seqs.to_f).to_s+' %</td>
+				</tr>'
+				status_suma += chimera_total
 			# añadimos los coding, P.coding
 			tcode_array.each do |status|
 				if (status[1] == 'Coding')
@@ -64,6 +81,7 @@ module FlnStats
 				end
 				status_suma += status[0]
 			end
 			# se ponen los ncRNA
 			stats_file.puts '				<tr>
 					<td colspan="2" align="left">Putative ncRNA</td>
@@ -156,12 +174,12 @@ module FlnStats
 					<td align="right">'+'%.2f' % (100*total_seqs.to_f/total_seqs.to_f).to_s+' %</td>
 				</tr>'
 				stats_file.puts '				<tr>
-					<td align="left">Unigenes >500pb</td>
+					<td align="left">Unigenes >'+size_filter2.to_s+'pb</td>
 					<td align="right">'+uni_500.to_s+'</td>
 					<td align="right">'+'%.2f' % (100*uni_500.to_f/total_seqs.to_f).to_s+' %</td>
 				</tr>'
 				stats_file.puts '				<tr>
-					<td align="left">Unigenes >200pb</td>
+					<td align="left">Unigenes >'+size_filter1.to_s+'pb</td>
 					<td align="right">'+uni_200.to_s+'</td>
 					<td align="right">'+'%.2f' % (100*uni_200.to_f/total_seqs.to_f).to_s+' %</td>
 				</tr>'
@@ -175,6 +193,8 @@ module FlnStats
 					<td align="right">'+seqs_number1.to_s+'</td>
 					<td align="right">'+'%.2f' % (100*seqs_number1.to_f/total_seqs.to_f).to_s+' %</td>
 				</tr>'
+			if (seqs_number1.to_i > 0)
 				stats_file.puts '				<tr>
 					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Different orthologue IDs</td>
 					<td align="right">'+seq_uniq.to_s+'</td>
@@ -195,21 +215,49 @@ module FlnStats
 					<td align="right">'+error_1_num.to_s+'</td>
 					<td align="right">'+'%.2f' % (100*error_1_num.to_f/seqs_number1.to_f).to_s+' %</td>
 				</tr>'
+				stats_file.puts '				<tr>
+					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative chimera</td>
+					<td align="right">'+chimera_total.to_s+'</td>
+					<td align="right">'+'%.2f' % (100*chimera_total.to_f/seqs_number1.to_f).to_s+' %</td>
+				</tr>'
+			end
 				stats_file.puts '				<tr>
 					<td align="left">Without orthologue <sup>1</sup></td>
 					<td align="right">'+no_db.to_s+'</td>
 					<td align="right">'+'%.2f' % (100*seqs_number2.to_f/total_seqs.to_f).to_s+' %</td>
 				</tr>'
+			if (no_db.to_i > 0) && (seqs_number2.to_i > 0)
 				stats_file.puts '				<tr>
-					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding</td>
+					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding (all)</td>
 					<td align="right">'+tcode_array[0][0].to_s+'</td>
 					<td align="right">'+'%.2f' % (100*tcode_array[0][0].to_f/no_db.to_f).to_s+' %</td>
 				</tr>'
 				stats_file.puts '				<tr>
-					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding</td>
+					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding > '+size_filter1.to_s+'bp</td>
+					<td align="right">'+tcode_array[0][2].to_s+'</td>
+					<td align="right">'+'%.2f' % (100*tcode_array[0][2].to_f/no_db.to_f).to_s+' %</td>
+				</tr>'
+				stats_file.puts '				<tr>
+					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Coding > '+size_filter2.to_s+'bp</td>
+					<td align="right">'+tcode_array[0][3].to_s+'</td>
+					<td align="right">'+'%.2f' % (100*tcode_array[0][3].to_f/no_db.to_f).to_s+' %</td>
+				</tr>'
+				stats_file.puts '				<tr>
+					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding (all)</td>
 					<td align="right">'+tcode_array[1][0].to_s+'</td>
 					<td align="right">'+'%.2f' % (100*tcode_array[1][0].to_f/no_db.to_f).to_s+' %</td>
 				</tr>'
+				stats_file.puts '				<tr>
+					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding > '+size_filter1.to_s+'bp</td>
+					<td align="right">'+tcode_array[1][2].to_s+'</td>
+					<td align="right">'+'%.2f' % (100*tcode_array[1][2].to_f/no_db.to_f).to_s+' %</td>
+				</tr>'
+				stats_file.puts '				<tr>
+					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative Coding > '+size_filter2.to_s+'bp</td>
+					<td align="right">'+tcode_array[1][3].to_s+'</td>
+					<td align="right">'+'%.2f' % (100*tcode_array[1][3].to_f/no_db.to_f).to_s+' %</td>
+				</tr>'
 				stats_file.puts '				<tr>
 					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Putative ncRNA</td>
 					<td align="right">'+ncrna_total.to_s+'</td>
@@ -221,16 +269,19 @@ module FlnStats
 					<td align="right">'+'%.2f' % (100*tcode_array[2][0].to_f/no_db.to_f).to_s+' %</td>
 				</tr>'
 				stats_file.puts '				<tr>
-					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown < 200bp</td>
-					<td align="right">'+unk_200.to_s+'</td>
-					<td align="right">'+'%.2f' % (100*unk_200.to_f/no_db.to_f).to_s+' %</td>
-				</tr>
-			</table>
+					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown > '+size_filter1.to_s+'bp</td>
+					<td align="right">'+tcode_array[2][2].to_s+'</td>
+					<td align="right">'+'%.2f' % (100*tcode_array[2][2].to_f/no_db.to_f).to_s+' %</td>
+				</tr>'
+				stats_file.puts '				<tr>
+					<td align="left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Unknown > '+size_filter2.to_s+'bp</td>
+					<td align="right">'+tcode_array[2][3].to_s+'</td>
+					<td align="right">'+'%.2f' % (100*tcode_array[2][3].to_f/no_db.to_f).to_s+' %</td>
+				</tr>'
+			end
+			stats_file.puts '			</table>
 			<sup>1</sup> Percents for subclassifications of this category were calculated using this line as 100% reference.'
 		end
 		stats_file.puts html_end
@@ -309,7 +360,7 @@ module FlnStats
 		end
-		def annotation_stats
+		def annotation_stats(size_filter1,size_filter2)
 			seqs_number = 0
 			array_of_all_accs = []
@@ -353,10 +404,10 @@ module FlnStats
 					end
 					# -------------------------------------------------------------------------
-					if (fasta_length.to_i >= 200)
+					if (fasta_length.to_i >= size_filter1)
 						uni_200 += 1
 					end
-					if (fasta_length.to_i >= 500)
+					if (fasta_length.to_i >= size_filter2)
 						uni_500 += 1
 					end
 					# -------------------------------------------------------------------------
@@ -394,18 +445,17 @@ module FlnStats
 		end
-		def testcode_stats
+		def testcode_stats(size_filter1,size_filter2)
 			seqs_number = 0
-			unk_200 = 0
 			uni_500 = 0
 			uni_200 = 0
 			longest_one = 0
 			# total, status
-			coding_stats = [0,'Coding']
-			p_coding_stats = [0,'Putative Coding']
-			unknown_stats = [0,'Unknown']
+			coding_stats = [0,'Coding',0,0]
+			p_coding_stats = [0,'Putative Coding',0,0]
+			unknown_stats = [0,'Unknown',0,0]
 			File.open('fln_results/new_coding.txt').each do |line|
 				line.chomp!
@@ -419,17 +469,31 @@ module FlnStats
 					end
 					# -------------------------------------------------------------------------
-					if (fasta_length.to_i >= 200)
+					if (fasta_length.to_i >= size_filter1)
 						uni_200 += 1
 					end
-					if (fasta_length.to_i >= 500)
+					if (fasta_length.to_i >= size_filter2)
 						uni_500 += 1
 					end
 					# -------------------------------------------------------------------------
-					if (fasta_length.to_i < 200)
-						if (status == 'unknown')
-							unk_200 += 1
+					if (fasta_length.to_i > size_filter1)
+						if (status == 'coding')
+							coding_stats[2] += 1
+						elsif (status == 'putative_coding')
+							p_coding_stats[2] += 1
+						elsif (status == 'unknown')
+							unknown_stats[2] += 1
+						end
+					end
+					if (fasta_length.to_i > size_filter2)
+						if (status == 'coding')
+							coding_stats[3] += 1
+						elsif (status == 'putative_coding')
+							p_coding_stats[3] += 1
+						elsif (status == 'unknown')
+							unknown_stats[3] += 1
 						end
 					end
@@ -447,11 +511,11 @@ module FlnStats
 			status_array = [coding_stats, p_coding_stats, unknown_stats]
-			return [status_array, seqs_number, unk_200, uni_500, uni_200, longest_one]
+			return [status_array, seqs_number, uni_500, uni_200, longest_one]
 		end
-		def ncrna_stats
+		def ncrna_stats(size_filter1,size_filter2)
 			uni_500 = 0
 			uni_200 = 0
@@ -468,10 +532,10 @@ module FlnStats
 						longest_one = fasta_length.to_i
 					end
 					# -------------------------------------------------------------------------
-					if (fasta_length.to_i >= 200)
+					if (fasta_length.to_i >= size_filter1)
 						uni_200 += 1
 					end
-					if (fasta_length.to_i >= 500)
+					if (fasta_length.to_i >= size_filter2)
 						uni_500 += 1
 					end
 					# -------------------------------------------------------------------------
@@ -484,5 +548,53 @@ module FlnStats
 			return [nc_total, uni_500, uni_200, longest_one]
 		end
+		def chimera_stats(size_filter1,size_filter2)
+			uni_500 = 0
+			uni_200 = 0
+			ch_total = 0
+			longest_one = 0
+			db_usage = [0,0,0]
+			if !File.exists?('fln_results/chimeric_sequences.txt')
+				return [0, 0, 0, longest_one, db_usage]
+			else
+				File.open('fln_results/chimeric_sequences.txt').each do |line|
+					line.chomp!
+					if (!line.empty?)
+						(name,fasta_length,acc,db_name,status) = line.split("\t")
+						if (status == 'Putative chimera')
+							if (fasta_length.to_i > longest_one)
+								longest_one = fasta_length.to_i
+							end
+							# -------------------------------------------------------------------------
+							if (fasta_length.to_i >= size_filter1)
+								uni_200 += 1
+							end
+							if (fasta_length.to_i >= size_filter2)
+								uni_500 += 1
+							end
+							# -------------------------------------------------------------------------
+							if (db_name =~ /^sp_/)
+								db_usage[1] += 1
+							elsif (db_name =~ /^tr_/)
+								db_usage[2] += 1
+							else
+								db_usage[0] += 1
+							end
+							# -------------------------------------------------------------------------
+							ch_total += 1
+						end
+					end
+				end
+				db_usage.each_with_index do |db,i|
+					db_usage[i] = db/2
+				end
+				return [(ch_total/2), (uni_500/2), (uni_200/2), longest_one, db_usage]
+			end
+		end
 end

data/lib/full_lengther_next/classes/my_worker.rb CHANGED

@@ -8,6 +8,9 @@ require 'fl_string_utils'
 require "lcs" # like the class simliar of seqtrim, return the longest common sequence
 require "test_code"
+require 'chimeric_seqs'
+include ChimericSeqs
 require 'fl_analysis'
 include FlAnalysis
@@ -46,7 +49,12 @@ class MyWorker < ScbiMapreduce::Worker
 	# ejecuta blast utilizando los parametros fichero de entrada, base de datos, fichero de salida y tipo de blast
 	def run_blast(input, database, blast_type, evalue)
-		blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue} -max_target_seqs 1")
+		if (@options[:chimera].nil?)
+			blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue} -max_target_seqs 1")
+		else
+			blast=BatchBlast.new("-db #{database}",blast_type,"-evalue #{evalue}")
+		end
 		blast_result = blast.do_blast_seqs(input, :xml)
 		return blast_result
@@ -73,34 +81,71 @@ class MyWorker < ScbiMapreduce::Worker
 			# do blast
 			my_blast = run_blast(seqs, "#{@options[:user_db]}", 'blastx', '1e-6')
+			# chimera detection
+			if (!@options[:chimera].nil?)
+				seqs.each_with_index do |seq,i|
+					if (!my_blast.querys[i].hits[0].nil?)
+						search_chimeras(seq, my_blast.querys[i], @options, user_db_name)
+					end
+				end
+				seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
+				my_blast = select_best_blast(my_blast, seqs)
+			end
 			# split and parse blast
 			seqs.each_with_index do |seq,i|
 				analiza_orf_y_fl(seq, my_blast.querys[i], @options, user_db_name)
 			end
-			new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
+			new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
 		else
 			new_seqs = seqs
 		end
+		return if new_seqs.empty?
 		# -------------------------------------------- UniProt (sp)
 		# blast
 		sp_path=File.join("sp_#{@options[:tax_group]}","sp_#{@options[:tax_group]}.fasta")
 		my_blast = run_blast(new_seqs, sp_path, 'blastx', '1e-6')
+		# chimera detection
+		if (!@options[:chimera].nil?)
+			new_seqs.each_with_index do |seq,i|
+				if (!my_blast.querys[i].hits[0].nil?)
+					search_chimeras(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
+				end
+			end
+			new_seqs=seqs.select{|s| s.get_annotations(:chimera).empty?}
+			my_blast = select_best_blast(my_blast, new_seqs)
+		end
 		# split and parse blast
 		new_seqs.each_with_index do |seq,i|
 			analiza_orf_y_fl(seq, my_blast.querys[i], @options, "sp_#{@options[:tax_group]}")
 		end
-		new_seqs=seqs.select{|s| s.get_annotations(:complete).empty?}
+		new_seqs=seqs.select{|s| (s.get_annotations(:complete).empty? && s.get_annotations(:chimera).empty?)}
+		return if new_seqs.empty?
 		# -------------------------------------------- UniProt (tr)
 		# blast
 		tr_path=File.join("tr_#{@options[:tax_group]}","tr_#{@options[:tax_group]}.fasta")
 		my_blast = run_blast(new_seqs, tr_path, 'blastx', '1e-6')
+		# chimera detection
+		if (!@options[:chimera].nil?)
+			new_seqs.each_with_index do |seq,i|
+				if (!my_blast.querys[i].hits[0].nil?)
+					search_chimeras(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
+				end
+			end
+			new_seqs=new_seqs.select{|s| s.get_annotations(:chimera).empty?}
+			my_blast = select_best_blast(my_blast, new_seqs)
+		end
 		# split and parse blast
 		new_seqs.each_with_index do |seq,i|
 			analiza_orf_y_fl(seq, my_blast.querys[i], @options, "tr_#{@options[:tax_group]}")
@@ -108,8 +153,9 @@ class MyWorker < ScbiMapreduce::Worker
 		# -------------------------------------------- Test Code
 		# the sequences without a reliable similarity with an orthologue are processed with Test Code
-		testcode_input=seqs.select{|s| !s.get_annotations(:apply_tcode).empty?}
+		testcode_input=seqs.select{|s| (!s.get_annotations(:apply_tcode).empty? && s.get_annotations(:chimera).empty?)}
+		return if testcode_input.empty?
 # active this line to test tcode, and comment all lines above in this function
 # testcode_input=seqs
@@ -119,6 +165,8 @@ class MyWorker < ScbiMapreduce::Worker
 		# -------------------------------------------- nc RNA
 		unknown_seqs=seqs.select{|s| !s.get_annotations(:tcode_unknown).empty?}
+		return if unknown_seqs.empty?
 		# run blastn
 		ncrna_path=File.join('nc_rna_db','ncrna_fln_100.fasta')
 		my_blast = run_blast(unknown_seqs, ncrna_path, 'blastn', '1e-3')

data/lib/full_lengther_next/classes/my_worker_manager.rb CHANGED

@@ -34,6 +34,15 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
 		@@nc_rna_file = File.open("fln_results/nc_rnas.txt", 'w')
 		@@nc_rna_file.puts file_head
+		if (!options[:chimera].nil?)
+			@@chimera_file = File.open("fln_results/chimeric_sequences.txt", 'w')
+			@@chimera_file.puts file_head
+		else
+			if File.exists?("fln_results/chimeric_sequences.txt")
+				File.delete("fln_results/chimeric_sequences.txt")
+			end
+		end
 		# @@error_fasta_file = File.open("fln_results/error_seqs.fasta", 'w')
 		# @@error_file = File.open("fln_results/errors_info.txt", 'w')
@@ -50,6 +59,10 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
 		@@tcode_file.close
 		@@nc_rna_file.close
+		if (!@@options[:chimera].nil?)
+			@@chimera_file.close
+		end
 		# @@error_fasta_file.close
 		# @@error_file.close
@@ -113,50 +126,101 @@ class MyWorkerManager < ScbiMapreduce::WorkManager
 	def write_seq(seq)
 		begin
-# --------------------------------------------------------     Complete Seqs
-			if (e=seq.get_annotations(:complete).first)
+# --------------------------------------------------------     Chimeric Seqs
+			if (!@@options[:chimera].nil?)
+				if (q=seq.get_annotations(:chimera).first)
+					@@chimera_file.puts q[:message]
+# --------------------------------------------------     Complete Seqs
+				elsif (e=seq.get_annotations(:complete).first)
+					@@annotation_file.puts e[:message]
+					if (a=seq.get_annotations(:alignment).first)
+						@@alignment_file.puts a[:message]
+					end
+					if (p=seq.get_annotations(:protein).first)
+						@@prot_file.puts p[:message]
+					end
+					if (n=seq.get_annotations(:nucleotide).first)
+						@@nts_file.puts n[:message]
+					end
+# ---------------------------------------------------    Non Complete Seqs
+				elsif (e=seq.get_annotations(:tmp_annotation).first)
-				@@annotation_file.puts e[:message]
+					@@annotation_file.puts e[:message][0]
-				if (a=seq.get_annotations(:alignment).first)
-					@@alignment_file.puts a[:message]
-				end
+					if (a=seq.get_annotations(:alignment).first)
+						if !a[:message].empty?
+							@@alignment_file.puts a[:message]
+						end
+					end
-				if (p=seq.get_annotations(:protein).first)
-					@@prot_file.puts p[:message]
-				end
+					if (p=seq.get_annotations(:protein).first)
+						if !p[:message].empty?
+							@@prot_file.puts p[:message]
+						end
+					end
-				if (n=seq.get_annotations(:nucleotide).first)
-					@@nts_file.puts n[:message]
+					if (n=seq.get_annotations(:nucleotide).first)
+						@@nts_file.puts n[:message]
+					end
+# -------------------------------------------------     nc RNA
+				elsif (nc=seq.get_annotations(:ncrna).first)
+					@@nc_rna_file.puts nc[:message]
+# -------------------------------------------------     Test Code
+				elsif (t=seq.get_annotations(:tcode).first)
+	  				@@tcode_file.puts t[:message]
 				end
-# --------------------------------------------------------     Non Complete Seqs
-			elsif (e=seq.get_annotations(:tmp_annotation).first)
+# ---------------------------------------------------------------------------------
+# --------------------------------------------------------    without Chimeric Seqs Mode
+			else
+# -------------------------------------------------     Complete Seqs
+				if (e=seq.get_annotations(:complete).first)
-				@@annotation_file.puts e[:message][0]
+					@@annotation_file.puts e[:message]
-				if (a=seq.get_annotations(:alignment).first)
-					if !a[:message].empty?
+					if (a=seq.get_annotations(:alignment).first)
 						@@alignment_file.puts a[:message]
 					end
-				end
-				if (p=seq.get_annotations(:protein).first)
-					if !p[:message].empty?
+					if (p=seq.get_annotations(:protein).first)
 						@@prot_file.puts p[:message]
 					end
-				end
-				if (n=seq.get_annotations(:nucleotide).first)
-					@@nts_file.puts n[:message]
+					if (n=seq.get_annotations(:nucleotide).first)
+						@@nts_file.puts n[:message]
+					end
+# -------------------------------------------------     Non Complete Seqs
+				elsif (e=seq.get_annotations(:tmp_annotation).first)
+					@@annotation_file.puts e[:message][0]
+					if (a=seq.get_annotations(:alignment).first)
+						if !a[:message].empty?
+							@@alignment_file.puts a[:message]
+						end
+					end
+					if (p=seq.get_annotations(:protein).first)
+						if !p[:message].empty?
+							@@prot_file.puts p[:message]
+						end
+					end
+					if (n=seq.get_annotations(:nucleotide).first)
+						@@nts_file.puts n[:message]
+					end
+# -------------------------------------------------     nc RNA
+				elsif (nc=seq.get_annotations(:ncrna).first)
+					@@nc_rna_file.puts nc[:message]
+# -------------------------------------------------     Test Code
+				elsif (t=seq.get_annotations(:tcode).first)
+	  				@@tcode_file.puts t[:message]
 				end
-# --------------------------------------------------------     nc RNA
-			elsif (nc=seq.get_annotations(:ncrna).first)
-				@@nc_rna_file.puts nc[:message]
-# --------------------------------------------------------     Test Code
-			elsif (t=seq.get_annotations(:tcode).first)
-  				@@tcode_file.puts t[:message]
 			end
-# --------------------------------------------------------     errors
+# -------------------------------------------------     errors
 			# if e=seq.get_annotations(:error).first
 			# 	if !e[:message].empty?
 			# 		@@error_fasta_file.puts ">#{seq.seq_name}\n#{seq.seq_fasta}"

metadata CHANGED

@@ -2,7 +2,7 @@
 name: full_lengther_next
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.0.6
+  version: 0.0.8
 platform: ruby
 authors:
 - Noe Fernandez & Dario Guerrero
@@ -10,7 +10,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-16 00:00:00 Z
+date: 2012-11-28 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: xml-simple
@@ -31,7 +31,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.0.32
+        version: 0.0.37
   type: :runtime
   version_requirements: *id002
 - !ruby/object:Gem::Dependency
@@ -97,6 +97,7 @@ files:
 - bin/full_lengther_next
 - History.txt
 - lib/full_lengther_next/classes/common_functions.rb
+- lib/full_lengther_next/classes/chimeric_seqs.rb
 - lib/full_lengther_next/classes/fl_analysis.rb
 - lib/full_lengther_next/classes/fl_string_utils.rb
 - lib/full_lengther_next/classes/fln_stats.rb
@@ -142,7 +143,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: full_lengther_next
-rubygems_version: 1.7.2
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
 summary: FULL-LENGTHERNEXT is a tool adapted to NGS technologies, able to work in parallel and in a distributed way to minimise computing time