RubyGems - full_lengther_next - Versions diffs - 0.0.1 - Mend

full_lengther_next 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/History.txt +4 -0
data/Manifest.txt +27 -0
data/PostInstall.txt +6 -0
data/README.rdoc +147 -0
data/Rakefile +37 -0
data/bin/download_fln_dbs.rb +197 -0
data/bin/full_lengther_next +173 -0
data/bin/make_user_db.rb +144 -0
data/lib/full_lengther_next.rb +13 -0
data/lib/full_lengther_next/classes/common_functions.rb +94 -0
data/lib/full_lengther_next/classes/fl2_stats.rb +222 -0
data/lib/full_lengther_next/classes/fl_analysis.rb +688 -0
data/lib/full_lengther_next/classes/fl_string_utils.rb +139 -0
data/lib/full_lengther_next/classes/lcs.rb +33 -0
data/lib/full_lengther_next/classes/my_worker.rb +122 -0
data/lib/full_lengther_next/classes/my_worker_manager.rb +167 -0
data/lib/full_lengther_next/classes/orf.rb +32 -0
data/lib/full_lengther_next/classes/sequence.rb +111 -0
data/lib/full_lengther_next/classes/test_code.rb +877 -0
data/lib/full_lengther_next/classes/une_los_hit.rb +287 -0
data/script/console +10 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/test/test_full_lengther_next.rb +11 -0
data/test/test_helper.rb +3 -0
metadata +150 -0

data/bin/make_user_db.rb ADDED Viewed

@@ -0,0 +1,144 @@
+#!/usr/bin/env ruby
+# 15-2-2011 Noe Fernandez-Pozo
+# Script to create your own Full-LengtherNext User database.
+require 'net/ftp'
+#receive one argument or fail
+if (ARGV.size != 2)
+	puts "incorrect number of arguments, you need a taxonomic group like 'Coniferopsida', you can search it in 'http://www.ncbi.nlm.nih.gov/Taxonomy/'
+and a UniProt taxonomic group from this list:
+	fungi
+	human
+	invertebrates
+	mammals
+	plants
+	rodents
+	vertebrates
+mode of use: ruby make_user_db.rb coniferopsida plants\n\n"
+  Process.exit(-1);
+end
+(my_group,uniprot_group)=ARGV
+################################################### Functions
+def filter_incomplete_seqs(output_file,file_name, my_group)
+	puts " filtering sequences"
+	# UniProtKB fragments with FT NON_CONS and FT NON_TER features.
+	#
+	#     * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
+	#       FT NON_TER 1 1
+	#       FT NON_TER 29 29
+	#     * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
+	#       FT NON_CONS 1683 1684
+	#
+	# NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
+	newseq=false
+	print_seq=false
+	incomplete=false
+	id=''
+	description = ''
+	organism_name = ''
+	seq = ''
+	organelle = ''
+	File.open(file_name).each_line do |line|
+		if (newseq == false)
+			if (line =~ /^AC\s+(\w+);/)
+				id=$1
+				newseq = true
+				description = ''
+				organism_name = ''
+				seq = ''
+				print_seq = false
+				incomplete = false
+				organelle = ''
+			end
+		else
+			if (line =~ /^DE\s+(.+)\;*/)
+				if (description == '')
+					description = $1
+					description.sub!(/RecName: Full=/,'sp=')
+					description.sub!(/SubName: Full=/,'tr=')
+				end
+				if (line =~ /Flags: Fragment/)
+					# puts "#{id} #{line}"
+					incomplete = true
+				end
+			elsif (line =~ /^OS\s+(.+)/)
+				organism_name = $1
+			elsif (line =~ /^OG\s+(.+)/)
+				organelle = $1
+			elsif (line =~ /^OC\s+[\w\s\;]*#{my_group}/i) && (!incomplete)
+				print_seq=true
+				# puts "#{id}   #{organism_name} print_seq?: #{print_seq}"
+			elsif (line =~ /^FT\s+NON_TER\s+/)
+				print_seq=false
+				# puts "#{id}   NON_TER"
+			elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
+				print_seq=false
+				# puts "#{id}   NON_CONS"
+			elsif (line =~ /^\s+([\w\s]+)/)
+				seq += $1
+			elsif (line =~ /^\/\//)
+				seq.gsub!(/\s*/,'')
+				if (seq !~ /^M/i)
+					print_seq=false
+				end
+				newseq = false
+				if (print_seq)
+					output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
+				end
+			end
+		end
+	end
+end
+########################################################
+##  MAIN
+########################################################
+ROOT_PATH=File.dirname(__FILE__)
+# $: << File.expand_path(File.join(ROOT_PATH, "classes"))
+# load gem path, only to test locally
+# $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
+require 'full_lengther_next'
+if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
+  formatted_db_path = ENV['BLASTDB']
+else # otherwise use ROOTPATH + DB
+  formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
+end
+ENV['BLASTDB']=formatted_db_path
+if !File.exists?(File.join(ENV['BLASTDB'], my_group))
+	Dir.mkdir("blast_dbs/#{my_group}")
+end
+output_file_path=File.join(ENV['BLASTDB'],my_group,my_group+".fasta")
+output_file = File.new(output_file_path, "w")
+filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_sprot_#{uniprot_group}.dat"), my_group)
+filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_trembl_#{uniprot_group}.dat"), my_group)
+output_file.close
+`makeblastdb -in #{output_file_path} -dbtype 'prot' -parse_seqids`
+puts "make_user_db.rb has finished"

data/lib/full_lengther_next.rb ADDED Viewed

@@ -0,0 +1,13 @@
+$:.unshift(File.dirname(__FILE__)) unless
+  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
+ROOT_PATH=File.join(File.dirname(__FILE__),'full_lengther_next')
+$: << File.expand_path(File.join(ROOT_PATH, 'classes'))
+module FullLengtherNext
+  VERSION = '0.0.1'
+  FULLLENGHTER_VERSION = VERSION
+end

data/lib/full_lengther_next/classes/common_functions.rb ADDED Viewed

@@ -0,0 +1,94 @@
+module CommonFunctions
+	def contenidos_en_prot(hit, full_prot, q)
+		is_ok = false
+		q_index_start = 9999
+		fr_index_start = 0
+		min_index_start = 9999
+		aas_parecidos = 0
+		masked_x = 0
+		suma_fragments = 0
+		masked_x = hit.q_seq.count('X')
+		masked_x = masked_x + hit.q_seq.count('-')
+		full_prot = full_prot.gsub(/[\-Xx]+/,'')
+		compare_prot = hit.q_seq.gsub(/[\-Xx]+/,'-')
+		fragments_array = compare_prot.split(/\-+/)
+		fragments_array.each do |seq|
+			# puts "seq: #{seq}\nfull_prot: #{full_prot}"
+			simliar_fragment = full_prot.lcs(seq)
+			suma_fragments += simliar_fragment.length
+			fr_index_start = full_prot.index(simliar_fragment)
+			if (q_index_start == 9999)
+				q_index_start = fr_index_start
+			end
+			full_prot = full_prot[(fr_index_start + simliar_fragment.length)..full_prot.length]
+		end
+		simliar_fragment = full_prot.lcs(compare_prot)
+		# if ($verbose)
+			# puts "#{q.query_def}-------------------------------------#{suma_fragments} de #{compare_prot.length}"
+			# puts "#{q.query_def}-------------------------------------#{suma_fragments + masked_x} >= #{compare_prot.length * 0.7}"
+			# puts "\nfull: #{full_prot}\ncomp: #{compare_prot}\nsimliar_fragment: #{simliar_fragment}"
+		# end
+		if (suma_fragments + masked_x >= compare_prot.length * 0.7)
+			is_ok = true
+			# puts "OK -- encontramos suficiente similitud entre query y subject -- OK"
+		else
+			is_ok = false
+			# puts "\nfull: #{full_prot}\ncomp: #{compare_prot}"
+			# puts "Warning!: no match comparing proteins"
+		end
+		min_index_start = [min_index_start, q_index_start].min
+		if (min_index_start == 9999)
+			min_index_start = 0
+		end
+		return [is_ok, min_index_start]
+	end
+	def reverse_seq(query_fasta, h_qframe, h_qstart, h_qend)
+		q_frame = -h_qframe.to_i
+		q_beg = query_fasta.length - h_qend - 1
+		q_end = query_fasta.length - h_qstart - 1
+		query_fasta = query_fasta.complementary_dna
+		# el qend y el qstart estan al reves porque cuando la seq tiene frame negativo el blast los pone al reves
+		return [query_fasta, q_frame, q_beg, q_end]
+	end
+	def corrige_frame(ref_frame,ref_start,ref_end)
+		if (ref_frame.abs == 2)
+			ref_start = ref_start + 1
+			ref_end = ref_end + 1
+		elsif (ref_frame.abs == 3)
+			ref_start = ref_start + 2
+			ref_end = ref_end + 2
+		end
+		return [ref_start,ref_end]
+	end
+end

data/lib/full_lengther_next/classes/fl2_stats.rb ADDED Viewed

@@ -0,0 +1,222 @@
+module Fl2Stats
+	# --------------------------------------------------------------------------------       Main
+	def summary_stats
+		stats_file = File.open('fl2_results/summary_stats.txt', 'w')
+		total_seqs = 0
+		num1 = annotation_stats(stats_file)
+		num2 = testcode_stats(stats_file)
+		total_seqs = num1 + num2
+		stats_file.puts "\nInput sequences in your fasta: #{total_seqs}\n\n"
+	end
+	# ----------------------------------------------------------------------------------      Functions
+	def stats_my_db(db_name, array)
+		if (db_name !~ /^sp_/) && (db_name !~ /^tr_/)
+			array[1] += 1
+		elsif (db_name =~ /^sp_/)
+			array[2] += 1
+		elsif (db_name =~ /^tr_/)
+			array[3] += 1
+		end
+		return array
+	end
+	def annotation_stats(stats_file)
+		seqs_number = 0
+		array_of_all_accs = []
+		array_of_complete_accs = []
+		error_1_num = 0
+		seqs_longer_200 = 0
+		seqs_shorter_200 = 0
+		complete_longer_200 = 0
+		complete_shorter_200 = 0
+		seqs_longer_500 = 0
+		seqs_shorter_500 = 0
+		complete_longer_500 = 0
+		complete_shorter_500 = 0
+		complete = [0,0,0,0]
+		putative_complete = [0,0,0,0]
+		c_terminus = [0,0,0,0]
+		putative_c_terminus = [0,0,0,0]
+		n_terminus = [0,0,0,0]
+		putative_n_terminus = [0,0,0,0]
+		internal = [0,0,0,0]
+		cod_seq = [0,0,0,0]
+		File.open('fl2_results/annotations.txt').each do |line|
+			line.chomp!
+			(name,fasta_length,acc,db_name,status,kk1,kk2,kk3,kk4,kk5,msgs) = line.split("\t")
+			if (line !~ /^Query_id\t/)
+				seqs_number += 1
+				array_of_all_accs.push acc
+				# -------------------------------------------------------------------------
+				if (fasta_length.to_i >= 200)
+					seqs_longer_200 += 1
+				else
+					seqs_shorter_200 += 1
+				end
+				if (fasta_length.to_i >= 500)
+					seqs_longer_500 += 1
+				else
+					seqs_shorter_500 += 1
+				end
+				# -------------------------------------------------------------------------
+				if (msgs =~ /ERROR#1/)
+					error_1_num += 1
+				end
+				# -------------------------------------------------------------------------
+				if (status == 'Complete')
+					complete[0] += 1
+					array_of_complete_accs.push acc
+					complete = stats_my_db(db_name, complete)
+					if (fasta_length.to_i >= 200)
+						complete_longer_200 += 1
+					else
+						complete_shorter_200 += 1
+					end
+					if (fasta_length.to_i >= 500)
+						complete_longer_500 += 1
+					else
+						complete_shorter_500 += 1
+					end
+				elsif (status == 'Putative Complete')
+					putative_complete[0] += 1
+					putative_complete = stats_my_db(db_name, putative_complete)
+				elsif (status == 'C-terminus')
+					c_terminus[0] += 1
+					c_terminus = stats_my_db(db_name, c_terminus)
+				elsif (status == 'N-terminus')
+					n_terminus[0] += 1
+					n_terminus = stats_my_db(db_name, n_terminus)
+				elsif (status == 'Putative C-terminus')
+					putative_c_terminus[0] += 1
+					putative_c_terminus = stats_my_db(db_name, putative_c_terminus)
+				elsif (status == 'Putative N-terminus')
+					putative_n_terminus[0] += 1
+					putative_n_terminus = stats_my_db(db_name, putative_n_terminus)
+				elsif (status == 'Internal')
+					internal[0] += 1
+					internal = stats_my_db(db_name, internal)
+				elsif (status == 'Coding Seq')
+					cod_seq[0] += 1
+					cod_seq = stats_my_db(db_name, cod_seq)
+				end
+				# -------------------------------------------------------------------------
+			end
+		end
+		stats_file.puts "--- Annotation Summary ---"
+		stats_file.puts "\n------------------------------ Summary of sequences found by similarity -----"
+		stats_file.puts "\n\tSequences found: #{seqs_number}\t\t(>200: #{seqs_longer_200}, <200: #{seqs_shorter_200})\t(>500: #{seqs_longer_500}, <500: #{seqs_shorter_500})"
+		stats_file.puts "\tDifferent IDs:   #{array_of_all_accs.uniq.count}"
+		stats_file.puts "\n\tsequences with sense and antisense hits error: #{error_1_num}"
+		stats_file.puts "\n------------------------------------------------- Full-Length Sequences -----"
+		stats_file.puts "\tComplete Seqs: #{complete[0]} ("+ '%.3f' % (complete[0].to_f/seqs_number.to_f*100) +" %)\t\t(>200: #{complete_longer_200}, <200: #{complete_shorter_200})\t(>500: #{complete_longer_500}, <500: #{complete_shorter_500})"
+		stats_file.puts "\tDifferent IDs: #{array_of_complete_accs.uniq.count} ("+ '%.3f' % (array_of_complete_accs.uniq.count.to_f/seqs_number.to_f*100) +" %)"
+		stats_file.puts "\n\t\tuser_db: #{complete[1]}\n\t\tsp: #{complete[2]}\n\t\ttr: #{complete[3]}"
+		stats_file.puts "-----------------------------------------------------------------------------"
+		stats_file.puts "\n\tputative completes: #{putative_complete[0]}\n\t\tuser_db: #{putative_complete[1]}\n\t\tsp: #{putative_complete[2]}\n\t\ttr: #{putative_complete[3]}"
+		stats_file.puts "\n\tn-terminus: #{n_terminus[0]}\n\t\tuser_db: #{n_terminus[1]}\n\t\tsp: #{n_terminus[2]}\n\t\ttr: #{n_terminus[3]}"
+		stats_file.puts "\n\tputative_n_terminus: #{putative_n_terminus[0]}\n\t\tuser_db: #{putative_n_terminus[1]}\n\t\tsp: #{putative_n_terminus[2]}\n\t\ttr: #{putative_n_terminus[3]}"
+		stats_file.puts "\n\tc-terminus: #{c_terminus[0]}\n\t\tuser_db: #{c_terminus[1]}\n\t\tsp: #{c_terminus[2]}\n\t\ttr: #{c_terminus[3]}"
+		stats_file.puts "\n\tputative_c_terminus: #{putative_c_terminus[0]}\n\t\tuser_db: #{putative_c_terminus[1]}\n\t\tsp: #{putative_c_terminus[2]}\n\t\ttr: #{putative_c_terminus[3]}"
+		stats_file.puts "\n\tinternal: #{internal[0]}\n\t\tuser_db: #{internal[1]}\n\t\tsp: #{internal[2]}\n\t\ttr: #{internal[3]}"
+		stats_file.puts "\n\tcoding sequences with unknown status: #{cod_seq[0]}\n\t\tuser_db: #{cod_seq[1]}\n\t\tsp: #{cod_seq[2]}\n\t\ttr: #{cod_seq[3]}"
+		return seqs_number
+	end
+	def testcode_stats(stats_file)
+		seqs_number = 0
+		coding = 0
+		putative_coding = 0
+		unknown = 0
+		coding_longer_200 = 0
+		coding_shorter_200 = 0
+		unknown_longer_200 = 0
+		unknown_shorter_200 = 0
+		coding_longer_500 = 0
+		coding_shorter_500 = 0
+		unknown_longer_500 = 0
+		unknown_shorter_500 = 0
+		File.open('fl2_results/tcode_result.txt').each do |line|
+			line.chomp!
+			(name,fasta_length,acc,db_name,status) = line.split("\t")
+			if (line !~ /^Query_id\t/)
+				seqs_number += 1
+				if (status == 'coding')
+					coding += 1
+					if (fasta_length.to_i >= 200)
+						coding_longer_200 += 1
+						coding_longer_500 += 1
+					else
+						coding_shorter_200 += 1
+						coding_shorter_500 += 1
+					end
+				elsif (status == 'putative_coding')
+					putative_coding += 1
+				elsif (status == 'unknown')
+					unknown += 1
+					if (fasta_length.to_i >= 200)
+						unknown_longer_200 += 1
+						unknown_longer_500 += 1
+					else
+						unknown_shorter_200 += 1
+						unknown_shorter_500 += 1
+					end
+				end
+			end
+		end
+		stats_file.puts "\n--------------------------- Test Code Summary\n\n\ttotal seqs: #{seqs_number}"
+		stats_file.puts "\n\tcoding sequences: #{coding}"
+		stats_file.puts "\t\tlonger than 200 bp: #{coding_longer_200}"
+		stats_file.puts "\t\tshorter than 200 bp: #{coding_shorter_200}"
+		stats_file.puts "\t\tlonger than 500 bp: #{coding_longer_500}"
+		stats_file.puts "\t\tshorter than 500 bp: #{coding_shorter_500}"
+		stats_file.puts "\n\tputative coding sequences: #{putative_coding}\n"
+		stats_file.puts "\n\tunknown: #{unknown} ("+ '%.3f' % (unknown.to_f/seqs_number.to_f*100) +" %)"
+		stats_file.puts "\t\tlonger than 200 bp: #{unknown_longer_200}"
+		stats_file.puts "\t\tshorter than 200 bp: #{unknown_shorter_200}"
+		stats_file.puts "\t\tlonger than 500 bp: #{unknown_longer_500}"
+		stats_file.puts "\t\tshorter than 500 bp: #{unknown_shorter_500}"
+		stats_file.puts "\n\tUnknown sequences have a bad test code score or haven't got an ORF longer than 200 nt"
+		stats_file.puts "---------------------------------------------"
+		return seqs_number
+	end
+end