RubyGems - full_lengther_next - Versions diffs - 0.0.8 → 0.5.6 - Mend

full_lengther_next 0.0.8 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

data/.gemtest +0 -0
data/History.txt +2 -2
data/Manifest.txt +33 -18
data/Rakefile +4 -2
data/bin/download_fln_dbs.rb +310 -158
data/bin/full_lengther_next +160 -103
data/bin/make_test_dataset.rb +236 -0
data/bin/make_user_db.rb +101 -117
data/bin/plot_fln.rb +270 -0
data/bin/plot_taxonomy.rb +70 -0
data/lib/expresscanvas.zip +0 -0
data/lib/full_lengther_next.rb +3 -3
data/lib/full_lengther_next/classes/artifacts.rb +66 -0
data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
data/lib/full_lengther_next/classes/cdhit.rb +154 -0
data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
data/lib/full_lengther_next/classes/common_functions.rb +105 -63
data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
data/lib/full_lengther_next/classes/handle_db.rb +30 -0
data/lib/full_lengther_next/classes/my_worker.rb +308 -138
data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
data/lib/full_lengther_next/classes/reptrans.rb +210 -0
data/lib/full_lengther_next/classes/sequence.rb +439 -80
data/lib/full_lengther_next/classes/test_code.rb +15 -16
data/lib/full_lengther_next/classes/types.rb +12 -0
data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
data/lib/full_lengther_next/classes/warnings.rb +40 -0
metadata +207 -93
data/lib/full_lengther_next/classes/lcs.rb +0 -33
data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240

data/bin/make_user_db.rb CHANGED

@@ -3,142 +3,126 @@
 # 15-2-2011 Noe Fernandez-Pozo
 # Script to create your own Full-LengtherNext User database.
-require 'net/ftp'
-#receive one argument or fail
-if (ARGV.size != 2)
-	puts "incorrect number of arguments, you need a taxonomic group like 'Coniferopsida', you can search it in 'http://www.ncbi.nlm.nih.gov/Taxonomy/'
-and a UniProt taxonomic group from this list:
-	fungi
-	human
-	invertebrates
-	mammals
-	plants
-	rodents
-	vertebrates
-mode of use: ruby make_user_db.rb coniferopsida plants\n\n"
-  Process.exit(-1);
-end
-(my_group,uniprot_group)=ARGV
-################################################### Functions
-def filter_incomplete_seqs(output_file,file_name, my_group)
-	puts " filtering sequences"
-	# UniProtKB fragments with FT NON_CONS and FT NON_TER features.
-	#
-	#     * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
-	#       FT NON_TER 1 1
-	#       FT NON_TER 29 29
-	#     * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
-	#       FT NON_CONS 1683 1684
-	#
-	# NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
-	newseq=false
-	print_seq=false
-	incomplete=false
-	id=''
-	description = ''
-	organism_name = ''
-	seq = ''
-	organelle = ''
-	File.open(file_name).each_line do |line|
-		if (newseq == false)
-			if (line =~ /^AC\s+(\w+);/)
-				id=$1
-				newseq = true
-				description = ''
-				organism_name = ''
-				seq = ''
-				print_seq = false
-				incomplete = false
+ROOT_PATH=File.dirname(__FILE__)
+$: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
+require 'cdhit'
+require 'handle_db'
+require 'optparse'
+##############################################################################################
+## METHODS
+#############################################################################################
+def get_seqs(index, taxon, isoform_hash)
+	seqs = ''
+	File.open(index).each do |line|
+		line.chomp!
+		fields = line.split("\t")
+		if fields[2].split(';').include?(taxon)
+			if fields[5] == '-'
 				organelle = ''
+			else
+				organelle = fields[5].gsub('-','')
 			end
-		else
-			if (line =~ /^DE\s+(.+)\;*/)
-				if (description == '')
-					description = $1
-					description.sub!(/RecName: Full=/,'sp=')
-					description.sub!(/SubName: Full=/,'tr=')
-				end
-				if (line =~ /Flags: Fragment/)
-					# puts "#{id} #{line}"
-					incomplete = true
-				end
-			elsif (line =~ /^OS\s+(.+)/)
-				organism_name = $1
-			elsif (line =~ /^OG\s+(.+)/)
-				organelle = $1
-			elsif (line =~ /^OC\s+[\w\s\;]*#{my_group}/i) && (!incomplete)
-				print_seq=true
-				# puts "#{id}   #{organism_name} print_seq?: #{print_seq}"
-			elsif (line =~ /^FT\s+NON_TER\s+/)
-				print_seq=false
-				# puts "#{id}   NON_TER"
-			elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
-				print_seq=false
-				# puts "#{id}   NON_CONS"
-			elsif (line =~ /^\s+([\w\s]+)/)
-				seq += $1
-			elsif (line =~ /^\/\//)
-				seq.gsub!(/\s*/,'')
-				if (seq !~ /^M/i)
-					print_seq=false
-				end
-				newseq = false
-				if (print_seq)
-					output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
-				end
+			seqs << ">#{[fields[0], fields[1], fields[3], organelle].join(' ')}\n#{fields[4]}\n"
+			if !isoform_hash.nil?
+				accid = fields[1].split(' ').first.split('-').first
+				var_splice = isoform_hash[accid]
+				seqs << var_splice + "\n" if !var_splice.nil?
 			end
 		end
 	end
+	return seqs
 end
-########################################################
-##  MAIN
-########################################################
+##########################################################################################
+## OPTIONS
+##########################################################################################
-ROOT_PATH=File.dirname(__FILE__)
+options = {}
-# $: << File.expand_path(File.join(ROOT_PATH, "classes"))
+divs = %w{human fungi invertebrates mammals plants rodents vertebrates}
-# load gem path, only to test locally
-# $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
+optparse = OptionParser.new do |opts|
+  options[:uniprot_div] = nil
+  opts.on( '-u', '--file String', 'Uniprot DBs to taxon search. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates.') do |uniprot_div|
+		if !divs.include?(uniprot_div)
+			puts 'This uniprot division not exists:', uniprot_div
+			process.exit
+		end
+		options[:uniprot_div] = uniprot_div
+  end
-require 'full_lengther_next'
+  options[:taxon] = nil
+  opts.on( '-t', '--taxon STRING', 'Specific taxon to search in uniprot division. Write taxo between \'\'') do |taxon|
+  		options[:taxon] = taxon
+  end
-if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
-  formatted_db_path = ENV['BLASTDB']
-else # otherwise use ROOTPATH + DB
-  formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
-end
+  options[:local] = FALSE
+  opts.on( '-l', '--local', 'Only parse downloaded files without download them again') do
+		options[:local] = TRUE
+  end
-ENV['BLASTDB']=formatted_db_path
+  options[:cdhit] = 0
+  opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
+		options[:cdhit] = cdhit.to_f
+  end
+  # Set a banner, displayed at the top of the help screen.
+  opts.banner = "Usage: #{File.basename(__FILE__)} [options]  \n\n"
-if !File.exists?(File.join(formatted_db_path, my_group))
-	Dir.mkdir(File.join(formatted_db_path,my_group))
-end
+  # This displays the help screen
+  opts.on( '-h', '--help', 'Display this screen' ) do
+    puts opts
+    exit
+  end
+end # End opts
-output_file_path=File.join(formatted_db_path,my_group,my_group+".fasta")
+# parse options and remove from ARGV
+optparse.parse!
-output_file = File.new(output_file_path, "w")
+########################################################
+##  MAIN
+########################################################
-filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_sprot_#{uniprot_group}.dat"), my_group)
-filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_trembl_#{uniprot_group}.dat"), my_group)
+if options[:taxon].nil? || options[:uniprot_div].nil?
+	puts 'Taxon or uniprot division was not specified'
+  	Process.exit(-1)
+end
-output_file.close
+if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
+	formatted_db_path = ENV['BLASTDB']
+else # otherwise use ROOTPATH + DB
+	formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
+end
-`makeblastdb -in #{output_file_path} -dbtype 'prot' -parse_seqids`
+if !options[:local]
+	user_db_folder = File.join(formatted_db_path, options[:taxon])
+else
+	user_db_folder = File.join(Dir.pwd, options[:taxon])
+end
+output_file_path = File.join(user_db_folder, options[:taxon]+".fasta")
+user_db_folder.gsub!(' ', '_')
+output_file_path.gsub!(' ', '_')
+Dir.mkdir(user_db_folder) if !File.exists?(user_db_folder)
+isoform_hash = load_isoform_hash(File.join(formatted_db_path, 'uniprot_sprot_varsplic.fasta'))
+seqs = get_seqs(File.join(formatted_db_path, 'sp_' + options[:uniprot_div],"sp_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
+isoform_hash = nil
+seqs << get_seqs(File.join(formatted_db_path, 'tr_' + options[:uniprot_div],"tr_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
+if options[:cdhit] > 0
+	output_file = File.open(output_file_path, 'w')
+	output_file.puts seqs
+	output_file.close
+	system("cd-hit -i #{output_file_path} -o #{output_file_path}_cln -c 1 -s 0.95 -M 0") #-d length of description in .clstr file, default 20 if set to 0, it takes the fasta defline and stops at first space (BUGGED OPTION) -M 0 cd-hit uses all memory that it needs
+	cdhit = Cdhit.new(output_file_path, output_file_path+'_cln.clstr')
+	cdhit.master_to_sp_seq
+	seqs = cdhit.get_all_master
+	seqs.map!{|s| s.to_s}
+	seqs = seqs.join("\n")
+end
+do_makeblastdb(seqs, output_file_path, 'prot')
 puts "make_user_db.rb has finished"

data/bin/plot_fln.rb ADDED

@@ -0,0 +1,270 @@
+#!/usr/bin/env ruby
+require 'optparse'
+#############################################
+### FUNCTIONS
+#############################################
+def create_fln_hash(path)
+  fln_hash = {}
+  file = File.open(path, 'r').each do |line|
+    fields = line.chomp.split
+    fln_hash[fields[1]] = fields[0].to_i
+  end
+  fln_hash['<=200seqs'] = fln_hash['good_seqs'] - fln_hash['sequences_>200']
+  fln_hash['>200seqs'] = fln_hash['sequences_>200']  - fln_hash['sequences_>500']
+  fln_hash['<=200unk'] = fln_hash['unknown'] - fln_hash['unknown_>200']
+  fln_hash['>200unk'] = fln_hash['unknown_>200']  - fln_hash['unknown_>500']
+  fln_hash['<=200cod'] = fln_hash['coding'] - fln_hash['coding_>200']
+  fln_hash['>200cod'] = fln_hash['coding_>200']  - fln_hash['coding_>500']
+  fln_hash['no_match_db'] = fln_hash['coding'] + fln_hash['unknown']
+  return fln_hash
+end
+def graph_table(fln_hash, output, graph_type, header_titles, categories_names, keywords, stacked_cols, titles = nil)
+  table = []
+  cmd = basic_plot_command(graph_type)
+  cmd << "set output '#{output}.png'\n"
+  if fln_hash.class.to_s == 'Array'
+    table << header(fln_hash.length, header_titles)
+    table.concat(categories(categories_names))
+    cmd << 'plot '
+    count = 0
+    fln_hash.each_with_index do |hash,i|
+      table = fill_table(hash, table, keywords, stacked_cols, graph_type)
+      if i == 0
+        first = TRUE
+      else
+        first = FALSE
+      end
+      cmd << histogram(stacked_cols, output,titles[i], count, graph_type, first)
+      if i < fln_hash.length-1
+        cmd << "\\\n"
+      end
+      if !graph_type.include?('clustered')
+        count += stacked_cols
+      else
+        count += 1
+      end
+    end
+  else
+    table << header(1, header_titles)
+    table.concat(categories(categories_names))
+    table = fill_table(fln_hash, table, keywords, stacked_cols, graph_type)
+    cmd << 'plot '+ histogram(stacked_cols, output, '', 0, graph_type, TRUE)
+  end
+  if table.length ==2 #Dummie row for rowstacked graph with a only category
+    table << table[1].dup
+    table[2].each_with_index do |cell, i|
+        if i== 0
+          table[2][i] = '&'
+        else
+          table[2][i] = 0
+        end
+    end
+  end
+  cmd.chop!
+  write_table(table, output)
+  write_cmd(cmd)
+  system('gnuplot cmd.dem')
+end
+def histogram(columns, file, name, add, graph_type, first)
+  cmd = ""
+  if first
+    cmd << "newhistogram \"#{name}\", '#{file}' using 2:xtic(1) t col,"
+  else
+    cmd << "newhistogram \"#{name}\", '' using #{2 + add}:xtic(1)  t col,"
+  end
+  if !graph_type.include?('clustered')
+    (columns-1).times do |col|
+        cmd << " '' u #{3+col+add} t col,"
+    end
+  end
+  return cmd
+end
+def write_cmd(cmd)
+  if File.exists?('cmd.dem')
+    File.delete('cmd.dem')
+  end
+  file = File.open('cmd.dem', 'a')
+  file.puts cmd
+  file.close
+end
+def header(iterations, header_titles)
+  header = ['Clasification']
+  iterations.times do
+    header_titles.each do |title|
+      header << title
+    end
+  end
+  return header
+end
+def categories(cat)
+  array_cat = cat.split(' ').map{|name| [name]}
+  return array_cat
+end
+def fill_table(fln_hash, table, keywords, stacked_cols,graph_type)
+  series = 0
+  keywords.each_with_index do |key, i|
+    if graph_type.include?('clustered')
+      row = i +1 -stacked_cols*series
+      if (i+1) % stacked_cols == 0
+        series +=1
+      end
+    else
+      row = i/stacked_cols + 1
+    end
+    value = fln_hash[key]
+    if value.nil?
+      value = 0
+    end
+    if table.length == 2
+      table[1] << value
+    else
+      table[row] << value
+    end
+  end
+  return table
+end
+def write_table(table, file_name)
+  file_table = File.open(file_name, 'w')
+    table.each do |line|
+      file_table.puts line.join(' ')
+    end
+  file_table.close
+end
+def basic_plot_command(graph_type)
+  cmd = ''
+  if graph_type.include?('clustered')
+    cmd << "unset key\n"
+  else
+    cmd << "set key under nobox\n"
+  end
+  cmd << "set style data histogram\n"
+  cmd << "set style histogram #{graph_type} title offset 2,0.25\n"
+  cmd << "set style fill solid noborder\n"
+  cmd << "set boxwidth 0.95\n"
+  cmd << "unset xtics\n"
+  cmd << "set xtics nomirror rotate by -45 scale 0\n"
+  cmd << "set xlabel \" \" offset 0,-2\n"
+  cmd << "set ylabel \"Num sequences\"\n"
+  cmd << "set ytics\n"
+  cmd << "set grid y\n"
+  cmd << "set auto y\n"
+  cmd << "set terminal png nocrop enhanced font arial 15 size 1000,600\n"
+  return cmd
+end
+def parse_file(file)
+  titles = []
+  paths =[]
+  File.open(file,'r').each do |line|
+    fields = line.chomp.split("\t")
+    if  !fields[0].nil?
+      titles << fields[0]
+    end
+    if !fields[1].nil?
+      paths << fields[1]
+    end
+  end
+  return titles, paths
+end
+##########################################################################################
+## OPTIONS
+##########################################################################################
+options = {}
+optparse = OptionParser.new do |opts|
+  options[:file]='samples'
+  opts.on( '-f', '--file FILE', 'Path to FLN execution') do |file|
+          options[:file]=file
+  end
+  options[:path] = File.join('fln_results','summary_stats.txt')
+  opts.on( '-p', '--path PATH', 'Path to FLN different FLN results' ) do |path|
+          options[:path] = File.join(path,'fln_results','summary_stats.txt')
+  end
+  # Set a banner, displayed at the top of the help screen.
+  opts.banner = "Usage: plot_fln.rb [-p PATH || -f FILE]  \n\n"
+  # This displays the help screen
+  opts.on( '-h', '--help', 'Display this screen' ) do
+    puts opts
+    exit
+  end
+end # End opts
+# parse options and remove from ARGV
+optparse.parse!
+##########################################################################################
+## MAIN
+##########################################################################################
+if File.exists?(options[:path])
+  fln_hash = create_fln_hash(options[:path])
+end
+if File.exists?(options[:file])
+  titles, paths = parse_file(options[:file])
+  fln_hash = []
+  paths.each do |path|
+    fln_hash << create_fln_hash(File.join(path,'fln_results','summary_stats.txt'))
+  end
+end
+graph_table(
+  fln_hash,
+  'status_report_table',
+  'rowstacked',
+  %w{Sure Putative},
+  'Complete N-terminal C-terminal Internal NcRNA Coding Unknown',
+  %w{complete_sure complete_putative n_terminal_sure n_terminal_putative c_terminal_sure c_terminal_putative internal internal_putative ncrna ncrna_putative coding_sure coding_putative unknown unknown_putative},
+  2,
+  titles)
+graph_table(
+  fln_hash,
+  'assembly_table',
+  'rowstacked',
+  %w{<=200nt >200nt >500nt},
+  'Unigenes Coding Unknown',
+  %w{<=200seqs >200seqs sequences_>500 <=200cod >200cod coding_>500 <=200unk >200unk unknown_>500},
+  3,
+  titles)
+graph_table(
+  fln_hash,
+  'database_usage',
+  'clustered',
+  %w{seqs},
+  'UserDB SwissProt TrEMBL ncRNA None Diff-orthologues Complete Diff-complete',
+  %w{userdb swissprot trembl ncrna no_match_db different_orthologues complete different_completes},
+  8,
+  titles)
+graph_table(
+  fln_hash,
+  'artifacts',
+  'clustered',
+  %w{seqs},
+  'Misassembled Chimeras Other',
+  %w{misassembled chimeras other_artifacts},
+  3,
+  titles)