RubyGems - bio-phyta - Versions diffs - 0.9.0 - Mend

bio-phyta 0.9.0

Files changed (17) hide show

data/.document +5 -0
data/Gemfile +22 -0
data/LICENSE.txt +20 -0
data/README.rdoc +19 -0
data/Rakefile +55 -0
data/VERSION +1 -0
data/bin/phyta-assign +175 -0
data/bin/phyta-extract +117 -0
data/bin/phyta-setup-taxonomy-db +131 -0
data/bin/phyta-split +158 -0
data/lib/blast_string_parser.rb +51 -0
data/lib/kingdom_db.rb +140 -0
data/test/helper.rb +17 -0
data/test/test_blackbox.rb +41 -0
data/test/test_blast_string_parser.rb +23 -0
data/test/test_kingdom_db.rb +118 -0
metadata +239 -0

data/.document ADDED

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/Gemfile ADDED

@@ -0,0 +1,22 @@
+source "http://rubygems.org"
+# MRI only for now
+# Runtime dependencies
+gem "bio", ">= 1.4.2"
+gem "mysql", ">= 2.8.1"
+# For JRuby: gem "mysql", "~> 2.8.1"
+gem "sequel", ">= 3.28.0"
+gem "fastercsv", ">= 1.5.4" # only for 1.8.7
+gem "nokogiri", ">= 1.5.0"
+gem "trollop", ">= 1.16.2"
+# Add dependencies to develop your gem here.
+# Include everything needed to run rake, tests, features, etc.
+group :development do
+  gem "shoulda", ">= 0"
+  gem "bundler", "~> 1.0.0"
+  gem "jeweler", "~> 1.6.4"
+  gem "rcov", ">= 0"
+end

data/LICENSE.txt ADDED

@@ -0,0 +1,20 @@
+Copyright (c) 2011 Philipp Comans
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED

@@ -0,0 +1,19 @@
+= bio-phyta
+Description goes here.
+== Contributing to bio-phyta
+* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
+* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
+* Fork the project
+* Start a feature/bugfix branch
+* Commit and push until you are happy with your contribution
+* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
+* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
+== Copyright
+Copyright (c) 2011 Philipp Comans. See LICENSE.txt for
+further details.

data/Rakefile ADDED

@@ -0,0 +1,55 @@
+# encoding: utf-8
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
+  gem.name = "bio-phyta"
+  gem.homepage = "http://github.com/pcomans/bioruby-phyta"
+  gem.license = "LGPL"
+  gem.summary = "Pipeline to remove contaminations from EST libraries"
+  gem.description = "Coming soon"
+  gem.email = "philipp.comans@googlemail.com"
+  gem.authors = ["Philipp Comans"]
+  # Remove test data from the gem
+  gem.files.exclude "test/data/**/*"
+  # dependencies defined in Gemfile
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+end
+require 'rcov/rcovtask'
+Rcov::RcovTask.new do |test|
+  test.libs << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+  test.rcov_opts << '--exclude "gems/*"'
+end
+task :default => :test
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "bio-phyta #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.9.0

data/bin/phyta-assign ADDED

@@ -0,0 +1,175 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'trollop'
+SCRIPT_NAME = "phyta-assign"
+#parse command line arguments
+opts = Trollop::options do
+  opt :input_file, "The output of the BLASTplus alignment in XML format", :type => String
+  opt :output_file, "The name of the output table in CSV format", :type => String
+  opt :database_server, "Optional: The address of the MySQL database server", :type => String, :default => "localhost"
+  opt :database_user, "Optional: The name of the database user", :type => String, :default => "root", :short => "-u"
+  opt :database_password, "Optional: The password of the database user", :type => String, :default => "no password", :short => "-p"
+  opt :database_name, "Optional: The name of the NCBI taxonomy database", :type => String, :default => "kingdom_assignment_taxonomy", :short => "-n"
+end
+unless opts[:input_file_given] && opts[:output_file_given]
+  puts "Invalid arguments, see --help for more information."
+  abort
+end
+unless opts[:database_password_given]
+  opts[:database_password] = nil
+end
+#Use the correct database connector
+if RUBY_PLATFORM =~ /java/
+  puts "You are running JRuby, the jdbc/mysql database connector will be used."
+  require 'jdbc/mysql'
+else
+  require 'mysql'
+end
+require 'sequel'
+require 'nokogiri'
+require 'bio'
+require 'csv'
+if CSV.const_defined? :Reader
+  require 'fastercsv'
+  INSTALLED_CSV = FasterCSV
+else
+  INSTALLED_CSV = CSV
+end
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'kingdom_db'
+require 'blast_string_parser'
+rootpath = File.dirname(File.dirname(__FILE__))
+PHYTA_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
+puts "Running #{SCRIPT_NAME} #{PHYTA_VERSION}"
+puts "Settings: " + opts.inspect
+#Initialize auxiliary classes
+blast_parser = BlastStringParser.new()
+#Open input file
+unless File.exists?(opts[:input_file])
+  puts "The input file at " + File.expand_path(opts[:input_file]) + " could not be opened!"
+  exit
+end
+file = File.new(opts[:input_file])
+reader = Nokogiri::XML::Reader(file)
+#Initialize database
+db = KingdomDB.new(opts[:database_server], opts[:database_user], opts[:database_password], opts[:database_name])
+#Initialize output file
+if File.exists?(opts[:output_file])
+  puts "The output file at " + File.expand_path(opts[:output_file]) + " already exists!"
+  exit
+end
+output = INSTALLED_CSV.open(opts[:output_file], "w", {
+                              :col_sep => ";",
+                              :headers => ["query sequence id", "hit accession number", "sgi", "evalue", "species", "subject annotation", "subject score", "kingdom"],
+                              :write_headers => true})
+filter_array = [
+                "Bacteria",
+                "Archaea",
+                "Viridiplantae",
+                "Rhodophyta",
+                "Glaucocystophyceae",
+                "Alveolata",
+                "Cryptophyta",
+                "stramenopiles", #<- Change
+                "Amoebozoa",
+                "Apusozoa",
+                "Euglenozoa",
+                "Fornicata",
+                "Haptophyceae",
+                "Heterolobosea",
+                "Jakobida",
+                "Katablepharidophyta",
+                "Malawimonadidae",
+                "Nucleariidae",
+                "Oxymonadida",
+                "Parabasalia",
+                "Rhizaria",
+                "unclassified eukaryotes",
+                "Fungi",
+                "Metazoa",
+                "Choanoflagellida",
+                "Opisthokonta incertae sedis", #"Fungi/Metazoa incertae sedis"
+                "Viruses"
+               ]
+filter_hash = db.get_filter(filter_array)
+current_query = ""
+hit_id = ""
+hit_def = ""
+hit_accession = ""
+hsp_evalue = ""
+subject_score = ""
+kingdom = ""
+#Go through the XML with a pull-parser
+reader.each do |elem|
+  if elem.name == "Iteration_query-def"&& elem.node_type == Nokogiri::XML::Node::ELEMENT_NODE
+    #We are at the beginning of an iteration
+    current_query = elem.inner_xml
+  elsif elem.name == "Hit" && elem.node_type == Nokogiri::XML::Node::ELEMENT_NODE
+    #We are at the beginning of a Hit
+    #Load the node representing this hit into memory and extract required information
+    hit = Nokogiri::XML(elem.outer_xml)
+    hit_id = hit.xpath("//Hit_id").inner_text
+    hit_def = hit.xpath("//Hit_def").inner_text
+    hit_accession = hit.xpath("//Hit_accession").inner_text
+    hsp_evalue = hit.xpath("//Hsp[1]/Hsp_evalue").inner_text.to_f #Yep, the first element really has number 1
+    subject_score = hit.xpath("//Hsp[1]/Hsp_bit-score").inner_text.to_f #Set the subject score to be the bit-score of the first HSP
+    subject_gi = blast_parser.get_sgi_info(hit_id)
+    species_name = nil
+    begin
+      species_name = blast_parser.get_species_name(hit_def)
+    rescue RuntimeError
+      puts "Can not find " + hit_def.to_s[0..20] + "..."
+      begin
+        species_name = db.name_from_gi(subject_gi)
+      rescue RuntimeError
+        puts "ERROR: Could not find gi " + subject_gi.to_s
+      end
+    end
+    begin
+      kingdom = db.match_filter(species_name, filter_hash)
+    rescue RuntimeError
+      kingdom = "NOT FOUND"
+    end
+    if kingdom.nil?
+      kingdom = "NONE"
+    end
+    output << [blast_parser.get_query_seq(current_query), hit_accession, subject_gi, hsp_evalue, species_name, blast_parser.get_subject_annotation(hit_def), subject_score, kingdom]
+  end
+end
+output.close
+puts "Parsing finished!"

data/bin/phyta-extract ADDED

@@ -0,0 +1,117 @@
+#!/usr/bin/env ruby
+SCRIPT_NAME = "phyta-extract"
+def table_to_set(table, header)
+  result = Set.new()
+  table.each do |current_row|
+    current = current_row[header]
+    if current.nil?
+      raise "Error: no entry found for header " + header.to_s + " at " + current_row.inspect
+    end
+    unless result.include?(current)
+      result.add(current)
+    else
+      raise "Error: duplicate entry for " + current.to_s
+    end
+  end
+  return result
+end
+#parse command line arguments
+settings = {}
+unless ARGV.size == 5
+  puts "Usage: kingdom-extraction sequences.fasta clean.csv contaminated.csv clean_output.fasta contaminated_output.fasta"
+  exit
+end
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'rubygems'
+require 'csv'
+require 'set'
+require 'bio'
+rootpath = File.dirname(File.dirname(__FILE__))
+PHYTA_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
+puts "Running #{SCRIPT_NAME} #{PHYTA_VERSION}"
+settings[:input_fasta] = ARGV.shift
+settings[:input_clean] = ARGV.shift
+settings[:input_contaminated] = ARGV.shift
+settings[:output_clean] = ARGV.shift
+settings[:output_contaminated] = ARGV.shift
+unless File.exists?(settings[:input_fasta])
+  puts "The input file at " + File.expand_path(settings[:input_fasta]) + " could not be opened!"
+  exit
+end
+unless File.exists?(settings[:input_clean])
+  puts "The input file at " + File.expand_path(settings[:input_clean]) + " could not be opened!"
+  exit
+end
+unless File.exists?(settings[:input_contaminated])
+  puts "The input file at " + File.expand_path(settings[:input_contaminated]) + " could not be opened!"
+  exit
+end
+if File.exists?(settings[:output_clean])
+  puts "The input file at " + File.expand_path(settings[:output_clean]) + " already exists!"
+  exit
+end
+if File.exists?(settings[:output_contaminated])
+  puts "The input file at " + File.expand_path(settings[:output_contaminated]) + " already exists!"
+  exit
+end
+#CSV backwards compatibility
+if CSV.const_defined? :Reader
+  require 'fastercsv'
+  INSTALLED_CSV = FasterCSV
+else
+  INSTALLED_CSV = CSV
+end
+#Open output of Kingdom-Splitter, save clean and contaminated sequence ids in two sets
+puts "Reading clean..."
+clean_table = INSTALLED_CSV.open(settings[:input_clean], "r", { :col_sep => ";", :headers => :first_row, :header_converters => :symbol})
+clean = table_to_set(clean_table, :query_sequence_id)
+clean_table.close
+puts "Reading contaminated..."
+contaminated_table = INSTALLED_CSV.open(settings[:input_contaminated], "r", { :col_sep => ";", :headers => :first_row, :header_converters => :symbol})
+contaminated = table_to_set(contaminated_table, :query_sequence_id)
+contaminated_table.close
+#Initialize output files
+clean_out = File.open(settings[:output_clean], "w")
+contaminated_out = File.open(settings[:output_contaminated], "w")
+puts "Extracting FASTA sequences..."
+QUERY_SEQ_REGEXP = /\A(\S+)\s.*\z/ #Make sure this is exactly the same as in BlastStringParser in Kingdom-Assignment
+sequences = Bio::FastaFormat.open(settings[:input_fasta])
+sequences.each do |entry|
+  current = QUERY_SEQ_REGEXP.match(entry.definition)[1] #TODO do something when this comparison fails
+  if clean.include?(current)
+    #Sequence belongs in the clean set
+    clean_out.write(entry)
+  elsif contaminated.include?(current)
+    #Sequence belongs in the contaminated set
+    contaminated_out.write(entry)
+  else
+    #Sequence is not annotated
+  end
+end
+sequences.close
+clean_out.close
+contaminated_out.close
+puts "Done!"

data/bin/phyta-setup-taxonomy-db ADDED

@@ -0,0 +1,131 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'net/ftp'
+require 'tmpdir'
+require 'trollop'
+require 'sequel'
+#parse command line arguments
+opts = Trollop::options do
+  opt :database_server, "Optional: The address of the MySQL database server", :type => String, :default => "localhost"
+  opt :database_user, "Optional: The name of the database user", :type => String, :default => "root", :short => "-u"
+  opt :database_password, "Optional: The password of the database user", :type => String, :default => "no password", :short => "-p"
+  opt :database_name, "Optional: The name of the NCBI taxonomy database", :type => String, :default => "kingdom_assignment_taxonomy", :short => "-n"
+end
+unless opts[:database_password_given]
+  opts[:database_password] = nil
+end
+#Connect to the target database
+connect_string = 'mysql://'+ opts[:database_server] + '/' + opts[:database_name] + '?user=' + opts[:database_user]
+if !opts[:database_password].nil?
+  connect_string = connect_string + '&password=' + opts[:database_password]
+end
+if RUBY_PLATFORM =~ /java/
+  #This is JRuby, using jdbc
+  require 'jdbc/mysql'
+  connect_string = 'jdbc:' + connect_string
+else
+  require 'mysql'
+end
+PROTEIN_TABLE_NAME = 'proteinGiToTaxonId'
+NAMES_TABLE_NAME = 'names'
+NODES_TABLE_NAME = 'nodes'
+database = Sequel.connect(connect_string)
+#Test the database connection
+#Better fail now than after downloading all that stuff from the NCBI webservers
+begin
+  database.run "SHOW TABLES"
+rescue Sequel::DatabaseConnectionError => e
+  abort "Could not connect to database: #{e.message}"
+end
+#Connect to the NCBI taxonomy db
+ftp = Net::FTP.new('ftp.ncbi.nih.gov')
+ftp.login
+files = ftp.chdir('pub/taxonomy/')
+#Do the following in a temporary directory, automatically delete it afterwards
+Dir.mktmpdir do |dir|
+  Dir.chdir(dir)
+  tax_dmp = 'taxdump.tar.gz'
+  puts "Downloading #{tax_dmp}... "
+  ftp.getbinaryfile(tax_dmp, tax_dmp)
+  taxdump_md5 = ftp.gettextfile(tax_dmp + ".md5")
+  #TODO validate checksum
+  prot_dmp = 'gi_taxid_prot.dmp.gz'
+  puts "Downloading #{prot_dmp}... "
+  ftp.getbinaryfile(prot_dmp, prot_dmp)
+  puts "Extracting files..."
+  `tar -xzf #{tax_dmp}`
+  `gunzip #{prot_dmp}`
+  # The following is taken from
+  # http://bergelson.uchicago.edu/Members/mhorton/taxonomydb.build
+  puts "Populating database tables..."
+  database.drop_table(PROTEIN_TABLE_NAME) if database.table_exists?(PROTEIN_TABLE_NAME)
+  database.run "CREATE TABLE #{PROTEIN_TABLE_NAME} (
+gi INT UNSIGNED NOT NULL,
+taxonid INT UNSIGNED NOT NULL,
+PRIMARY KEY(gi)
+) engine=innodb charset=utf8;"
+  database.drop_table(NAMES_TABLE_NAME) if database.table_exists?(NAMES_TABLE_NAME)
+  database.run "CREATE TABLE #{NAMES_TABLE_NAME} (
+taxonid MEDIUMINT(11) UNSIGNED NOT NULL,
+name VARCHAR(200) NOT NULL,
+uniquename VARCHAR(100) DEFAULT NULL,
+class VARCHAR(50) NOT NULL DEFAULT '',
+  KEY taxonid (taxonid), KEY type (class), KEY name (name)
+                                   ) ENGINE=INNODB CHARSET=UTF8;"
+database.drop_table(NODES_TABLE_NAME) if database.table_exists?(NODES_TABLE_NAME)
+database.run "CREATE TABLE #{NODES_TABLE_NAME} (
+taxonid mediumint(8) unsigned NOT NULL,
+parenttaxonid mediumint(8) unsigned NOT NULL,
+rank varchar(50) default NULL,
+embl_code varchar(20) default NULL,
+division_id smallint(6) NOT NULL,
+inherited_div_flag tinyint(1) unsigned NOT NULL,
+genetic_code_id smallint(6) NOT NULL,
+inherited_gc_flag tinyint(1) unsigned NOT NULL,
+mitochondrial_genetic_codeid smallint(6) NOT NULL,
+inherited_mgc_flag tinyint(1) unsigned NOT NULL,
+genbank_hidden_flag tinyint(1) unsigned NOT NULL,
+hidden_subtree_root_flag tinyint(1) unsigned NOT NULL,
+comments varchar(255) default NULL,
+PRIMARY KEY  (taxonid), KEY parenttaxonid (parenttaxonid)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8"
+database.run "TRUNCATE #{NAMES_TABLE_NAME}"
+database.run "TRUNCATE #{NODES_TABLE_NAME}"
+database.run "TRUNCATE #{PROTEIN_TABLE_NAME}"
+database.run "LOAD DATA INFILE '#{dir}/gi_taxid_prot.dmp' INTO TABLE #{PROTEIN_TABLE_NAME} FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' (gi,taxonid);"
+database.run "LOAD DATA INFILE '#{dir}/names.dmp' INTO TABLE #{NAMES_TABLE_NAME} FIELDS TERMINATED BY '\t|\t' LINES TERMINATED BY '\t|\n' (taxonid, name, uniquename, class);"
+database.run "LOAD DATA INFILE '#{dir}/nodes.dmp' INTO TABLE #{NODES_TABLE_NAME} FIELDS TERMINATED BY '\t|\t' LINES TERMINATED BY '\t|\n' (taxonid, parenttaxonid,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_gc_flag, mitochondrial_genetic_codeid,inherited_mgc_flag,genBank_hidden_flag,hidden_subtree_root_flag,comments);"
+end
+puts "done!"
+ftp.close

data/bin/phyta-split ADDED

@@ -0,0 +1,158 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'csv' #Will use FasterCSV on Ruby 1.8
+SCRIPT_NAME = "phyta-split"
+# $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+#CSV backwards compatibility
+if CSV.const_defined? :Reader
+  require 'fastercsv'
+  INSTALLED_CSV = FasterCSV
+else
+  INSTALLED_CSV = CSV
+end
+rootpath = File.dirname(File.dirname(__FILE__))
+PHYTA_VERSION = File.new(File.join(rootpath,'VERSION')).read.chomp
+puts "Running #{SCRIPT_NAME} #{PHYTA_VERSION}"
+unless ARGV.size == 1
+  puts "Usage: #{SCRIPT_NAME} input.csv"
+  puts "This will automatically create input_clean.csv and input_contaminated.csv in the same directory."
+  exit
+end
+#Command line arguments
+settings = {}
+settings[:input_file] =  ARGV.shift
+#Set up output file
+fullpath = File.expand_path(settings[:input_file])
+suffix = File.extname(fullpath)
+dirname = File.dirname(fullpath)
+name = File.basename(fullpath, suffix)
+settings[:contaminated_file] = dirname + "/" + name + "_contaminated.csv"
+settings[:clean_file] = dirname + "/" + name + "_clean.csv"
+csv_header = ["query sequence id", "hit accession number", "sgi", "evalue", "species", "subject annotation", "subject score", "kingdom"]
+#Open input file
+if !File.file?(settings[:input_file])
+  puts "No input file at " + File.expand_path(settings[:input_file]) + "!"
+  exit
+end
+input = INSTALLED_CSV.open(settings[:input_file], "r", {
+                   :col_sep => ";",
+                   :headers => :first_row,
+                   :header_converters => :symbol})
+clean_seqs = {}
+contaminated_seqs = {}
+contaminated_filter = [
+                       "Bacteria",
+                       "Archaea",
+                       "Viruses",
+                       "NONE"
+                       #TODO is this all?
+                      ]
+warning = false;
+input.each do |current_row|
+  seqid   = current_row[:query_sequence_id]
+  kingdom = current_row[:kingdom]
+  if !warning
+    if current_row[:evalue].include? ','
+      puts "***************** Warning *****************"
+      puts "This program will produce incorrect output"
+      puts "if a comma is used as a decimal divider!"
+      puts "*******************************************"
+      warning = true
+    end
+  end
+  seq_is_in_clean            = clean_seqs.has_key?(seqid)
+  seq_is_in_contaminated     = contaminated_seqs.has_key?(seqid)
+  kingdom_is_in_contaminated = contaminated_filter.include?(kingdom)
+  if seq_is_in_clean && seq_is_in_contaminated
+    puts "Something went wrong"
+    exit
+  elsif !seq_is_in_clean && !seq_is_in_contaminated
+    #Seq is not yet in any of the lists
+    if kingdom_is_in_contaminated
+      contaminated_seqs[seqid] = current_row
+    else
+      clean_seqs[seqid] = current_row
+    end
+  elsif seq_is_in_clean
+    #Seqs go into clean when they have one hit that's not in the contaminated filter
+    #Make sure the seq in the hash has the lower evalue
+    if clean_seqs[seqid][:evalue].to_f > current_row[:evalue].to_f
+      clean_seqs[seqid] = current_row
+    end
+  elsif seq_is_in_contaminated
+    #Seqs go into clean when they have one hit that's not in the contaminated filter
+    if kingdom_is_in_contaminated
+      #Make sure the seq in the hash has the lower evalue
+      if contaminated_seqs[seqid][:evalue].to_f > current_row[:evalue].to_f
+        contaminated_seqs[seqid] = current_row
+      end
+    else
+      #One hit is not contaminated, move to clean seqs
+      if contaminated_seqs[seqid][:evalue].to_f >= current_row[:evalue].to_f
+        clean_seqs[seqid] = current_row
+      else
+        clean_seqs[seqid] = contaminated_seqs[seqid]
+      end
+      #Remove row from the list of contaminated seqs
+      contaminated_seqs.delete(seqid)
+    end
+  else
+    #This should never happen
+    puts "Something went wrong..."
+    exit
+  end
+end
+#make sure that the set of contaminated and clean seqs does not overlap
+unless (clean_seqs.keys & contaminated_seqs.keys).empty?
+  puts "Something went wrong!"
+  exit
+end
+#Output
+contaminated = INSTALLED_CSV.open(settings[:contaminated_file], "w", {
+                          :col_sep => ";",
+                          :headers => csv_header,
+                          :write_headers => true})
+clean = INSTALLED_CSV.open(settings[:clean_file], "w", {
+                   :col_sep => ";",
+                   :headers => csv_header,
+                   :write_headers => true})
+clean_seqs.each_value {|row| clean << row }
+contaminated_seqs.each_value {|row| contaminated << row }
+input.close
+clean.close
+contaminated.close

data/lib/blast_string_parser.rb ADDED

@@ -0,0 +1,51 @@
+# To change this template, choose Tools | Templates
+# and open the template in the editor.
+class BlastStringParser
+  def initialize
+  end
+  #Set up Regexps
+  #SPECIES_REGEXP2 = /^.*\[(\w* \w*).*\].*$/ #captures the first two words in square brackets
+  SPECIES_REGEXP2 = /^.*\[(.*)\].*$/ #captures everything in square brackets
+  SGI_REGEXP = /^gi\|(\d+)\|.*$/
+  #QUERY_SEQ_REGEXP = /^([a-zA-Z0-9]+)[_|\s].*$/ #This captures everything up to the 1st underscore
+  QUERY_SEQ_REGEXP = /^(\S+)\s.*$/ #This captures everything until the first whitespace (more robust)
+  #do not expect whitespace after the last | for robustness, strip later
+  SUBJ_ANNOTATION_REGEXP = /(?:.*\|)*(.*)\[.*/ #TODO check if this REGEXP captures the right stuff
+  def get_sgi_info(a_hit_id)
+    unless SGI_REGEXP.match(a_hit_id)
+      raise("Wrong hit id " + a_hit_id)
+    else
+      return SGI_REGEXP.match(a_hit_id)[1]
+    end
+  end
+  def get_species_name(a_hit_def)
+    unless SPECIES_REGEXP2.match(a_hit_def)
+      raise "No species info found!"
+    else
+      return SPECIES_REGEXP2.match(a_hit_def)[1]
+    end
+  end
+  def get_subject_annotation(a_hit_def)
+    unless SUBJ_ANNOTATION_REGEXP.match(a_hit_def)
+      puts "Can not parse subject annotation " + a_hit_def[0..20] + "...\n"
+      return a_hit_def
+    else
+      return SUBJ_ANNOTATION_REGEXP.match(a_hit_def)[1].strip
+    end
+  end
+  def get_query_seq(a_query)
+    unless QUERY_SEQ_REGEXP.match(a_query)
+      return a_query
+    else
+      return QUERY_SEQ_REGEXP.match(a_query)[1]
+    end
+  end
+end

data/lib/kingdom_db.rb ADDED

@@ -0,0 +1,140 @@
+require 'sequel'
+class KingdomDB
+  ROOT_ID = "1"
+  SCIENTIFIC_NAME = "scientific name"
+  def initialize(server, user, password, database)
+    connect_string = 'mysql://'+ server + '/' + database + '?user=' + user
+    if !password.nil?
+      connect_string = connect_string + '&password=' + password
+    end
+    if !defined?(RUBY_ENGINE)
+      #This is most likey 1.8.7
+    else
+      if RUBY_ENGINE == 'jruby'
+        #This is JRuby, using jdbc
+        connect_string = 'jdbc:' + connect_string
+      end
+    end
+    @database = Sequel.connect(connect_string)
+    @filter_hit_cache = {}
+  end
+  def id_from_name(taxon_name)
+    db_results = @database[:names].select(:taxonid, :class).filter(:name => taxon_name).all
+    if db_results.size > 1
+      #If we get more than one result, check if we got a scientific name
+      db_results.delete_if{|x| x[:class] != SCIENTIFIC_NAME}
+    end
+    if db_results.size > 1
+      raise("Results not unique: " + db_results.inspect)
+    end
+    if db_results.size == 0
+      raise("No results for taxon name " + taxon_name.to_s)
+    end
+    return db_results[0][:taxonid].to_s
+  end
+  def name_from_id(taxon_id)
+    db_results = @database[:names].filter(:taxonid => taxon_id.to_s, :class => SCIENTIFIC_NAME).map(:name)
+    if db_results.size == 0
+      raise("No results for taxon id " + taxon_id.to_s)
+    elsif db_results.size > 1
+      raise("Results not unique: " + db_results.inspect)
+    else
+      return db_results[0]
+    end
+  end
+  def parent_id_from_id(taxon_id)
+    db_results = @database[:nodes].filter(:taxonid => taxon_id.to_s).map(:parenttaxonid)
+    if db_results.size == 0
+      raise("No results for taxon id " + taxon_id.to_s)
+    elsif db_results.size > 1
+      raise("Results not unique: " + db_results.inspect)
+    else
+      return db_results[0].to_s
+    end
+  end
+  def node_rank_from_id(taxon_id)
+    db_results = @database[:nodes].filter(:taxonid => taxon_id.to_s).map(:rank)
+    if db_results.size == 0
+      raise("No results for taxon id " + taxon_id.to_s)
+    elsif db_results.size > 1
+      raise("Results not unique: " + db_results.inspect)
+    else
+      return db_results[0].to_s
+    end
+  end
+  def id_from_gi(gi_number)
+    db_results = @database[:proteingiToTaxonId].filter(:gi => gi_number).map(:taxonid)
+    if db_results.size == 0
+      raise("No results for gi " + gi_number.to_s)
+    elsif db_results.size > 1
+      raise("Results not unique: " + db_results.inspect)
+    else
+      return db_results[0].to_s
+    end
+  end
+  def name_from_gi(gi_number)
+    taxonid = id_from_gi(gi_number)
+    name_from_id(taxonid)
+  end
+  def get_filter(name_array)
+    filter_hash = Hash[name_array.collect { |taxon_name|
+                         [taxon_name, id_from_name(taxon_name)]
+                       }]
+    return filter_hash
+  end
+  def match_filter(taxon_name, filter_hash)
+    current_species_id = id_from_name(taxon_name)
+    history = []
+    while ((current_species_id.to_i > ROOT_ID.to_i)&&(!filter_hash.has_value?(current_species_id)))
+      if @filter_hit_cache.has_key? current_species_id
+        #Cache hit
+        current_species_id = @filter_hit_cache[current_species_id]
+        break
+      else
+        parent_id = parent_id_from_id(current_species_id)
+        history << current_species_id
+        current_species_id = parent_id
+      end
+    end
+    history.each { |i|
+      @filter_hit_cache[i] = current_species_id
+    }
+    if current_species_id == ROOT_ID
+      return nil
+    else
+      return name_from_id(current_species_id)
+    end
+  end
+end

data/test/helper.rb ADDED

@@ -0,0 +1,17 @@
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'test/unit'
+require 'shoulda'
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+class Test::Unit::TestCase
+end

data/test/test_blackbox.rb ADDED

@@ -0,0 +1,41 @@
+require 'helper'
+require 'tmpdir'
+class BlackBoxTest < Test::Unit::TestCase
+  def test_without_parameters
+    #This test does not make a whole lot of sense...
+    result = %x[bin/phyta-assign]
+    expected = "Invalid arguments, see --help for more information."
+    assert_equal expected.strip, result.strip
+  end
+  def test_small
+    Dir.mktmpdir do |dir|
+      %x[bin/phyta-assign -i test/data/in_3.xml -o #{dir}/out_3.csv]
+      result = File.open("#{dir}/out_3.csv").read
+      target = File.open("test/data/target_3.csv").read
+      assert_not_nil result
+      assert_not_nil target
+      assert_equal target, result, "Output of out_3.xml invalid"
+    end
+  end
+  def test_medium
+    Dir.mktmpdir do |dir|
+      %x[bin/phyta-assign -i test/data/in_medium.xml -o #{dir}/out_medium.csv]
+      result = File.open("#{dir}/out_medium.csv").read
+      target = File.open("test/data/target_medium.csv").read
+      assert_not_nil result
+      assert_not_nil target
+      assert_block "Output of out_medium.xml invalid." do
+        result == target
+      end
+    end
+  end
+end

data/test/test_blast_string_parser.rb ADDED

@@ -0,0 +1,23 @@
+require 'helper'
+require 'blast_string_parser'
+class BlastStringParserTest < Test::Unit::TestCase
+  def test_get_species_info
+    bsp = BlastStringParser.new()
+    assert_equal "Xenopus (Silurana) tropicalis", bsp.get_species_name("PREDICTED: uncharacterized protein K02A2.6-like [Xenopus (Silurana) tropicalis]")
+    assert_equal "Corticium_candelabrum", bsp.get_species_name("CC1c114_molpal [Corticium_candelabrum]")
+  end
+  def test_get_query_seq
+    bsp = BlastStringParser.new()
+    assert_equal "Aqu1.200003", bsp.get_query_seq("Aqu1.200003")
+    assert_equal "AW3C1", bsp.get_query_seq("AW3C1 [Astrosclera_willeyana]")
+    assert_equal "AW3C1_molpal", bsp.get_query_seq("AW3C1_molpal")
+    assert_equal "CC1c1_molpal", bsp.get_query_seq("CC1c1_molpal [Corticium_candelabrum]")
+    assert_equal "CC1c1_molpal", bsp.get_query_seq("CC1c1_molpal  [Corticium_candelabrum]")
+    assert_equal "CC1c1_molpal", bsp.get_query_seq("CC1c1_molpal \n[Corticium_candelabrum]")
+    assert_equal "CC1c1_molpal", bsp.get_query_seq("CC1c1_molpal [Corticium_candelabrum], this is a nice_sequence I found rummaging through my fridge [an older model from AEG]")
+    assert_equal "CC1c1", bsp.get_query_seq("CC1c1 (tastes really good with curry)")
+    assert_equal "CC1c1_molpal", bsp.get_query_seq("CC1c1_molpal [Corticium_candelabrum] (oh, hai!)")
+  end
+end

data/test/test_kingdom_db.rb ADDED

@@ -0,0 +1,118 @@
+require 'helper'
+require 'kingdom_db'
+class KingdomDbTest < Test::Unit::TestCase
+  def setup
+    @db = KingdomDB.new('localhost', 'root', '', 'kingdom_assignment_taxonomy')
+  end
+  def test_id_from_taxon_name
+    assert_not_nil(@db.id_from_name("Drosophila melanogaster"))
+    assert_raise RuntimeError do
+      @db.id_from_name("Sarah palin")
+    end
+    assert_raise RuntimeError do
+      @db.id_from_name("")
+    end
+    assert_raise RuntimeError do
+      @db.id_from_name("Shewanella sp")
+    end
+  end
+  def test_name_from_id
+    homo =  @db.id_from_name("Homo sapiens")
+    assert_equal "Homo sapiens", @db.name_from_id(homo)
+    assert_equal "Homo sapiens", @db.name_from_id(homo.to_s)
+    assert_equal "Homo sapiens", @db.name_from_id(homo.to_i)
+    assert_raise RuntimeError do
+      @db.name_from_id(0)
+    end
+    assert_raise RuntimeError do
+      @db.name_from_id(-1)
+    end
+    assert_raise RuntimeError do
+      @db.name_from_id(0)
+    end
+  end
+  def test_parent_id_from_id
+    assert_equal "7872", @db.parent_id_from_id("7873")
+    assert_equal "7872", @db.parent_id_from_id(7873)
+  end
+  def test_node_rank_from_id
+    assert_equal "species", @db.node_rank_from_id("7873")
+    assert_equal "species", @db.node_rank_from_id(7873)
+    assert_equal "species", @db.node_rank_from_id(@db.id_from_name("Drosophila melanogaster"))
+    assert_equal "genus", @db.node_rank_from_id("7872")
+    assert_equal "no rank", @db.node_rank_from_id(1)
+    assert_equal "no rank", @db.node_rank_from_id(@db.id_from_name("Woodchuck hepatitis virus 1"))
+  end
+  def test_name_from_gi
+    assert_equal "Oryctolagus cuniculus", @db.name_from_gi(1712)
+    assert_equal "Tribolium castaneum", @db.name_from_gi("270016927")
+    assert_equal "Clypeaster japonicus", @db.name_from_gi(124106306)
+    assert_equal "Anthocidaris crassispina", @db.name_from_gi(124106325)
+    assert_equal "Lateolabrax japonicus", @db.name_from_gi(158518390)
+    assert_raise RuntimeError do
+      @db.name_from_gi(205688854)
+    end
+  end
+  def test_match_filter
+    filter_array = [
+                    "Bacteria",
+                    "Archaea",
+                    "Metazoa",
+                    "Viruses"
+                   ]
+    filter_hash = @db.get_filter(filter_array)
+    filter_hash.each { |name, id|
+      assert_equal @db.id_from_name(name), id
+    }
+    assert_equal ["Bacteria", "Archaea", "Metazoa", "Viruses"], filter_array
+    assert_equal "Metazoa", @db.match_filter("Homo sapiens", filter_hash)
+    assert_equal "Bacteria", @db.match_filter("Escherichia coli", filter_hash)
+    assert_raise RuntimeError do
+      assert_equal nil, @db.match_filter("Hello world", filter_hash)
+    end
+    assert_equal "Bacteria", @db.match_filter("Bacteria", filter_hash)
+    assert_equal nil, @db.match_filter("root", filter_hash)
+    assert_equal nil, @db.match_filter("Zea mays", filter_hash)
+    assert_equal nil, @db.match_filter("cellular organisms", filter_hash)
+    assert_equal "Bacteria", @db.match_filter("Shewanella sp.", filter_hash)
+    assert_raise RuntimeError do
+      assert_equal nil, @db.match_filter("Homo s", filter_hash)
+    end
+    assert_raise RuntimeError do
+      assert_equal nil, @db.match_filter("sp", filter_hash)
+    end
+    assert_equal "Metazoa", @db.match_filter("Homo", filter_hash)
+    assert_equal "Viruses", @db.match_filter("Cyanophage Syn26", filter_hash)
+    assert_equal "Viruses", @db.match_filter("uncultured phage", filter_hash)
+    assert_equal "Bacteria", @db.match_filter("uncultured bacterium", filter_hash)
+    assert_equal nil, @db.match_filter("uncultured organism", filter_hash)
+    assert_equal "Metazoa", @db.match_filter("Xenopus (Silurana) tropicalis", filter_hash)
+    assert_equal "Viruses", @db.match_filter("Pseudomonas phage EL", filter_hash)
+    assert_equal "Viruses", @db.match_filter("Pseudomonas phage EL", filter_hash)
+    assert_equal "Metazoa", @db.match_filter("Canis lupus familiaris", filter_hash)
+    assert_equal "Metazoa", @db.match_filter("Canis familiaris", filter_hash)
+    assert_raise RuntimeError do
+      @db.match_filter(nil, filter_hash)
+    end
+  end
+end

metadata ADDED

@@ -0,0 +1,239 @@
+--- !ruby/object:Gem::Specification
+name: bio-phyta
+version: !ruby/object:Gem::Version
+  hash: 59
+  prerelease:
+  segments:
+  - 0
+  - 9
+  - 0
+  version: 0.9.0
+platform: ruby
+authors:
+- Philipp Comans
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-10-20 00:00:00 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 1
+        - 4
+        - 2
+        version: 1.4.2
+  version_requirements: *id001
+  name: bio
+  prerelease: false
+  type: :runtime
+- !ruby/object:Gem::Dependency
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 45
+        segments:
+        - 2
+        - 8
+        - 1
+        version: 2.8.1
+  version_requirements: *id002
+  name: mysql
+  prerelease: false
+  type: :runtime
+- !ruby/object:Gem::Dependency
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 119
+        segments:
+        - 3
+        - 28
+        - 0
+        version: 3.28.0
+  version_requirements: *id003
+  name: sequel
+  prerelease: false
+  type: :runtime
+- !ruby/object:Gem::Dependency
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 11
+        segments:
+        - 1
+        - 5
+        - 4
+        version: 1.5.4
+  version_requirements: *id004
+  name: fastercsv
+  prerelease: false
+  type: :runtime
+- !ruby/object:Gem::Dependency
+  requirement: &id005 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 1
+        - 5
+        - 0
+        version: 1.5.0
+  version_requirements: *id005
+  name: nokogiri
+  prerelease: false
+  type: :runtime
+- !ruby/object:Gem::Dependency
+  requirement: &id006 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 83
+        segments:
+        - 1
+        - 16
+        - 2
+        version: 1.16.2
+  version_requirements: *id006
+  name: trollop
+  prerelease: false
+  type: :runtime
+- !ruby/object:Gem::Dependency
+  requirement: &id007 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  version_requirements: *id007
+  name: shoulda
+  prerelease: false
+  type: :development
+- !ruby/object:Gem::Dependency
+  requirement: &id008 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 23
+        segments:
+        - 1
+        - 0
+        - 0
+        version: 1.0.0
+  version_requirements: *id008
+  name: bundler
+  prerelease: false
+  type: :development
+- !ruby/object:Gem::Dependency
+  requirement: &id009 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 7
+        segments:
+        - 1
+        - 6
+        - 4
+        version: 1.6.4
+  version_requirements: *id009
+  name: jeweler
+  prerelease: false
+  type: :development
+- !ruby/object:Gem::Dependency
+  requirement: &id010 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  version_requirements: *id010
+  name: rcov
+  prerelease: false
+  type: :development
+description: Coming soon
+email: philipp.comans@googlemail.com
+executables:
+- phyta-split
+- phyta-assign
+- phyta-extract
+- phyta-setup-taxonomy-db
+extensions: []
+extra_rdoc_files:
+- LICENSE.txt
+- README.rdoc
+files:
+- .document
+- Gemfile
+- LICENSE.txt
+- README.rdoc
+- Rakefile
+- VERSION
+- bin/phyta-assign
+- bin/phyta-extract
+- bin/phyta-setup-taxonomy-db
+- bin/phyta-split
+- lib/blast_string_parser.rb
+- lib/kingdom_db.rb
+- test/helper.rb
+- test/test_blackbox.rb
+- test/test_blast_string_parser.rb
+- test/test_kingdom_db.rb
+homepage: http://github.com/pcomans/bioruby-phyta
+licenses:
+- LGPL
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.10
+signing_key:
+specification_version: 3
+summary: Pipeline to remove contaminations from EST libraries
+test_files: []