RubyGems - gene_assembler - Versions diffs - 0.0.9 - Mend

gene_assembler 0.0.9

Files changed (38) hide show

data/.gitignore +22 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +29 -0
data/Rakefile +2 -0
data/bin/GeneAssembler +233 -0
data/bin/phytozome_scan +60 -0
data/gene_assembler.gemspec +25 -0
data/lib/gene_assembler.rb +5 -0
data/lib/gene_assembler/blast_type_parser.rb +41 -0
data/lib/gene_assembler/contig.rb +643 -0
data/lib/gene_assembler/dataset.rb +532 -0
data/lib/gene_assembler/exonerate_result.rb +230 -0
data/lib/gene_assembler/gff_contig.rb +67 -0
data/lib/gene_assembler/gff_dataset.rb +152 -0
data/lib/gene_assembler/gff_feature.rb +175 -0
data/lib/gene_assembler/gff_frameshift.rb +6 -0
data/lib/gene_assembler/gff_go.rb +13 -0
data/lib/gene_assembler/gff_hit.rb +53 -0
data/lib/gene_assembler/gff_hsp.rb +6 -0
data/lib/gene_assembler/gff_localization.rb +6 -0
data/lib/gene_assembler/gff_master_feature.rb +5 -0
data/lib/gene_assembler/gff_parser.rb +35 -0
data/lib/gene_assembler/gff_snp.rb +21 -0
data/lib/gene_assembler/gff_stop.rb +6 -0
data/lib/gene_assembler/go.rb +13 -0
data/lib/gene_assembler/hit.rb +191 -0
data/lib/gene_assembler/hsp.rb +100 -0
data/lib/gene_assembler/other_functions.rb +228 -0
data/lib/gene_assembler/parser.rb +25 -0
data/lib/gene_assembler/parser_blast.rb +12 -0
data/lib/gene_assembler/parser_exonerate.rb +16 -0
data/lib/gene_assembler/rebuild.rb +975 -0
data/lib/gene_assembler/report.rb +13 -0
data/lib/gene_assembler/report_gff.rb +30 -0
data/lib/gene_assembler/snp.rb +13 -0
data/lib/gene_assembler/version.rb +3 -0
metadata +149 -0

data/.gitignore ADDED

@@ -0,0 +1,22 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+*.bundle
+*.so
+*.o
+*.a
+mkmf.log

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in gene_assembler.gemspec
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2014 TODO: Write your name
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,29 @@
+# GeneAssembler
+TODO: Write a gem description
+## Installation
+Add this line to your application's Gemfile:
+    gem 'gene_assembler'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install gene_assembler
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it ( https://github.com/[my-github-username]/gene_assembler/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED

	@@ -0,0 +1,2 @@
1	+ require "bundler/gem_tasks"
2	+

data/bin/GeneAssembler ADDED

@@ -0,0 +1,233 @@
+#!/usr/bin/env ruby
+ROOT_PATH=File.dirname(__FILE__)
+$: << File.expand_path(File.join(ROOT_PATH, "../lib/"))
+$: << File.expand_path(File.join(ROOT_PATH, "../lib/gene_assembler/"))
+require 'optparse'
+require 'scbi_fasta'
+require 'parser_blast'
+require 'parser_exonerate'
+require 'dataset'
+require 'rebuild'
+require 'other_functions'
+# INPUT PARSING
+#################################################################################################
+options = {}
+optparse = OptionParser.new do |opts|
+	options[:fasta] = 'contigsMC.fasta'
+	opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
+		options[:fasta] = file
+	end
+	options[:db] = 'dual_prot.fasta'
+	opts.on( '-d', '--database FILE', 'Blast database' ) do |db|
+		options[:db] = db
+	end
+	options[:reference] = 'reference'
+	opts.on( '-r', '--reference FILE', 'Gene models reference file' ) do |ref|
+		options[:reference] = ref
+	end
+	options[:evalue] = 1.0e-3
+	opts.on( '-e', '--evalue EVALUE', 'e value threshold to consider as reliable the orthologue sequence. Default=1.0e-3' ) do |evalue|
+		options[:evalue] = evalue.to_f
+	end
+	options[:verbose] = FALSE
+	opts.on( '-v', '--verbose', 'Default=0' ) do |verbose|
+		options[:verbose] = TRUE
+		$verbose=TRUE
+	end
+	options[:overwrite] = FALSE
+	opts.on( '-o', '--overwrite', 'Default=FALSE' ) do |overwrite|
+		options[:overwrite] = TRUE
+	end
+	options[:web] = FALSE
+	opts.on( '-w', '--web', 'Default=FALSE' ) do |web|
+	options[:web] = TRUE
+	end
+	options[:index] = FALSE
+	opts.on( '-i', '--index', 'Default=FALSE' ) do |index|
+	options[:index] = TRUE
+	end
+	options[:rebuild] = TRUE
+	opts.on( '-n', '--nrebuild', 'Default=1' ) do |rebuild|
+		options[:rebuild] = FALSE
+	end
+	options[:cpu] = 1
+	opts.on( '-c', '--cpu CPU', 'Default=2' ) do |cpu|
+		options[:cpu] = cpu
+	end
+	# Set a banner, displayed at the top of the help screen.
+	opts.banner = "Usage: GeneEnsambler -f input.fasta -e evalue -c n_cpu \n\n"
+	# This displays the help screen
+	opts.on( '-h', '--help', 'Display this screen' ) do
+		puts opts
+		exit
+	end
+end # End opts
+# parse options and remove from ARGV
+optparse.parse!
+# I/O FILES
+#####################################################################################
+# comprueba si existen los ficheros de entrada
+if !File.exists?(options[:fasta])||!File.exists?(options[:db])
+  puts "File #{options[:fasta]} or #{options[:db]} doesn't exists"
+  Process.exit(-1)
+end
+path={}
+# Directories path
+path[:db]=File.join(Dir.pwd,'db') #Database folder
+path[:temp]=File.join(Dir.pwd,'temp') #Temp folder
+path[:local]=File.join(Dir.pwd,'temp/local')
+path[:ouput_files]=File.join(Dir.pwd,'output_files') #Output folder
+# Create work directories
+path.each do |key, directory|
+  if !File.exists?(directory)
+    Dir.mkdir(directory)
+  end
+end
+# Files path
+path[:fasta]=File.join(path[:ouput_files],'gene_models.fasta')
+path[:gff]=File.join(path[:ouput_files],'gene_capture.gff')
+path[:error]=File.join(path[:ouput_files],'Error.log')
+path[:html]=File.join(path[:ouput_files],'Index.html')
+path[:prime]=File.join(path[:ouput_files],'5_prime_data.txt')
+path[:fasta_prime]=File.join(path[:ouput_files],'5_prime_region.fasta')
+path[:db_prot]=options[:db] # ???
+path[:blast_db]=File.join(path[:db],File.basename(options[:db]))
+path[:blast_output]=File.join(path[:temp],File.basename(options[:db])+'.blast')
+path[:exonerate_db]=options[:db]
+path[:exonerate_output]=File.join(path[:temp],File.basename(options[:db])+'.ex')
+path[:exonerate_input_fasta]=File.join(path[:db],File.basename(options[:db])+'.fasta')
+path[:reference]= options[:reference]
+# Links path
+path[:gbrowse_link]='http://10.247.129.19/cgi-bin/gbrowse/ostra2/?name=Sequence:'
+if File.exists?(path[:gff]) #Delete gff report for creation a new one
+  FileUtils.rm(path[:gff])
+end
+# BLASTING
+#######################################################################################
+# Creating blast db
+#---------------------------------------
+if !File.exists?(path[:blast_db]+'.psq')||options[:overwrite]
+	puts 'Creating DB '
+	var="makeblastdb -in #{options[:db]} -out #{path[:blast_db]} -dbtype prot -parse_seqids" #Protein
+	system(var)
+	puts 'DB created'
+end
+# Do blast
+#---------------------------------------
+if !File.exists?(path[:blast_output]) ||options[:overwrite]
+	puts 'Start blastx'
+	cmd="blastx -query #{options[:fasta]} -db #{path[:blast_db]} -outfmt '7 qseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore score qframe sframe qseq sseq qlen slen stitle' -evalue #{options[:evalue]} -max_target_seqs 1 -out #{path[:blast_output]} -num_threads #{options[:cpu]}"
+	system(cmd)
+	puts "Blastx has finished"
+end
+#Parsing blast (blast to class)
+#-------------------------------------------
+puts 'Parsing blast_prot:'
+store_blast = ParserBlast.new('contig','nucleotide_match', path[:blast_output])
+store_prot_blast = store_blast.dataset
+store_prot_blast.correct_hsp_contigs('s') #Subject
+store_prot_blast.correct_hsp_contigs('q') #Query
+puts 'End parsing'
+#Save relationship contig-protein for debbuging
+#-------------------------------------------------
+if options[:index]
+  index=File.open(File.join(path[:ouput_files],'contig_index'),'w')
+  store_prot_blast.each_contig {|contig|
+    index.puts contig.name+"\t"+ contig.first_hit.name
+  }
+  index.close
+end
+# Filtering results
+#-----------------------------------------------
+puts 'FILTERING BLAST'
+store_prot_uni_hsp=Dataset.new(:prot)
+store_prot_uni_hsp.transfer_contigs(store_prot_blast.filtering)
+puts "FILTERING BLAST FINISHED\n"
+# EXONERATING
+###########################################################################################
+# Loading sequences for exonerate fasta
+#---------------------------------------
+puts 'Loading sequences for exonerate fasta'
+seqs = fasta_hash(options[:fasta])
+store_prot_blast.load_seq(seqs) #Cargar secuencia de los contigs en la clase
+store_prot_uni_hsp.load_seq(seqs)
+seqs = nil
+store_prot_blast.rev_comp #Hacer reversocomplementaria en caso de no estar la secuencia en +, de todos los contig
+store_prot_uni_hsp.rev_comp
+# Do exonerate
+#---------------------------------------
+puts 'START EXONERATE'
+if !File.exists?(path[:exonerate_output]) ||options[:overwrite]
+  store_prot_blast.fasta(path[:exonerate_input_fasta])
+  cmd="exonerate -q #{options[:db]} -t #{path[:exonerate_input_fasta]} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --showvulgar --useaatla 1 > #{path[:exonerate_output]}"
+  system(cmd)
+end
+puts 'EXONERATE FINISHED'
+# Parsing exonerate (exonerate to class)
+#-------------------------------------------
+store_exonerate = ParserExonerate.new('contig','nucleotide_match', path[:exonerate_output])
+store_prot_exonerate = store_exonerate.dataset
+store_prot_exonerate.attrib_recover(store_prot_blast)
+store_prot_exonerate.score_correction(30) #Correccion intron penalty del exonerate (el programa tiene un bug q impide hacerlo desde el mismo)
+# Filtering results
+#-----------------------------------------------
+puts 'FILTERING EXONERATE'
+store_prot_uni_hsp.transfer_contigs(store_prot_exonerate.filtering)
+puts "Filtering exonerate finished\n"
+# CLUSTERING
+#################################################################################
+store_prot_exonerate.clustering #Clusterizado contigs que han pasado los filtros
+store_prot_exonerate.info_clusters
+store_prot_uni_hsp.clustering #Clusterizado contigs uni-hsp
+# Rescue of missed genes and contigs (store_prot_uni_hsp to store_prot_exonerate)
+#----------------------------------------------------------------------------
+store_prot_exonerate.missing_cluster_transfer(store_prot_uni_hsp)
+store_prot_exonerate.missing_contigs_transfer(store_prot_uni_hsp)
+# GENE REBUILD
+######################################################################################
+puts 'Gene rebuild start'
+store_prot_exonerate.load_references(path[:reference])
+store_prot_exonerate.sort_cont_clust #Ordenar contigs de menor a mayor en base a su primer hsp
+store_prot_uni_hsp.sort_cont_clust #Ordenar contigs uni-hsp
+store_prot_exonerate.generate_file_5_prime(path[:prime], path[:fasta_prime])
+rebuild=Rebuild.new(store_prot_exonerate,store_prot_uni_hsp,path)
+rebuild.rebuild(options)
+puts "\nGene rebuild finished"

data/bin/phytozome_scan ADDED

@@ -0,0 +1,60 @@
+#!/usr/bin/env ruby
+ROOT_PATH=File.dirname(__FILE__)
+$: << File.expand_path(File.join(ROOT_PATH, "../lib/"))
+$: << File.expand_path(File.join(ROOT_PATH, "../lib/gene_assembler/"))
+require 'gff_parser'
+#Input gff and blast result with outfmt 6
+# Load Arabidopsis gene features in hash (only CDS), file used: NCBI_Chr1.tbl downloaded of TAIR
+gff=Gff_parser.new(ARGV[0]).dataset
+# Load relationship between a pool of proteins and Arabidopsis gene's pool in hash, file used: tblastn report with outfmt 6. Name has two partsm first is gene name and second is the mRNA name. mRNA name i used like gene_name
+model_hash={}
+File.open(ARGV[1],'r').each do |line|
+	fields=line.split
+	gene_name=fields[1].split('+')
+	if !model_hash.key?(fields[0])
+		model_hash[fields[0]]=[gene_name[1].gsub('id','')]
+	else
+		model_hash[fields[0]] << gene_name[1].gsub('id','')
+	end
+end
+# Write crossover between blast and gff
+max_exones=100
+genes_exones=Array.new(max_exones,0)
+ref=File.open('references','w')
+model_hash.each do |gene_names|
+	ref.print gene_names[0]+"\t"
+	gene_names[1].each do |mRNA_name|
+		mRNA=gff.feature(mRNA_name)
+		if !mRNA.nil?
+			mRNA_structure=mRNA.cds
+			length=mRNA_structure.length-1
+			genes_exones[length]+=1
+			first_exon=mRNA_structure.first
+			origin=nil
+			if mRNA.strand =='-' # Detect reverse structures
+				origin=mRNA_structure.first.last-1
+				mRNA_structure.each do |exon|
+					exon.reverse!
+				end
+			else
+				origin=mRNA_structure.first.first-1
+			end
+			mRNA_structure.each do |exon|
+				ref.print "#{(exon[0]-=origin).abs}-#{(exon[1]-=origin).abs};"
+			end
+			ref.print '|'
+		end
+	end
+	ref.puts
+end
+ref.close
+#puts 'GENES-EXONES'
+#puts genes_exones.inspect

data/gene_assembler.gemspec ADDED

@@ -0,0 +1,25 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'gene_assembler/version'
+Gem::Specification.new do |spec|
+  spec.name          = "gene_assembler"
+  spec.version       = GeneAssembler::VERSION
+  spec.authors       = ["Pedro Seoane"]
+  spec.email         = ["seoanezonjic@hotmail.es"]
+  spec.summary       = %q{This gem builds gene models using fragmented genome information.}
+  spec.description   = %q{Use this siftware with techniques like genecapture}
+  spec.homepage      = ""
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_runtime_dependency 'scbi_blast', '>= 0.0.43'
+  spec.add_runtime_dependency 'scbi_fasta', '>= 0.1.9'
+  spec.add_development_dependency "bundler", "~> 1.6"
+  spec.add_development_dependency "rake"
+end

data/lib/gene_assembler.rb ADDED

@@ -0,0 +1,5 @@
+require "gene_assembler/version"
+module GeneAssembler
+  # Your code goes here...
+end

data/lib/gene_assembler/blast_type_parser.rb ADDED

@@ -0,0 +1,41 @@
+require 'parser'
+class BlastTypeParser < Parser
+	def initialize(contig_type,hit_type,file,all=FALSE)
+    @file=file
+		@dataset=create_dataset
+		@all=all
+		data=parse_file(file)
+		load_dataset(data,contig_type,hit_type)
+	end
+	def load_dataset(data,contig_type,hit_type) # Introduce datos del blast en clases contig/hit/hsp
+	  data.querys.each do |item|
+	  	if item.hits.empty? #Descartamos querys q no hayan dado nigun match
+	  		next
+	  	end
+	    contig=@dataset.add_contig(item.query_def) #query_def -> nombre de la query (nuestro contig)
+	    contig.length=item.full_query_length #full_query_length -> longitud de la query
+	    contig.type=contig_type
+	    populate_extra_atributes(contig,item)
+	    last_hit_name=''
+	    hit=''
+	    item.hits.each do |ht| #Clasificacion hits del blast en hits-hsps
+	      if ht.subject_id != last_hit_name #Hit
+	      	hit=contig.add_hit(ht.subject_id, ht.full_subject_length, ht.q_frame, hit_type)
+	      end
+	      hsp=hit.add_hsp(ht.q_beg+1, ht.q_end+1, ht.s_beg+1, ht.s_end+1, ht.align_len, ht.score, ht.ident, ht.gaps) # +1 xq gema parser blast resta 1 a todo
+	      hsp.type='match_part'
+	      last_hit_name=ht.subject_id
+	    end
+	    contig.hits_sort!
+	  end
+	end
+	def populate_extra_atributes(contig,item)
+	end
+end