RubyGems - gene_assembler - Versions diffs - 0.0.9 - Mend

gene_assembler 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/.gitignore +22 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +29 -0
data/Rakefile +2 -0
data/bin/GeneAssembler +233 -0
data/bin/phytozome_scan +60 -0
data/gene_assembler.gemspec +25 -0
data/lib/gene_assembler.rb +5 -0
data/lib/gene_assembler/blast_type_parser.rb +41 -0
data/lib/gene_assembler/contig.rb +643 -0
data/lib/gene_assembler/dataset.rb +532 -0
data/lib/gene_assembler/exonerate_result.rb +230 -0
data/lib/gene_assembler/gff_contig.rb +67 -0
data/lib/gene_assembler/gff_dataset.rb +152 -0
data/lib/gene_assembler/gff_feature.rb +175 -0
data/lib/gene_assembler/gff_frameshift.rb +6 -0
data/lib/gene_assembler/gff_go.rb +13 -0
data/lib/gene_assembler/gff_hit.rb +53 -0
data/lib/gene_assembler/gff_hsp.rb +6 -0
data/lib/gene_assembler/gff_localization.rb +6 -0
data/lib/gene_assembler/gff_master_feature.rb +5 -0
data/lib/gene_assembler/gff_parser.rb +35 -0
data/lib/gene_assembler/gff_snp.rb +21 -0
data/lib/gene_assembler/gff_stop.rb +6 -0
data/lib/gene_assembler/go.rb +13 -0
data/lib/gene_assembler/hit.rb +191 -0
data/lib/gene_assembler/hsp.rb +100 -0
data/lib/gene_assembler/other_functions.rb +228 -0
data/lib/gene_assembler/parser.rb +25 -0
data/lib/gene_assembler/parser_blast.rb +12 -0
data/lib/gene_assembler/parser_exonerate.rb +16 -0
data/lib/gene_assembler/rebuild.rb +975 -0
data/lib/gene_assembler/report.rb +13 -0
data/lib/gene_assembler/report_gff.rb +30 -0
data/lib/gene_assembler/snp.rb +13 -0
data/lib/gene_assembler/version.rb +3 -0
metadata +149 -0

data/.gitignore ADDED

@@ -0,0 +1,22 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+*.bundle
+*.so
+*.o
+*.a
+mkmf.log

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in gene_assembler.gemspec
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2014 TODO: Write your name
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,29 @@
+# GeneAssembler
+TODO: Write a gem description
+## Installation
+Add this line to your application's Gemfile:
+    gem 'gene_assembler'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install gene_assembler
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it ( https://github.com/[my-github-username]/gene_assembler/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED

	@@ -0,0 +1,2 @@
1	+ require "bundler/gem_tasks"
2	+

data/bin/GeneAssembler ADDED

@@ -0,0 +1,233 @@
+#!/usr/bin/env ruby
+ROOT_PATH=File.dirname(__FILE__)
+$: << File.expand_path(File.join(ROOT_PATH, "../lib/"))
+$: << File.expand_path(File.join(ROOT_PATH, "../lib/gene_assembler/"))
+require 'optparse'
+require 'scbi_fasta'
+require 'parser_blast'
+require 'parser_exonerate'
+require 'dataset'
+require 'rebuild'
+require 'other_functions'
+# INPUT PARSING
+#################################################################################################
+options = {}
+optparse = OptionParser.new do |opts|
+	options[:fasta] = 'contigsMC.fasta'
+	opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
+		options[:fasta] = file
+	end
+	options[:db] = 'dual_prot.fasta'
+	opts.on( '-d', '--database FILE', 'Blast database' ) do |db|
+		options[:db] = db
+	end
+	options[:reference] = 'reference'
+	opts.on( '-r', '--reference FILE', 'Gene models reference file' ) do |ref|
+		options[:reference] = ref
+	end
+	options[:evalue] = 1.0e-3
+	opts.on( '-e', '--evalue EVALUE', 'e value threshold to consider as reliable the orthologue sequence. Default=1.0e-3' ) do |evalue|
+		options[:evalue] = evalue.to_f
+	end
+	options[:verbose] = FALSE
+	opts.on( '-v', '--verbose', 'Default=0' ) do |verbose|
+		options[:verbose] = TRUE
+		$verbose=TRUE
+	end
+	options[:overwrite] = FALSE
+	opts.on( '-o', '--overwrite', 'Default=FALSE' ) do |overwrite|
+		options[:overwrite] = TRUE
+	end
+	options[:web] = FALSE
+	opts.on( '-w', '--web', 'Default=FALSE' ) do |web|
+	options[:web] = TRUE
+	end
+	options[:index] = FALSE
+	opts.on( '-i', '--index', 'Default=FALSE' ) do |index|
+	options[:index] = TRUE
+	end
+	options[:rebuild] = TRUE
+	opts.on( '-n', '--nrebuild', 'Default=1' ) do |rebuild|
+		options[:rebuild] = FALSE
+	end
+	options[:cpu] = 1
+	opts.on( '-c', '--cpu CPU', 'Default=2' ) do |cpu|
+		options[:cpu] = cpu
+	end
+	# Set a banner, displayed at the top of the help screen.
+	opts.banner = "Usage: GeneEnsambler -f input.fasta -e evalue -c n_cpu \n\n"
+	# This displays the help screen
+	opts.on( '-h', '--help', 'Display this screen' ) do
+		puts opts
+		exit
+	end
+end # End opts
+# parse options and remove from ARGV
+optparse.parse!
+# I/O FILES
+#####################################################################################
+# comprueba si existen los ficheros de entrada
+if !File.exists?(options[:fasta])||!File.exists?(options[:db])
+  puts "File #{options[:fasta]} or #{options[:db]} doesn't exists"
+  Process.exit(-1)
+end
+path={}
+# Directories path
+path[:db]=File.join(Dir.pwd,'db') #Database folder
+path[:temp]=File.join(Dir.pwd,'temp') #Temp folder
+path[:local]=File.join(Dir.pwd,'temp/local')
+path[:ouput_files]=File.join(Dir.pwd,'output_files') #Output folder
+# Create work directories
+path.each do |key, directory|
+  if !File.exists?(directory)
+    Dir.mkdir(directory)
+  end
+end
+# Files path
+path[:fasta]=File.join(path[:ouput_files],'gene_models.fasta')
+path[:gff]=File.join(path[:ouput_files],'gene_capture.gff')
+path[:error]=File.join(path[:ouput_files],'Error.log')
+path[:html]=File.join(path[:ouput_files],'Index.html')
+path[:prime]=File.join(path[:ouput_files],'5_prime_data.txt')
+path[:fasta_prime]=File.join(path[:ouput_files],'5_prime_region.fasta')
+path[:db_prot]=options[:db] # ???
+path[:blast_db]=File.join(path[:db],File.basename(options[:db]))
+path[:blast_output]=File.join(path[:temp],File.basename(options[:db])+'.blast')
+path[:exonerate_db]=options[:db]
+path[:exonerate_output]=File.join(path[:temp],File.basename(options[:db])+'.ex')
+path[:exonerate_input_fasta]=File.join(path[:db],File.basename(options[:db])+'.fasta')
+path[:reference]= options[:reference]
+# Links path
+path[:gbrowse_link]='http://10.247.129.19/cgi-bin/gbrowse/ostra2/?name=Sequence:'
+if File.exists?(path[:gff]) #Delete gff report for creation a new one
+  FileUtils.rm(path[:gff])
+end
+# BLASTING
+#######################################################################################
+# Creating blast db
+#---------------------------------------
+if !File.exists?(path[:blast_db]+'.psq')||options[:overwrite]
+	puts 'Creating DB '
+	var="makeblastdb -in #{options[:db]} -out #{path[:blast_db]} -dbtype prot -parse_seqids" #Protein
+	system(var)
+	puts 'DB created'
+end
+# Do blast
+#---------------------------------------
+if !File.exists?(path[:blast_output]) ||options[:overwrite]
+	puts 'Start blastx'
+	cmd="blastx -query #{options[:fasta]} -db #{path[:blast_db]} -outfmt '7 qseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore score qframe sframe qseq sseq qlen slen stitle' -evalue #{options[:evalue]} -max_target_seqs 1 -out #{path[:blast_output]} -num_threads #{options[:cpu]}"
+	system(cmd)
+	puts "Blastx has finished"
+end
+#Parsing blast (blast to class)
+#-------------------------------------------
+puts 'Parsing blast_prot:'
+store_blast = ParserBlast.new('contig','nucleotide_match', path[:blast_output])
+store_prot_blast = store_blast.dataset
+store_prot_blast.correct_hsp_contigs('s') #Subject
+store_prot_blast.correct_hsp_contigs('q') #Query
+puts 'End parsing'
+#Save relationship contig-protein for debbuging
+#-------------------------------------------------
+if options[:index]
+  index=File.open(File.join(path[:ouput_files],'contig_index'),'w')
+  store_prot_blast.each_contig {|contig|
+    index.puts contig.name+"\t"+ contig.first_hit.name
+  }
+  index.close
+end
+# Filtering results
+#-----------------------------------------------
+puts 'FILTERING BLAST'
+store_prot_uni_hsp=Dataset.new(:prot)
+store_prot_uni_hsp.transfer_contigs(store_prot_blast.filtering)
+puts "FILTERING BLAST FINISHED\n"
+# EXONERATING
+###########################################################################################
+# Loading sequences for exonerate fasta
+#---------------------------------------
+puts 'Loading sequences for exonerate fasta'
+seqs = fasta_hash(options[:fasta])
+store_prot_blast.load_seq(seqs) #Cargar secuencia de los contigs en la clase
+store_prot_uni_hsp.load_seq(seqs)
+seqs = nil
+store_prot_blast.rev_comp #Hacer reversocomplementaria en caso de no estar la secuencia en +, de todos los contig
+store_prot_uni_hsp.rev_comp
+# Do exonerate
+#---------------------------------------
+puts 'START EXONERATE'
+if !File.exists?(path[:exonerate_output]) ||options[:overwrite]
+  store_prot_blast.fasta(path[:exonerate_input_fasta])
+  cmd="exonerate -q #{options[:db]} -t #{path[:exonerate_input_fasta]} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --showvulgar --useaatla 1 > #{path[:exonerate_output]}"
+  system(cmd)
+end
+puts 'EXONERATE FINISHED'
+# Parsing exonerate (exonerate to class)
+#-------------------------------------------
+store_exonerate = ParserExonerate.new('contig','nucleotide_match', path[:exonerate_output])
+store_prot_exonerate = store_exonerate.dataset
+store_prot_exonerate.attrib_recover(store_prot_blast)
+store_prot_exonerate.score_correction(30) #Correccion intron penalty del exonerate (el programa tiene un bug q impide hacerlo desde el mismo)
+# Filtering results
+#-----------------------------------------------
+puts 'FILTERING EXONERATE'
+store_prot_uni_hsp.transfer_contigs(store_prot_exonerate.filtering)
+puts "Filtering exonerate finished\n"
+# CLUSTERING
+#################################################################################
+store_prot_exonerate.clustering #Clusterizado contigs que han pasado los filtros
+store_prot_exonerate.info_clusters
+store_prot_uni_hsp.clustering #Clusterizado contigs uni-hsp
+# Rescue of missed genes and contigs (store_prot_uni_hsp to store_prot_exonerate)
+#----------------------------------------------------------------------------
+store_prot_exonerate.missing_cluster_transfer(store_prot_uni_hsp)
+store_prot_exonerate.missing_contigs_transfer(store_prot_uni_hsp)
+# GENE REBUILD
+######################################################################################
+puts 'Gene rebuild start'
+store_prot_exonerate.load_references(path[:reference])
+store_prot_exonerate.sort_cont_clust #Ordenar contigs de menor a mayor en base a su primer hsp
+store_prot_uni_hsp.sort_cont_clust #Ordenar contigs uni-hsp
+store_prot_exonerate.generate_file_5_prime(path[:prime], path[:fasta_prime])
+rebuild=Rebuild.new(store_prot_exonerate,store_prot_uni_hsp,path)
+rebuild.rebuild(options)
+puts "\nGene rebuild finished"

data/bin/phytozome_scan ADDED

@@ -0,0 +1,60 @@
+#!/usr/bin/env ruby
+ROOT_PATH=File.dirname(__FILE__)
+$: << File.expand_path(File.join(ROOT_PATH, "../lib/"))
+$: << File.expand_path(File.join(ROOT_PATH, "../lib/gene_assembler/"))
+require 'gff_parser'
+#Input gff and blast result with outfmt 6
+# Load Arabidopsis gene features in hash (only CDS), file used: NCBI_Chr1.tbl downloaded of TAIR
+gff=Gff_parser.new(ARGV[0]).dataset
+# Load relationship between a pool of proteins and Arabidopsis gene's pool in hash, file used: tblastn report with outfmt 6. Name has two partsm first is gene name and second is the mRNA name. mRNA name i used like gene_name
+model_hash={}
+File.open(ARGV[1],'r').each do |line|
+	fields=line.split
+	gene_name=fields[1].split('+')
+	if !model_hash.key?(fields[0])
+		model_hash[fields[0]]=[gene_name[1].gsub('id','')]
+	else
+		model_hash[fields[0]] << gene_name[1].gsub('id','')
+	end
+end
+# Write crossover between blast and gff
+max_exones=100
+genes_exones=Array.new(max_exones,0)
+ref=File.open('references','w')
+model_hash.each do |gene_names|
+	ref.print gene_names[0]+"\t"
+	gene_names[1].each do |mRNA_name|
+		mRNA=gff.feature(mRNA_name)
+		if !mRNA.nil?
+			mRNA_structure=mRNA.cds
+			length=mRNA_structure.length-1
+			genes_exones[length]+=1
+			first_exon=mRNA_structure.first
+			origin=nil
+			if mRNA.strand =='-' # Detect reverse structures
+				origin=mRNA_structure.first.last-1
+				mRNA_structure.each do |exon|
+					exon.reverse!
+				end
+			else
+				origin=mRNA_structure.first.first-1
+			end
+			mRNA_structure.each do |exon|
+				ref.print "#{(exon[0]-=origin).abs}-#{(exon[1]-=origin).abs};"
+			end
+			ref.print '|'
+		end
+	end
+	ref.puts
+end
+ref.close
+#puts 'GENES-EXONES'
+#puts genes_exones.inspect

data/gene_assembler.gemspec ADDED

@@ -0,0 +1,25 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'gene_assembler/version'
+Gem::Specification.new do |spec|
+  spec.name          = "gene_assembler"
+  spec.version       = GeneAssembler::VERSION
+  spec.authors       = ["Pedro Seoane"]
+  spec.email         = ["seoanezonjic@hotmail.es"]
+  spec.summary       = %q{This gem builds gene models using fragmented genome information.}
+  spec.description   = %q{Use this siftware with techniques like genecapture}
+  spec.homepage      = ""
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_runtime_dependency 'scbi_blast', '>= 0.0.43'
+  spec.add_runtime_dependency 'scbi_fasta', '>= 0.1.9'
+  spec.add_development_dependency "bundler", "~> 1.6"
+  spec.add_development_dependency "rake"
+end

data/lib/gene_assembler.rb ADDED

@@ -0,0 +1,5 @@
+require "gene_assembler/version"
+module GeneAssembler
+  # Your code goes here...
+end

data/lib/gene_assembler/blast_type_parser.rb ADDED

@@ -0,0 +1,41 @@
+require 'parser'
+class BlastTypeParser < Parser
+	def initialize(contig_type,hit_type,file,all=FALSE)
+    @file=file
+		@dataset=create_dataset
+		@all=all
+		data=parse_file(file)
+		load_dataset(data,contig_type,hit_type)
+	end
+	def load_dataset(data,contig_type,hit_type) # Introduce datos del blast en clases contig/hit/hsp
+	  data.querys.each do |item|
+	  	if item.hits.empty? #Descartamos querys q no hayan dado nigun match
+	  		next
+	  	end
+	    contig=@dataset.add_contig(item.query_def) #query_def -> nombre de la query (nuestro contig)
+	    contig.length=item.full_query_length #full_query_length -> longitud de la query
+	    contig.type=contig_type
+	    populate_extra_atributes(contig,item)
+	    last_hit_name=''
+	    hit=''
+	    item.hits.each do |ht| #Clasificacion hits del blast en hits-hsps
+	      if ht.subject_id != last_hit_name #Hit
+	      	hit=contig.add_hit(ht.subject_id, ht.full_subject_length, ht.q_frame, hit_type)
+	      end
+	      hsp=hit.add_hsp(ht.q_beg+1, ht.q_end+1, ht.s_beg+1, ht.s_end+1, ht.align_len, ht.score, ht.ident, ht.gaps) # +1 xq gema parser blast resta 1 a todo
+	      hsp.type='match_part'
+	      last_hit_name=ht.subject_id
+	    end
+	    contig.hits_sort!
+	  end
+	end
+	def populate_extra_atributes(contig,item)
+	end
+end