gene_assembler 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in gene_assembler.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # GeneAssembler
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'gene_assembler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install gene_assembler
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it ( https://github.com/[my-github-username]/gene_assembler/fork )
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create a new Pull Request
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,233 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ROOT_PATH=File.dirname(__FILE__)
4
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/"))
5
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/gene_assembler/"))
6
+
7
+ require 'optparse'
8
+ require 'scbi_fasta'
9
+ require 'parser_blast'
10
+ require 'parser_exonerate'
11
+ require 'dataset'
12
+ require 'rebuild'
13
+ require 'other_functions'
14
+
15
+
16
+ # INPUT PARSING
17
+ #################################################################################################
18
+ options = {}
19
+
20
+ optparse = OptionParser.new do |opts|
21
+ options[:fasta] = 'contigsMC.fasta'
22
+ opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
23
+ options[:fasta] = file
24
+ end
25
+
26
+ options[:db] = 'dual_prot.fasta'
27
+ opts.on( '-d', '--database FILE', 'Blast database' ) do |db|
28
+ options[:db] = db
29
+ end
30
+
31
+ options[:reference] = 'reference'
32
+ opts.on( '-r', '--reference FILE', 'Gene models reference file' ) do |ref|
33
+ options[:reference] = ref
34
+ end
35
+
36
+ options[:evalue] = 1.0e-3
37
+ opts.on( '-e', '--evalue EVALUE', 'e value threshold to consider as reliable the orthologue sequence. Default=1.0e-3' ) do |evalue|
38
+ options[:evalue] = evalue.to_f
39
+ end
40
+
41
+ options[:verbose] = FALSE
42
+ opts.on( '-v', '--verbose', 'Default=0' ) do |verbose|
43
+ options[:verbose] = TRUE
44
+ $verbose=TRUE
45
+ end
46
+
47
+ options[:overwrite] = FALSE
48
+ opts.on( '-o', '--overwrite', 'Default=FALSE' ) do |overwrite|
49
+ options[:overwrite] = TRUE
50
+ end
51
+
52
+ options[:web] = FALSE
53
+ opts.on( '-w', '--web', 'Default=FALSE' ) do |web|
54
+ options[:web] = TRUE
55
+ end
56
+
57
+ options[:index] = FALSE
58
+ opts.on( '-i', '--index', 'Default=FALSE' ) do |index|
59
+ options[:index] = TRUE
60
+ end
61
+
62
+ options[:rebuild] = TRUE
63
+ opts.on( '-n', '--nrebuild', 'Default=1' ) do |rebuild|
64
+ options[:rebuild] = FALSE
65
+ end
66
+
67
+ options[:cpu] = 1
68
+ opts.on( '-c', '--cpu CPU', 'Default=2' ) do |cpu|
69
+ options[:cpu] = cpu
70
+ end
71
+
72
+ # Set a banner, displayed at the top of the help screen.
73
+ opts.banner = "Usage: GeneEnsambler -f input.fasta -e evalue -c n_cpu \n\n"
74
+
75
+ # This displays the help screen
76
+ opts.on( '-h', '--help', 'Display this screen' ) do
77
+ puts opts
78
+ exit
79
+ end
80
+ end # End opts
81
+
82
+ # parse options and remove from ARGV
83
+ optparse.parse!
84
+
85
+ # I/O FILES
86
+ #####################################################################################
87
+ # comprueba si existen los ficheros de entrada
88
+ if !File.exists?(options[:fasta])||!File.exists?(options[:db])
89
+ puts "File #{options[:fasta]} or #{options[:db]} doesn't exists"
90
+ Process.exit(-1)
91
+ end
92
+
93
+ path={}
94
+ # Directories path
95
+ path[:db]=File.join(Dir.pwd,'db') #Database folder
96
+ path[:temp]=File.join(Dir.pwd,'temp') #Temp folder
97
+ path[:local]=File.join(Dir.pwd,'temp/local')
98
+ path[:ouput_files]=File.join(Dir.pwd,'output_files') #Output folder
99
+
100
+ # Create work directories
101
+ path.each do |key, directory|
102
+ if !File.exists?(directory)
103
+ Dir.mkdir(directory)
104
+ end
105
+ end
106
+
107
+ # Files path
108
+ path[:fasta]=File.join(path[:ouput_files],'gene_models.fasta')
109
+ path[:gff]=File.join(path[:ouput_files],'gene_capture.gff')
110
+ path[:error]=File.join(path[:ouput_files],'Error.log')
111
+ path[:html]=File.join(path[:ouput_files],'Index.html')
112
+ path[:prime]=File.join(path[:ouput_files],'5_prime_data.txt')
113
+ path[:fasta_prime]=File.join(path[:ouput_files],'5_prime_region.fasta')
114
+ path[:db_prot]=options[:db] # ???
115
+ path[:blast_db]=File.join(path[:db],File.basename(options[:db]))
116
+ path[:blast_output]=File.join(path[:temp],File.basename(options[:db])+'.blast')
117
+ path[:exonerate_db]=options[:db]
118
+ path[:exonerate_output]=File.join(path[:temp],File.basename(options[:db])+'.ex')
119
+ path[:exonerate_input_fasta]=File.join(path[:db],File.basename(options[:db])+'.fasta')
120
+ path[:reference]= options[:reference]
121
+
122
+ # Links path
123
+ path[:gbrowse_link]='http://10.247.129.19/cgi-bin/gbrowse/ostra2/?name=Sequence:'
124
+
125
+ if File.exists?(path[:gff]) #Delete gff report for creation a new one
126
+ FileUtils.rm(path[:gff])
127
+ end
128
+
129
+ # BLASTING
130
+ #######################################################################################
131
+
132
+ # Creating blast db
133
+ #---------------------------------------
134
+ if !File.exists?(path[:blast_db]+'.psq')||options[:overwrite]
135
+ puts 'Creating DB '
136
+ var="makeblastdb -in #{options[:db]} -out #{path[:blast_db]} -dbtype prot -parse_seqids" #Protein
137
+ system(var)
138
+ puts 'DB created'
139
+ end
140
+
141
+ # Do blast
142
+ #---------------------------------------
143
+ if !File.exists?(path[:blast_output]) ||options[:overwrite]
144
+ puts 'Start blastx'
145
+ cmd="blastx -query #{options[:fasta]} -db #{path[:blast_db]} -outfmt '7 qseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore score qframe sframe qseq sseq qlen slen stitle' -evalue #{options[:evalue]} -max_target_seqs 1 -out #{path[:blast_output]} -num_threads #{options[:cpu]}"
146
+ system(cmd)
147
+ puts "Blastx has finished"
148
+ end
149
+
150
+ #Parsing blast (blast to class)
151
+ #-------------------------------------------
152
+ puts 'Parsing blast_prot:'
153
+ store_blast = ParserBlast.new('contig','nucleotide_match', path[:blast_output])
154
+ store_prot_blast = store_blast.dataset
155
+ store_prot_blast.correct_hsp_contigs('s') #Subject
156
+ store_prot_blast.correct_hsp_contigs('q') #Query
157
+ puts 'End parsing'
158
+
159
+ #Save relationship contig-protein for debbuging
160
+ #-------------------------------------------------
161
+ if options[:index]
162
+ index=File.open(File.join(path[:ouput_files],'contig_index'),'w')
163
+ store_prot_blast.each_contig {|contig|
164
+ index.puts contig.name+"\t"+ contig.first_hit.name
165
+ }
166
+ index.close
167
+ end
168
+
169
+ # Filtering results
170
+ #-----------------------------------------------
171
+ puts 'FILTERING BLAST'
172
+ store_prot_uni_hsp=Dataset.new(:prot)
173
+ store_prot_uni_hsp.transfer_contigs(store_prot_blast.filtering)
174
+ puts "FILTERING BLAST FINISHED\n"
175
+
176
+ # EXONERATING
177
+ ###########################################################################################
178
+
179
+ # Loading sequences for exonerate fasta
180
+ #---------------------------------------
181
+ puts 'Loading sequences for exonerate fasta'
182
+ seqs = fasta_hash(options[:fasta])
183
+ store_prot_blast.load_seq(seqs) #Cargar secuencia de los contigs en la clase
184
+ store_prot_uni_hsp.load_seq(seqs)
185
+ seqs = nil
186
+ store_prot_blast.rev_comp #Hacer reversocomplementaria en caso de no estar la secuencia en +, de todos los contig
187
+ store_prot_uni_hsp.rev_comp
188
+
189
+ # Do exonerate
190
+ #---------------------------------------
191
+ puts 'START EXONERATE'
192
+ if !File.exists?(path[:exonerate_output]) ||options[:overwrite]
193
+ store_prot_blast.fasta(path[:exonerate_input_fasta])
194
+ cmd="exonerate -q #{options[:db]} -t #{path[:exonerate_input_fasta]} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --showvulgar --useaatla 1 > #{path[:exonerate_output]}"
195
+ system(cmd)
196
+ end
197
+ puts 'EXONERATE FINISHED'
198
+
199
+ # Parsing exonerate (exonerate to class)
200
+ #-------------------------------------------
201
+ store_exonerate = ParserExonerate.new('contig','nucleotide_match', path[:exonerate_output])
202
+ store_prot_exonerate = store_exonerate.dataset
203
+ store_prot_exonerate.attrib_recover(store_prot_blast)
204
+ store_prot_exonerate.score_correction(30) #Correccion intron penalty del exonerate (el programa tiene un bug q impide hacerlo desde el mismo)
205
+
206
+ # Filtering results
207
+ #-----------------------------------------------
208
+ puts 'FILTERING EXONERATE'
209
+ store_prot_uni_hsp.transfer_contigs(store_prot_exonerate.filtering)
210
+ puts "Filtering exonerate finished\n"
211
+
212
+ # CLUSTERING
213
+ #################################################################################
214
+ store_prot_exonerate.clustering #Clusterizado contigs que han pasado los filtros
215
+ store_prot_exonerate.info_clusters
216
+ store_prot_uni_hsp.clustering #Clusterizado contigs uni-hsp
217
+
218
+ # Rescue of missed genes and contigs (store_prot_uni_hsp to store_prot_exonerate)
219
+ #----------------------------------------------------------------------------
220
+ store_prot_exonerate.missing_cluster_transfer(store_prot_uni_hsp)
221
+ store_prot_exonerate.missing_contigs_transfer(store_prot_uni_hsp)
222
+
223
+ # GENE REBUILD
224
+ ######################################################################################
225
+ puts 'Gene rebuild start'
226
+ store_prot_exonerate.load_references(path[:reference])
227
+ store_prot_exonerate.sort_cont_clust #Ordenar contigs de menor a mayor en base a su primer hsp
228
+ store_prot_uni_hsp.sort_cont_clust #Ordenar contigs uni-hsp
229
+ store_prot_exonerate.generate_file_5_prime(path[:prime], path[:fasta_prime])
230
+ rebuild=Rebuild.new(store_prot_exonerate,store_prot_uni_hsp,path)
231
+ rebuild.rebuild(options)
232
+ puts "\nGene rebuild finished"
233
+
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ROOT_PATH=File.dirname(__FILE__)
4
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/"))
5
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/gene_assembler/"))
6
+
7
+
8
+ require 'gff_parser'
9
+ #Input gff and blast result with outfmt 6
10
+
11
+ # Load Arabidopsis gene features in hash (only CDS), file used: NCBI_Chr1.tbl downloaded of TAIR
12
+
13
+ gff=Gff_parser.new(ARGV[0]).dataset
14
+
15
+ # Load relationship between a pool of proteins and Arabidopsis gene's pool in hash, file used: tblastn report with outfmt 6. Name has two partsm first is gene name and second is the mRNA name. mRNA name i used like gene_name
16
+ model_hash={}
17
+ File.open(ARGV[1],'r').each do |line|
18
+ fields=line.split
19
+ gene_name=fields[1].split('+')
20
+ if !model_hash.key?(fields[0])
21
+ model_hash[fields[0]]=[gene_name[1].gsub('id','')]
22
+ else
23
+ model_hash[fields[0]] << gene_name[1].gsub('id','')
24
+ end
25
+ end
26
+
27
+
28
+ # Write crossover between blast and gff
29
+ max_exones=100
30
+ genes_exones=Array.new(max_exones,0)
31
+ ref=File.open('references','w')
32
+ model_hash.each do |gene_names|
33
+ ref.print gene_names[0]+"\t"
34
+ gene_names[1].each do |mRNA_name|
35
+ mRNA=gff.feature(mRNA_name)
36
+ if !mRNA.nil?
37
+ mRNA_structure=mRNA.cds
38
+ length=mRNA_structure.length-1
39
+ genes_exones[length]+=1
40
+ first_exon=mRNA_structure.first
41
+ origin=nil
42
+ if mRNA.strand =='-' # Detect reverse structures
43
+ origin=mRNA_structure.first.last-1
44
+ mRNA_structure.each do |exon|
45
+ exon.reverse!
46
+ end
47
+ else
48
+ origin=mRNA_structure.first.first-1
49
+ end
50
+ mRNA_structure.each do |exon|
51
+ ref.print "#{(exon[0]-=origin).abs}-#{(exon[1]-=origin).abs};"
52
+ end
53
+ ref.print '|'
54
+ end
55
+ end
56
+ ref.puts
57
+ end
58
+ ref.close
59
+ #puts 'GENES-EXONES'
60
+ #puts genes_exones.inspect
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gene_assembler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "gene_assembler"
8
+ spec.version = GeneAssembler::VERSION
9
+ spec.authors = ["Pedro Seoane"]
10
+ spec.email = ["seoanezonjic@hotmail.es"]
11
+ spec.summary = %q{This gem builds gene models using fragmented genome information.}
12
+ spec.description = %q{Use this siftware with techniques like genecapture}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_runtime_dependency 'scbi_blast', '>= 0.0.43'
22
+ spec.add_runtime_dependency 'scbi_fasta', '>= 0.1.9'
23
+ spec.add_development_dependency "bundler", "~> 1.6"
24
+ spec.add_development_dependency "rake"
25
+ end
@@ -0,0 +1,5 @@
1
+ require "gene_assembler/version"
2
+
3
+ module GeneAssembler
4
+ # Your code goes here...
5
+ end
@@ -0,0 +1,41 @@
1
+ require 'parser'
2
+
3
+ class BlastTypeParser < Parser
4
+
5
+ def initialize(contig_type,hit_type,file,all=FALSE)
6
+ @file=file
7
+ @dataset=create_dataset
8
+ @all=all
9
+ data=parse_file(file)
10
+ load_dataset(data,contig_type,hit_type)
11
+ end
12
+
13
+ def load_dataset(data,contig_type,hit_type) # Introduce datos del blast en clases contig/hit/hsp
14
+ data.querys.each do |item|
15
+ if item.hits.empty? #Descartamos querys q no hayan dado nigun match
16
+ next
17
+ end
18
+ contig=@dataset.add_contig(item.query_def) #query_def -> nombre de la query (nuestro contig)
19
+ contig.length=item.full_query_length #full_query_length -> longitud de la query
20
+ contig.type=contig_type
21
+ populate_extra_atributes(contig,item)
22
+
23
+ last_hit_name=''
24
+ hit=''
25
+ item.hits.each do |ht| #Clasificacion hits del blast en hits-hsps
26
+ if ht.subject_id != last_hit_name #Hit
27
+ hit=contig.add_hit(ht.subject_id, ht.full_subject_length, ht.q_frame, hit_type)
28
+ end
29
+ hsp=hit.add_hsp(ht.q_beg+1, ht.q_end+1, ht.s_beg+1, ht.s_end+1, ht.align_len, ht.score, ht.ident, ht.gaps) # +1 xq gema parser blast resta 1 a todo
30
+ hsp.type='match_part'
31
+ last_hit_name=ht.subject_id
32
+ end
33
+ contig.hits_sort!
34
+ end
35
+ end
36
+
37
+ def populate_extra_atributes(contig,item)
38
+
39
+ end
40
+
41
+ end