gene_assembler 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +29 -0
  5. data/Rakefile +2 -0
  6. data/bin/GeneAssembler +233 -0
  7. data/bin/phytozome_scan +60 -0
  8. data/gene_assembler.gemspec +25 -0
  9. data/lib/gene_assembler.rb +5 -0
  10. data/lib/gene_assembler/blast_type_parser.rb +41 -0
  11. data/lib/gene_assembler/contig.rb +643 -0
  12. data/lib/gene_assembler/dataset.rb +532 -0
  13. data/lib/gene_assembler/exonerate_result.rb +230 -0
  14. data/lib/gene_assembler/gff_contig.rb +67 -0
  15. data/lib/gene_assembler/gff_dataset.rb +152 -0
  16. data/lib/gene_assembler/gff_feature.rb +175 -0
  17. data/lib/gene_assembler/gff_frameshift.rb +6 -0
  18. data/lib/gene_assembler/gff_go.rb +13 -0
  19. data/lib/gene_assembler/gff_hit.rb +53 -0
  20. data/lib/gene_assembler/gff_hsp.rb +6 -0
  21. data/lib/gene_assembler/gff_localization.rb +6 -0
  22. data/lib/gene_assembler/gff_master_feature.rb +5 -0
  23. data/lib/gene_assembler/gff_parser.rb +35 -0
  24. data/lib/gene_assembler/gff_snp.rb +21 -0
  25. data/lib/gene_assembler/gff_stop.rb +6 -0
  26. data/lib/gene_assembler/go.rb +13 -0
  27. data/lib/gene_assembler/hit.rb +191 -0
  28. data/lib/gene_assembler/hsp.rb +100 -0
  29. data/lib/gene_assembler/other_functions.rb +228 -0
  30. data/lib/gene_assembler/parser.rb +25 -0
  31. data/lib/gene_assembler/parser_blast.rb +12 -0
  32. data/lib/gene_assembler/parser_exonerate.rb +16 -0
  33. data/lib/gene_assembler/rebuild.rb +975 -0
  34. data/lib/gene_assembler/report.rb +13 -0
  35. data/lib/gene_assembler/report_gff.rb +30 -0
  36. data/lib/gene_assembler/snp.rb +13 -0
  37. data/lib/gene_assembler/version.rb +3 -0
  38. metadata +149 -0
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in gene_assembler.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # GeneAssembler
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'gene_assembler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install gene_assembler
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it ( https://github.com/[my-github-username]/gene_assembler/fork )
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create a new Pull Request
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,233 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ROOT_PATH=File.dirname(__FILE__)
4
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/"))
5
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/gene_assembler/"))
6
+
7
+ require 'optparse'
8
+ require 'scbi_fasta'
9
+ require 'parser_blast'
10
+ require 'parser_exonerate'
11
+ require 'dataset'
12
+ require 'rebuild'
13
+ require 'other_functions'
14
+
15
+
16
+ # INPUT PARSING
17
+ #################################################################################################
18
+ options = {}
19
+
20
+ optparse = OptionParser.new do |opts|
21
+ options[:fasta] = 'contigsMC.fasta'
22
+ opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
23
+ options[:fasta] = file
24
+ end
25
+
26
+ options[:db] = 'dual_prot.fasta'
27
+ opts.on( '-d', '--database FILE', 'Blast database' ) do |db|
28
+ options[:db] = db
29
+ end
30
+
31
+ options[:reference] = 'reference'
32
+ opts.on( '-r', '--reference FILE', 'Gene models reference file' ) do |ref|
33
+ options[:reference] = ref
34
+ end
35
+
36
+ options[:evalue] = 1.0e-3
37
+ opts.on( '-e', '--evalue EVALUE', 'e value threshold to consider as reliable the orthologue sequence. Default=1.0e-3' ) do |evalue|
38
+ options[:evalue] = evalue.to_f
39
+ end
40
+
41
+ options[:verbose] = FALSE
42
+ opts.on( '-v', '--verbose', 'Default=0' ) do |verbose|
43
+ options[:verbose] = TRUE
44
+ $verbose=TRUE
45
+ end
46
+
47
+ options[:overwrite] = FALSE
48
+ opts.on( '-o', '--overwrite', 'Default=FALSE' ) do |overwrite|
49
+ options[:overwrite] = TRUE
50
+ end
51
+
52
+ options[:web] = FALSE
53
+ opts.on( '-w', '--web', 'Default=FALSE' ) do |web|
54
+ options[:web] = TRUE
55
+ end
56
+
57
+ options[:index] = FALSE
58
+ opts.on( '-i', '--index', 'Default=FALSE' ) do |index|
59
+ options[:index] = TRUE
60
+ end
61
+
62
+ options[:rebuild] = TRUE
63
+ opts.on( '-n', '--nrebuild', 'Default=1' ) do |rebuild|
64
+ options[:rebuild] = FALSE
65
+ end
66
+
67
+ options[:cpu] = 1
68
+ opts.on( '-c', '--cpu CPU', 'Default=2' ) do |cpu|
69
+ options[:cpu] = cpu
70
+ end
71
+
72
+ # Set a banner, displayed at the top of the help screen.
73
+ opts.banner = "Usage: GeneEnsambler -f input.fasta -e evalue -c n_cpu \n\n"
74
+
75
+ # This displays the help screen
76
+ opts.on( '-h', '--help', 'Display this screen' ) do
77
+ puts opts
78
+ exit
79
+ end
80
+ end # End opts
81
+
82
+ # parse options and remove from ARGV
83
+ optparse.parse!
84
+
85
+ # I/O FILES
86
+ #####################################################################################
87
+ # comprueba si existen los ficheros de entrada
88
+ if !File.exists?(options[:fasta])||!File.exists?(options[:db])
89
+ puts "File #{options[:fasta]} or #{options[:db]} doesn't exists"
90
+ Process.exit(-1)
91
+ end
92
+
93
+ path={}
94
+ # Directories path
95
+ path[:db]=File.join(Dir.pwd,'db') #Database folder
96
+ path[:temp]=File.join(Dir.pwd,'temp') #Temp folder
97
+ path[:local]=File.join(Dir.pwd,'temp/local')
98
+ path[:ouput_files]=File.join(Dir.pwd,'output_files') #Output folder
99
+
100
+ # Create work directories
101
+ path.each do |key, directory|
102
+ if !File.exists?(directory)
103
+ Dir.mkdir(directory)
104
+ end
105
+ end
106
+
107
+ # Files path
108
+ path[:fasta]=File.join(path[:ouput_files],'gene_models.fasta')
109
+ path[:gff]=File.join(path[:ouput_files],'gene_capture.gff')
110
+ path[:error]=File.join(path[:ouput_files],'Error.log')
111
+ path[:html]=File.join(path[:ouput_files],'Index.html')
112
+ path[:prime]=File.join(path[:ouput_files],'5_prime_data.txt')
113
+ path[:fasta_prime]=File.join(path[:ouput_files],'5_prime_region.fasta')
114
+ path[:db_prot]=options[:db] # ???
115
+ path[:blast_db]=File.join(path[:db],File.basename(options[:db]))
116
+ path[:blast_output]=File.join(path[:temp],File.basename(options[:db])+'.blast')
117
+ path[:exonerate_db]=options[:db]
118
+ path[:exonerate_output]=File.join(path[:temp],File.basename(options[:db])+'.ex')
119
+ path[:exonerate_input_fasta]=File.join(path[:db],File.basename(options[:db])+'.fasta')
120
+ path[:reference]= options[:reference]
121
+
122
+ # Links path
123
+ path[:gbrowse_link]='http://10.247.129.19/cgi-bin/gbrowse/ostra2/?name=Sequence:'
124
+
125
+ if File.exists?(path[:gff]) #Delete gff report for creation a new one
126
+ FileUtils.rm(path[:gff])
127
+ end
128
+
129
+ # BLASTING
130
+ #######################################################################################
131
+
132
+ # Creating blast db
133
+ #---------------------------------------
134
+ if !File.exists?(path[:blast_db]+'.psq')||options[:overwrite]
135
+ puts 'Creating DB '
136
+ var="makeblastdb -in #{options[:db]} -out #{path[:blast_db]} -dbtype prot -parse_seqids" #Protein
137
+ system(var)
138
+ puts 'DB created'
139
+ end
140
+
141
+ # Do blast
142
+ #---------------------------------------
143
+ if !File.exists?(path[:blast_output]) ||options[:overwrite]
144
+ puts 'Start blastx'
145
+ cmd="blastx -query #{options[:fasta]} -db #{path[:blast_db]} -outfmt '7 qseqid sacc pident length mismatch gapopen qstart qend sstart send evalue bitscore score qframe sframe qseq sseq qlen slen stitle' -evalue #{options[:evalue]} -max_target_seqs 1 -out #{path[:blast_output]} -num_threads #{options[:cpu]}"
146
+ system(cmd)
147
+ puts "Blastx has finished"
148
+ end
149
+
150
+ #Parsing blast (blast to class)
151
+ #-------------------------------------------
152
+ puts 'Parsing blast_prot:'
153
+ store_blast = ParserBlast.new('contig','nucleotide_match', path[:blast_output])
154
+ store_prot_blast = store_blast.dataset
155
+ store_prot_blast.correct_hsp_contigs('s') #Subject
156
+ store_prot_blast.correct_hsp_contigs('q') #Query
157
+ puts 'End parsing'
158
+
159
+ #Save relationship contig-protein for debbuging
160
+ #-------------------------------------------------
161
+ if options[:index]
162
+ index=File.open(File.join(path[:ouput_files],'contig_index'),'w')
163
+ store_prot_blast.each_contig {|contig|
164
+ index.puts contig.name+"\t"+ contig.first_hit.name
165
+ }
166
+ index.close
167
+ end
168
+
169
+ # Filtering results
170
+ #-----------------------------------------------
171
+ puts 'FILTERING BLAST'
172
+ store_prot_uni_hsp=Dataset.new(:prot)
173
+ store_prot_uni_hsp.transfer_contigs(store_prot_blast.filtering)
174
+ puts "FILTERING BLAST FINISHED\n"
175
+
176
+ # EXONERATING
177
+ ###########################################################################################
178
+
179
+ # Loading sequences for exonerate fasta
180
+ #---------------------------------------
181
+ puts 'Loading sequences for exonerate fasta'
182
+ seqs = fasta_hash(options[:fasta])
183
+ store_prot_blast.load_seq(seqs) #Cargar secuencia de los contigs en la clase
184
+ store_prot_uni_hsp.load_seq(seqs)
185
+ seqs = nil
186
+ store_prot_blast.rev_comp #Hacer reversocomplementaria en caso de no estar la secuencia en +, de todos los contig
187
+ store_prot_uni_hsp.rev_comp
188
+
189
+ # Do exonerate
190
+ #---------------------------------------
191
+ puts 'START EXONERATE'
192
+ if !File.exists?(path[:exonerate_output]) ||options[:overwrite]
193
+ store_prot_blast.fasta(path[:exonerate_input_fasta])
194
+ cmd="exonerate -q #{options[:db]} -t #{path[:exonerate_input_fasta]} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --showvulgar --useaatla 1 > #{path[:exonerate_output]}"
195
+ system(cmd)
196
+ end
197
+ puts 'EXONERATE FINISHED'
198
+
199
+ # Parsing exonerate (exonerate to class)
200
+ #-------------------------------------------
201
+ store_exonerate = ParserExonerate.new('contig','nucleotide_match', path[:exonerate_output])
202
+ store_prot_exonerate = store_exonerate.dataset
203
+ store_prot_exonerate.attrib_recover(store_prot_blast)
204
+ store_prot_exonerate.score_correction(30) #Correccion intron penalty del exonerate (el programa tiene un bug q impide hacerlo desde el mismo)
205
+
206
+ # Filtering results
207
+ #-----------------------------------------------
208
+ puts 'FILTERING EXONERATE'
209
+ store_prot_uni_hsp.transfer_contigs(store_prot_exonerate.filtering)
210
+ puts "Filtering exonerate finished\n"
211
+
212
+ # CLUSTERING
213
+ #################################################################################
214
+ store_prot_exonerate.clustering #Clusterizado contigs que han pasado los filtros
215
+ store_prot_exonerate.info_clusters
216
+ store_prot_uni_hsp.clustering #Clusterizado contigs uni-hsp
217
+
218
+ # Rescue of missed genes and contigs (store_prot_uni_hsp to store_prot_exonerate)
219
+ #----------------------------------------------------------------------------
220
+ store_prot_exonerate.missing_cluster_transfer(store_prot_uni_hsp)
221
+ store_prot_exonerate.missing_contigs_transfer(store_prot_uni_hsp)
222
+
223
+ # GENE REBUILD
224
+ ######################################################################################
225
+ puts 'Gene rebuild start'
226
+ store_prot_exonerate.load_references(path[:reference])
227
+ store_prot_exonerate.sort_cont_clust #Ordenar contigs de menor a mayor en base a su primer hsp
228
+ store_prot_uni_hsp.sort_cont_clust #Ordenar contigs uni-hsp
229
+ store_prot_exonerate.generate_file_5_prime(path[:prime], path[:fasta_prime])
230
+ rebuild=Rebuild.new(store_prot_exonerate,store_prot_uni_hsp,path)
231
+ rebuild.rebuild(options)
232
+ puts "\nGene rebuild finished"
233
+
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ROOT_PATH=File.dirname(__FILE__)
4
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/"))
5
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/gene_assembler/"))
6
+
7
+
8
+ require 'gff_parser'
9
+ #Input gff and blast result with outfmt 6
10
+
11
+ # Load Arabidopsis gene features in hash (only CDS), file used: NCBI_Chr1.tbl downloaded of TAIR
12
+
13
+ gff=Gff_parser.new(ARGV[0]).dataset
14
+
15
+ # Load relationship between a pool of proteins and Arabidopsis gene's pool in hash, file used: tblastn report with outfmt 6. Name has two partsm first is gene name and second is the mRNA name. mRNA name i used like gene_name
16
+ model_hash={}
17
+ File.open(ARGV[1],'r').each do |line|
18
+ fields=line.split
19
+ gene_name=fields[1].split('+')
20
+ if !model_hash.key?(fields[0])
21
+ model_hash[fields[0]]=[gene_name[1].gsub('id','')]
22
+ else
23
+ model_hash[fields[0]] << gene_name[1].gsub('id','')
24
+ end
25
+ end
26
+
27
+
28
+ # Write crossover between blast and gff
29
+ max_exones=100
30
+ genes_exones=Array.new(max_exones,0)
31
+ ref=File.open('references','w')
32
+ model_hash.each do |gene_names|
33
+ ref.print gene_names[0]+"\t"
34
+ gene_names[1].each do |mRNA_name|
35
+ mRNA=gff.feature(mRNA_name)
36
+ if !mRNA.nil?
37
+ mRNA_structure=mRNA.cds
38
+ length=mRNA_structure.length-1
39
+ genes_exones[length]+=1
40
+ first_exon=mRNA_structure.first
41
+ origin=nil
42
+ if mRNA.strand =='-' # Detect reverse structures
43
+ origin=mRNA_structure.first.last-1
44
+ mRNA_structure.each do |exon|
45
+ exon.reverse!
46
+ end
47
+ else
48
+ origin=mRNA_structure.first.first-1
49
+ end
50
+ mRNA_structure.each do |exon|
51
+ ref.print "#{(exon[0]-=origin).abs}-#{(exon[1]-=origin).abs};"
52
+ end
53
+ ref.print '|'
54
+ end
55
+ end
56
+ ref.puts
57
+ end
58
+ ref.close
59
+ #puts 'GENES-EXONES'
60
+ #puts genes_exones.inspect
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gene_assembler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "gene_assembler"
8
+ spec.version = GeneAssembler::VERSION
9
+ spec.authors = ["Pedro Seoane"]
10
+ spec.email = ["seoanezonjic@hotmail.es"]
11
+ spec.summary = %q{This gem builds gene models using fragmented genome information.}
12
+ spec.description = %q{Use this siftware with techniques like genecapture}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_runtime_dependency 'scbi_blast', '>= 0.0.43'
22
+ spec.add_runtime_dependency 'scbi_fasta', '>= 0.1.9'
23
+ spec.add_development_dependency "bundler", "~> 1.6"
24
+ spec.add_development_dependency "rake"
25
+ end
@@ -0,0 +1,5 @@
1
+ require "gene_assembler/version"
2
+
3
+ module GeneAssembler
4
+ # Your code goes here...
5
+ end
@@ -0,0 +1,41 @@
1
+ require 'parser'
2
+
3
+ class BlastTypeParser < Parser
4
+
5
+ def initialize(contig_type,hit_type,file,all=FALSE)
6
+ @file=file
7
+ @dataset=create_dataset
8
+ @all=all
9
+ data=parse_file(file)
10
+ load_dataset(data,contig_type,hit_type)
11
+ end
12
+
13
+ def load_dataset(data,contig_type,hit_type) # Introduce datos del blast en clases contig/hit/hsp
14
+ data.querys.each do |item|
15
+ if item.hits.empty? #Descartamos querys q no hayan dado nigun match
16
+ next
17
+ end
18
+ contig=@dataset.add_contig(item.query_def) #query_def -> nombre de la query (nuestro contig)
19
+ contig.length=item.full_query_length #full_query_length -> longitud de la query
20
+ contig.type=contig_type
21
+ populate_extra_atributes(contig,item)
22
+
23
+ last_hit_name=''
24
+ hit=''
25
+ item.hits.each do |ht| #Clasificacion hits del blast en hits-hsps
26
+ if ht.subject_id != last_hit_name #Hit
27
+ hit=contig.add_hit(ht.subject_id, ht.full_subject_length, ht.q_frame, hit_type)
28
+ end
29
+ hsp=hit.add_hsp(ht.q_beg+1, ht.q_end+1, ht.s_beg+1, ht.s_end+1, ht.align_len, ht.score, ht.ident, ht.gaps) # +1 xq gema parser blast resta 1 a todo
30
+ hsp.type='match_part'
31
+ last_hit_name=ht.subject_id
32
+ end
33
+ contig.hits_sort!
34
+ end
35
+ end
36
+
37
+ def populate_extra_atributes(contig,item)
38
+
39
+ end
40
+
41
+ end