bio-polyploid-tools 0.7.3 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +17 -0
  3. data/Gemfile +10 -7
  4. data/README.md +44 -0
  5. data/Rakefile +14 -14
  6. data/VERSION +1 -1
  7. data/bin/bfr.rb +2 -2
  8. data/bin/blast_triads.rb +166 -0
  9. data/bin/blast_triads_promoters.rb +192 -0
  10. data/bin/find_homoeologue_variations.rb +385 -0
  11. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  12. data/bin/hexaploid_primers.rb +2 -2
  13. data/bin/homokaryot_primers.rb +2 -2
  14. data/bin/mafft_triads.rb +120 -0
  15. data/bin/mafft_triads_promoters.rb +403 -0
  16. data/bin/polymarker.rb +73 -17
  17. data/bin/polymarker_capillary.rb +416 -0
  18. data/bin/snp_position_to_polymarker.rb +5 -3
  19. data/bin/snps_between_bams.rb +0 -29
  20. data/bin/vcfLineToTable.rb +56 -0
  21. data/bio-polyploid-tools.gemspec +74 -32
  22. data/lib/bio/BFRTools.rb +1 -0
  23. data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
  24. data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
  25. data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
  26. data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
  27. data/lib/bio/PolyploidTools/SNP.rb +58 -18
  28. data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
  29. data/lib/bio/db/blast.rb +112 -0
  30. data/lib/bio/db/exonerate.rb +4 -5
  31. data/lib/bio/db/primer3.rb +83 -14
  32. data/test/data/BS00068396_51_blast.tab +4 -0
  33. data/test/data/BS00068396_51_contigs.nhr +0 -0
  34. data/test/data/BS00068396_51_contigs.nin +0 -0
  35. data/test/data/BS00068396_51_contigs.nsq +0 -0
  36. data/test/data/BS00068396_51_for_polymarker.fa +1 -0
  37. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  38. data/test/data/S22380157.vcf +67 -0
  39. data/test/data/S58861868/LIB1716.bam +0 -0
  40. data/test/data/S58861868/LIB1716.sam +651 -0
  41. data/test/data/S58861868/LIB1719.bam +0 -0
  42. data/test/data/S58861868/LIB1719.sam +805 -0
  43. data/test/data/S58861868/LIB1721.bam +0 -0
  44. data/test/data/S58861868/LIB1721.sam +1790 -0
  45. data/test/data/S58861868/LIB1722.bam +0 -0
  46. data/test/data/S58861868/LIB1722.sam +1271 -0
  47. data/test/data/S58861868/S58861868.fa +16 -0
  48. data/test/data/S58861868/S58861868.fa.fai +1 -0
  49. data/test/data/S58861868/S58861868.vcf +76 -0
  50. data/test/data/S58861868/header.txt +9 -0
  51. data/test/data/S58861868/merged.bam +0 -0
  52. data/test/data/S58861868/merged_reheader.bam +0 -0
  53. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  54. data/test/data/bfr_out_test.csv +5 -5
  55. data/test/data/headerMergeed.txt +9 -0
  56. data/test/data/headerS2238015 +1 -0
  57. data/test/data/mergedLibs.bam +0 -0
  58. data/test/data/mergedLibsReheader.bam +0 -0
  59. data/test/data/mergedLibsSorted.bam +0 -0
  60. data/test/data/mergedLibsSorted.bam.bai +0 -0
  61. data/test/test_bfr.rb +26 -34
  62. data/test/test_blast.rb +47 -0
  63. data/test/test_exonearate.rb +4 -9
  64. data/test/test_snp_parsing.rb +42 -22
  65. metadata +81 -20
  66. data/Gemfile.lock +0 -67
@@ -0,0 +1,385 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools'
6
+ require 'optparse'
7
+ require 'set'
8
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
9
+ $: << File.expand_path('.')
10
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
11
+ require path
12
+
13
+ def log(msg)
14
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
15
+ puts "#{time}: #{msg}"
16
+ end
17
+
18
+
19
+ class Bio::PolyploidTools::ExonContainer
20
+ def add_alignments(opts=Hash.new)
21
+ opts = { :min_identity=>90 }.merge!(opts)
22
+ exonerate_filename = opts[:exonerate_file]
23
+ arm_selection = opts[:arm_selection]
24
+
25
+ unless arm_selection
26
+ arm_selection = lambda do | contig_name |
27
+ ret = contig_name[0,3]
28
+ return ret
29
+ end
30
+ end
31
+
32
+ File.open(exonerate_filename) do |f|
33
+ f.each_line do | line |
34
+ record = Bio::DB::Exonerate::Alignment.parse_custom(line)
35
+ if record and record.identity >= opts[:min_identity]
36
+ snp_array = @snp_map[record.query_id]
37
+ if snp_array != nil
38
+ snp_array.each do |snp|
39
+ if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
40
+ begin
41
+ exon = record.exon_on_gene_position(snp.position)
42
+ snp.add_exon(exon, arm_selection.call(record.target_id))
43
+ rescue Bio::DB::Exonerate::ExonerateException
44
+ $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+
55
+ class Bio::DB::Primer3::SNP
56
+
57
+ def to_s
58
+ "#{gene}:#{snp_from.chromosome}"
59
+ end
60
+
61
+ end
62
+ class Bio::DB::Primer3::Primer3Record
63
+
64
+
65
+ def best_pair
66
+ return @best_pair if @best_pair
67
+ @best_pair = nil
68
+ @total_caps = 100
69
+ @primerPairs.each do | primer |
70
+ capital_count = "#{primer.left.sequence}#{primer.right.sequence}".scan(/[A-Z]/).length
71
+ if @best_pair.nil?
72
+ @best_pair = primer
73
+ @total_caps = capital_count
74
+ next
75
+ end
76
+ if capital_count < @total_caps
77
+ @best_pair = primer
78
+ @total_caps = capital_count
79
+ end
80
+ if primer.size < @best_pair.size
81
+ @best_pair = primer
82
+ @total_caps = capital_count
83
+ end
84
+ end
85
+ #@best_pair = @primerPairs.min
86
+ @best_pair
87
+ end
88
+
89
+ #CL3339Contig1:T509C AvocetS chromosome_specific exon 4D forward
90
+ def parse_header
91
+ @snp, @line, @type, @in, @polymorphism, @chromosome, @orientation = self.sequence_id.split(" ")
92
+ @type = @type.to_sym
93
+ if @in
94
+ @in = @in.to_sym == :exon
95
+ else
96
+ @exon = false
97
+ end
98
+
99
+ if @polymorphism.to_sym == :homoeologous
100
+ @homoeologous = true
101
+ else
102
+ @homoeologous = false
103
+ end
104
+ @parsed = true
105
+ @orientation = @orientation.to_sym
106
+ end
107
+
108
+ def score
109
+ best_pair
110
+ # puts "score"
111
+ # puts self.inspect
112
+ ret = 0
113
+ ret += @scores[type]
114
+ ret += @scores[:exon] if exon?
115
+ ret -= @total_caps * 10
116
+ ret -= product_length
117
+ ret
118
+ end
119
+
120
+ def to_s
121
+ "#{gene}:#{snp_from.chromosome}"
122
+ end
123
+
124
+ def left_primer_snp(snp)
125
+ tmp_primer = String.new(left_primer)
126
+ #if self.orientation == :forward
127
+ # base_original = snp.original
128
+ # base_snp = snp.snp
129
+ #elsif self.orientation == :reverse
130
+ # base_original = reverse_complement_string(snp.original )
131
+ # base_snp = reverse_complement_string(snp.snp)
132
+ #else
133
+ # raise Primer3Exception.new "#{self.orientation} is not a valid orientation"
134
+ #end
135
+
136
+ # puts "#{snp.to_s} #{self.orientation} #{tmp_primer[-1] } #{base_original} #{base_snp}"
137
+ #if tmp_primer[-1] == base_original
138
+ # tmp_primer[-1] = base_snp
139
+ #elsif tmp_primer[-1] == base_snp
140
+ # tmp_primer[-1] = base_original
141
+ #else
142
+ # raise Primer3Exception.new "#{tmp_primer} doesnt end in a base in the SNP #{snp.to_s}"
143
+ #end
144
+ #puts "tmp_primer: #{tmp_primer}"
145
+ return tmp_primer
146
+ end
147
+
148
+ end
149
+
150
+ arm_selection_functions = Hash.new;
151
+
152
+
153
+ arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
154
+ ret = contig_name[0,2]
155
+ return ret
156
+ end
157
+
158
+ #Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
159
+ #Or the first two characters in the contig name, to deal with
160
+ #pseudomolecules that start with headers like: "1A"
161
+ #And with the cases when 3B is named with the prefix: v443
162
+ arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
163
+
164
+ arr = contig_name.split('_')
165
+ ret = "U"
166
+ ret = arr[2][0,2] if arr.size >= 3
167
+ ret = "3B" if arr.size == 2 and arr[0] == "v443"
168
+ ret = arr[0][0,2] if arr.size == 1
169
+ return ret
170
+ end
171
+
172
+ arm_selection_functions[:arm_selection_morex] = lambda do | contig_name |
173
+ ret = contig_name.split(':')[0].split("_")[1];
174
+ return ret
175
+ end
176
+
177
+ arm_selection_functions[:scaffold] = lambda do | contig_name |
178
+ ret = contig_name;
179
+ return ret
180
+ end
181
+
182
+ markers = nil
183
+
184
+ options = {}
185
+ options[:model] = "est2genome"
186
+ options[:min_identity] = 90
187
+ options[:extract_found_contigs] = false
188
+ options[:arm_selection] = arm_selection_functions[:arm_selection_embl] ;
189
+ options[:genomes_count] = 3
190
+
191
+
192
+ options[:primer_3_preferences] = {
193
+ :primer_product_size_range => "50-150" ,
194
+ :primer_max_size => 25 ,
195
+ :primer_lib_ambiguity_codes_consensus => 1,
196
+ :primer_liberal_base => 1,
197
+ :primer_num_return=>5,
198
+ :primer_explain_flag => 1,
199
+ :primer_thermodynamic_parameters_path=>File.expand_path(File.dirname(__FILE__) + '../../conf/primer3_config/') + '/'
200
+ }
201
+
202
+
203
+ OptionParser.new do |opts|
204
+
205
+ opts.banner = "Usage: find_homoeologue_variations.rb [options]"
206
+
207
+ opts.on("-c", "--sequences FASTA", "Sequence of the region to searc") do |o|
208
+ options[:sequences] = o
209
+ end
210
+ opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
211
+ options[:reference] = o
212
+ end
213
+ opts.on("-o", "--output DIR", "Directory to write the output") do |o|
214
+ options[:output] = o
215
+ end
216
+
217
+ opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
218
+ options[:genomes_count] = o.to_i
219
+ end
220
+
221
+ opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
222
+ options[:extract_found_contigs] = true
223
+ end
224
+
225
+ end.parse!
226
+ #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
227
+ reference = options[:reference] if options[:reference]
228
+ throw raise Exception.new(), "Reference has to be provided" unless reference
229
+ sequences = options[:sequences] if options[:sequences]
230
+ throw raise Exception.new(), "Fasta file with sequences has to be provided" unless sequences
231
+ output_folder = options[:output] if options[:output]
232
+ throw raise Exception.new(), "An output directory has to be provided" unless output_folder
233
+ model=options[:model]
234
+ Dir.mkdir(output_folder)
235
+ min_identity= options[:min_identity]
236
+
237
+ exonerate_file="#{output_folder}/exonerate_tmp.tab"
238
+ temp_contigs="#{output_folder}/contigs_tmp.fa"
239
+ primer_3_input="#{output_folder}/primer_3_input_temp"
240
+ primer_3_output="#{output_folder}/primer_3_output_temp"
241
+ exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
242
+ output_primers="#{output_folder}/primers.csv"
243
+ output_to_order="#{output_folder}/primers_to_order.csv"
244
+
245
+ fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
246
+ fasta_file.load_fai_entries
247
+
248
+ original_name="A"
249
+ snp_in="B"
250
+
251
+ arm_selection = options[:arm_selection]
252
+
253
+ unless arm_selection
254
+ arm_selection = lambda do | contig_name |
255
+ ret = contig_name[0,3]
256
+ return ret
257
+ end
258
+ end
259
+ begin
260
+ log "Reading exons"
261
+ exons = Array.new
262
+ Bio::FlatFile.auto(sequences) do |ff|
263
+ ff.each do |entry|
264
+ fields = Array.new
265
+ fields << entry.definition
266
+ fields << arm_selection.call(entry.definition)
267
+ fields << entry.seq
268
+
269
+ line = fields.join(",")
270
+ snp = Bio::PolyploidTools::NoSNPSequence.parse(line)
271
+ snp.genomes_count = options[:genomes_count]
272
+ exons << snp
273
+
274
+ end
275
+ end
276
+
277
+
278
+
279
+ log "Searching markers in genome"
280
+ found_contigs = Set.new
281
+ exo_f = File.open(exonerate_file, "w")
282
+ contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
283
+ Bio::DB::Exonerate.align({:query=>sequences, :target=>reference, :model=>model}) do |aln|
284
+ if aln.identity > min_identity
285
+ exo_f.puts aln.line
286
+ unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
287
+ found_contigs.add(aln.target_id)
288
+ entry = fasta_file.index.region_for_entry(aln.target_id)
289
+ raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
290
+ region = entry.get_full_region
291
+ seq = fasta_file.fetch_sequence(region)
292
+ contigs_f.puts(">#{aln.target_id}\n#{seq}") if options[:extract_found_contigs]
293
+ end
294
+ end
295
+ end
296
+ exo_f.close()
297
+ contigs_f.close() if options[:extract_found_contigs]
298
+
299
+
300
+
301
+ log "Reading best alignment on each chromosome"
302
+
303
+ container= Bio::PolyploidTools::ExonContainer.new
304
+ container.flanking_size=options[:flanking_size]
305
+ container.gene_models(sequences)
306
+ container.chromosomes(temp_contigs)
307
+ container.add_parental({:name=>"A"})
308
+ container.add_parental({:name=>"B"})
309
+ exons.each do |exon|
310
+ exon.container = container
311
+ exon.flanking_size = 50
312
+ exon.variation_free_region = options[:variation_free_region]
313
+ # puts exon.inspect
314
+ container.add_snp(exon)
315
+
316
+ end
317
+ container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>options[:arm_selection] , :min_identity=>min_identity})
318
+
319
+ #4.1 generating primer3 file
320
+ log "Running primer3"
321
+ file = File.open(exons_filename, "w")
322
+ container.print_fasta_snp_exones(file)
323
+ file.close
324
+
325
+ file = File.open(primer_3_input, "w")
326
+
327
+ Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
328
+ added_exons = container.print_primer_3_exons(file, nil, snp_in)
329
+ file.close
330
+
331
+ Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
332
+
333
+ #5. Pick the best primer and make the primer3 output
334
+ log "Selecting best primers"
335
+ kasp_container=Bio::DB::Primer3::KASPContainer.new
336
+ kasp_container.line_1= original_name
337
+ kasp_container.line_2= snp_in
338
+
339
+ if options[:scoring] == :het_dels
340
+ kasp_container.scores = Hash.new
341
+ kasp_container.scores[:chromosome_specific] = 0
342
+ kasp_container.scores[:chromosome_semispecific] = 1000
343
+ kasp_container.scores[:chromosome_nonspecific] = 100
344
+ end
345
+
346
+ exons.each do |snp|
347
+ snpk = kasp_container.add_snp(snp)
348
+ end
349
+
350
+ kasp_container.add_primers_file(primer_3_output) if added_exons > 0
351
+ header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors"
352
+ File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
353
+
354
+ kasp_container.snp_hash.each_pair do |name, kaspSNP|
355
+ #puts kaspSNP.snp_from.surrounding_exon_sequences.inspect
356
+ #puts kaspSNP.first_product
357
+ #puts kaspSNP.realigned_primers
358
+
359
+ out_fasta_products = "#{output_folder}/#{name}.fa"
360
+ File.open(out_fasta_products, 'w') { |f| f.write(kaspSNP.realigned_primers_fasta) }
361
+
362
+
363
+ end
364
+
365
+ File.open(output_to_order, "w") { |io| io.write(kasp_container.print_primers_with_tails()) }
366
+
367
+ log "DONE"
368
+ rescue StandardError => e
369
+ log "ERROR\t#{e.message}"
370
+ $stderr.puts e.backtrace
371
+ raise e
372
+ rescue Exception => e
373
+ log "ERROR\t#{e.message}"
374
+ $stderr.puts e.backtrace
375
+ raise e
376
+ end
377
+ #puts container.inspect
378
+
379
+ #container.snp_map.each do | gene, snp_array|
380
+ # snp_array.each do |e|
381
+ # puts e.inspect
382
+ # puts e.aligned_sequences_fasta
383
+ # end
384
+ #end
385
+
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ #$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
6
+ #$: << File.expand_path('.')
7
+ #path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polyploid-tools.rb')
8
+ #require path
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:blastx] = "-"
14
+
15
+ OptionParser.new do |opts|
16
+
17
+ opts.banner = "Usage: filter_blat.rb [options]"
18
+
19
+ opts.on("-p", "--blastx FILE", "BLAST XML file") do |o|
20
+ options[:blastx] = o
21
+ end
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
26
+ options[:min_bases] = o.to_i
27
+ end
28
+
29
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
30
+ options[:triads] = o
31
+ end
32
+
33
+ end.parse!
34
+
35
+ valid_pairs_A_B = Hash.new
36
+ valid_pairs_A_D = Hash.new
37
+ valid_pairs_B_D = Hash.new
38
+
39
+ CSV.foreach(options[:triads], headers:true ) do |row|
40
+ valid_pairs_A_B[row['A']] = row['B']
41
+ valid_pairs_A_D[row['A']] = row['D']
42
+ valid_pairs_B_D[row['B']] = row['D']
43
+ end
44
+
45
+ stream = ARGF
46
+ stream = IO.open(options[:blastx]) unless options[:blastx] == "-"
47
+ puts "Loaded #{valid_pairs_B_D.length} triads"
48
+ $stdout.flush
49
+
50
+ blast_report = Bio::FlatFile.new(Bio::Blast::Report, stream)
51
+
52
+ blast_report.each_entry do |report|
53
+ puts "Hits for " + report.query_def + " against " + report.db
54
+ $stdout.flush
55
+ report.each do |hit|
56
+ query = hit.query_id.split("-")[0]
57
+ target = hit.target_id.split("-")[0]
58
+ if valid_pairs_A_B[query] == target or valid_pairs_A_D[query] == target or valid_pairs_B_D[query] == target
59
+ puts hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
60
+ puts hit.inspect
61
+ end
62
+
63
+ end
64
+ end
65
+
66
+ stream.close unless options[:blat_file] == "-"