bio-polyploid-tools 0.7.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +17 -0
  3. data/Gemfile +10 -7
  4. data/README.md +44 -0
  5. data/Rakefile +14 -14
  6. data/VERSION +1 -1
  7. data/bin/bfr.rb +2 -2
  8. data/bin/blast_triads.rb +166 -0
  9. data/bin/blast_triads_promoters.rb +192 -0
  10. data/bin/find_homoeologue_variations.rb +385 -0
  11. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  12. data/bin/hexaploid_primers.rb +2 -2
  13. data/bin/homokaryot_primers.rb +2 -2
  14. data/bin/mafft_triads.rb +120 -0
  15. data/bin/mafft_triads_promoters.rb +403 -0
  16. data/bin/polymarker.rb +73 -17
  17. data/bin/polymarker_capillary.rb +416 -0
  18. data/bin/snp_position_to_polymarker.rb +5 -3
  19. data/bin/snps_between_bams.rb +0 -29
  20. data/bin/vcfLineToTable.rb +56 -0
  21. data/bio-polyploid-tools.gemspec +74 -32
  22. data/lib/bio/BFRTools.rb +1 -0
  23. data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
  24. data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
  25. data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
  26. data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
  27. data/lib/bio/PolyploidTools/SNP.rb +58 -18
  28. data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
  29. data/lib/bio/db/blast.rb +112 -0
  30. data/lib/bio/db/exonerate.rb +4 -5
  31. data/lib/bio/db/primer3.rb +83 -14
  32. data/test/data/BS00068396_51_blast.tab +4 -0
  33. data/test/data/BS00068396_51_contigs.nhr +0 -0
  34. data/test/data/BS00068396_51_contigs.nin +0 -0
  35. data/test/data/BS00068396_51_contigs.nsq +0 -0
  36. data/test/data/BS00068396_51_for_polymarker.fa +1 -0
  37. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  38. data/test/data/S22380157.vcf +67 -0
  39. data/test/data/S58861868/LIB1716.bam +0 -0
  40. data/test/data/S58861868/LIB1716.sam +651 -0
  41. data/test/data/S58861868/LIB1719.bam +0 -0
  42. data/test/data/S58861868/LIB1719.sam +805 -0
  43. data/test/data/S58861868/LIB1721.bam +0 -0
  44. data/test/data/S58861868/LIB1721.sam +1790 -0
  45. data/test/data/S58861868/LIB1722.bam +0 -0
  46. data/test/data/S58861868/LIB1722.sam +1271 -0
  47. data/test/data/S58861868/S58861868.fa +16 -0
  48. data/test/data/S58861868/S58861868.fa.fai +1 -0
  49. data/test/data/S58861868/S58861868.vcf +76 -0
  50. data/test/data/S58861868/header.txt +9 -0
  51. data/test/data/S58861868/merged.bam +0 -0
  52. data/test/data/S58861868/merged_reheader.bam +0 -0
  53. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  54. data/test/data/bfr_out_test.csv +5 -5
  55. data/test/data/headerMergeed.txt +9 -0
  56. data/test/data/headerS2238015 +1 -0
  57. data/test/data/mergedLibs.bam +0 -0
  58. data/test/data/mergedLibsReheader.bam +0 -0
  59. data/test/data/mergedLibsSorted.bam +0 -0
  60. data/test/data/mergedLibsSorted.bam.bai +0 -0
  61. data/test/test_bfr.rb +26 -34
  62. data/test/test_blast.rb +47 -0
  63. data/test/test_exonearate.rb +4 -9
  64. data/test/test_snp_parsing.rb +42 -22
  65. metadata +81 -20
  66. data/Gemfile.lock +0 -67
@@ -0,0 +1,385 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools'
6
+ require 'optparse'
7
+ require 'set'
8
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
9
+ $: << File.expand_path('.')
10
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
11
+ require path
12
+
13
+ def log(msg)
14
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
15
+ puts "#{time}: #{msg}"
16
+ end
17
+
18
+
19
+ class Bio::PolyploidTools::ExonContainer
20
+ def add_alignments(opts=Hash.new)
21
+ opts = { :min_identity=>90 }.merge!(opts)
22
+ exonerate_filename = opts[:exonerate_file]
23
+ arm_selection = opts[:arm_selection]
24
+
25
+ unless arm_selection
26
+ arm_selection = lambda do | contig_name |
27
+ ret = contig_name[0,3]
28
+ return ret
29
+ end
30
+ end
31
+
32
+ File.open(exonerate_filename) do |f|
33
+ f.each_line do | line |
34
+ record = Bio::DB::Exonerate::Alignment.parse_custom(line)
35
+ if record and record.identity >= opts[:min_identity]
36
+ snp_array = @snp_map[record.query_id]
37
+ if snp_array != nil
38
+ snp_array.each do |snp|
39
+ if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
40
+ begin
41
+ exon = record.exon_on_gene_position(snp.position)
42
+ snp.add_exon(exon, arm_selection.call(record.target_id))
43
+ rescue Bio::DB::Exonerate::ExonerateException
44
+ $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+
55
+ class Bio::DB::Primer3::SNP
56
+
57
+ def to_s
58
+ "#{gene}:#{snp_from.chromosome}"
59
+ end
60
+
61
+ end
62
+ class Bio::DB::Primer3::Primer3Record
63
+
64
+
65
+ def best_pair
66
+ return @best_pair if @best_pair
67
+ @best_pair = nil
68
+ @total_caps = 100
69
+ @primerPairs.each do | primer |
70
+ capital_count = "#{primer.left.sequence}#{primer.right.sequence}".scan(/[A-Z]/).length
71
+ if @best_pair.nil?
72
+ @best_pair = primer
73
+ @total_caps = capital_count
74
+ next
75
+ end
76
+ if capital_count < @total_caps
77
+ @best_pair = primer
78
+ @total_caps = capital_count
79
+ end
80
+ if primer.size < @best_pair.size
81
+ @best_pair = primer
82
+ @total_caps = capital_count
83
+ end
84
+ end
85
+ #@best_pair = @primerPairs.min
86
+ @best_pair
87
+ end
88
+
89
+ #CL3339Contig1:T509C AvocetS chromosome_specific exon 4D forward
90
+ def parse_header
91
+ @snp, @line, @type, @in, @polymorphism, @chromosome, @orientation = self.sequence_id.split(" ")
92
+ @type = @type.to_sym
93
+ if @in
94
+ @in = @in.to_sym == :exon
95
+ else
96
+ @exon = false
97
+ end
98
+
99
+ if @polymorphism.to_sym == :homoeologous
100
+ @homoeologous = true
101
+ else
102
+ @homoeologous = false
103
+ end
104
+ @parsed = true
105
+ @orientation = @orientation.to_sym
106
+ end
107
+
108
+ def score
109
+ best_pair
110
+ # puts "score"
111
+ # puts self.inspect
112
+ ret = 0
113
+ ret += @scores[type]
114
+ ret += @scores[:exon] if exon?
115
+ ret -= @total_caps * 10
116
+ ret -= product_length
117
+ ret
118
+ end
119
+
120
+ def to_s
121
+ "#{gene}:#{snp_from.chromosome}"
122
+ end
123
+
124
+ def left_primer_snp(snp)
125
+ tmp_primer = String.new(left_primer)
126
+ #if self.orientation == :forward
127
+ # base_original = snp.original
128
+ # base_snp = snp.snp
129
+ #elsif self.orientation == :reverse
130
+ # base_original = reverse_complement_string(snp.original )
131
+ # base_snp = reverse_complement_string(snp.snp)
132
+ #else
133
+ # raise Primer3Exception.new "#{self.orientation} is not a valid orientation"
134
+ #end
135
+
136
+ # puts "#{snp.to_s} #{self.orientation} #{tmp_primer[-1] } #{base_original} #{base_snp}"
137
+ #if tmp_primer[-1] == base_original
138
+ # tmp_primer[-1] = base_snp
139
+ #elsif tmp_primer[-1] == base_snp
140
+ # tmp_primer[-1] = base_original
141
+ #else
142
+ # raise Primer3Exception.new "#{tmp_primer} doesnt end in a base in the SNP #{snp.to_s}"
143
+ #end
144
+ #puts "tmp_primer: #{tmp_primer}"
145
+ return tmp_primer
146
+ end
147
+
148
+ end
149
+
150
+ arm_selection_functions = Hash.new;
151
+
152
+
153
+ arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
154
+ ret = contig_name[0,2]
155
+ return ret
156
+ end
157
+
158
+ #Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
159
+ #Or the first two characters in the contig name, to deal with
160
+ #pseudomolecules that start with headers like: "1A"
161
+ #And with the cases when 3B is named with the prefix: v443
162
+ arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
163
+
164
+ arr = contig_name.split('_')
165
+ ret = "U"
166
+ ret = arr[2][0,2] if arr.size >= 3
167
+ ret = "3B" if arr.size == 2 and arr[0] == "v443"
168
+ ret = arr[0][0,2] if arr.size == 1
169
+ return ret
170
+ end
171
+
172
+ arm_selection_functions[:arm_selection_morex] = lambda do | contig_name |
173
+ ret = contig_name.split(':')[0].split("_")[1];
174
+ return ret
175
+ end
176
+
177
+ arm_selection_functions[:scaffold] = lambda do | contig_name |
178
+ ret = contig_name;
179
+ return ret
180
+ end
181
+
182
+ markers = nil
183
+
184
+ options = {}
185
+ options[:model] = "est2genome"
186
+ options[:min_identity] = 90
187
+ options[:extract_found_contigs] = false
188
+ options[:arm_selection] = arm_selection_functions[:arm_selection_embl] ;
189
+ options[:genomes_count] = 3
190
+
191
+
192
+ options[:primer_3_preferences] = {
193
+ :primer_product_size_range => "50-150" ,
194
+ :primer_max_size => 25 ,
195
+ :primer_lib_ambiguity_codes_consensus => 1,
196
+ :primer_liberal_base => 1,
197
+ :primer_num_return=>5,
198
+ :primer_explain_flag => 1,
199
+ :primer_thermodynamic_parameters_path=>File.expand_path(File.dirname(__FILE__) + '../../conf/primer3_config/') + '/'
200
+ }
201
+
202
+
203
+ OptionParser.new do |opts|
204
+
205
+ opts.banner = "Usage: find_homoeologue_variations.rb [options]"
206
+
207
+ opts.on("-c", "--sequences FASTA", "Sequence of the region to searc") do |o|
208
+ options[:sequences] = o
209
+ end
210
+ opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
211
+ options[:reference] = o
212
+ end
213
+ opts.on("-o", "--output DIR", "Directory to write the output") do |o|
214
+ options[:output] = o
215
+ end
216
+
217
+ opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
218
+ options[:genomes_count] = o.to_i
219
+ end
220
+
221
+ opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
222
+ options[:extract_found_contigs] = true
223
+ end
224
+
225
+ end.parse!
226
+ #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
227
+ reference = options[:reference] if options[:reference]
228
+ throw raise Exception.new(), "Reference has to be provided" unless reference
229
+ sequences = options[:sequences] if options[:sequences]
230
+ throw raise Exception.new(), "Fasta file with sequences has to be provided" unless sequences
231
+ output_folder = options[:output] if options[:output]
232
+ throw raise Exception.new(), "An output directory has to be provided" unless output_folder
233
+ model=options[:model]
234
+ Dir.mkdir(output_folder)
235
+ min_identity= options[:min_identity]
236
+
237
+ exonerate_file="#{output_folder}/exonerate_tmp.tab"
238
+ temp_contigs="#{output_folder}/contigs_tmp.fa"
239
+ primer_3_input="#{output_folder}/primer_3_input_temp"
240
+ primer_3_output="#{output_folder}/primer_3_output_temp"
241
+ exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
242
+ output_primers="#{output_folder}/primers.csv"
243
+ output_to_order="#{output_folder}/primers_to_order.csv"
244
+
245
+ fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
246
+ fasta_file.load_fai_entries
247
+
248
+ original_name="A"
249
+ snp_in="B"
250
+
251
+ arm_selection = options[:arm_selection]
252
+
253
+ unless arm_selection
254
+ arm_selection = lambda do | contig_name |
255
+ ret = contig_name[0,3]
256
+ return ret
257
+ end
258
+ end
259
+ begin
260
+ log "Reading exons"
261
+ exons = Array.new
262
+ Bio::FlatFile.auto(sequences) do |ff|
263
+ ff.each do |entry|
264
+ fields = Array.new
265
+ fields << entry.definition
266
+ fields << arm_selection.call(entry.definition)
267
+ fields << entry.seq
268
+
269
+ line = fields.join(",")
270
+ snp = Bio::PolyploidTools::NoSNPSequence.parse(line)
271
+ snp.genomes_count = options[:genomes_count]
272
+ exons << snp
273
+
274
+ end
275
+ end
276
+
277
+
278
+
279
+ log "Searching markers in genome"
280
+ found_contigs = Set.new
281
+ exo_f = File.open(exonerate_file, "w")
282
+ contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
283
+ Bio::DB::Exonerate.align({:query=>sequences, :target=>reference, :model=>model}) do |aln|
284
+ if aln.identity > min_identity
285
+ exo_f.puts aln.line
286
+ unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
287
+ found_contigs.add(aln.target_id)
288
+ entry = fasta_file.index.region_for_entry(aln.target_id)
289
+ raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
290
+ region = entry.get_full_region
291
+ seq = fasta_file.fetch_sequence(region)
292
+ contigs_f.puts(">#{aln.target_id}\n#{seq}") if options[:extract_found_contigs]
293
+ end
294
+ end
295
+ end
296
+ exo_f.close()
297
+ contigs_f.close() if options[:extract_found_contigs]
298
+
299
+
300
+
301
+ log "Reading best alignment on each chromosome"
302
+
303
+ container= Bio::PolyploidTools::ExonContainer.new
304
+ container.flanking_size=options[:flanking_size]
305
+ container.gene_models(sequences)
306
+ container.chromosomes(temp_contigs)
307
+ container.add_parental({:name=>"A"})
308
+ container.add_parental({:name=>"B"})
309
+ exons.each do |exon|
310
+ exon.container = container
311
+ exon.flanking_size = 50
312
+ exon.variation_free_region = options[:variation_free_region]
313
+ # puts exon.inspect
314
+ container.add_snp(exon)
315
+
316
+ end
317
+ container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>options[:arm_selection] , :min_identity=>min_identity})
318
+
319
+ #4.1 generating primer3 file
320
+ log "Running primer3"
321
+ file = File.open(exons_filename, "w")
322
+ container.print_fasta_snp_exones(file)
323
+ file.close
324
+
325
+ file = File.open(primer_3_input, "w")
326
+
327
+ Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
328
+ added_exons = container.print_primer_3_exons(file, nil, snp_in)
329
+ file.close
330
+
331
+ Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
332
+
333
+ #5. Pick the best primer and make the primer3 output
334
+ log "Selecting best primers"
335
+ kasp_container=Bio::DB::Primer3::KASPContainer.new
336
+ kasp_container.line_1= original_name
337
+ kasp_container.line_2= snp_in
338
+
339
+ if options[:scoring] == :het_dels
340
+ kasp_container.scores = Hash.new
341
+ kasp_container.scores[:chromosome_specific] = 0
342
+ kasp_container.scores[:chromosome_semispecific] = 1000
343
+ kasp_container.scores[:chromosome_nonspecific] = 100
344
+ end
345
+
346
+ exons.each do |snp|
347
+ snpk = kasp_container.add_snp(snp)
348
+ end
349
+
350
+ kasp_container.add_primers_file(primer_3_output) if added_exons > 0
351
+ header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors"
352
+ File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
353
+
354
+ kasp_container.snp_hash.each_pair do |name, kaspSNP|
355
+ #puts kaspSNP.snp_from.surrounding_exon_sequences.inspect
356
+ #puts kaspSNP.first_product
357
+ #puts kaspSNP.realigned_primers
358
+
359
+ out_fasta_products = "#{output_folder}/#{name}.fa"
360
+ File.open(out_fasta_products, 'w') { |f| f.write(kaspSNP.realigned_primers_fasta) }
361
+
362
+
363
+ end
364
+
365
+ File.open(output_to_order, "w") { |io| io.write(kasp_container.print_primers_with_tails()) }
366
+
367
+ log "DONE"
368
+ rescue StandardError => e
369
+ log "ERROR\t#{e.message}"
370
+ $stderr.puts e.backtrace
371
+ raise e
372
+ rescue Exception => e
373
+ log "ERROR\t#{e.message}"
374
+ $stderr.puts e.backtrace
375
+ raise e
376
+ end
377
+ #puts container.inspect
378
+
379
+ #container.snp_map.each do | gene, snp_array|
380
+ # snp_array.each do |e|
381
+ # puts e.inspect
382
+ # puts e.aligned_sequences_fasta
383
+ # end
384
+ #end
385
+
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ #$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
6
+ #$: << File.expand_path('.')
7
+ #path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polyploid-tools.rb')
8
+ #require path
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:blastx] = "-"
14
+
15
+ OptionParser.new do |opts|
16
+
17
+ opts.banner = "Usage: filter_blat.rb [options]"
18
+
19
+ opts.on("-p", "--blastx FILE", "BLAST XML file") do |o|
20
+ options[:blastx] = o
21
+ end
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
26
+ options[:min_bases] = o.to_i
27
+ end
28
+
29
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
30
+ options[:triads] = o
31
+ end
32
+
33
+ end.parse!
34
+
35
+ valid_pairs_A_B = Hash.new
36
+ valid_pairs_A_D = Hash.new
37
+ valid_pairs_B_D = Hash.new
38
+
39
+ CSV.foreach(options[:triads], headers:true ) do |row|
40
+ valid_pairs_A_B[row['A']] = row['B']
41
+ valid_pairs_A_D[row['A']] = row['D']
42
+ valid_pairs_B_D[row['B']] = row['D']
43
+ end
44
+
45
+ stream = ARGF
46
+ stream = IO.open(options[:blastx]) unless options[:blastx] == "-"
47
+ puts "Loaded #{valid_pairs_B_D.length} triads"
48
+ $stdout.flush
49
+
50
+ blast_report = Bio::FlatFile.new(Bio::Blast::Report, stream)
51
+
52
+ blast_report.each_entry do |report|
53
+ puts "Hits for " + report.query_def + " against " + report.db
54
+ $stdout.flush
55
+ report.each do |hit|
56
+ query = hit.query_id.split("-")[0]
57
+ target = hit.target_id.split("-")[0]
58
+ if valid_pairs_A_B[query] == target or valid_pairs_A_D[query] == target or valid_pairs_B_D[query] == target
59
+ puts hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
60
+ puts hit.inspect
61
+ end
62
+
63
+ end
64
+ end
65
+
66
+ stream.close unless options[:blat_file] == "-"