bio-polymarker 1.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,350 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools-wrapper'
6
+ require 'optparse'
7
+ require 'set'
8
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
9
+ $: << File.expand_path('.')
10
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
11
+ require path
12
+
13
+ def log(msg)
14
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
15
+ puts "#{time}: #{msg}"
16
+ end
17
+
18
+
19
+ class Bio::PolyploidTools::ExonContainer
20
+ def add_alignments(opts=Hash.new)
21
+ opts = { :min_identity=>90 }.merge!(opts)
22
+ exonerate_filename = opts[:exonerate_file]
23
+ arm_selection = opts[:arm_selection]
24
+
25
+ unless arm_selection
26
+ arm_selection = lambda do | contig_name |
27
+ ret = contig_name[0,3]
28
+ return ret
29
+ end
30
+ end
31
+
32
+ File.open(exonerate_filename) do |f|
33
+ f.each_line do | line |
34
+ record = Bio::DB::Exonerate::Alignment.parse_custom(line)
35
+ if record and record.identity >= opts[:min_identity]
36
+ snp_array = @snp_map[record.query_id]
37
+ if snp_array != nil
38
+ snp_array.each do |snp|
39
+ if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
40
+ begin
41
+ exon = record.exon_on_gene_position(snp.position)
42
+ snp.add_exon(exon, arm_selection.call(record.target_id))
43
+ rescue Bio::DB::Exonerate::ExonerateException
44
+ $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+
55
+ class Bio::DB::Primer3::SNP
56
+ def to_s
57
+ "#{gene}:#{snp_from.chromosome}"
58
+ end
59
+ end
60
+
61
+ class Bio::DB::Primer3::Primer3Record
62
+
63
+ def best_pair
64
+ return @best_pair if @best_pair
65
+ @best_pair = nil
66
+ @total_caps = 100
67
+ @primerPairs.each do | primer |
68
+ capital_count = "#{primer.left.sequence}#{primer.right.sequence}".scan(/[A-Z]/).length
69
+ if @best_pair.nil?
70
+ @best_pair = primer
71
+ @total_caps = capital_count
72
+ next
73
+ end
74
+ if capital_count < @total_caps
75
+ @best_pair = primer
76
+ @total_caps = capital_count
77
+ end
78
+ if primer.size < @best_pair.size
79
+ @best_pair = primer
80
+ @total_caps = capital_count
81
+ end
82
+ end
83
+
84
+ @best_pair
85
+ end
86
+
87
+ #CL3339Contig1:T509C AvocetS chromosome_specific exon 4D forward
88
+ def parse_header
89
+ @snp, @line, @type, @in, @polymorphism, @chromosome, @orientation = self.sequence_id.split(" ")
90
+ @type = @type.to_sym
91
+ if @in
92
+ @in = @in.to_sym == :exon
93
+ else
94
+ @exon = false
95
+ end
96
+
97
+ if @polymorphism.to_sym == :homoeologous
98
+ @homoeologous = true
99
+ else
100
+ @homoeologous = false
101
+ end
102
+ @parsed = true
103
+ @orientation = @orientation.to_sym
104
+ end
105
+
106
+ def score
107
+ best_pair
108
+ total_caps = "#{best_pair.left.sequence}#{best_pair.right.sequence}".scan(/[A-Z]/).length
109
+ # puts "score"
110
+ # puts self.inspect
111
+ ret = 0
112
+ ret += @scores[type]
113
+ ret += @scores[:exon] if exon?
114
+ ret -= total_caps * 10
115
+ ret -= product_length
116
+ ret
117
+ end
118
+
119
+ def to_s
120
+ "#{gene}:#{snp_from.chromosome}"
121
+ end
122
+
123
+ def left_primer_snp(snp)
124
+ tmp_primer = String.new(left_primer)
125
+ return tmp_primer
126
+ end
127
+
128
+ end
129
+
130
+ markers = nil
131
+
132
+ options = {}
133
+ options[:aligner] = :blast
134
+ options[:model] = "est2genome"
135
+ options[:min_identity] = 90
136
+ options[:extract_found_contigs] = true
137
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
138
+ options[:genomes_count] = 3
139
+ options[:variation_free_region] =0
140
+
141
+ options[:primer_3_preferences] = {
142
+ :primer_product_size_range => "50-150" ,
143
+ :primer_max_size => 25 ,
144
+ :primer_lib_ambiguity_codes_consensus => 1,
145
+ :primer_liberal_base => 1,
146
+ :primer_num_return=>5,
147
+ :primer_explain_flag => 1,
148
+ :primer_thermodynamic_parameters_path=>File.expand_path(File.dirname(__FILE__) + '../../conf/primer3_config/') + '/'
149
+ }
150
+
151
+
152
+ options[:database] = false
153
+
154
+
155
+ OptionParser.new do |opts|
156
+
157
+ opts.banner = "Usage: polymarker_deletions.rb [options]"
158
+
159
+ opts.on("-m", "--sequences FASTA", "Sequence of the region to search") do |o|
160
+ options[:sequences] = o
161
+ end
162
+ opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
163
+ options[:reference] = o
164
+ end
165
+ opts.on("-o", "--output DIR", "Directory to write the output") do |o|
166
+ options[:output] = o
167
+ end
168
+
169
+ opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
170
+ options[:genomes_count] = o.to_i
171
+ end
172
+
173
+ opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
174
+ options[:extract_found_contigs] = true
175
+ end
176
+
177
+ opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
178
+ options[:database] = o
179
+ end
180
+
181
+ opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
182
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
183
+ end
184
+
185
+ end.parse!
186
+ #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
187
+ reference = options[:reference] if options[:reference]
188
+ throw raise Exception.new(), "Reference has to be provided" unless reference
189
+ sequences = options[:sequences] if options[:sequences]
190
+ throw raise Exception.new(), "Fasta file with sequences has to be provided" unless sequences
191
+ output_folder = options[:output] if options[:output]
192
+ throw raise Exception.new(), "An output directory has to be provided" unless output_folder
193
+ model=options[:model]
194
+
195
+ options[:database] = options[:reference] unless options[:database]
196
+
197
+ Dir.mkdir(output_folder)
198
+ min_identity= options[:min_identity]
199
+
200
+ exonerate_file="#{output_folder}/exonerate_tmp.tab"
201
+
202
+ primer_3_input="#{output_folder}/primer_3_input_temp"
203
+ primer_3_output="#{output_folder}/primer_3_output_temp"
204
+ exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
205
+ output_primers="#{output_folder}/primers.csv"
206
+ output_to_order="#{output_folder}/primers_to_order.csv"
207
+
208
+ fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
209
+ fasta_file.load_fai_entries
210
+
211
+ original_name="A"
212
+ snp_in="B"
213
+
214
+ arm_selection = options[:arm_selection]
215
+
216
+ begin
217
+ log "Reading exons"
218
+ exons = Array.new
219
+ Bio::FlatFile.auto(sequences) do |ff|
220
+ ff.each do |entry|
221
+ fields = Array.new
222
+ fields << entry.definition
223
+ fields << arm_selection.call(entry.definition)
224
+ fields << entry.seq
225
+
226
+ line = fields.join(",")
227
+ snp = Bio::PolyploidTools::NoSNPSequence.parse(line)
228
+ snp.genomes_count = options[:genomes_count]
229
+ exons << snp
230
+
231
+ end
232
+ end
233
+
234
+
235
+
236
+ log "Searching markers in genome"
237
+ found_contigs = Set.new
238
+ exo_f = File.open(exonerate_file, "w")
239
+
240
+ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
241
+ if aln.identity > min_identity
242
+ exo_f.puts aln.line
243
+ unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
244
+ found_contigs.add(aln.target_id)
245
+ entry = fasta_file.index.region_for_entry(aln.target_id)
246
+ raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
247
+
248
+ end
249
+ end
250
+ end
251
+
252
+ Bio::DB::Blast.align({:query=>sequences, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
253
+ do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
254
+ end if options[:aligner] == :blast
255
+
256
+ Bio::DB::Exonerate.align({:query=>sequences, :target=>target, :model=>model}) do |aln|
257
+ do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
258
+ end if options[:aligner] == :exonerate
259
+
260
+ exo_f.close()
261
+
262
+
263
+
264
+ log "Reading best alignment on each chromosome"
265
+
266
+ container= Bio::PolyploidTools::ExonContainer.new
267
+ container.flanking_size=options[:flanking_size]
268
+ container.gene_models(sequences)
269
+ container.chromosomes(reference)
270
+ container.add_parental({:name=>"A"})
271
+ container.add_parental({:name=>"B"})
272
+ exons.each do |exon|
273
+ exon.container = container
274
+ exon.flanking_size = 200
275
+ exon.variation_free_region = options[:variation_free_region]
276
+ #puts exon.inspect
277
+ container.add_snp(exon)
278
+
279
+ end
280
+ container.add_alignments(
281
+ {:exonerate_file=>exonerate_file,
282
+ :arm_selection=>options[:arm_selection] ,
283
+ :min_identity=>min_identity})
284
+
285
+
286
+
287
+
288
+ #4.1 generating primer3 file
289
+ log "Running primer3"
290
+ file = File.open(exons_filename, "w")
291
+ container.print_fasta_snp_exones(file)
292
+ file.close
293
+
294
+ file = File.open(primer_3_input, "w")
295
+
296
+ Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
297
+ added_exons = container.print_primer_3_exons(file, nil, snp_in)
298
+ file.close
299
+
300
+ Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
301
+
302
+ #5. Pick the best primer and make the primer3 output
303
+ log "Selecting best primers"
304
+ kasp_container=Bio::DB::Primer3::KASPContainer.new
305
+ kasp_container.line_1= original_name
306
+ kasp_container.line_2= snp_in
307
+
308
+ if options[:scoring] == :het_dels
309
+ kasp_container.scores = Hash.new
310
+ kasp_container.scores[:chromosome_specific] = 0
311
+ kasp_container.scores[:chromosome_semispecific] = 1000
312
+ kasp_container.scores[:chromosome_nonspecific] = 100
313
+ end
314
+
315
+ exons.each do |snp|
316
+ snpk = kasp_container.add_snp(snp)
317
+ end
318
+
319
+ kasp_container.add_primers_file(primer_3_output) if added_exons > 0
320
+ header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors,repetitive,blast_hits"
321
+ File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
322
+
323
+ out_fasta_products = "#{output_folder}/products.fa"
324
+ File.open(out_fasta_products, 'w') do |f|
325
+ kasp_container.snp_hash.each_pair do |name, kaspSNP|
326
+ f.write(kaspSNP.realigned_primers_fasta)
327
+ end
328
+ end
329
+
330
+ File.open(output_to_order, "w") { |io| io.write(kasp_container.print_primers_with_tails()) }
331
+
332
+ log "DONE"
333
+ rescue StandardError => e
334
+ log "ERROR\t#{e.message}"
335
+ $stderr.puts e.backtrace
336
+ raise e
337
+ rescue Exception => e
338
+ log "ERROR\t#{e.message}"
339
+ $stderr.puts e.backtrace
340
+ raise e
341
+ end
342
+ #puts container.inspect
343
+
344
+ #container.snp_map.each do | gene, snp_array|
345
+ # snp_array.each do |e|
346
+ # puts e.inspect
347
+ # puts e.aligned_sequences_fasta
348
+ # end
349
+ #end
350
+
@@ -0,0 +1,101 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #This This script converts the a file with snps and positions with the header:
4
+ #GENE,BASE,POS,SNP,Chromosome
5
+ # snp.gene, snp.original, snp.position, snp.snp, snp.chromosome
6
+ #To the input expected by polymarker
7
+ #ID, Chromosome, sequence
8
+ #With sequence containing the SNP in the notation "[A/T]"
9
+ require 'bio'
10
+ require 'optparse'
11
+
12
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
13
+ $: << File.expand_path('.')
14
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
15
+ require path
16
+
17
+
18
+ def log(msg)
19
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
20
+ puts "#{time}: #{msg}"
21
+ end
22
+
23
+ markers = nil
24
+
25
+ options = {}
26
+ options[:flanking_size] = 100
27
+ test_file=''
28
+ OptionParser.new do |opts|
29
+
30
+ opts.banner = "Usage: snp_postion_to_polymarker.rb [options]"
31
+
32
+ opts.on("-s", "--snp_file CSV", "CSV file with the following columnns:\nID,Allele_1,position,Allele_1,target_chromosome") do |o|
33
+ options[:snp_file] = o
34
+ test_file = o
35
+ end
36
+ opts.on("-r", "--reference FASTA", "reference with the genes/contings/marker seuqnece") do |o|
37
+ options[:reference] = o
38
+ end
39
+ opts.on("-o", "--out CSV", "Output file ") do |o|
40
+ options[:output] = o
41
+ end
42
+ opts.on("-f", "--flanking_size INT", "Flanking size around the SNP") do |o|
43
+ options[:flanking_size] = o.to_i
44
+ end
45
+
46
+ opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line. Example: IWGSC_CSS_1AL_scaff_1455974,Kronos2281,127,C,T\n\
47
+ requires --reference to get the sequence using a position") do |o|
48
+ options[:mutant_list] = o
49
+ test_file = o
50
+ end
51
+
52
+ end.parse!
53
+ #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
54
+
55
+ fasta_reference = options[:reference] if options[:reference]
56
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
57
+ fasta_reference_db.load_fai_entries
58
+
59
+ out = $stdout
60
+ lastRegion = nil
61
+ lastTemplate = nil
62
+ out = File.open(options[:output], "w") if options[:output]
63
+ File.open(test_file) do | f |
64
+ f.each_line do | line |
65
+ snp = nil
66
+ entry = nil
67
+ if options[:snp_file]
68
+ snp = Bio::PolyploidTools::SNP.parse(line)
69
+ entry = fasta_reference_db.index.region_for_entry(snp.gene)
70
+ elsif options[:mutant_list]
71
+ snp = Bio::PolyploidTools::SNPMutant.parse(line)
72
+ entry = fasta_reference_db.index.region_for_entry(snp.contig)
73
+ end
74
+ #puts line
75
+ if entry
76
+ region = entry.get_full_region
77
+ snp_name = snp.snp_id_in_seq
78
+
79
+ #if region != lastRegion
80
+ # lastTemplate = fasta_reference_db.fetch_sequence(region)
81
+ #end
82
+ start, total, new_position = snp.to_polymarker_coordinates(options[:flanking_size])
83
+ region.start = start
84
+ region.end = start + total
85
+ #puts region
86
+ local_template = fasta_reference_db.fetch_sequence(region)
87
+
88
+ snp.position = new_position
89
+
90
+ snp.template_sequence = local_template
91
+ lastRegion = region
92
+
93
+ out.puts "#{snp.gene}_#{snp_name},#{snp.chromosome},#{snp.to_polymarker_sequence(options[:flanking_size])}"
94
+ else
95
+ $stderr.puts "ERROR: Unable to find entry for #{snp.gene}"
96
+ end
97
+ end
98
+ end
99
+
100
+ out.close if options[:output]
101
+
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio'
4
+ require 'rubygems'
5
+ require 'pathname'
6
+ require 'bio-samtools-wrapper'
7
+
8
+ require 'set'
9
+
10
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
11
+ $: << File.expand_path('.')
12
+ path=File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
13
+ $stderr.puts "Loading: #{path}"
14
+ require path
15
+
16
+
17
+
18
+ fasta_db = Bio::DB::Fasta::FastaFile.new( ARGV[0])
19
+ fasta_db.load_fai_entries
20
+ bam1 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]})
21
+ bam2 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[2]})
22
+
23
+
24
+ output_prefix = ARGV[3]
25
+
26
+ block_size=1000
27
+
28
+ min_cov = ARGV[4].to_i ? ARGV[4].to_i : 10
29
+ chunk = ARGV[5].to_i
30
+ chunk_size = ARGV[6].to_i
31
+
32
+
33
+
34
+
35
+ main_table="#{output_prefix}_#{block_size}_#{min_cov}_table.#{chunk}.csv"
36
+
37
+ table_file = File.open(main_table, "w")
38
+ table_file.puts "gene\tlength\tsnps_1\tcalled_1\tsnps_per_#{block_size}_1\tsnps_2\tcalled_2\tsnps_per_#{block_size}_2\tsnps_tot\tsnps_per_1k_tot"
39
+
40
+ hist_1= Hash.new(0)
41
+ hist_2= Hash.new(0)
42
+
43
+ fasta_file = File.open("#{output_prefix}_#{min_cov}.#{chunk}.fa", "w")
44
+ i = -1
45
+ min = chunk * chunk_size
46
+ max = min + chunk_size
47
+
48
+ fasta_db.index.entries.each do | r |
49
+ i = i + 1
50
+ next if i < min or i >= max
51
+ #Np r.get_full_region
52
+ #container.process_region( { :region => r.get_full_region.to_s, :output_file => output_file } )
53
+ region=r.get_full_region
54
+
55
+
56
+ begin
57
+ reg_a = bam1.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
58
+ reg_b = bam2.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
59
+ cons_1 = reg_a.consensus
60
+ cons_2 = reg_b.consensus
61
+
62
+
63
+ snps_1 = cons_1.count_ambiguities
64
+ snps_2 = cons_2.count_ambiguities
65
+
66
+ called_1 = reg_a.called
67
+ called_2 = reg_b.called
68
+
69
+ snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
70
+
71
+ snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
72
+ snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
73
+ snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
74
+
75
+ hist_1[snps_per_1k_1.to_i] += 1
76
+ hist_2[snps_per_1k_2.to_i] += 1
77
+
78
+ table_file.print "#{r.id}\t#{region.size}\t"
79
+ table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
80
+ table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
81
+ table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n"
82
+ fasta_file.puts ">#{r.id}_1"
83
+ fasta_file.puts "#{cons_1}"
84
+ fasta_file.puts ">#{r.id}_2"
85
+ fasta_file.puts "#{cons_2}"
86
+
87
+ rescue Exception => e
88
+ $stderr.puts "Unable to process #{region}: #{e.to_s}"
89
+ end
90
+ end
91
+ fasta_file.close
92
+ table_file.close
93
+
94
+ hist_table="#{output_prefix}_#{block_size}_#{min_cov}_hist.#{chunk}.csv"
95
+ hist_file = File.open(hist_table, "w")
96
+
97
+ all_keys = SortedSet.new(hist_1.keys)
98
+ all_keys.merge(hist_2.keys)
99
+ hist_file.puts "SNPs/#{block_size}\thist_1\thist_2\n"
100
+ all_keys.each do |k|
101
+ hist_file.puts "#{k}\t#{hist_1[k]}\t#{hist_2[k]}"
102
+ end
103
+
104
+ hist_file.close
105
+
106
+
107
+
data/bin/tag_stats.rb ADDED
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+
4
+ require 'csv'
5
+ require 'fileutils'
6
+ require 'tmpdir'
7
+ require 'bio-samtools-wrapper'
8
+ require 'bio'
9
+ require 'descriptive_statistics'
10
+
11
+ class Bio::DB::Tag
12
+ def set(str)
13
+ @tag = str[0..1]
14
+ @type = str[3]
15
+ @value = str[5..-1]
16
+ @value = @value.to_i if @type == "i"
17
+ end
18
+ end
19
+
20
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
21
+ $: << File.expand_path('.')
22
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
23
+ require path
24
+ opts = {}
25
+ opts[:tag] = "NH"
26
+ opts[:bam] = nil
27
+ opts[:out] = nil
28
+ opts[:ref] = nil
29
+
30
+ out = $stdout
31
+
32
+ OptionParser.new do |o|
33
+ o.banner = "Usage: tag_stats.rb [options]"
34
+
35
+ o.on("-t", "--tag str", "The tag to extract (default NH)") do |o|
36
+ opts[:tag] = o
37
+ end
38
+
39
+ o.on("-b", "--bam FILE" , "BAM file with the alignments ") do |o|
40
+ opts[:bam] = o
41
+ end
42
+
43
+ o.on("-o", "--out_file CHAR", "File to save the stats") do |o|
44
+ opts[:out] = o
45
+ end
46
+
47
+ o.on("-r", "--reference FILE", "Fasta file with the reference") do |o|
48
+ opts[:ref] = o
49
+ end
50
+ end.parse!
51
+
52
+ bam = Bio::DB::Sam.new(fasta: opts[:ref], bam: opts[:bam])
53
+ tag = opts[:tag]
54
+
55
+ sample = File.basename(opts[:bam], '.sorted.bam')
56
+ last_ref = ""
57
+ values = []
58
+ to_print = [:sum, :min, :max, :mean, :mode, :median, :q1, :q2, :q3]
59
+ percentiles = [90, 95, 97.5, 99]
60
+ #Add the 90, 95, 97.5 and 99 percentiles.
61
+ out = File.open(opts[:out], "w") if opts[:out]
62
+ bam.view do |aln |
63
+ if(last_ref != aln.rname)
64
+
65
+ desc_stats = values.descriptive_statistics
66
+ to_print.each { |e| out.puts [sample, last_ref, e , desc_stats[e] ].join("\t") } if(last_ref != "")
67
+ percentiles.each { |e| out.puts [sample, last_ref, "P#{e}", values.percentile(e)].join("\t") } if(last_ref != "")
68
+ out.puts [sample, last_ref, "N", values.length].join("\t") if(last_ref != "")
69
+ values.clear
70
+ last_ref = aln.rname
71
+ end
72
+ values << aln.tags[tag].value
73
+ end
74
+
75
+ out.close if opts[:out]
@@ -0,0 +1,56 @@
1
+ require 'bio-samtools-wrapper'
2
+ require 'optparse'
3
+
4
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
5
+ $: << File.expand_path('.')
6
+ path=File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
7
+
8
+
9
+
10
+
11
+ def parseVCFheader(head_line="")
12
+ ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
13
+
14
+ m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
15
+ {:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
16
+
17
+ end
18
+
19
+
20
+ header_info = Hash.new
21
+ ARGF.each_line do |line|
22
+ h = nil
23
+ h = parseVCFheader(line) if line.start_with? "##INFO"
24
+
25
+ header_info[h[:id]] = h[:desc] if h
26
+ #puts header_info.inspect
27
+ next if line.start_with? "##"
28
+ if line.start_with? "#CHROM"
29
+ arr = line.split
30
+ arr = arr.drop(9)
31
+ arr2 = arr.map { |s| [s.clone().prepend('Cov'), s.clone().prepend('Hap') ]}
32
+ #header += arr2.join("\t")
33
+ #puts header
34
+ next
35
+ end
36
+
37
+ line.chomp!
38
+
39
+ vcf = Bio::DB::Vcf.new(line, arr)
40
+ # puts arr.join("\t") if vcf.info["TYPE"] == "snp"
41
+ # puts vcf.inspect
42
+ #pus vcf.pos.inspect
43
+ #next if vcf.info["AO"].to_i != 1
44
+ vcf.info.each_pair { |name, val| puts "#{name}\t#{val}\t#{header_info[name]}" }
45
+
46
+ arr2 = Array.new
47
+ puts "____"
48
+ i = 0
49
+ vcf.samples.each do |sample|
50
+ #puts sample.inspect
51
+ puts sample[1].keys.join("\t") if i == 0
52
+ puts sample[1].values.join("\t")
53
+ i+=1
54
+ end
55
+
56
+ end