bio-polymarker 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +24 -0
  3. data/Gemfile +23 -0
  4. data/README.md +205 -0
  5. data/Rakefile +61 -0
  6. data/SECURITY.md +16 -0
  7. data/VERSION +1 -0
  8. data/bin/bfr.rb +128 -0
  9. data/bin/blast_triads.rb +166 -0
  10. data/bin/blast_triads_promoters.rb +192 -0
  11. data/bin/count_variations.rb +36 -0
  12. data/bin/filter_blat_by_target_coverage.rb +69 -0
  13. data/bin/filter_exonerate_by_identity.rb +38 -0
  14. data/bin/find_best_blat_hit.rb +33 -0
  15. data/bin/find_best_exonerate.rb +17 -0
  16. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  17. data/bin/hexaploid_primers.rb +168 -0
  18. data/bin/homokaryot_primers.rb +183 -0
  19. data/bin/mafft_triads.rb +120 -0
  20. data/bin/mafft_triads_promoters.rb +403 -0
  21. data/bin/map_markers_to_contigs.rb +66 -0
  22. data/bin/marker_to_vcf.rb +241 -0
  23. data/bin/markers_in_region.rb +42 -0
  24. data/bin/mask_triads.rb +169 -0
  25. data/bin/polymarker.rb +410 -0
  26. data/bin/polymarker_capillary.rb +443 -0
  27. data/bin/polymarker_deletions.rb +350 -0
  28. data/bin/snp_position_to_polymarker.rb +101 -0
  29. data/bin/snps_between_bams.rb +107 -0
  30. data/bin/tag_stats.rb +75 -0
  31. data/bin/vcfLineToTable.rb +56 -0
  32. data/bin/vcfToPolyMarker.rb +82 -0
  33. data/bio-polymarker.gemspec +227 -0
  34. data/conf/defaults.rb +1 -0
  35. data/conf/primer3_config/dangle.dh +128 -0
  36. data/conf/primer3_config/dangle.ds +128 -0
  37. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  38. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  39. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  40. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  41. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  42. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  43. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  44. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  45. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  46. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  47. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  48. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  49. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  50. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  51. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  52. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  53. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  54. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  55. data/conf/primer3_config/loops.dh +30 -0
  56. data/conf/primer3_config/loops.ds +30 -0
  57. data/conf/primer3_config/stack.dh +256 -0
  58. data/conf/primer3_config/stack.ds +256 -0
  59. data/conf/primer3_config/stackmm.dh +256 -0
  60. data/conf/primer3_config/stackmm.ds +256 -0
  61. data/conf/primer3_config/tetraloop.dh +77 -0
  62. data/conf/primer3_config/tetraloop.ds +77 -0
  63. data/conf/primer3_config/triloop.dh +16 -0
  64. data/conf/primer3_config/triloop.ds +16 -0
  65. data/conf/primer3_config/tstack.dh +256 -0
  66. data/conf/primer3_config/tstack2.dh +256 -0
  67. data/conf/primer3_config/tstack2.ds +256 -0
  68. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  69. data/lib/bio/BFRTools.rb +465 -0
  70. data/lib/bio/BIOExtensions.rb +153 -0
  71. data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
  72. data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
  73. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  74. data/lib/bio/PolyploidTools/Mask.rb +116 -0
  75. data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
  76. data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
  77. data/lib/bio/PolyploidTools/SNP.rb +804 -0
  78. data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
  79. data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
  80. data/lib/bio/db/blast.rb +114 -0
  81. data/lib/bio/db/exonerate.rb +333 -0
  82. data/lib/bio/db/primer3.rb +820 -0
  83. data/lib/bio-polymarker.rb +28 -0
  84. data/test/data/7B_amplicon_test.fa +12 -0
  85. data/test/data/7B_amplicon_test.fa.fai +1 -0
  86. data/test/data/7B_amplicon_test_reference.fa +110 -0
  87. data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
  88. data/test/data/7B_marker_test.txt +1 -0
  89. data/test/data/BS00068396_51.fa +2 -0
  90. data/test/data/BS00068396_51_blast.tab +4 -0
  91. data/test/data/BS00068396_51_contigs.aln +1412 -0
  92. data/test/data/BS00068396_51_contigs.dnd +7 -0
  93. data/test/data/BS00068396_51_contigs.fa +8 -0
  94. data/test/data/BS00068396_51_contigs.fa.fai +4 -0
  95. data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
  96. data/test/data/BS00068396_51_contigs.fa.nin +0 -0
  97. data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
  98. data/test/data/BS00068396_51_contigs.nhr +0 -0
  99. data/test/data/BS00068396_51_contigs.nin +0 -0
  100. data/test/data/BS00068396_51_contigs.nsq +0 -0
  101. data/test/data/BS00068396_51_exonerate.tab +6 -0
  102. data/test/data/BS00068396_51_for_polymarker.txt +1 -0
  103. data/test/data/BS00068396_51_genes.txt +14 -0
  104. data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
  105. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
  106. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  107. data/test/data/LIB1716.bam +0 -0
  108. data/test/data/LIB1716.bam.bai +0 -0
  109. data/test/data/LIB1719.bam +0 -0
  110. data/test/data/LIB1719.bam.bai +0 -0
  111. data/test/data/LIB1721.bam +0 -0
  112. data/test/data/LIB1721.bam.bai +0 -0
  113. data/test/data/LIB1722.bam +0 -0
  114. data/test/data/LIB1722.bam.bai +0 -0
  115. data/test/data/PST130_7067.csv +1 -0
  116. data/test/data/PST130_7067.fa +2 -0
  117. data/test/data/PST130_7067.fa.fai +1 -0
  118. data/test/data/PST130_7067.fa.ndb +0 -0
  119. data/test/data/PST130_7067.fa.nhr +0 -0
  120. data/test/data/PST130_7067.fa.nin +0 -0
  121. data/test/data/PST130_7067.fa.not +0 -0
  122. data/test/data/PST130_7067.fa.nsq +0 -0
  123. data/test/data/PST130_7067.fa.ntf +0 -0
  124. data/test/data/PST130_7067.fa.nto +0 -0
  125. data/test/data/PST130_reverse_primer.csv +1 -0
  126. data/test/data/S22380157.fa +16 -0
  127. data/test/data/S22380157.fa.fai +1 -0
  128. data/test/data/S22380157.vcf +67 -0
  129. data/test/data/S58861868/LIB1716.bam +0 -0
  130. data/test/data/S58861868/LIB1716.sam +651 -0
  131. data/test/data/S58861868/LIB1719.bam +0 -0
  132. data/test/data/S58861868/LIB1719.sam +805 -0
  133. data/test/data/S58861868/LIB1721.bam +0 -0
  134. data/test/data/S58861868/LIB1721.sam +1790 -0
  135. data/test/data/S58861868/LIB1722.bam +0 -0
  136. data/test/data/S58861868/LIB1722.sam +1271 -0
  137. data/test/data/S58861868/S58861868.fa +16 -0
  138. data/test/data/S58861868/S58861868.fa.fai +1 -0
  139. data/test/data/S58861868/S58861868.vcf +76 -0
  140. data/test/data/S58861868/header.txt +9 -0
  141. data/test/data/S58861868/merged.bam +0 -0
  142. data/test/data/S58861868/merged_reheader.bam +0 -0
  143. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  144. data/test/data/Test3Aspecific.csv +2 -0
  145. data/test/data/Test3Aspecific_contigs.fa +6 -0
  146. data/test/data/bfr_out_test.csv +5 -0
  147. data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
  148. data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
  149. data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
  150. data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
  151. data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
  152. data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
  153. data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
  154. data/test/data/headerMergeed.txt +9 -0
  155. data/test/data/headerS2238015 +1 -0
  156. data/test/data/mergedLibs.bam +0 -0
  157. data/test/data/mergedLibsReheader.bam +0 -0
  158. data/test/data/mergedLibsSorted.bam +0 -0
  159. data/test/data/mergedLibsSorted.bam.bai +0 -0
  160. data/test/data/patological_cases5D.csv +1 -0
  161. data/test/data/primer_3_input_header_test +5 -0
  162. data/test/data/short_primer_design_test.csv +10 -0
  163. data/test/data/some_tests/some_tests.csv +201 -0
  164. data/test/data/test_from_mutant.csv +3 -0
  165. data/test/data/test_iselect.csv +196 -0
  166. data/test/data/test_iselect_reference.fa +1868 -0
  167. data/test/data/test_iselect_reference.fa.fai +934 -0
  168. data/test/data/test_primer3_error.csv +4 -0
  169. data/test/data/test_primer3_error_contigs.fa +10 -0
  170. data/test/test_bfr.rb +135 -0
  171. data/test/test_blast.rb +47 -0
  172. data/test/test_exon_container.rb +17 -0
  173. data/test/test_exonearate.rb +48 -0
  174. data/test/test_integration.rb +76 -0
  175. data/test/test_snp_parsing.rb +121 -0
  176. data/test/test_wrong_selection.sh +5 -0
  177. metadata +356 -0
@@ -0,0 +1,350 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio'
3
+ require 'rubygems'
4
+ require 'pathname'
5
+ require 'bio-samtools-wrapper'
6
+ require 'optparse'
7
+ require 'set'
8
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
9
+ $: << File.expand_path('.')
10
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
11
+ require path
12
+
13
+ def log(msg)
14
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
15
+ puts "#{time}: #{msg}"
16
+ end
17
+
18
+
19
+ class Bio::PolyploidTools::ExonContainer
20
+ def add_alignments(opts=Hash.new)
21
+ opts = { :min_identity=>90 }.merge!(opts)
22
+ exonerate_filename = opts[:exonerate_file]
23
+ arm_selection = opts[:arm_selection]
24
+
25
+ unless arm_selection
26
+ arm_selection = lambda do | contig_name |
27
+ ret = contig_name[0,3]
28
+ return ret
29
+ end
30
+ end
31
+
32
+ File.open(exonerate_filename) do |f|
33
+ f.each_line do | line |
34
+ record = Bio::DB::Exonerate::Alignment.parse_custom(line)
35
+ if record and record.identity >= opts[:min_identity]
36
+ snp_array = @snp_map[record.query_id]
37
+ if snp_array != nil
38
+ snp_array.each do |snp|
39
+ if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
40
+ begin
41
+ exon = record.exon_on_gene_position(snp.position)
42
+ snp.add_exon(exon, arm_selection.call(record.target_id))
43
+ rescue Bio::DB::Exonerate::ExonerateException
44
+ $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+
55
+ class Bio::DB::Primer3::SNP
56
+ def to_s
57
+ "#{gene}:#{snp_from.chromosome}"
58
+ end
59
+ end
60
+
61
+ class Bio::DB::Primer3::Primer3Record
62
+
63
+ def best_pair
64
+ return @best_pair if @best_pair
65
+ @best_pair = nil
66
+ @total_caps = 100
67
+ @primerPairs.each do | primer |
68
+ capital_count = "#{primer.left.sequence}#{primer.right.sequence}".scan(/[A-Z]/).length
69
+ if @best_pair.nil?
70
+ @best_pair = primer
71
+ @total_caps = capital_count
72
+ next
73
+ end
74
+ if capital_count < @total_caps
75
+ @best_pair = primer
76
+ @total_caps = capital_count
77
+ end
78
+ if primer.size < @best_pair.size
79
+ @best_pair = primer
80
+ @total_caps = capital_count
81
+ end
82
+ end
83
+
84
+ @best_pair
85
+ end
86
+
87
+ #CL3339Contig1:T509C AvocetS chromosome_specific exon 4D forward
88
+ def parse_header
89
+ @snp, @line, @type, @in, @polymorphism, @chromosome, @orientation = self.sequence_id.split(" ")
90
+ @type = @type.to_sym
91
+ if @in
92
+ @in = @in.to_sym == :exon
93
+ else
94
+ @exon = false
95
+ end
96
+
97
+ if @polymorphism.to_sym == :homoeologous
98
+ @homoeologous = true
99
+ else
100
+ @homoeologous = false
101
+ end
102
+ @parsed = true
103
+ @orientation = @orientation.to_sym
104
+ end
105
+
106
+ def score
107
+ best_pair
108
+ total_caps = "#{best_pair.left.sequence}#{best_pair.right.sequence}".scan(/[A-Z]/).length
109
+ # puts "score"
110
+ # puts self.inspect
111
+ ret = 0
112
+ ret += @scores[type]
113
+ ret += @scores[:exon] if exon?
114
+ ret -= total_caps * 10
115
+ ret -= product_length
116
+ ret
117
+ end
118
+
119
+ def to_s
120
+ "#{gene}:#{snp_from.chromosome}"
121
+ end
122
+
123
+ def left_primer_snp(snp)
124
+ tmp_primer = String.new(left_primer)
125
+ return tmp_primer
126
+ end
127
+
128
+ end
129
+
130
+ markers = nil
131
+
132
+ options = {}
133
+ options[:aligner] = :blast
134
+ options[:model] = "est2genome"
135
+ options[:min_identity] = 90
136
+ options[:extract_found_contigs] = true
137
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
138
+ options[:genomes_count] = 3
139
+ options[:variation_free_region] =0
140
+
141
+ options[:primer_3_preferences] = {
142
+ :primer_product_size_range => "50-150" ,
143
+ :primer_max_size => 25 ,
144
+ :primer_lib_ambiguity_codes_consensus => 1,
145
+ :primer_liberal_base => 1,
146
+ :primer_num_return=>5,
147
+ :primer_explain_flag => 1,
148
+ :primer_thermodynamic_parameters_path=>File.expand_path(File.dirname(__FILE__) + '../../conf/primer3_config/') + '/'
149
+ }
150
+
151
+
152
+ options[:database] = false
153
+
154
+
155
+ OptionParser.new do |opts|
156
+
157
+ opts.banner = "Usage: polymarker_deletions.rb [options]"
158
+
159
+ opts.on("-m", "--sequences FASTA", "Sequence of the region to search") do |o|
160
+ options[:sequences] = o
161
+ end
162
+ opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
163
+ options[:reference] = o
164
+ end
165
+ opts.on("-o", "--output DIR", "Directory to write the output") do |o|
166
+ options[:output] = o
167
+ end
168
+
169
+ opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
170
+ options[:genomes_count] = o.to_i
171
+ end
172
+
173
+ opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
174
+ options[:extract_found_contigs] = true
175
+ end
176
+
177
+ opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
178
+ options[:database] = o
179
+ end
180
+
181
+ opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
182
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
183
+ end
184
+
185
+ end.parse!
186
+ #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
187
+ reference = options[:reference] if options[:reference]
188
+ throw raise Exception.new(), "Reference has to be provided" unless reference
189
+ sequences = options[:sequences] if options[:sequences]
190
+ throw raise Exception.new(), "Fasta file with sequences has to be provided" unless sequences
191
+ output_folder = options[:output] if options[:output]
192
+ throw raise Exception.new(), "An output directory has to be provided" unless output_folder
193
+ model=options[:model]
194
+
195
+ options[:database] = options[:reference] unless options[:database]
196
+
197
+ Dir.mkdir(output_folder)
198
+ min_identity= options[:min_identity]
199
+
200
+ exonerate_file="#{output_folder}/exonerate_tmp.tab"
201
+
202
+ primer_3_input="#{output_folder}/primer_3_input_temp"
203
+ primer_3_output="#{output_folder}/primer_3_output_temp"
204
+ exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
205
+ output_primers="#{output_folder}/primers.csv"
206
+ output_to_order="#{output_folder}/primers_to_order.csv"
207
+
208
+ fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
209
+ fasta_file.load_fai_entries
210
+
211
+ original_name="A"
212
+ snp_in="B"
213
+
214
+ arm_selection = options[:arm_selection]
215
+
216
+ begin
217
+ log "Reading exons"
218
+ exons = Array.new
219
+ Bio::FlatFile.auto(sequences) do |ff|
220
+ ff.each do |entry|
221
+ fields = Array.new
222
+ fields << entry.definition
223
+ fields << arm_selection.call(entry.definition)
224
+ fields << entry.seq
225
+
226
+ line = fields.join(",")
227
+ snp = Bio::PolyploidTools::NoSNPSequence.parse(line)
228
+ snp.genomes_count = options[:genomes_count]
229
+ exons << snp
230
+
231
+ end
232
+ end
233
+
234
+
235
+
236
+ log "Searching markers in genome"
237
+ found_contigs = Set.new
238
+ exo_f = File.open(exonerate_file, "w")
239
+
240
+ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
241
+ if aln.identity > min_identity
242
+ exo_f.puts aln.line
243
+ unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
244
+ found_contigs.add(aln.target_id)
245
+ entry = fasta_file.index.region_for_entry(aln.target_id)
246
+ raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
247
+
248
+ end
249
+ end
250
+ end
251
+
252
+ Bio::DB::Blast.align({:query=>sequences, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
253
+ do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
254
+ end if options[:aligner] == :blast
255
+
256
+ Bio::DB::Exonerate.align({:query=>sequences, :target=>target, :model=>model}) do |aln|
257
+ do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
258
+ end if options[:aligner] == :exonerate
259
+
260
+ exo_f.close()
261
+
262
+
263
+
264
+ log "Reading best alignment on each chromosome"
265
+
266
+ container= Bio::PolyploidTools::ExonContainer.new
267
+ container.flanking_size=options[:flanking_size]
268
+ container.gene_models(sequences)
269
+ container.chromosomes(reference)
270
+ container.add_parental({:name=>"A"})
271
+ container.add_parental({:name=>"B"})
272
+ exons.each do |exon|
273
+ exon.container = container
274
+ exon.flanking_size = 200
275
+ exon.variation_free_region = options[:variation_free_region]
276
+ #puts exon.inspect
277
+ container.add_snp(exon)
278
+
279
+ end
280
+ container.add_alignments(
281
+ {:exonerate_file=>exonerate_file,
282
+ :arm_selection=>options[:arm_selection] ,
283
+ :min_identity=>min_identity})
284
+
285
+
286
+
287
+
288
+ #4.1 generating primer3 file
289
+ log "Running primer3"
290
+ file = File.open(exons_filename, "w")
291
+ container.print_fasta_snp_exones(file)
292
+ file.close
293
+
294
+ file = File.open(primer_3_input, "w")
295
+
296
+ Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
297
+ added_exons = container.print_primer_3_exons(file, nil, snp_in)
298
+ file.close
299
+
300
+ Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
301
+
302
+ #5. Pick the best primer and make the primer3 output
303
+ log "Selecting best primers"
304
+ kasp_container=Bio::DB::Primer3::KASPContainer.new
305
+ kasp_container.line_1= original_name
306
+ kasp_container.line_2= snp_in
307
+
308
+ if options[:scoring] == :het_dels
309
+ kasp_container.scores = Hash.new
310
+ kasp_container.scores[:chromosome_specific] = 0
311
+ kasp_container.scores[:chromosome_semispecific] = 1000
312
+ kasp_container.scores[:chromosome_nonspecific] = 100
313
+ end
314
+
315
+ exons.each do |snp|
316
+ snpk = kasp_container.add_snp(snp)
317
+ end
318
+
319
+ kasp_container.add_primers_file(primer_3_output) if added_exons > 0
320
+ header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors,repetitive,blast_hits"
321
+ File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
322
+
323
+ out_fasta_products = "#{output_folder}/products.fa"
324
+ File.open(out_fasta_products, 'w') do |f|
325
+ kasp_container.snp_hash.each_pair do |name, kaspSNP|
326
+ f.write(kaspSNP.realigned_primers_fasta)
327
+ end
328
+ end
329
+
330
+ File.open(output_to_order, "w") { |io| io.write(kasp_container.print_primers_with_tails()) }
331
+
332
+ log "DONE"
333
+ rescue StandardError => e
334
+ log "ERROR\t#{e.message}"
335
+ $stderr.puts e.backtrace
336
+ raise e
337
+ rescue Exception => e
338
+ log "ERROR\t#{e.message}"
339
+ $stderr.puts e.backtrace
340
+ raise e
341
+ end
342
+ #puts container.inspect
343
+
344
+ #container.snp_map.each do | gene, snp_array|
345
+ # snp_array.each do |e|
346
+ # puts e.inspect
347
+ # puts e.aligned_sequences_fasta
348
+ # end
349
+ #end
350
+
@@ -0,0 +1,101 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #This This script converts the a file with snps and positions with the header:
4
+ #GENE,BASE,POS,SNP,Chromosome
5
+ # snp.gene, snp.original, snp.position, snp.snp, snp.chromosome
6
+ #To the input expected by polymarker
7
+ #ID, Chromosome, sequence
8
+ #With sequence containing the SNP in the notation "[A/T]"
9
+ require 'bio'
10
+ require 'optparse'
11
+
12
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
13
+ $: << File.expand_path('.')
14
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
15
+ require path
16
+
17
+
18
+ def log(msg)
19
+ time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
20
+ puts "#{time}: #{msg}"
21
+ end
22
+
23
+ markers = nil
24
+
25
+ options = {}
26
+ options[:flanking_size] = 100
27
+ test_file=''
28
+ OptionParser.new do |opts|
29
+
30
+ opts.banner = "Usage: snp_postion_to_polymarker.rb [options]"
31
+
32
+ opts.on("-s", "--snp_file CSV", "CSV file with the following columnns:\nID,Allele_1,position,Allele_1,target_chromosome") do |o|
33
+ options[:snp_file] = o
34
+ test_file = o
35
+ end
36
+ opts.on("-r", "--reference FASTA", "reference with the genes/contings/marker seuqnece") do |o|
37
+ options[:reference] = o
38
+ end
39
+ opts.on("-o", "--out CSV", "Output file ") do |o|
40
+ options[:output] = o
41
+ end
42
+ opts.on("-f", "--flanking_size INT", "Flanking size around the SNP") do |o|
43
+ options[:flanking_size] = o.to_i
44
+ end
45
+
46
+ opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line. Example: IWGSC_CSS_1AL_scaff_1455974,Kronos2281,127,C,T\n\
47
+ requires --reference to get the sequence using a position") do |o|
48
+ options[:mutant_list] = o
49
+ test_file = o
50
+ end
51
+
52
+ end.parse!
53
+ #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
54
+
55
+ fasta_reference = options[:reference] if options[:reference]
56
+ fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
57
+ fasta_reference_db.load_fai_entries
58
+
59
+ out = $stdout
60
+ lastRegion = nil
61
+ lastTemplate = nil
62
+ out = File.open(options[:output], "w") if options[:output]
63
+ File.open(test_file) do | f |
64
+ f.each_line do | line |
65
+ snp = nil
66
+ entry = nil
67
+ if options[:snp_file]
68
+ snp = Bio::PolyploidTools::SNP.parse(line)
69
+ entry = fasta_reference_db.index.region_for_entry(snp.gene)
70
+ elsif options[:mutant_list]
71
+ snp = Bio::PolyploidTools::SNPMutant.parse(line)
72
+ entry = fasta_reference_db.index.region_for_entry(snp.contig)
73
+ end
74
+ #puts line
75
+ if entry
76
+ region = entry.get_full_region
77
+ snp_name = snp.snp_id_in_seq
78
+
79
+ #if region != lastRegion
80
+ # lastTemplate = fasta_reference_db.fetch_sequence(region)
81
+ #end
82
+ start, total, new_position = snp.to_polymarker_coordinates(options[:flanking_size])
83
+ region.start = start
84
+ region.end = start + total
85
+ #puts region
86
+ local_template = fasta_reference_db.fetch_sequence(region)
87
+
88
+ snp.position = new_position
89
+
90
+ snp.template_sequence = local_template
91
+ lastRegion = region
92
+
93
+ out.puts "#{snp.gene}_#{snp_name},#{snp.chromosome},#{snp.to_polymarker_sequence(options[:flanking_size])}"
94
+ else
95
+ $stderr.puts "ERROR: Unable to find entry for #{snp.gene}"
96
+ end
97
+ end
98
+ end
99
+
100
+ out.close if options[:output]
101
+
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bio'
4
+ require 'rubygems'
5
+ require 'pathname'
6
+ require 'bio-samtools-wrapper'
7
+
8
+ require 'set'
9
+
10
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
11
+ $: << File.expand_path('.')
12
+ path=File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
13
+ $stderr.puts "Loading: #{path}"
14
+ require path
15
+
16
+
17
+
18
+ fasta_db = Bio::DB::Fasta::FastaFile.new( ARGV[0])
19
+ fasta_db.load_fai_entries
20
+ bam1 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]})
21
+ bam2 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[2]})
22
+
23
+
24
+ output_prefix = ARGV[3]
25
+
26
+ block_size=1000
27
+
28
+ min_cov = ARGV[4].to_i ? ARGV[4].to_i : 10
29
+ chunk = ARGV[5].to_i
30
+ chunk_size = ARGV[6].to_i
31
+
32
+
33
+
34
+
35
+ main_table="#{output_prefix}_#{block_size}_#{min_cov}_table.#{chunk}.csv"
36
+
37
+ table_file = File.open(main_table, "w")
38
+ table_file.puts "gene\tlength\tsnps_1\tcalled_1\tsnps_per_#{block_size}_1\tsnps_2\tcalled_2\tsnps_per_#{block_size}_2\tsnps_tot\tsnps_per_1k_tot"
39
+
40
+ hist_1= Hash.new(0)
41
+ hist_2= Hash.new(0)
42
+
43
+ fasta_file = File.open("#{output_prefix}_#{min_cov}.#{chunk}.fa", "w")
44
+ i = -1
45
+ min = chunk * chunk_size
46
+ max = min + chunk_size
47
+
48
+ fasta_db.index.entries.each do | r |
49
+ i = i + 1
50
+ next if i < min or i >= max
51
+ #Np r.get_full_region
52
+ #container.process_region( { :region => r.get_full_region.to_s, :output_file => output_file } )
53
+ region=r.get_full_region
54
+
55
+
56
+ begin
57
+ reg_a = bam1.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
58
+ reg_b = bam2.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
59
+ cons_1 = reg_a.consensus
60
+ cons_2 = reg_b.consensus
61
+
62
+
63
+ snps_1 = cons_1.count_ambiguities
64
+ snps_2 = cons_2.count_ambiguities
65
+
66
+ called_1 = reg_a.called
67
+ called_2 = reg_b.called
68
+
69
+ snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
70
+
71
+ snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
72
+ snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
73
+ snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
74
+
75
+ hist_1[snps_per_1k_1.to_i] += 1
76
+ hist_2[snps_per_1k_2.to_i] += 1
77
+
78
+ table_file.print "#{r.id}\t#{region.size}\t"
79
+ table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
80
+ table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
81
+ table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n"
82
+ fasta_file.puts ">#{r.id}_1"
83
+ fasta_file.puts "#{cons_1}"
84
+ fasta_file.puts ">#{r.id}_2"
85
+ fasta_file.puts "#{cons_2}"
86
+
87
+ rescue Exception => e
88
+ $stderr.puts "Unable to process #{region}: #{e.to_s}"
89
+ end
90
+ end
91
+ fasta_file.close
92
+ table_file.close
93
+
94
+ hist_table="#{output_prefix}_#{block_size}_#{min_cov}_hist.#{chunk}.csv"
95
+ hist_file = File.open(hist_table, "w")
96
+
97
+ all_keys = SortedSet.new(hist_1.keys)
98
+ all_keys.merge(hist_2.keys)
99
+ hist_file.puts "SNPs/#{block_size}\thist_1\thist_2\n"
100
+ all_keys.each do |k|
101
+ hist_file.puts "#{k}\t#{hist_1[k]}\t#{hist_2[k]}"
102
+ end
103
+
104
+ hist_file.close
105
+
106
+
107
+
data/bin/tag_stats.rb ADDED
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+
4
+ require 'csv'
5
+ require 'fileutils'
6
+ require 'tmpdir'
7
+ require 'bio-samtools-wrapper'
8
+ require 'bio'
9
+ require 'descriptive_statistics'
10
+
11
+ class Bio::DB::Tag
12
+ def set(str)
13
+ @tag = str[0..1]
14
+ @type = str[3]
15
+ @value = str[5..-1]
16
+ @value = @value.to_i if @type == "i"
17
+ end
18
+ end
19
+
20
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
21
+ $: << File.expand_path('.')
22
+ path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
23
+ require path
24
+ opts = {}
25
+ opts[:tag] = "NH"
26
+ opts[:bam] = nil
27
+ opts[:out] = nil
28
+ opts[:ref] = nil
29
+
30
+ out = $stdout
31
+
32
+ OptionParser.new do |o|
33
+ o.banner = "Usage: tag_stats.rb [options]"
34
+
35
+ o.on("-t", "--tag str", "The tag to extract (default NH)") do |o|
36
+ opts[:tag] = o
37
+ end
38
+
39
+ o.on("-b", "--bam FILE" , "BAM file with the alignments ") do |o|
40
+ opts[:bam] = o
41
+ end
42
+
43
+ o.on("-o", "--out_file CHAR", "File to save the stats") do |o|
44
+ opts[:out] = o
45
+ end
46
+
47
+ o.on("-r", "--reference FILE", "Fasta file with the reference") do |o|
48
+ opts[:ref] = o
49
+ end
50
+ end.parse!
51
+
52
+ bam = Bio::DB::Sam.new(fasta: opts[:ref], bam: opts[:bam])
53
+ tag = opts[:tag]
54
+
55
+ sample = File.basename(opts[:bam], '.sorted.bam')
56
+ last_ref = ""
57
+ values = []
58
+ to_print = [:sum, :min, :max, :mean, :mode, :median, :q1, :q2, :q3]
59
+ percentiles = [90, 95, 97.5, 99]
60
+ #Add the 90, 95, 97.5 and 99 percentiles.
61
+ out = File.open(opts[:out], "w") if opts[:out]
62
+ bam.view do |aln |
63
+ if(last_ref != aln.rname)
64
+
65
+ desc_stats = values.descriptive_statistics
66
+ to_print.each { |e| out.puts [sample, last_ref, e , desc_stats[e] ].join("\t") } if(last_ref != "")
67
+ percentiles.each { |e| out.puts [sample, last_ref, "P#{e}", values.percentile(e)].join("\t") } if(last_ref != "")
68
+ out.puts [sample, last_ref, "N", values.length].join("\t") if(last_ref != "")
69
+ values.clear
70
+ last_ref = aln.rname
71
+ end
72
+ values << aln.tags[tag].value
73
+ end
74
+
75
+ out.close if opts[:out]
@@ -0,0 +1,56 @@
1
+ require 'bio-samtools-wrapper'
2
+ require 'optparse'
3
+
4
+ $: << File.expand_path(File.dirname(__FILE__) + '/../lib')
5
+ $: << File.expand_path('.')
6
+ path=File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
7
+
8
+
9
+
10
+
11
+ def parseVCFheader(head_line="")
12
+ ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
13
+
14
+ m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
15
+ {:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
16
+
17
+ end
18
+
19
+
20
+ header_info = Hash.new
21
+ ARGF.each_line do |line|
22
+ h = nil
23
+ h = parseVCFheader(line) if line.start_with? "##INFO"
24
+
25
+ header_info[h[:id]] = h[:desc] if h
26
+ #puts header_info.inspect
27
+ next if line.start_with? "##"
28
+ if line.start_with? "#CHROM"
29
+ arr = line.split
30
+ arr = arr.drop(9)
31
+ arr2 = arr.map { |s| [s.clone().prepend('Cov'), s.clone().prepend('Hap') ]}
32
+ #header += arr2.join("\t")
33
+ #puts header
34
+ next
35
+ end
36
+
37
+ line.chomp!
38
+
39
+ vcf = Bio::DB::Vcf.new(line, arr)
40
+ # puts arr.join("\t") if vcf.info["TYPE"] == "snp"
41
+ # puts vcf.inspect
42
+ #pus vcf.pos.inspect
43
+ #next if vcf.info["AO"].to_i != 1
44
+ vcf.info.each_pair { |name, val| puts "#{name}\t#{val}\t#{header_info[name]}" }
45
+
46
+ arr2 = Array.new
47
+ puts "____"
48
+ i = 0
49
+ vcf.samples.each do |sample|
50
+ #puts sample.inspect
51
+ puts sample[1].keys.join("\t") if i == 0
52
+ puts sample[1].values.join("\t")
53
+ i+=1
54
+ end
55
+
56
+ end