bio-polymarker 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,350 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'pathname'
|
5
|
+
require 'bio-samtools-wrapper'
|
6
|
+
require 'optparse'
|
7
|
+
require 'set'
|
8
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
9
|
+
$: << File.expand_path('.')
|
10
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
11
|
+
require path
|
12
|
+
|
13
|
+
def log(msg)
|
14
|
+
time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
|
15
|
+
puts "#{time}: #{msg}"
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
class Bio::PolyploidTools::ExonContainer
|
20
|
+
def add_alignments(opts=Hash.new)
|
21
|
+
opts = { :min_identity=>90 }.merge!(opts)
|
22
|
+
exonerate_filename = opts[:exonerate_file]
|
23
|
+
arm_selection = opts[:arm_selection]
|
24
|
+
|
25
|
+
unless arm_selection
|
26
|
+
arm_selection = lambda do | contig_name |
|
27
|
+
ret = contig_name[0,3]
|
28
|
+
return ret
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
File.open(exonerate_filename) do |f|
|
33
|
+
f.each_line do | line |
|
34
|
+
record = Bio::DB::Exonerate::Alignment.parse_custom(line)
|
35
|
+
if record and record.identity >= opts[:min_identity]
|
36
|
+
snp_array = @snp_map[record.query_id]
|
37
|
+
if snp_array != nil
|
38
|
+
snp_array.each do |snp|
|
39
|
+
if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
|
40
|
+
begin
|
41
|
+
exon = record.exon_on_gene_position(snp.position)
|
42
|
+
snp.add_exon(exon, arm_selection.call(record.target_id))
|
43
|
+
rescue Bio::DB::Exonerate::ExonerateException
|
44
|
+
$stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Bio::DB::Primer3::SNP
|
56
|
+
def to_s
|
57
|
+
"#{gene}:#{snp_from.chromosome}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class Bio::DB::Primer3::Primer3Record
|
62
|
+
|
63
|
+
def best_pair
|
64
|
+
return @best_pair if @best_pair
|
65
|
+
@best_pair = nil
|
66
|
+
@total_caps = 100
|
67
|
+
@primerPairs.each do | primer |
|
68
|
+
capital_count = "#{primer.left.sequence}#{primer.right.sequence}".scan(/[A-Z]/).length
|
69
|
+
if @best_pair.nil?
|
70
|
+
@best_pair = primer
|
71
|
+
@total_caps = capital_count
|
72
|
+
next
|
73
|
+
end
|
74
|
+
if capital_count < @total_caps
|
75
|
+
@best_pair = primer
|
76
|
+
@total_caps = capital_count
|
77
|
+
end
|
78
|
+
if primer.size < @best_pair.size
|
79
|
+
@best_pair = primer
|
80
|
+
@total_caps = capital_count
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
@best_pair
|
85
|
+
end
|
86
|
+
|
87
|
+
#CL3339Contig1:T509C AvocetS chromosome_specific exon 4D forward
|
88
|
+
def parse_header
|
89
|
+
@snp, @line, @type, @in, @polymorphism, @chromosome, @orientation = self.sequence_id.split(" ")
|
90
|
+
@type = @type.to_sym
|
91
|
+
if @in
|
92
|
+
@in = @in.to_sym == :exon
|
93
|
+
else
|
94
|
+
@exon = false
|
95
|
+
end
|
96
|
+
|
97
|
+
if @polymorphism.to_sym == :homoeologous
|
98
|
+
@homoeologous = true
|
99
|
+
else
|
100
|
+
@homoeologous = false
|
101
|
+
end
|
102
|
+
@parsed = true
|
103
|
+
@orientation = @orientation.to_sym
|
104
|
+
end
|
105
|
+
|
106
|
+
def score
|
107
|
+
best_pair
|
108
|
+
total_caps = "#{best_pair.left.sequence}#{best_pair.right.sequence}".scan(/[A-Z]/).length
|
109
|
+
# puts "score"
|
110
|
+
# puts self.inspect
|
111
|
+
ret = 0
|
112
|
+
ret += @scores[type]
|
113
|
+
ret += @scores[:exon] if exon?
|
114
|
+
ret -= total_caps * 10
|
115
|
+
ret -= product_length
|
116
|
+
ret
|
117
|
+
end
|
118
|
+
|
119
|
+
def to_s
|
120
|
+
"#{gene}:#{snp_from.chromosome}"
|
121
|
+
end
|
122
|
+
|
123
|
+
def left_primer_snp(snp)
|
124
|
+
tmp_primer = String.new(left_primer)
|
125
|
+
return tmp_primer
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
markers = nil
|
131
|
+
|
132
|
+
options = {}
|
133
|
+
options[:aligner] = :blast
|
134
|
+
options[:model] = "est2genome"
|
135
|
+
options[:min_identity] = 90
|
136
|
+
options[:extract_found_contigs] = true
|
137
|
+
options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
|
138
|
+
options[:genomes_count] = 3
|
139
|
+
options[:variation_free_region] =0
|
140
|
+
|
141
|
+
options[:primer_3_preferences] = {
|
142
|
+
:primer_product_size_range => "50-150" ,
|
143
|
+
:primer_max_size => 25 ,
|
144
|
+
:primer_lib_ambiguity_codes_consensus => 1,
|
145
|
+
:primer_liberal_base => 1,
|
146
|
+
:primer_num_return=>5,
|
147
|
+
:primer_explain_flag => 1,
|
148
|
+
:primer_thermodynamic_parameters_path=>File.expand_path(File.dirname(__FILE__) + '../../conf/primer3_config/') + '/'
|
149
|
+
}
|
150
|
+
|
151
|
+
|
152
|
+
options[:database] = false
|
153
|
+
|
154
|
+
|
155
|
+
OptionParser.new do |opts|
|
156
|
+
|
157
|
+
opts.banner = "Usage: polymarker_deletions.rb [options]"
|
158
|
+
|
159
|
+
opts.on("-m", "--sequences FASTA", "Sequence of the region to search") do |o|
|
160
|
+
options[:sequences] = o
|
161
|
+
end
|
162
|
+
opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
|
163
|
+
options[:reference] = o
|
164
|
+
end
|
165
|
+
opts.on("-o", "--output DIR", "Directory to write the output") do |o|
|
166
|
+
options[:output] = o
|
167
|
+
end
|
168
|
+
|
169
|
+
opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
|
170
|
+
options[:genomes_count] = o.to_i
|
171
|
+
end
|
172
|
+
|
173
|
+
opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
|
174
|
+
options[:extract_found_contigs] = true
|
175
|
+
end
|
176
|
+
|
177
|
+
opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
|
178
|
+
options[:database] = o
|
179
|
+
end
|
180
|
+
|
181
|
+
opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
|
182
|
+
options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
|
183
|
+
end
|
184
|
+
|
185
|
+
end.parse!
|
186
|
+
#reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
|
187
|
+
reference = options[:reference] if options[:reference]
|
188
|
+
throw raise Exception.new(), "Reference has to be provided" unless reference
|
189
|
+
sequences = options[:sequences] if options[:sequences]
|
190
|
+
throw raise Exception.new(), "Fasta file with sequences has to be provided" unless sequences
|
191
|
+
output_folder = options[:output] if options[:output]
|
192
|
+
throw raise Exception.new(), "An output directory has to be provided" unless output_folder
|
193
|
+
model=options[:model]
|
194
|
+
|
195
|
+
options[:database] = options[:reference] unless options[:database]
|
196
|
+
|
197
|
+
Dir.mkdir(output_folder)
|
198
|
+
min_identity= options[:min_identity]
|
199
|
+
|
200
|
+
exonerate_file="#{output_folder}/exonerate_tmp.tab"
|
201
|
+
|
202
|
+
primer_3_input="#{output_folder}/primer_3_input_temp"
|
203
|
+
primer_3_output="#{output_folder}/primer_3_output_temp"
|
204
|
+
exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
|
205
|
+
output_primers="#{output_folder}/primers.csv"
|
206
|
+
output_to_order="#{output_folder}/primers_to_order.csv"
|
207
|
+
|
208
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
|
209
|
+
fasta_file.load_fai_entries
|
210
|
+
|
211
|
+
original_name="A"
|
212
|
+
snp_in="B"
|
213
|
+
|
214
|
+
arm_selection = options[:arm_selection]
|
215
|
+
|
216
|
+
begin
|
217
|
+
log "Reading exons"
|
218
|
+
exons = Array.new
|
219
|
+
Bio::FlatFile.auto(sequences) do |ff|
|
220
|
+
ff.each do |entry|
|
221
|
+
fields = Array.new
|
222
|
+
fields << entry.definition
|
223
|
+
fields << arm_selection.call(entry.definition)
|
224
|
+
fields << entry.seq
|
225
|
+
|
226
|
+
line = fields.join(",")
|
227
|
+
snp = Bio::PolyploidTools::NoSNPSequence.parse(line)
|
228
|
+
snp.genomes_count = options[:genomes_count]
|
229
|
+
exons << snp
|
230
|
+
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
|
236
|
+
log "Searching markers in genome"
|
237
|
+
found_contigs = Set.new
|
238
|
+
exo_f = File.open(exonerate_file, "w")
|
239
|
+
|
240
|
+
def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
241
|
+
if aln.identity > min_identity
|
242
|
+
exo_f.puts aln.line
|
243
|
+
unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
244
|
+
found_contigs.add(aln.target_id)
|
245
|
+
entry = fasta_file.index.region_for_entry(aln.target_id)
|
246
|
+
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
|
247
|
+
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
Bio::DB::Blast.align({:query=>sequences, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
|
253
|
+
do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
|
254
|
+
end if options[:aligner] == :blast
|
255
|
+
|
256
|
+
Bio::DB::Exonerate.align({:query=>sequences, :target=>target, :model=>model}) do |aln|
|
257
|
+
do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
258
|
+
end if options[:aligner] == :exonerate
|
259
|
+
|
260
|
+
exo_f.close()
|
261
|
+
|
262
|
+
|
263
|
+
|
264
|
+
log "Reading best alignment on each chromosome"
|
265
|
+
|
266
|
+
container= Bio::PolyploidTools::ExonContainer.new
|
267
|
+
container.flanking_size=options[:flanking_size]
|
268
|
+
container.gene_models(sequences)
|
269
|
+
container.chromosomes(reference)
|
270
|
+
container.add_parental({:name=>"A"})
|
271
|
+
container.add_parental({:name=>"B"})
|
272
|
+
exons.each do |exon|
|
273
|
+
exon.container = container
|
274
|
+
exon.flanking_size = 200
|
275
|
+
exon.variation_free_region = options[:variation_free_region]
|
276
|
+
#puts exon.inspect
|
277
|
+
container.add_snp(exon)
|
278
|
+
|
279
|
+
end
|
280
|
+
container.add_alignments(
|
281
|
+
{:exonerate_file=>exonerate_file,
|
282
|
+
:arm_selection=>options[:arm_selection] ,
|
283
|
+
:min_identity=>min_identity})
|
284
|
+
|
285
|
+
|
286
|
+
|
287
|
+
|
288
|
+
#4.1 generating primer3 file
|
289
|
+
log "Running primer3"
|
290
|
+
file = File.open(exons_filename, "w")
|
291
|
+
container.print_fasta_snp_exones(file)
|
292
|
+
file.close
|
293
|
+
|
294
|
+
file = File.open(primer_3_input, "w")
|
295
|
+
|
296
|
+
Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
|
297
|
+
added_exons = container.print_primer_3_exons(file, nil, snp_in)
|
298
|
+
file.close
|
299
|
+
|
300
|
+
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
|
301
|
+
|
302
|
+
#5. Pick the best primer and make the primer3 output
|
303
|
+
log "Selecting best primers"
|
304
|
+
kasp_container=Bio::DB::Primer3::KASPContainer.new
|
305
|
+
kasp_container.line_1= original_name
|
306
|
+
kasp_container.line_2= snp_in
|
307
|
+
|
308
|
+
if options[:scoring] == :het_dels
|
309
|
+
kasp_container.scores = Hash.new
|
310
|
+
kasp_container.scores[:chromosome_specific] = 0
|
311
|
+
kasp_container.scores[:chromosome_semispecific] = 1000
|
312
|
+
kasp_container.scores[:chromosome_nonspecific] = 100
|
313
|
+
end
|
314
|
+
|
315
|
+
exons.each do |snp|
|
316
|
+
snpk = kasp_container.add_snp(snp)
|
317
|
+
end
|
318
|
+
|
319
|
+
kasp_container.add_primers_file(primer_3_output) if added_exons > 0
|
320
|
+
header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors,repetitive,blast_hits"
|
321
|
+
File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
|
322
|
+
|
323
|
+
out_fasta_products = "#{output_folder}/products.fa"
|
324
|
+
File.open(out_fasta_products, 'w') do |f|
|
325
|
+
kasp_container.snp_hash.each_pair do |name, kaspSNP|
|
326
|
+
f.write(kaspSNP.realigned_primers_fasta)
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
File.open(output_to_order, "w") { |io| io.write(kasp_container.print_primers_with_tails()) }
|
331
|
+
|
332
|
+
log "DONE"
|
333
|
+
rescue StandardError => e
|
334
|
+
log "ERROR\t#{e.message}"
|
335
|
+
$stderr.puts e.backtrace
|
336
|
+
raise e
|
337
|
+
rescue Exception => e
|
338
|
+
log "ERROR\t#{e.message}"
|
339
|
+
$stderr.puts e.backtrace
|
340
|
+
raise e
|
341
|
+
end
|
342
|
+
#puts container.inspect
|
343
|
+
|
344
|
+
#container.snp_map.each do | gene, snp_array|
|
345
|
+
# snp_array.each do |e|
|
346
|
+
# puts e.inspect
|
347
|
+
# puts e.aligned_sequences_fasta
|
348
|
+
# end
|
349
|
+
#end
|
350
|
+
|
@@ -0,0 +1,101 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#This This script converts the a file with snps and positions with the header:
|
4
|
+
#GENE,BASE,POS,SNP,Chromosome
|
5
|
+
# snp.gene, snp.original, snp.position, snp.snp, snp.chromosome
|
6
|
+
#To the input expected by polymarker
|
7
|
+
#ID, Chromosome, sequence
|
8
|
+
#With sequence containing the SNP in the notation "[A/T]"
|
9
|
+
require 'bio'
|
10
|
+
require 'optparse'
|
11
|
+
|
12
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
13
|
+
$: << File.expand_path('.')
|
14
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
15
|
+
require path
|
16
|
+
|
17
|
+
|
18
|
+
def log(msg)
|
19
|
+
time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
|
20
|
+
puts "#{time}: #{msg}"
|
21
|
+
end
|
22
|
+
|
23
|
+
markers = nil
|
24
|
+
|
25
|
+
options = {}
|
26
|
+
options[:flanking_size] = 100
|
27
|
+
test_file=''
|
28
|
+
OptionParser.new do |opts|
|
29
|
+
|
30
|
+
opts.banner = "Usage: snp_postion_to_polymarker.rb [options]"
|
31
|
+
|
32
|
+
opts.on("-s", "--snp_file CSV", "CSV file with the following columnns:\nID,Allele_1,position,Allele_1,target_chromosome") do |o|
|
33
|
+
options[:snp_file] = o
|
34
|
+
test_file = o
|
35
|
+
end
|
36
|
+
opts.on("-r", "--reference FASTA", "reference with the genes/contings/marker seuqnece") do |o|
|
37
|
+
options[:reference] = o
|
38
|
+
end
|
39
|
+
opts.on("-o", "--out CSV", "Output file ") do |o|
|
40
|
+
options[:output] = o
|
41
|
+
end
|
42
|
+
opts.on("-f", "--flanking_size INT", "Flanking size around the SNP") do |o|
|
43
|
+
options[:flanking_size] = o.to_i
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line. Example: IWGSC_CSS_1AL_scaff_1455974,Kronos2281,127,C,T\n\
|
47
|
+
requires --reference to get the sequence using a position") do |o|
|
48
|
+
options[:mutant_list] = o
|
49
|
+
test_file = o
|
50
|
+
end
|
51
|
+
|
52
|
+
end.parse!
|
53
|
+
#reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
|
54
|
+
|
55
|
+
fasta_reference = options[:reference] if options[:reference]
|
56
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
|
57
|
+
fasta_reference_db.load_fai_entries
|
58
|
+
|
59
|
+
out = $stdout
|
60
|
+
lastRegion = nil
|
61
|
+
lastTemplate = nil
|
62
|
+
out = File.open(options[:output], "w") if options[:output]
|
63
|
+
File.open(test_file) do | f |
|
64
|
+
f.each_line do | line |
|
65
|
+
snp = nil
|
66
|
+
entry = nil
|
67
|
+
if options[:snp_file]
|
68
|
+
snp = Bio::PolyploidTools::SNP.parse(line)
|
69
|
+
entry = fasta_reference_db.index.region_for_entry(snp.gene)
|
70
|
+
elsif options[:mutant_list]
|
71
|
+
snp = Bio::PolyploidTools::SNPMutant.parse(line)
|
72
|
+
entry = fasta_reference_db.index.region_for_entry(snp.contig)
|
73
|
+
end
|
74
|
+
#puts line
|
75
|
+
if entry
|
76
|
+
region = entry.get_full_region
|
77
|
+
snp_name = snp.snp_id_in_seq
|
78
|
+
|
79
|
+
#if region != lastRegion
|
80
|
+
# lastTemplate = fasta_reference_db.fetch_sequence(region)
|
81
|
+
#end
|
82
|
+
start, total, new_position = snp.to_polymarker_coordinates(options[:flanking_size])
|
83
|
+
region.start = start
|
84
|
+
region.end = start + total
|
85
|
+
#puts region
|
86
|
+
local_template = fasta_reference_db.fetch_sequence(region)
|
87
|
+
|
88
|
+
snp.position = new_position
|
89
|
+
|
90
|
+
snp.template_sequence = local_template
|
91
|
+
lastRegion = region
|
92
|
+
|
93
|
+
out.puts "#{snp.gene}_#{snp_name},#{snp.chromosome},#{snp.to_polymarker_sequence(options[:flanking_size])}"
|
94
|
+
else
|
95
|
+
$stderr.puts "ERROR: Unable to find entry for #{snp.gene}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
out.close if options[:output]
|
101
|
+
|
@@ -0,0 +1,107 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bio'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'pathname'
|
6
|
+
require 'bio-samtools-wrapper'
|
7
|
+
|
8
|
+
require 'set'
|
9
|
+
|
10
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
11
|
+
$: << File.expand_path('.')
|
12
|
+
path=File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
13
|
+
$stderr.puts "Loading: #{path}"
|
14
|
+
require path
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
fasta_db = Bio::DB::Fasta::FastaFile.new( ARGV[0])
|
19
|
+
fasta_db.load_fai_entries
|
20
|
+
bam1 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]})
|
21
|
+
bam2 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[2]})
|
22
|
+
|
23
|
+
|
24
|
+
output_prefix = ARGV[3]
|
25
|
+
|
26
|
+
block_size=1000
|
27
|
+
|
28
|
+
min_cov = ARGV[4].to_i ? ARGV[4].to_i : 10
|
29
|
+
chunk = ARGV[5].to_i
|
30
|
+
chunk_size = ARGV[6].to_i
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
main_table="#{output_prefix}_#{block_size}_#{min_cov}_table.#{chunk}.csv"
|
36
|
+
|
37
|
+
table_file = File.open(main_table, "w")
|
38
|
+
table_file.puts "gene\tlength\tsnps_1\tcalled_1\tsnps_per_#{block_size}_1\tsnps_2\tcalled_2\tsnps_per_#{block_size}_2\tsnps_tot\tsnps_per_1k_tot"
|
39
|
+
|
40
|
+
hist_1= Hash.new(0)
|
41
|
+
hist_2= Hash.new(0)
|
42
|
+
|
43
|
+
fasta_file = File.open("#{output_prefix}_#{min_cov}.#{chunk}.fa", "w")
|
44
|
+
i = -1
|
45
|
+
min = chunk * chunk_size
|
46
|
+
max = min + chunk_size
|
47
|
+
|
48
|
+
fasta_db.index.entries.each do | r |
|
49
|
+
i = i + 1
|
50
|
+
next if i < min or i >= max
|
51
|
+
#Np r.get_full_region
|
52
|
+
#container.process_region( { :region => r.get_full_region.to_s, :output_file => output_file } )
|
53
|
+
region=r.get_full_region
|
54
|
+
|
55
|
+
|
56
|
+
begin
|
57
|
+
reg_a = bam1.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
|
58
|
+
reg_b = bam2.fetch_region({:region=>region, :min_cov=>min_cov, :A=>1})
|
59
|
+
cons_1 = reg_a.consensus
|
60
|
+
cons_2 = reg_b.consensus
|
61
|
+
|
62
|
+
|
63
|
+
snps_1 = cons_1.count_ambiguities
|
64
|
+
snps_2 = cons_2.count_ambiguities
|
65
|
+
|
66
|
+
called_1 = reg_a.called
|
67
|
+
called_2 = reg_b.called
|
68
|
+
|
69
|
+
snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
|
70
|
+
|
71
|
+
snps_per_1k_1 = (block_size * snps_1.to_f ) / region.size
|
72
|
+
snps_per_1k_2 = (block_size * snps_2.to_f ) / region.size
|
73
|
+
snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
|
74
|
+
|
75
|
+
hist_1[snps_per_1k_1.to_i] += 1
|
76
|
+
hist_2[snps_per_1k_2.to_i] += 1
|
77
|
+
|
78
|
+
table_file.print "#{r.id}\t#{region.size}\t"
|
79
|
+
table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
|
80
|
+
table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
|
81
|
+
table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n"
|
82
|
+
fasta_file.puts ">#{r.id}_1"
|
83
|
+
fasta_file.puts "#{cons_1}"
|
84
|
+
fasta_file.puts ">#{r.id}_2"
|
85
|
+
fasta_file.puts "#{cons_2}"
|
86
|
+
|
87
|
+
rescue Exception => e
|
88
|
+
$stderr.puts "Unable to process #{region}: #{e.to_s}"
|
89
|
+
end
|
90
|
+
end
|
91
|
+
fasta_file.close
|
92
|
+
table_file.close
|
93
|
+
|
94
|
+
hist_table="#{output_prefix}_#{block_size}_#{min_cov}_hist.#{chunk}.csv"
|
95
|
+
hist_file = File.open(hist_table, "w")
|
96
|
+
|
97
|
+
all_keys = SortedSet.new(hist_1.keys)
|
98
|
+
all_keys.merge(hist_2.keys)
|
99
|
+
hist_file.puts "SNPs/#{block_size}\thist_1\thist_2\n"
|
100
|
+
all_keys.each do |k|
|
101
|
+
hist_file.puts "#{k}\t#{hist_1[k]}\t#{hist_2[k]}"
|
102
|
+
end
|
103
|
+
|
104
|
+
hist_file.close
|
105
|
+
|
106
|
+
|
107
|
+
|
data/bin/tag_stats.rb
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
|
4
|
+
require 'csv'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'bio-samtools-wrapper'
|
8
|
+
require 'bio'
|
9
|
+
require 'descriptive_statistics'
|
10
|
+
|
11
|
+
class Bio::DB::Tag
|
12
|
+
def set(str)
|
13
|
+
@tag = str[0..1]
|
14
|
+
@type = str[3]
|
15
|
+
@value = str[5..-1]
|
16
|
+
@value = @value.to_i if @type == "i"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
21
|
+
$: << File.expand_path('.')
|
22
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
23
|
+
require path
|
24
|
+
opts = {}
|
25
|
+
opts[:tag] = "NH"
|
26
|
+
opts[:bam] = nil
|
27
|
+
opts[:out] = nil
|
28
|
+
opts[:ref] = nil
|
29
|
+
|
30
|
+
out = $stdout
|
31
|
+
|
32
|
+
OptionParser.new do |o|
|
33
|
+
o.banner = "Usage: tag_stats.rb [options]"
|
34
|
+
|
35
|
+
o.on("-t", "--tag str", "The tag to extract (default NH)") do |o|
|
36
|
+
opts[:tag] = o
|
37
|
+
end
|
38
|
+
|
39
|
+
o.on("-b", "--bam FILE" , "BAM file with the alignments ") do |o|
|
40
|
+
opts[:bam] = o
|
41
|
+
end
|
42
|
+
|
43
|
+
o.on("-o", "--out_file CHAR", "File to save the stats") do |o|
|
44
|
+
opts[:out] = o
|
45
|
+
end
|
46
|
+
|
47
|
+
o.on("-r", "--reference FILE", "Fasta file with the reference") do |o|
|
48
|
+
opts[:ref] = o
|
49
|
+
end
|
50
|
+
end.parse!
|
51
|
+
|
52
|
+
bam = Bio::DB::Sam.new(fasta: opts[:ref], bam: opts[:bam])
|
53
|
+
tag = opts[:tag]
|
54
|
+
|
55
|
+
sample = File.basename(opts[:bam], '.sorted.bam')
|
56
|
+
last_ref = ""
|
57
|
+
values = []
|
58
|
+
to_print = [:sum, :min, :max, :mean, :mode, :median, :q1, :q2, :q3]
|
59
|
+
percentiles = [90, 95, 97.5, 99]
|
60
|
+
#Add the 90, 95, 97.5 and 99 percentiles.
|
61
|
+
out = File.open(opts[:out], "w") if opts[:out]
|
62
|
+
bam.view do |aln |
|
63
|
+
if(last_ref != aln.rname)
|
64
|
+
|
65
|
+
desc_stats = values.descriptive_statistics
|
66
|
+
to_print.each { |e| out.puts [sample, last_ref, e , desc_stats[e] ].join("\t") } if(last_ref != "")
|
67
|
+
percentiles.each { |e| out.puts [sample, last_ref, "P#{e}", values.percentile(e)].join("\t") } if(last_ref != "")
|
68
|
+
out.puts [sample, last_ref, "N", values.length].join("\t") if(last_ref != "")
|
69
|
+
values.clear
|
70
|
+
last_ref = aln.rname
|
71
|
+
end
|
72
|
+
values << aln.tags[tag].value
|
73
|
+
end
|
74
|
+
|
75
|
+
out.close if opts[:out]
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'bio-samtools-wrapper'
|
2
|
+
require 'optparse'
|
3
|
+
|
4
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
5
|
+
$: << File.expand_path('.')
|
6
|
+
path=File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
def parseVCFheader(head_line="")
|
12
|
+
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
|
13
|
+
|
14
|
+
m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
|
15
|
+
{:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
header_info = Hash.new
|
21
|
+
ARGF.each_line do |line|
|
22
|
+
h = nil
|
23
|
+
h = parseVCFheader(line) if line.start_with? "##INFO"
|
24
|
+
|
25
|
+
header_info[h[:id]] = h[:desc] if h
|
26
|
+
#puts header_info.inspect
|
27
|
+
next if line.start_with? "##"
|
28
|
+
if line.start_with? "#CHROM"
|
29
|
+
arr = line.split
|
30
|
+
arr = arr.drop(9)
|
31
|
+
arr2 = arr.map { |s| [s.clone().prepend('Cov'), s.clone().prepend('Hap') ]}
|
32
|
+
#header += arr2.join("\t")
|
33
|
+
#puts header
|
34
|
+
next
|
35
|
+
end
|
36
|
+
|
37
|
+
line.chomp!
|
38
|
+
|
39
|
+
vcf = Bio::DB::Vcf.new(line, arr)
|
40
|
+
# puts arr.join("\t") if vcf.info["TYPE"] == "snp"
|
41
|
+
# puts vcf.inspect
|
42
|
+
#pus vcf.pos.inspect
|
43
|
+
#next if vcf.info["AO"].to_i != 1
|
44
|
+
vcf.info.each_pair { |name, val| puts "#{name}\t#{val}\t#{header_info[name]}" }
|
45
|
+
|
46
|
+
arr2 = Array.new
|
47
|
+
puts "____"
|
48
|
+
i = 0
|
49
|
+
vcf.samples.each do |sample|
|
50
|
+
#puts sample.inspect
|
51
|
+
puts sample[1].keys.join("\t") if i == 0
|
52
|
+
puts sample[1].values.join("\t")
|
53
|
+
i+=1
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|