bio-polymarker 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
data/bin/polymarker.rb
ADDED
@@ -0,0 +1,410 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'pathname'
|
5
|
+
require 'bio-samtools-wrapper'
|
6
|
+
require 'optparse'
|
7
|
+
require 'set'
|
8
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
9
|
+
$: << File.expand_path('.')
|
10
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
11
|
+
require path
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
def validate_files(o)
|
16
|
+
[
|
17
|
+
o[:path_to_contigs],
|
18
|
+
o[:marker_list],
|
19
|
+
o[:snp_list],
|
20
|
+
o[:mutant_list],
|
21
|
+
o[:reference]
|
22
|
+
].flatten.compact.each do |f|
|
23
|
+
raise IOError.new "Unable to read #{f}" unless File.exist? f
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
options = {}
|
28
|
+
options[:path_to_contigs] = "/tgac/references/external/projects/iwgsc/css/IWGSC_CSS_all_scaff_v1.fa"
|
29
|
+
options[:chunks] = 1
|
30
|
+
options[:bucket_size] = 0
|
31
|
+
options[:bucket] = 1
|
32
|
+
options[:model] = "est2genome"
|
33
|
+
options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene");
|
34
|
+
options[:flanking_size] = 150;
|
35
|
+
options[:variation_free_region] = 0
|
36
|
+
options[:extract_found_contigs] = false
|
37
|
+
options[:genomes_count] = 3
|
38
|
+
options[:min_identity] = 90
|
39
|
+
options[:scoring] = :genome_specific
|
40
|
+
options[:database] = false
|
41
|
+
options[:filter_best] = false
|
42
|
+
options[:aligner] = :blast
|
43
|
+
options[:max_hits] = 8
|
44
|
+
options[:max_specific_primers] = 20
|
45
|
+
options[:primer_3_preferences] = {
|
46
|
+
:primer_product_size_range => "50-150" ,
|
47
|
+
:primer_max_size => 25 ,
|
48
|
+
:primer_lib_ambiguity_codes_consensus => 1,
|
49
|
+
:primer_liberal_base => 1,
|
50
|
+
:primer_num_return=>5,
|
51
|
+
:primer_explain_flag => 1,
|
52
|
+
:primer_thermodynamic_parameters_path=>File.expand_path(File.dirname(__FILE__) + '../../conf/primer3_config/') + '/'
|
53
|
+
}
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
OptionParser.new do |opts|
|
58
|
+
opts.banner = "Usage: polymarker.rb [options]"
|
59
|
+
|
60
|
+
opts.on("-c", "--contigs FILE", "File with contigs to use as database") do |o|
|
61
|
+
options[:path_to_contigs] = o
|
62
|
+
end
|
63
|
+
|
64
|
+
opts.on("-m", "--marker_list FILE", "File with the list of markers to search from") do |o|
|
65
|
+
options[:marker_list] = o
|
66
|
+
end
|
67
|
+
|
68
|
+
opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
|
69
|
+
options[:genomes_count] = o.to_i
|
70
|
+
end
|
71
|
+
|
72
|
+
opts.on("-b", "--filter_best", "If set, only keep the best alignment for each chromosome") do
|
73
|
+
options[:filter_best] = true
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
opts.on("-s", "--snp_list FILE", "File with the list of snps to search from, requires --reference to get the sequence using a position") do |o|
|
78
|
+
options[:snp_list] = o
|
79
|
+
end
|
80
|
+
|
81
|
+
opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line.\n\
|
82
|
+
requires --reference to get the sequence using a position") do |o|
|
83
|
+
options[:mutant_list] = o
|
84
|
+
end
|
85
|
+
|
86
|
+
opts.on("-r", "--reference FILE", "Fasta file with the sequence for the markers (to complement --snp_list)") do |o|
|
87
|
+
options[:reference] = o
|
88
|
+
end
|
89
|
+
|
90
|
+
opts.on("-i", "--min_identity INT", "Minimum identity to consider a hit (default 90)") do |o|
|
91
|
+
options[:min_identity] = o.to_i
|
92
|
+
end
|
93
|
+
|
94
|
+
opts.on("-o", "--output FOLDER", "Output folder") do |o|
|
95
|
+
options[:output_folder] = o
|
96
|
+
end
|
97
|
+
|
98
|
+
opts.on("-e", "--exonerate_model MODEL", "Model to be used in exonerate to search for the contigs") do |o|
|
99
|
+
options[:model] = o
|
100
|
+
end
|
101
|
+
|
102
|
+
opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
|
103
|
+
options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
|
104
|
+
end
|
105
|
+
|
106
|
+
opts.on("-p", "--primer_3_preferences FILE", "file with preferences to be sent to primer3") do |o|
|
107
|
+
options[:primer_3_preferences] = Bio::DB::Primer3.read_primer_preferences(o, options[:primer_3_preferences] )
|
108
|
+
end
|
109
|
+
|
110
|
+
opts.on("-v", "--variation_free_region INT", "If present, avoid generating the common primer if there are homoeologous SNPs within the specified distance") do |o|
|
111
|
+
options[:variation_free_region] = o.to_i
|
112
|
+
end
|
113
|
+
|
114
|
+
opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
|
115
|
+
options[:extract_found_contigs] = true
|
116
|
+
end
|
117
|
+
|
118
|
+
opts.on("-P", "--primers_to_order", "If present, save a separate file with the primers with the KASP tails")do
|
119
|
+
#TODO: have a string with the tails, optional.
|
120
|
+
options[:primers_to_order] = true
|
121
|
+
end
|
122
|
+
|
123
|
+
opts.on("-H", "--het_dels", "If present, change the scoring to give priority to: semi-specific, specific, non-specific") do
|
124
|
+
options[:scoring] = :het_dels
|
125
|
+
end
|
126
|
+
|
127
|
+
opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: #{options[:aligner]}") do |o|
|
128
|
+
raise "Invalid aligner" unless o == "exonerate" or o == "blast"
|
129
|
+
options[:aligner] = o.to_sym
|
130
|
+
end
|
131
|
+
|
132
|
+
opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
|
133
|
+
options[:database] = o
|
134
|
+
end
|
135
|
+
|
136
|
+
opts.on("-H", "--max_hits INT", "Maximum number of hits to the reference. If there are more hits than this value, the marker is ignored") do |o|
|
137
|
+
options[:max_hits] = o.to_i
|
138
|
+
end
|
139
|
+
|
140
|
+
opts.on("-S", "--max_specific_primers INT", "Maximum number of candidate primers to attempt to design. Default: #{options[:max_specific_primers]} ") do |o|
|
141
|
+
options[:max_specific_primers] = o.to_i
|
142
|
+
end
|
143
|
+
|
144
|
+
end.parse!
|
145
|
+
|
146
|
+
|
147
|
+
validate_files(options)
|
148
|
+
|
149
|
+
options[:database] = options[:path_to_contigs] unless options[:database]
|
150
|
+
|
151
|
+
|
152
|
+
if options[:primer_3_preferences][:primer_product_size_range]
|
153
|
+
range = options[:primer_3_preferences][:primer_product_size_range]
|
154
|
+
range_arr = range.split("-")
|
155
|
+
min = range_arr[0].to_i
|
156
|
+
max = range_arr[1].to_i
|
157
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Range #{range} is invalid!" unless max > min
|
158
|
+
options[:flanking_size] = max
|
159
|
+
end
|
160
|
+
|
161
|
+
#p options
|
162
|
+
#p ARGV
|
163
|
+
|
164
|
+
|
165
|
+
#TODO: Use temporary files somewhere in the file system and add traps to delete them/forward them as a result.
|
166
|
+
#TODO: Make all this parameters
|
167
|
+
|
168
|
+
path_to_contigs=options[:path_to_contigs]
|
169
|
+
|
170
|
+
original_name="A"
|
171
|
+
snp_in="B"
|
172
|
+
|
173
|
+
fasta_reference = nil
|
174
|
+
#test_file="/Users/ramirezr/Dropbox/JIC/PrimersToTest/test_primers_nick_and_james_1.csv"
|
175
|
+
test_file=options[:marker_list] if options[:marker_list]
|
176
|
+
test_file=options[:snp_list] if options[:snp_list]
|
177
|
+
test_file=options[:mutant_list] if options[:mutant_list]
|
178
|
+
fasta_reference = options[:reference]
|
179
|
+
output_folder="#{test_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}"
|
180
|
+
output_folder= options[:output_folder] if options[:output_folder]
|
181
|
+
Dir.mkdir(output_folder) unless Dir.exist?(output_folder)
|
182
|
+
#TODO Make this tmp files
|
183
|
+
temp_fasta_query="#{output_folder}/to_align.fa"
|
184
|
+
temp_contigs="#{output_folder}/contigs_tmp.fa"
|
185
|
+
exonerate_file="#{output_folder}/exonerate_tmp.tab"
|
186
|
+
primer_3_input="#{output_folder}/primer_3_input_temp"
|
187
|
+
primer_3_output="#{output_folder}/primer_3_output_temp"
|
188
|
+
exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
|
189
|
+
output_primers="#{output_folder}/primers.csv"
|
190
|
+
output_to_order="#{output_folder}/primers_to_order.csv"
|
191
|
+
min_identity= options[:min_identity]
|
192
|
+
|
193
|
+
@status_file="#{output_folder}/status.txt"
|
194
|
+
|
195
|
+
primer_3_config=File.expand_path(File.dirname(__FILE__) + '/../conf/primer3_config')
|
196
|
+
model=options[:model]
|
197
|
+
|
198
|
+
def write_status(status)
|
199
|
+
f=File.open(@status_file, "a")
|
200
|
+
f.puts "#{Time.now.to_s},#{status}"
|
201
|
+
f.close
|
202
|
+
end
|
203
|
+
|
204
|
+
Signal.trap("ABRT") do
|
205
|
+
write_status "ERROR: Job aborted. Please try a small number of primers."
|
206
|
+
Signal.trap("SIGABRT", "DEFAULT") # restore handler
|
207
|
+
Process.kill("ABRT", 0)
|
208
|
+
end
|
209
|
+
|
210
|
+
Signal.trap("TERM") do
|
211
|
+
write_status "ERROR: Job terminated. Please try a small number of primers."
|
212
|
+
Signal.trap("SIGTERM", "DEFAULT") # restore handler
|
213
|
+
exit
|
214
|
+
end
|
215
|
+
|
216
|
+
snps = Array.new
|
217
|
+
|
218
|
+
begin
|
219
|
+
|
220
|
+
write_status "Loading Reference"
|
221
|
+
#0. Load the fasta index
|
222
|
+
fasta_reference_db = nil
|
223
|
+
if fasta_reference
|
224
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
|
225
|
+
fasta_reference_db.load_fai_entries
|
226
|
+
write_status "Fasta reference: #{fasta_reference}"
|
227
|
+
end
|
228
|
+
|
229
|
+
#1. Read all the SNP files
|
230
|
+
#chromosome = nil
|
231
|
+
write_status "Reading SNPs"
|
232
|
+
File.open(test_file) do | f |
|
233
|
+
f.each_line do | line |
|
234
|
+
# p line.chomp!
|
235
|
+
snp = nil
|
236
|
+
if options[:marker_list] #List with Sequence
|
237
|
+
snp = Bio::PolyploidTools::SNPSequence.parse(line)
|
238
|
+
elsif options[:snp_list] and options[:reference] #List and fasta file
|
239
|
+
snp = Bio::PolyploidTools::SNP.parse(line)
|
240
|
+
entry = fasta_reference_db.index.region_for_entry(snp.gene)
|
241
|
+
if entry
|
242
|
+
region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
|
243
|
+
snp.template_sequence = fasta_reference_db.fetch_sequence(region)
|
244
|
+
else
|
245
|
+
write_status "WARN: Unable to find entry for #{snp.gene}"
|
246
|
+
end
|
247
|
+
elsif options[:mutant_list] and options[:reference] #List and fasta file
|
248
|
+
snp = Bio::PolyploidTools::SNPMutant.parse(line)
|
249
|
+
entry = fasta_reference_db.index.region_for_entry(snp.contig)
|
250
|
+
if entry
|
251
|
+
region = fasta_reference_db.index.region_for_entry(snp.contig).get_full_region
|
252
|
+
snp.full_sequence = fasta_reference_db.fetch_sequence(region)
|
253
|
+
else
|
254
|
+
write_status "WARN: Unable to find entry for #{snp.gene}"
|
255
|
+
end
|
256
|
+
else
|
257
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
258
|
+
end
|
259
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
260
|
+
snp.max_hits = options[:max_hits]
|
261
|
+
snp.genomes_count = options[:genomes_count]
|
262
|
+
snp.snp_in = snp_in
|
263
|
+
snp.original_name = original_name
|
264
|
+
if snp.position
|
265
|
+
snps << snp
|
266
|
+
else
|
267
|
+
$stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
|
268
|
+
end
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
#1.1 Close fasta file
|
273
|
+
#fasta_reference_db.close() if fasta_reference_db
|
274
|
+
#2. Generate all the fasta files
|
275
|
+
write_status "Writing sequences to align"
|
276
|
+
written_seqs = Set.new
|
277
|
+
file = File.open(temp_fasta_query, "w")
|
278
|
+
snps.each do |snp|
|
279
|
+
unless written_seqs.include?(snp.gene)
|
280
|
+
written_seqs << snp.gene
|
281
|
+
file.puts snp.to_fasta
|
282
|
+
end
|
283
|
+
end
|
284
|
+
file.close
|
285
|
+
|
286
|
+
#3. Run exonerate on each of the possible chromosomes for the SNP
|
287
|
+
#puts chromosome
|
288
|
+
#chr_group = chromosome[0]
|
289
|
+
write_status "Searching markers in genome"
|
290
|
+
exo_f = File.open(exonerate_file, "w")
|
291
|
+
contigs_f = nil
|
292
|
+
contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
|
293
|
+
filename=path_to_contigs
|
294
|
+
#puts filename
|
295
|
+
target=filename
|
296
|
+
|
297
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new(fasta: target)
|
298
|
+
fasta_file.load_fai_entries
|
299
|
+
|
300
|
+
found_contigs = Set.new
|
301
|
+
|
302
|
+
|
303
|
+
def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options, contigs_f: nil)
|
304
|
+
if aln.identity > min_identity
|
305
|
+
exo_f.puts aln.line
|
306
|
+
unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
307
|
+
found_contigs.add(aln.target_id)
|
308
|
+
entry = fasta_file.index.region_for_entry(aln.target_id)
|
309
|
+
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
|
310
|
+
if options[:extract_found_contigs]
|
311
|
+
region = entry.get_full_region
|
312
|
+
seq = fasta_file.fetch_sequence(region)
|
313
|
+
contigs_f.puts(">#{aln.target_id}\n#{seq}")
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
end
|
319
|
+
|
320
|
+
Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
|
321
|
+
do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options, contigs_f: contigs_f)
|
322
|
+
end if options[:aligner] == :blast
|
323
|
+
|
324
|
+
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
|
325
|
+
do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options, contigs_f: contigs_f)
|
326
|
+
end if options[:aligner] == :exonerate
|
327
|
+
|
328
|
+
exo_f.close()
|
329
|
+
|
330
|
+
|
331
|
+
|
332
|
+
exo_f.close()
|
333
|
+
contigs_f.close() if options[:extract_found_contigs]
|
334
|
+
|
335
|
+
#4. Load all the results from exonerate and get the input filename for primer3
|
336
|
+
#Custom arm selection function that only uses the first two characters. Maybe
|
337
|
+
#we want to make it a bit more cleaver
|
338
|
+
write_status "Reading best alignment on each chromosome"
|
339
|
+
|
340
|
+
|
341
|
+
container= Bio::PolyploidTools::ExonContainer.new
|
342
|
+
container.flanking_size=options[:flanking_size]
|
343
|
+
container.gene_models(temp_fasta_query)
|
344
|
+
container.chromosomes(target)
|
345
|
+
container.add_parental({:name=>snp_in})
|
346
|
+
container.add_parental({:name=>original_name})
|
347
|
+
container.max_hits = options[:max_hits]
|
348
|
+
snps.each do |snp|
|
349
|
+
snp.container = container
|
350
|
+
snp.flanking_size = container.flanking_size
|
351
|
+
snp.variation_free_region = options[:variation_free_region]
|
352
|
+
container.add_snp(snp)
|
353
|
+
end
|
354
|
+
container.add_alignments({
|
355
|
+
:exonerate_file=>exonerate_file,
|
356
|
+
:arm_selection=>options[:arm_selection],
|
357
|
+
:min_identity=>min_identity,
|
358
|
+
:filter_best=>options[:filter_best]})
|
359
|
+
|
360
|
+
|
361
|
+
#4.1 generating primer3 file
|
362
|
+
write_status "Finding genome-specific positions"
|
363
|
+
file = File.open(exons_filename, "w")
|
364
|
+
container.print_fasta_snp_exones(file)
|
365
|
+
file.close
|
366
|
+
write_status "Running primer3"
|
367
|
+
|
368
|
+
file = File.open(primer_3_input, "w")
|
369
|
+
|
370
|
+
Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
|
371
|
+
added_exons = container.print_primer_3_exons(file, nil, snp_in, max_specific_primers: options[:max_specific_primers] )
|
372
|
+
file.close
|
373
|
+
|
374
|
+
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
|
375
|
+
|
376
|
+
#5. Pick the best primer and make the primer3 output
|
377
|
+
write_status "Selecting best primers"
|
378
|
+
kasp_container=Bio::DB::Primer3::KASPContainer.new
|
379
|
+
|
380
|
+
|
381
|
+
|
382
|
+
kasp_container.line_1= original_name
|
383
|
+
kasp_container.line_2= snp_in
|
384
|
+
|
385
|
+
if options[:scoring] == :het_dels
|
386
|
+
kasp_container.scores = Hash.new
|
387
|
+
kasp_container.scores[:chromosome_specific] = 0
|
388
|
+
kasp_container.scores[:chromosome_semispecific] = 1000
|
389
|
+
kasp_container.scores[:chromosome_nonspecific] = 100
|
390
|
+
end
|
391
|
+
|
392
|
+
snps.each do |snp|
|
393
|
+
snpk = kasp_container.add_snp(snp)
|
394
|
+
|
395
|
+
|
396
|
+
end
|
397
|
+
|
398
|
+
kasp_container.add_primers_file(primer_3_output) if added_exons > 0
|
399
|
+
header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors,repetitive,total_hits"
|
400
|
+
File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
|
401
|
+
File.open(output_to_order, "w") { |io| io.write(kasp_container.print_primers_with_tails())}
|
402
|
+
|
403
|
+
write_status "DONE"
|
404
|
+
rescue StandardError => e
|
405
|
+
write_status "ERROR\t#{e.message}"
|
406
|
+
raise e
|
407
|
+
rescue Exception => e
|
408
|
+
write_status "ERROR\t#{e.message}"
|
409
|
+
raise e
|
410
|
+
end
|