bio-polymarker 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
6
|
+
$: << File.expand_path('.')
|
7
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
8
|
+
require path
|
9
|
+
|
10
|
+
|
11
|
+
def log(msg)
|
12
|
+
time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
|
13
|
+
puts "#{time}: #{msg}"
|
14
|
+
end
|
15
|
+
|
16
|
+
markers = nil
|
17
|
+
|
18
|
+
options = {}
|
19
|
+
OptionParser.new do |opts|
|
20
|
+
|
21
|
+
opts.banner = "Usage: polymarker.rb [options]"
|
22
|
+
|
23
|
+
opts.on("-c", "--chromosome CHR", "chromosome (1A, 3B, etc)") do |o|
|
24
|
+
options[:chromosome] = o.upcase
|
25
|
+
end
|
26
|
+
opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
|
27
|
+
options[:reference] = o
|
28
|
+
end
|
29
|
+
opts.on("-m", "--map CSV", "File with the map and sequence \n Header: INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE") do |o|
|
30
|
+
options[:map] = o
|
31
|
+
end
|
32
|
+
|
33
|
+
end.parse!
|
34
|
+
#reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
|
35
|
+
reference = options[:reference] if options[:reference]
|
36
|
+
throw raise Exception.new(), "Reference has to be provided" unless reference
|
37
|
+
|
38
|
+
map = Bio::PolyploidTools::ArmMap.new
|
39
|
+
map.chromosome = options[:chromosome]
|
40
|
+
map.global_reference(reference)
|
41
|
+
log "Reading markers file"
|
42
|
+
Bio::PolyploidTools::Marker.parse(options[:map]) do |marker|
|
43
|
+
if options[:chromosome] == marker.chr
|
44
|
+
map.markers[marker.snp_name] = marker
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
|
50
|
+
fasta_tmp="markers_#{options[:chromosome]}.fa"
|
51
|
+
contigs_tmp="contigs_#{options[:chromosome]}.fa"
|
52
|
+
aln_tmp="align_#{options[:chromosome]}.psl"
|
53
|
+
contigs_map="contigs_map_#{options[:chromosome]}.fa"
|
54
|
+
map_with_contigs="contigs_map_#{options[:chromosome]}.csv"
|
55
|
+
|
56
|
+
#1. Prints the sequences to print according to the chromosome to search
|
57
|
+
log "Writing markers: #{fasta_tmp}"
|
58
|
+
map.print_fasta_markers(fasta_tmp)
|
59
|
+
log "Writing contigs: #{contigs_tmp}"
|
60
|
+
map.print_fasta_contigs_from_reference(contigs_tmp)
|
61
|
+
log "Aligning markers #{aln_tmp}"
|
62
|
+
map.align_markers(aln_tmp)
|
63
|
+
log "printing contigs with markers #{contigs_map}"
|
64
|
+
map.print_fasta_contigs_for_markers(contigs_map)
|
65
|
+
log "printing map with contigs #{map_with_contigs}"
|
66
|
+
map.print_map_with_contigs(map_with_contigs)
|
@@ -0,0 +1,241 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'pathname'
|
5
|
+
require 'bio-samtools-wrapper'
|
6
|
+
require 'optparse'
|
7
|
+
require 'set'
|
8
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
9
|
+
$: << File.expand_path('.')
|
10
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
11
|
+
require path
|
12
|
+
|
13
|
+
options = {}
|
14
|
+
options[:min_identity] = 90
|
15
|
+
options[:filter_best] = false
|
16
|
+
options[:debug] = false
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
opts.banner = "Usage: marler_to_vcf.rb [options]"
|
20
|
+
|
21
|
+
opts.on("-c", "--contigs FILE", "File with contigs to use as database") do |o|
|
22
|
+
options[:path_to_contigs] = o
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-m", "--marker_list FILE", "File with the list of markers to search from") do |o|
|
26
|
+
options[:marker_list] = o
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-b", "--filter_best", "If set, only keep the best alignment for each chromosome") do
|
30
|
+
options[:filter_best] = false
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-D", "--debug", "Validate that the flanking sequences are correct") do
|
34
|
+
options[:debug] = true
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-i", "--min_identity INT", "Minimum identity to consider a hit (default 90)") do |o|
|
38
|
+
options[:min_identity] = o.to_i
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-o", "--output FOLDER", "Output folder") do |o|
|
42
|
+
options[:output_folder] = o
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
|
46
|
+
options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: blast") do |o|
|
50
|
+
raise "Invalid aligner" unless o == "exonerate" or o == "blast"
|
51
|
+
options[:aligner] = o.to_sym
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
|
55
|
+
options[:database] = o
|
56
|
+
end
|
57
|
+
|
58
|
+
end.parse!
|
59
|
+
options[:database] = options[:path_to_contigs]
|
60
|
+
p options
|
61
|
+
p ARGV
|
62
|
+
|
63
|
+
|
64
|
+
path_to_contigs=options[:path_to_contigs]
|
65
|
+
|
66
|
+
original_name="A"
|
67
|
+
snp_in="B"
|
68
|
+
|
69
|
+
fasta_reference = nil
|
70
|
+
test_file=options[:marker_list]
|
71
|
+
|
72
|
+
output_folder="#{test_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}"
|
73
|
+
output_folder= options[:output_folder] if options[:output_folder]
|
74
|
+
Dir.mkdir(output_folder)
|
75
|
+
#T
|
76
|
+
temp_fasta_query="#{output_folder}/to_align.fa"
|
77
|
+
temp_contigs="#{output_folder}/contigs_tmp.fa"
|
78
|
+
exonerate_file="#{output_folder}/exonerate_tmp.tab"
|
79
|
+
vcf_file="#{output_folder}/snp_positions.vcf"
|
80
|
+
|
81
|
+
min_identity= options[:min_identity]
|
82
|
+
|
83
|
+
@status_file="#{output_folder}/status.txt"
|
84
|
+
|
85
|
+
|
86
|
+
def write_status(status)
|
87
|
+
f=File.open(@status_file, "a")
|
88
|
+
f.puts "#{Time.now.to_s},#{status}"
|
89
|
+
f.close
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
snps = Hash.new
|
94
|
+
|
95
|
+
fasta_reference_db=nil
|
96
|
+
|
97
|
+
#if options[:debug]
|
98
|
+
write_status "Loading Reference"
|
99
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>path_to_contigs})
|
100
|
+
fasta_reference_db.load_fai_entries
|
101
|
+
write_status "Fasta reference: #{fasta_reference}"
|
102
|
+
#end
|
103
|
+
|
104
|
+
#1. Read all the SNP files
|
105
|
+
#chromosome = nil
|
106
|
+
write_status "Reading SNPs"
|
107
|
+
|
108
|
+
File.open(test_file) do | f |
|
109
|
+
f.each_line do | line |
|
110
|
+
snp = Bio::PolyploidTools::SNPSequence.parse(line)
|
111
|
+
snp.genomes_count = options[:genomes_count]
|
112
|
+
snp.snp_in = snp_in
|
113
|
+
snp.original_name = original_name
|
114
|
+
if snp.position
|
115
|
+
snps[snp.gene] = snp
|
116
|
+
else
|
117
|
+
$stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
#2. Generate all the fasta files
|
123
|
+
write_status "Writing sequences to align"
|
124
|
+
written_seqs = Set.new
|
125
|
+
file = File.open(temp_fasta_query, "w")
|
126
|
+
snps.each_pair do |k,snp|
|
127
|
+
unless written_seqs.include?(snp.gene)
|
128
|
+
written_seqs << snp.gene
|
129
|
+
file.puts snp.to_fasta
|
130
|
+
end
|
131
|
+
end
|
132
|
+
file.close
|
133
|
+
|
134
|
+
|
135
|
+
#3. Run exonerate on each of the possible chromosomes for the SNP
|
136
|
+
#puts chromosome
|
137
|
+
#chr_group = chromosome[0]
|
138
|
+
write_status "Searching markers in genome"
|
139
|
+
exo_f = File.open(exonerate_file, "w")
|
140
|
+
contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
|
141
|
+
filename=path_to_contigs
|
142
|
+
#puts filename
|
143
|
+
target=filename
|
144
|
+
|
145
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
146
|
+
fasta_file.load_fai_entries
|
147
|
+
found_contigs = Set.new
|
148
|
+
|
149
|
+
def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
150
|
+
if aln.identity > min_identity
|
151
|
+
exo_f.puts aln.line
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database]}) do |aln|
|
156
|
+
do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
|
157
|
+
end
|
158
|
+
|
159
|
+
exo_f.close()
|
160
|
+
|
161
|
+
def print_positions(min_identity:90, filter_best:false, exonerate_filename:"test.exo", snps:{}, reference:nil, out:$stdout)
|
162
|
+
marker_count=Hash.new { |h, k| h[k] = 1 }
|
163
|
+
File.open(exonerate_filename) do |f|
|
164
|
+
f.each_line do | line |
|
165
|
+
record = Bio::DB::Exonerate::Alignment.parse_custom(line)
|
166
|
+
next unless record and record.identity >= min_identity
|
167
|
+
snp = snps[record.query_id]
|
168
|
+
next unless snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
|
169
|
+
begin
|
170
|
+
|
171
|
+
position = record.query_position_on_target(snp.position)
|
172
|
+
q_strand = record.query_strand
|
173
|
+
t_strand = record.target_strand
|
174
|
+
template = snp.template_sequence
|
175
|
+
|
176
|
+
vulgar = record.exon_on_gene_position(snp.position)
|
177
|
+
tr = vulgar.target_region
|
178
|
+
qr = vulgar.query_region
|
179
|
+
template_pre = template[qr.start - 1 .. snp.position - 1 ]
|
180
|
+
tr.orientation == :forward ? tr.end = position : tr.start = position
|
181
|
+
region = tr
|
182
|
+
target_seq = reference.fetch_sequence(region)
|
183
|
+
target_seq[-1] = target_seq[-1].upcase
|
184
|
+
ref_base = target_seq[-1]
|
185
|
+
ma = ref_base
|
186
|
+
alt_base = [snp.snp, snp.original].join(",")
|
187
|
+
|
188
|
+
if snp.original == ref_base
|
189
|
+
alt_base = snp.snp
|
190
|
+
elsif snp.snp == ref_base
|
191
|
+
alt_base = snp.original
|
192
|
+
end
|
193
|
+
|
194
|
+
if record.target_strand == :reverse
|
195
|
+
alt_base = Bio::Sequence::NA.new(alt_base)
|
196
|
+
ref_base = Bio::Sequence::NA.new(ref_base)
|
197
|
+
alt_base.complement!.upcase!
|
198
|
+
ref_base.complement!.upcase!
|
199
|
+
end
|
200
|
+
|
201
|
+
info = ["OR=#{record.target_strand}"]
|
202
|
+
info << "SC=#{record.score}"
|
203
|
+
info << "PI=#{record.pi}"
|
204
|
+
info << "MA=#{ma}"
|
205
|
+
info << "TS=#{target_seq}"
|
206
|
+
vcf_line="#{record.target_id}\t#{position}\t#{record.query_id}.path#{marker_count[record.query_id]}\t#{ref_base}\t#{alt_base}\t#{record.pi}\t.\t#{info.join(";")}"
|
207
|
+
#snp2 = Bio::PolyploidTools::SNP.parseVCF( vcf_line )
|
208
|
+
#snp2.setTemplateFromFastaFile(reference)
|
209
|
+
#seq2=snp2.to_polymarker_sequence(50)
|
210
|
+
#info << "PS=#{seq2}"
|
211
|
+
vcf_line="#{record.target_id}\t#{position}\t#{record.query_id}.path#{marker_count[record.query_id]}\t#{ref_base}\t#{alt_base}\t#{record.pi}\t.\t#{info.join(";")}"
|
212
|
+
out.puts(vcf_line)
|
213
|
+
|
214
|
+
marker_count[record.query_id] += 1
|
215
|
+
rescue Bio::DB::Exonerate::ExonerateException
|
216
|
+
$stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
|
223
|
+
write_status "Printing VCF file"
|
224
|
+
#puts snps.inspect
|
225
|
+
out = File.open(vcf_file, "w")
|
226
|
+
out.puts "##fileformat=VCFv4.2"
|
227
|
+
out.puts "##fileDate=#{Time.now.strftime("%Y%m%d")}"
|
228
|
+
out.puts "##source=#{$0}"
|
229
|
+
out.puts "##reference=file://#{options[:path_to_contigs]}"
|
230
|
+
out.puts "##INFO=<ID=OR,Number=1,Type=String,Description=\"Orientation of the alignment of the marker\">"
|
231
|
+
out.puts "##INFO=<ID=SC,Number=1,Type=Float,Description=\"Alignment score of the marker\">"
|
232
|
+
out.puts "##INFO=<ID=PI,Number=1,Type=Float,Description=\"Percentage of identity of the alignment to the marker\">"
|
233
|
+
out.puts "##INFO=<ID=PS,Number=1,Type=String,Description=\"SNP sequence for PolyMarker\">"
|
234
|
+
out.puts "##INFO=<ID=MA,Number=1,Type=String,Description=\"Allele based on the original marker sequence\">"
|
235
|
+
out.puts "##INFO=<ID=TS,Number=1,Type=String,Description=\"Target sequence before the SNP from the reference\">"
|
236
|
+
out.puts "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"
|
237
|
+
print_positions(exonerate_filename:exonerate_file, min_identity:95, snps:snps, reference: fasta_reference_db, out:out)
|
238
|
+
out.close
|
239
|
+
write_status "DONE"
|
240
|
+
|
241
|
+
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#This uses the map output from map_markers_to_contigs.rb
|
4
|
+
#You need a reference with the name of the contigs, containing the chromosome
|
5
|
+
#arm and a list of sequences to map. The algorithm creates a smaller reference
|
6
|
+
#file, so the search only spans across the contigs in the region. This should
|
7
|
+
#allow to use a refined mapping algorithm.
|
8
|
+
require 'bio'
|
9
|
+
require 'optparse'
|
10
|
+
|
11
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
12
|
+
$: << File.expand_path('.')
|
13
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
14
|
+
require path
|
15
|
+
|
16
|
+
|
17
|
+
def log(msg)
|
18
|
+
time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
|
19
|
+
puts "#{time}: #{msg}"
|
20
|
+
end
|
21
|
+
|
22
|
+
markers = nil
|
23
|
+
|
24
|
+
options = {}
|
25
|
+
OptionParser.new do |opts|
|
26
|
+
|
27
|
+
opts.banner = "Usage: markers_in_region.rb [options]"
|
28
|
+
|
29
|
+
opts.on("-c", "--chromosome CHR", "chromosome (1A, 3B, etc)") do |o|
|
30
|
+
options[:chromosome] = o.upcase
|
31
|
+
end
|
32
|
+
opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
|
33
|
+
options[:reference] = o
|
34
|
+
end
|
35
|
+
opts.on("-m", "--map CSV", "File with the map and sequence \n Header: INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE") do |o|
|
36
|
+
options[:map] = o
|
37
|
+
end
|
38
|
+
|
39
|
+
end.parse!
|
40
|
+
#reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
|
41
|
+
reference = options[:reference] if options[:reference]
|
42
|
+
throw raise Exception.new(), "Reference has to be provided" unless reference
|
data/bin/mask_triads.rb
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
|
4
|
+
require 'csv'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'tmpdir'
|
7
|
+
require 'bio-samtools-wrapper'
|
8
|
+
require 'bio'
|
9
|
+
|
10
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
11
|
+
$: << File.expand_path('.')
|
12
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
13
|
+
require path
|
14
|
+
opts = {}
|
15
|
+
opts[:identity] = 50
|
16
|
+
opts[:min_bases] = 200
|
17
|
+
opts[:split_token] = "."
|
18
|
+
opts[:tmp_folder] = Dir.mktmpdir
|
19
|
+
opts[:random_sample] = 0
|
20
|
+
opts[:output_folder] = "."
|
21
|
+
|
22
|
+
OptionParser.new do |o|
|
23
|
+
|
24
|
+
o.banner = "Usage: mask_triads.rb [options]"
|
25
|
+
|
26
|
+
o.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
27
|
+
opts[:triads] = o
|
28
|
+
end
|
29
|
+
|
30
|
+
o.on("-f", "--fasta FILE" , "FASTA file containing all the possible peptide sequences. ") do |o|
|
31
|
+
opts[:fasta] = o
|
32
|
+
end
|
33
|
+
|
34
|
+
o.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
35
|
+
opts[:split_token] = o
|
36
|
+
end
|
37
|
+
|
38
|
+
o.on("-o", "--output_folder DIR", "Location to save the alignment masks. If the alignment exists, it is recycled to avoid calling MAFFT again") do |o|
|
39
|
+
opts[:output_folder] = o
|
40
|
+
end
|
41
|
+
end.parse!
|
42
|
+
|
43
|
+
|
44
|
+
split_token = opts[:split_token]
|
45
|
+
reference_name = File.basename opts[:fasta]
|
46
|
+
output_folder = opts[:output_folder]
|
47
|
+
@fasta_reference_db = Bio::DB::Fasta::FastaFile.new(fasta: opts[:fasta])
|
48
|
+
@fasta_reference_db.load_fai_entries
|
49
|
+
#puts @fasta_reference_db.index.entries
|
50
|
+
@cannonical = Hash.new
|
51
|
+
@fasta_reference_db.index.entries.each do |e|
|
52
|
+
gene = e.id.split(split_token)[0]
|
53
|
+
@cannonical[gene] = e unless @cannonical[gene]
|
54
|
+
@cannonical[gene] = e if e.length > @cannonical[gene].length
|
55
|
+
end
|
56
|
+
|
57
|
+
$stderr.puts "#Loaded #{@cannonical.length} canonical sequences from #{@fasta_reference_db.index.size} in reference"
|
58
|
+
|
59
|
+
$stderr.puts "TMP dir: #{opts[:tmp_folder]}"
|
60
|
+
|
61
|
+
def write_fasta_from_hash(sequences, filename)
|
62
|
+
out = File.new(filename, "w")
|
63
|
+
sequences.each_pair do | chromosome, exon_seq |
|
64
|
+
out.puts ">#{chromosome}\n#{exon_seq}\n"
|
65
|
+
end
|
66
|
+
out.close
|
67
|
+
end
|
68
|
+
|
69
|
+
def mafft_align(a, b, d)
|
70
|
+
to_align = Bio::Alignment::SequenceHash.new
|
71
|
+
seq_a = @fasta_reference_db.fetch_sequence(@cannonical[a].get_full_region)
|
72
|
+
seq_b = @fasta_reference_db.fetch_sequence(@cannonical[b].get_full_region)
|
73
|
+
seq_d = @fasta_reference_db.fetch_sequence(@cannonical[d].get_full_region)
|
74
|
+
to_align[a] = seq_a
|
75
|
+
to_align[b] = seq_b
|
76
|
+
to_align[d] = seq_d
|
77
|
+
report = mafft.query_alignment(to_align)
|
78
|
+
aln = report.alignment
|
79
|
+
aln
|
80
|
+
end
|
81
|
+
|
82
|
+
def read_alignment(path)
|
83
|
+
aln = Bio::Alignment::SequenceHash.new
|
84
|
+
i = 0
|
85
|
+
Bio::FlatFile.open(Bio::FastaFormat, path) do |fasta_file|
|
86
|
+
fasta_file.each do |entry|
|
87
|
+
aln[entry.entry_id] = entry.seq if i < 3
|
88
|
+
i += 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
aln
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
mafft_opts = ['--maxiterate', '1000', '--localpair', '--quiet']
|
96
|
+
mafft = Bio::MAFFT.new( "mafft" , mafft_opts)
|
97
|
+
header_printed = false
|
98
|
+
stats = File.open("#{output_folder}/#{reference_name}.identity_stats.csv", "w")
|
99
|
+
distances = File.open("#{output_folder}/#{reference_name}.distance_between_snps.csv.gz", "w")
|
100
|
+
gz = Zlib::GzipWriter.new(distances)
|
101
|
+
gz.write "triad,gene,genome,reference,type,distance\n"
|
102
|
+
#gz.close
|
103
|
+
|
104
|
+
def write_distances(distances, triad, gene, genome, reference, type, out)
|
105
|
+
distances.each { |e| out.write "#{triad},#{gene},#{genome},#{reference},#{type},#{e}\n" }
|
106
|
+
end
|
107
|
+
|
108
|
+
i = 0
|
109
|
+
CSV.foreach(opts[:triads], headers:true ) do |row|
|
110
|
+
next unless row["cardinality_abs"] == "1:1:1" and row["HC.LC"] == "HC-only"
|
111
|
+
a = row['A']
|
112
|
+
b = row['B']
|
113
|
+
d = row['D']
|
114
|
+
triad = row['group_id']
|
115
|
+
cent_triad = triad.to_i / 100
|
116
|
+
folder = "#{output_folder}/alignments/#{reference_name}/#{cent_triad}/"
|
117
|
+
save_cds = "#{folder}/#{triad}.fa"
|
118
|
+
aligned = File.file?(save_cds)
|
119
|
+
aln = aligned ? read_alignment(save_cds) : mafft_align(a,b,d)
|
120
|
+
folder = "#{output_folder}/alignments_new/#{reference_name}/#{cent_triad}/" if aligned
|
121
|
+
FileUtils.mkdir_p folder
|
122
|
+
save_cds = "#{folder}/#{triad}.fa"
|
123
|
+
|
124
|
+
aln2 = Bio::Alignment.new aln
|
125
|
+
seq_start = Bio::PolyploidTools::Mask.find_start(aln)
|
126
|
+
seq_end = Bio::PolyploidTools::Mask.find_end(aln)
|
127
|
+
#puts "#{triad}: #{seq_start}-#{seq_end}"
|
128
|
+
|
129
|
+
|
130
|
+
aln2.add_seq(Bio::PolyploidTools::Mask.get(aln,seq_start: seq_start, seq_end: seq_end, target: a), "A")
|
131
|
+
aln2.add_seq(Bio::PolyploidTools::Mask.get(aln,seq_start: seq_start, seq_end: seq_end, target: b), "B")
|
132
|
+
aln2.add_seq(Bio::PolyploidTools::Mask.get(aln,seq_start: seq_start, seq_end: seq_end, target: d), "D")
|
133
|
+
|
134
|
+
a_stats = Bio::PolyploidTools::Mask.stats(aln2["A"], triad, a, "A", reference_name)
|
135
|
+
b_stats = Bio::PolyploidTools::Mask.stats(aln2["B"], triad, b, "B", reference_name)
|
136
|
+
d_stats = Bio::PolyploidTools::Mask.stats(aln2["D"], triad, d, "D", reference_name)
|
137
|
+
|
138
|
+
write_distances(a_stats[:specific], triad, a, "A", reference_name, "specific", gz)
|
139
|
+
write_distances(b_stats[:specific], triad, b, "B", reference_name, "specific", gz)
|
140
|
+
write_distances(d_stats[:specific], triad, d, "D", reference_name, "specific", gz)
|
141
|
+
|
142
|
+
write_distances(a_stats[:semispecific], triad, a, "A", reference_name, "semispecific", gz)
|
143
|
+
write_distances(b_stats[:semispecific], triad, b, "B", reference_name, "semispecific", gz)
|
144
|
+
write_distances(d_stats[:semispecific], triad, d, "D", reference_name, "semispecific", gz)
|
145
|
+
|
146
|
+
a_stats.delete(:semispecific)
|
147
|
+
b_stats.delete(:semispecific)
|
148
|
+
d_stats.delete(:semispecific)
|
149
|
+
|
150
|
+
a_stats.delete(:specific)
|
151
|
+
b_stats.delete(:specific)
|
152
|
+
d_stats.delete(:specific)
|
153
|
+
|
154
|
+
a_stats[:length] = @cannonical[a].length
|
155
|
+
b_stats[:length] = @cannonical[b].length
|
156
|
+
d_stats[:length] = @cannonical[d].length
|
157
|
+
|
158
|
+
stats.puts a_stats.keys.join(",") unless header_printed
|
159
|
+
stats.puts a_stats.values.join(",")
|
160
|
+
stats.puts b_stats.values.join(",")
|
161
|
+
stats.puts d_stats.values.join(",")
|
162
|
+
header_printed = true
|
163
|
+
|
164
|
+
write_fasta_from_hash(aln2, save_cds)
|
165
|
+
i += 1
|
166
|
+
end
|
167
|
+
gz.close
|
168
|
+
distances.close
|
169
|
+
stats.close
|