bio-polymarker 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,183 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'pathname'
|
5
|
+
require 'bio-samtools-wrapper'
|
6
|
+
|
7
|
+
require 'set'
|
8
|
+
|
9
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
10
|
+
$: << File.expand_path('.')
|
11
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
12
|
+
require path
|
13
|
+
|
14
|
+
|
15
|
+
#@snp_map=Hash.new
|
16
|
+
|
17
|
+
class HomokaryotContainer < Bio::PolyploidTools::ExonContainer
|
18
|
+
|
19
|
+
|
20
|
+
def add_snp_file(filename, chromosome, snp_in, original_name)
|
21
|
+
flanking_size = 100
|
22
|
+
File.open(filename) do | f |
|
23
|
+
f.each_line do | line |
|
24
|
+
if ARGV.size == 1 #List with Sequence
|
25
|
+
snp = Bio::PolyploidTools::SNPSequence.parse(line)
|
26
|
+
snp.use_reference = false
|
27
|
+
elsif ARGV.size == 2 #List and fasta file
|
28
|
+
snp = Bio::PolyploidTools::SNP.parse(line)
|
29
|
+
snp.use_reference = true
|
30
|
+
end
|
31
|
+
#snp = Bio::PolyploidTools::SNP.parse(line)
|
32
|
+
# puts snp.gene
|
33
|
+
snp.flanking_size = flanking_size
|
34
|
+
if snp.position > 0
|
35
|
+
snp.container = self
|
36
|
+
snp.chromosome = chromosome
|
37
|
+
snp.snp_in = snp_in
|
38
|
+
snp.original_name = original_name
|
39
|
+
|
40
|
+
snp.container = self
|
41
|
+
@snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
|
42
|
+
@snp_map[snp.gene] << snp
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
def print_primer_3_exons (file, target_chromosome , parental )
|
51
|
+
@snp_map.each do | gene, snp_array|
|
52
|
+
snp_array.each do |snp|
|
53
|
+
string = snp.primer_3_string( snp.chromosome, parental )
|
54
|
+
file.puts string if string.size > 0
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class Bio::PolyploidTools::SNP
|
62
|
+
|
63
|
+
@aligned = false
|
64
|
+
|
65
|
+
def aligned_snp_position
|
66
|
+
return local_position
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
def aligned_sequences
|
71
|
+
|
72
|
+
@aligned_sequences = parental_sequences
|
73
|
+
@aligned_sequences["A"][local_position] = original
|
74
|
+
@aligned_sequences["B"][local_position] = snp
|
75
|
+
return @aligned_sequences
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
snp_file = ARGV[0]
|
84
|
+
reference_file = ARGV[1]
|
85
|
+
|
86
|
+
snp_in="A"
|
87
|
+
original_name="B"
|
88
|
+
snps = Array.new
|
89
|
+
|
90
|
+
#0. Load the fasta index
|
91
|
+
fasta_reference_db = nil
|
92
|
+
if reference_file
|
93
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>reference_file})
|
94
|
+
fasta_reference_db.load_fai_entries
|
95
|
+
p "Fasta reference: #{reference_file}"
|
96
|
+
end
|
97
|
+
#1. Read all the SNP files
|
98
|
+
#All the SNPs should be on the same chromosome as the first SNP.
|
99
|
+
chromosome = nil
|
100
|
+
File.open(snp_file) do | f |
|
101
|
+
f.each_line do | line |
|
102
|
+
# p line.chomp!
|
103
|
+
snp = nil
|
104
|
+
if ARGV.size == 1 #List with Sequence
|
105
|
+
snp = Bio::PolyploidTools::SNPSequence.parse(line)
|
106
|
+
|
107
|
+
elsif ARGV.size == 2 #List and fasta file
|
108
|
+
snp = Bio::PolyploidTools::SNP.parse(line)
|
109
|
+
region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
|
110
|
+
snp.template_sequence = fasta_reference_db.fetch_sequence(region)
|
111
|
+
else
|
112
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
113
|
+
end
|
114
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
115
|
+
snp.snp_in = snp_in
|
116
|
+
snp.original_name = original_name
|
117
|
+
snps << snp
|
118
|
+
chromosome = snp.chromosome unless chromosome
|
119
|
+
raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
output_folder="#{snp_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}/"
|
125
|
+
Dir.mkdir(output_folder)
|
126
|
+
seqs_file= output_folder + "sequences.fa"
|
127
|
+
written_seqs = Set.new
|
128
|
+
reference_file = seqs_file unless reference_file
|
129
|
+
|
130
|
+
|
131
|
+
file = File.open(seqs_file, "w")
|
132
|
+
snps.each do |snp|
|
133
|
+
unless written_seqs.include?(snp.gene)
|
134
|
+
written_seqs << snp.gene
|
135
|
+
file.puts snp.to_fasta
|
136
|
+
end
|
137
|
+
end
|
138
|
+
file.close
|
139
|
+
|
140
|
+
|
141
|
+
container = HomokaryotContainer.new
|
142
|
+
container.add_parental({:name=>snp_in})
|
143
|
+
container.add_parental({:name=>original_name})
|
144
|
+
container.gene_models(reference_file) if reference_file
|
145
|
+
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
primer_3_input="#{output_folder}primer_3_input_temp"
|
151
|
+
primer_3_output="#{output_folder}primer_3_output_temp"
|
152
|
+
container.add_snp_file(snp_file, "PST130", snp_in, original_name)
|
153
|
+
primer_3_config=File.expand_path(File.dirname(__FILE__) + '/../conf/primer3_config')
|
154
|
+
output_primers="#{output_folder}primers.csv"
|
155
|
+
|
156
|
+
file = File.open(primer_3_input, "w")
|
157
|
+
file.puts("PRIMER_PRODUCT_SIZE_RANGE=50-150")
|
158
|
+
file.puts("PRIMER_MAX_SIZE=25")
|
159
|
+
file.puts("PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS=1")
|
160
|
+
file.puts("PRIMER_LIBERAL_BASE=1")
|
161
|
+
file.puts("PRIMER_NUM_RETURN=5")
|
162
|
+
file.puts("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=#{primer_3_config}/")
|
163
|
+
|
164
|
+
|
165
|
+
container.print_primer_3_exons(file, "PST130",snp_in)
|
166
|
+
|
167
|
+
file.close
|
168
|
+
|
169
|
+
|
170
|
+
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output})
|
171
|
+
|
172
|
+
#2. Pick the best primer and make the primer3 output
|
173
|
+
kasp_container=Bio::DB::Primer3::KASPContainer.new
|
174
|
+
kasp_container.line_1=original_name
|
175
|
+
kasp_container.line_2=snp_in
|
176
|
+
|
177
|
+
snps.each do |snp|
|
178
|
+
kasp_container.add_snp(snp)
|
179
|
+
end
|
180
|
+
|
181
|
+
kasp_container.add_primers_file(primer_3_output)
|
182
|
+
header = "Marker,SNP,RegionSize,SNP_type,#{snp_in},#{original_name},common,primer_type,orientation,#{snp_in}_TM,#{original_name}_TM,common_TM,selected_from,product_size"
|
183
|
+
File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
|
data/bin/mafft_triads.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: mafft_triads.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
27
|
+
options[:triads] = o
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on("-f", "--pep FILE" , "FASTA file containing all the possible peptide sequences. ") do |o|
|
31
|
+
options[:pep] = o
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on("-s", "--cds FILE" , "FASTA file containing all the possible CDS sequences. ") do |o|
|
35
|
+
options[:cds] = o
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
39
|
+
options[:split_token] = o
|
40
|
+
end
|
41
|
+
|
42
|
+
end.parse!
|
43
|
+
|
44
|
+
|
45
|
+
def peptide_alignment(sequences_to_align)
|
46
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
47
|
+
mafft = Bio::MAFFT.new( "mafft" , options)
|
48
|
+
report = mafft.query_align(sequences_to_align)
|
49
|
+
report.alignment
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
split_token = options[:split_token]
|
54
|
+
|
55
|
+
pep_seq = Hash.new
|
56
|
+
pep_seq_count=0
|
57
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:pep]) do |fasta_file|
|
58
|
+
fasta_file.each do |entry|
|
59
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
60
|
+
pep_seq[gene_name] = entry unless pep_seq[gene_name]
|
61
|
+
pep_seq[gene_name] = entry if entry.length > pep_seq[gene_name].length
|
62
|
+
pep_seq_count += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
$stderr.puts "#Loaded #{pep_seq.length} genes from #{pep_seq_count} pep_seq"
|
66
|
+
|
67
|
+
cds_seq = Hash.new
|
68
|
+
cds_seq_count=0
|
69
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:cds]) do |fasta_file|
|
70
|
+
fasta_file.each do |entry|
|
71
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
72
|
+
cds_seq[gene_name] = entry unless cds_seq[gene_name]
|
73
|
+
cds_seq[gene_name] = entry if entry.length > cds_seq[gene_name].length
|
74
|
+
cds_seq_count += 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
$stderr.puts "#Loaded #{cds_seq.length} genes from #{cds_seq_count} cds_seq"
|
78
|
+
|
79
|
+
|
80
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
81
|
+
|
82
|
+
def write_fasta_from_hash(sequences, filename)
|
83
|
+
out = File.new(filename, "w")
|
84
|
+
#puts sequences.inspect
|
85
|
+
sequences.each_pair do | chromosome, exon_seq |
|
86
|
+
out.puts ">#{chromosome}\n#{exon_seq}\n"
|
87
|
+
end
|
88
|
+
out.close
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
93
|
+
a = row['A']
|
94
|
+
b = row['B']
|
95
|
+
d = row['D']
|
96
|
+
triad = row['group_id']
|
97
|
+
|
98
|
+
to_align = Bio::Alignment::SequenceHash.new
|
99
|
+
to_align[a] = pep_seq[a]
|
100
|
+
to_align[b] = pep_seq[b]
|
101
|
+
to_align[d] = pep_seq[d]
|
102
|
+
|
103
|
+
cds_seqs = Bio::Alignment::SequenceHash.new
|
104
|
+
cds_seqs[a] = cds_seq[a].to_biosequence
|
105
|
+
cds_seqs[b] = cds_seq[b].to_biosequence
|
106
|
+
cds_seqs[d] = cds_seq[d].to_biosequence
|
107
|
+
|
108
|
+
cent_triad = triad.to_i / 100
|
109
|
+
folder = "alignments/#{cent_triad}/"
|
110
|
+
FileUtils.mkdir_p folder
|
111
|
+
|
112
|
+
pep_aln = peptide_alignment(to_align)
|
113
|
+
|
114
|
+
save_pep = "#{folder}/#{triad}.pep.fa"
|
115
|
+
write_fasta_from_hash(pep_aln, save_pep)
|
116
|
+
|
117
|
+
save_cds = "#{folder}/#{triad}.cds.fa"
|
118
|
+
write_fasta_from_hash(cds_seqs, save_cds)
|
119
|
+
#break
|
120
|
+
end
|
@@ -0,0 +1,403 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:output_folder] = "."
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
26
|
+
options[:min_bases] = o.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
30
|
+
options[:triads] = o
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
34
|
+
options[:fasta] = o
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
38
|
+
options[:split_token] = o
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
42
|
+
options[:program] = o
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-o", "--output_folder DIR", "Folder to save the output") do |o|
|
46
|
+
options[:output_folder] = o
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end.parse!
|
51
|
+
|
52
|
+
module Bio::Alignment::EnumerableExtension
|
53
|
+
def each_base_alignment
|
54
|
+
names = self.keys
|
55
|
+
|
56
|
+
i = 0
|
57
|
+
len = 0
|
58
|
+
len = self[names[0]].length if names[0]
|
59
|
+
total_alignments = names.size
|
60
|
+
while i < len do
|
61
|
+
yield names.map { | chr| self[chr][i] }
|
62
|
+
i += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def cut_alignment(start, length)
|
67
|
+
a = Bio::Alignment::SequenceHash.new
|
68
|
+
a.set_all_property(get_all_property)
|
69
|
+
each_pair do |key, str|
|
70
|
+
seq = ""
|
71
|
+
seq = str[start, length] if str != nil
|
72
|
+
a.store(key, seq)
|
73
|
+
end
|
74
|
+
a
|
75
|
+
end
|
76
|
+
|
77
|
+
def best_block
|
78
|
+
best_start = 0
|
79
|
+
best_score = 0
|
80
|
+
best_end = 0
|
81
|
+
best_length = 0
|
82
|
+
current_start = 0
|
83
|
+
current_score = 0
|
84
|
+
current_length = 0
|
85
|
+
|
86
|
+
each_base_alignment_with_index do |bases, i|
|
87
|
+
current_start = i if current_length == 0
|
88
|
+
current_length += 1
|
89
|
+
current_score += sum_of_pair bases
|
90
|
+
if current_score > best_score
|
91
|
+
best_score = current_score
|
92
|
+
best_length = current_length
|
93
|
+
best_end = i
|
94
|
+
best_start = current_start
|
95
|
+
end
|
96
|
+
|
97
|
+
if current_score < 0
|
98
|
+
current_length = 0
|
99
|
+
current_score = 0
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
[best_start, best_length, len - best_start - best_length , len - best_start ]
|
105
|
+
end
|
106
|
+
|
107
|
+
def each_base_alignment_with_index
|
108
|
+
names = self.keys
|
109
|
+
total_alignments = names.size
|
110
|
+
i = 0
|
111
|
+
while i < len do
|
112
|
+
yield names.map { | chr| self[chr][i] } , i
|
113
|
+
i += 1
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def each_base_alignment
|
118
|
+
each_base_alignment_with_index do |chr, i|
|
119
|
+
yield chr
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def sum_of_all_pairs
|
124
|
+
return @sum_of_all_pairs if @sum_of_all_pairs
|
125
|
+
@sum_of_all_pairs = 0
|
126
|
+
self.each_base_alignment do |bases|
|
127
|
+
@sum_of_all_pairs += sum_of_pair bases
|
128
|
+
end
|
129
|
+
@sum_of_all_pairs
|
130
|
+
end
|
131
|
+
|
132
|
+
def sum_of_identities
|
133
|
+
return @sum_of_identities if @sum_of_identities
|
134
|
+
@sum_of_identities = 0
|
135
|
+
self.each_base_alignment do |bases|
|
136
|
+
@sum_of_identities += s_o_i bases
|
137
|
+
end
|
138
|
+
@sum_of_identities
|
139
|
+
end
|
140
|
+
|
141
|
+
def len
|
142
|
+
return @len if @len
|
143
|
+
names = self.keys
|
144
|
+
@len = 0
|
145
|
+
@len = self[names[0]].length if names[0] and self[names[0]] != nil
|
146
|
+
@len
|
147
|
+
end
|
148
|
+
|
149
|
+
def pairwise_comparaisons
|
150
|
+
names = self.keys
|
151
|
+
n = names.size
|
152
|
+
c = n * (n-1)/2
|
153
|
+
c
|
154
|
+
end
|
155
|
+
|
156
|
+
def identity
|
157
|
+
max_score = len * pairwise_comparaisons
|
158
|
+
sum_of_identities.to_f/max_score
|
159
|
+
end
|
160
|
+
|
161
|
+
def normalized_sum_of_all_pairs
|
162
|
+
max_score = len * pairwise_comparaisons
|
163
|
+
sum_of_all_pairs.to_f/max_score
|
164
|
+
end
|
165
|
+
|
166
|
+
def sum_of_pair(bases)
|
167
|
+
x = bases.length - 1
|
168
|
+
total = 0
|
169
|
+
for i in 0..x
|
170
|
+
y = i + 1
|
171
|
+
for j in y..x
|
172
|
+
case
|
173
|
+
when (bases[i] == "-" and bases[j] == "-")
|
174
|
+
total += 0
|
175
|
+
when (bases[i] == "N" and bases[j] == "N")
|
176
|
+
total += 0
|
177
|
+
when (bases[i] == "n" and bases[j] == "n")
|
178
|
+
total += 0
|
179
|
+
when (bases[i] == "-" or bases[j] == "-")
|
180
|
+
total -= 2
|
181
|
+
when bases[i] == bases[j]
|
182
|
+
total += 1
|
183
|
+
when bases[i] != bases[j]
|
184
|
+
total -= 1
|
185
|
+
else
|
186
|
+
$stderr.puts "Invalid comparaison! sum_of_all_pairs(#{bases})"
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
total
|
191
|
+
end
|
192
|
+
|
193
|
+
def s_o_i(bases)
|
194
|
+
x = bases.length - 1
|
195
|
+
total = 0
|
196
|
+
for i in 0..x
|
197
|
+
y = i + 1
|
198
|
+
for j in y..x
|
199
|
+
total += 1 if bases[i] == bases[j]
|
200
|
+
end
|
201
|
+
end
|
202
|
+
total
|
203
|
+
end
|
204
|
+
|
205
|
+
def window_identities(window_size=100, offset=25)
|
206
|
+
steps = (0..len).step(offset).to_a.map {|a| a + len%offset }.reverse
|
207
|
+
ret = []
|
208
|
+
steps.each_with_index do |e, i|
|
209
|
+
start = e - window_size
|
210
|
+
tmp_aln = self.cut_alignment start, window_size
|
211
|
+
tmp_arr = [
|
212
|
+
i * offset,
|
213
|
+
i * offset + window_size,
|
214
|
+
tmp_aln.sum_of_all_pairs,
|
215
|
+
tmp_aln.normalized_sum_of_all_pairs,
|
216
|
+
tmp_aln.sum_of_identities,
|
217
|
+
tmp_aln.identity]
|
218
|
+
ret << tmp_arr
|
219
|
+
end
|
220
|
+
ret
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def promoter_alignment(sequences_to_align)
|
225
|
+
process = true
|
226
|
+
sequences_to_align.each_value { |val| process &= val != nil }
|
227
|
+
return sequences_to_align unless process
|
228
|
+
#options = ['--maxiterate', '1000', '--ep', '0', '--genafpair', '--quiet']
|
229
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
230
|
+
@mafft = Bio::MAFFT.new( "mafft" , options) unless @mafft
|
231
|
+
report = @mafft.query_align(sequences_to_align)
|
232
|
+
report.alignment
|
233
|
+
end
|
234
|
+
|
235
|
+
def write_fasta_from_hash(sequences, filename)
|
236
|
+
out = File.new(filename, "w")
|
237
|
+
sequences.each_pair do | chromosome, exon_seq |
|
238
|
+
out.puts ">#{chromosome}\n#{exon_seq}\n"
|
239
|
+
end
|
240
|
+
out.close
|
241
|
+
end
|
242
|
+
|
243
|
+
def get_longest_aln(aln, max_gap: 10)
|
244
|
+
names = aln.keys
|
245
|
+
i = 0
|
246
|
+
len = 0
|
247
|
+
len = aln[names[0]].length if names[0] and aln[names[0]] != nil
|
248
|
+
total_alignments = names.size
|
249
|
+
masked_snps = "-" * len
|
250
|
+
longest_start = -1
|
251
|
+
longest_length = 0
|
252
|
+
current_start = -1
|
253
|
+
current_length = 0
|
254
|
+
current_gap = 0
|
255
|
+
longest_gaps = 0
|
256
|
+
gaps = 0
|
257
|
+
while i < len do
|
258
|
+
different = 0
|
259
|
+
cov = 0
|
260
|
+
names.each do | chr |
|
261
|
+
if aln[chr][i] != "-"
|
262
|
+
cov += 1
|
263
|
+
end
|
264
|
+
end
|
265
|
+
if cov == total_alignments
|
266
|
+
current_start = i if current_length == 0
|
267
|
+
current_length += 1
|
268
|
+
current_gap = 0
|
269
|
+
else
|
270
|
+
gaps += 1
|
271
|
+
current_gap += 1
|
272
|
+
end
|
273
|
+
|
274
|
+
if current_length > longest_length
|
275
|
+
longest_length = current_length
|
276
|
+
longest_start = current_start
|
277
|
+
longest_gaps = gaps - current_gap
|
278
|
+
end
|
279
|
+
if current_gap > max_gap
|
280
|
+
current_length = 0
|
281
|
+
gaps = 0
|
282
|
+
end
|
283
|
+
i += 1
|
284
|
+
end
|
285
|
+
longest_length += longest_gaps
|
286
|
+
[longest_start, longest_length, len - longest_start - longest_length, len - longest_start]
|
287
|
+
end
|
288
|
+
|
289
|
+
split_token = options[:split_token]
|
290
|
+
|
291
|
+
def read_alignments(fasta_path, split_token)
|
292
|
+
sequences = Hash.new
|
293
|
+
sequence_count=0
|
294
|
+
Bio::FlatFile.open(Bio::FastaFormat, fasta_path) do |fasta_file|
|
295
|
+
fasta_file.each do |entry|
|
296
|
+
#puts entry
|
297
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
298
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
299
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
300
|
+
sequence_count += 1
|
301
|
+
end
|
302
|
+
end
|
303
|
+
[sequences,sequence_count]
|
304
|
+
end
|
305
|
+
|
306
|
+
sequences, sequence_count = read_alignments(options[:fasta], split_token)
|
307
|
+
|
308
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
309
|
+
output_folder = options[:output_folder]
|
310
|
+
|
311
|
+
FileUtils.mkdir_p output_folder
|
312
|
+
summary_file = "#{output_folder}/identities.txt"
|
313
|
+
long_table_file = "#{output_folder}/sliding_window_identities.txt"
|
314
|
+
|
315
|
+
out = File.open(summary_file, "w")
|
316
|
+
long_table = File.open(long_table_file, "w")
|
317
|
+
|
318
|
+
i =0
|
319
|
+
|
320
|
+
header = ["triad", "total_aln_length"]
|
321
|
+
header << ["longest_start", "longest_length", "longest_start_from_CDS","longest_end_from_CDS", "longest_sum_of_all_pairs","longest_norm_sum_of_all_pairs","longest_sum_of_identities", "longest_identity"]
|
322
|
+
header << ["best_start", "best_length" , "best_start_from_CDS","best_end_from_CDS", "best_sum_of_all_pairs","best_norm_sum_of_all_pairs","best_sum_of_identities", "best_identity"]
|
323
|
+
out.puts header.join("\t")
|
324
|
+
long_table.puts ["triad", "type", "start_from_CDS", "end_from_cds" , "sum_of_all_pairs","norm_sum_of_all_pairs","sum_of_identities", "identity"].join("\t")
|
325
|
+
CSV.foreach( options[:triads], headers:true ) do |row|
|
326
|
+
a = row['A']
|
327
|
+
b = row['B']
|
328
|
+
d = row['D']
|
329
|
+
triad = row['group_id']
|
330
|
+
|
331
|
+
cent_triad = triad.to_i / 100
|
332
|
+
folder = "#{output_folder}/prom_aln/#{cent_triad}/"
|
333
|
+
save_prom = "#{folder}/#{triad}.prom.fa"
|
334
|
+
|
335
|
+
to_align = Bio::Alignment::SequenceHash.new
|
336
|
+
to_align[a] = sequences[a]
|
337
|
+
to_align[b] = sequences[b]
|
338
|
+
to_align[d] = sequences[d]
|
339
|
+
|
340
|
+
prom_aln = nil
|
341
|
+
unless File.file? save_prom
|
342
|
+
prom_aln = promoter_alignment to_align
|
343
|
+
else
|
344
|
+
ff, seqs_cnt = read_alignments save_prom, split_token
|
345
|
+
seqs = Bio::Alignment::SequenceHash.new
|
346
|
+
prom_aln = Bio::Alignment.new(ff)
|
347
|
+
end
|
348
|
+
print_arr = [triad, prom_aln.len]
|
349
|
+
aln_stats = get_longest_aln prom_aln
|
350
|
+
print_arr << aln_stats
|
351
|
+
cut_seqs = prom_aln.cut_alignment aln_stats[0], aln_stats[1]
|
352
|
+
|
353
|
+
|
354
|
+
|
355
|
+
print_arr << cut_seqs.sum_of_all_pairs
|
356
|
+
print_arr << cut_seqs.normalized_sum_of_all_pairs
|
357
|
+
|
358
|
+
print_arr << cut_seqs.sum_of_identities
|
359
|
+
print_arr << cut_seqs.identity
|
360
|
+
|
361
|
+
best_aln_stats = prom_aln.best_block
|
362
|
+
best_aln_cut = prom_aln.cut_alignment best_aln_stats[0], best_aln_stats[1]
|
363
|
+
|
364
|
+
print_arr << best_aln_stats
|
365
|
+
|
366
|
+
print_arr << best_aln_cut.sum_of_all_pairs
|
367
|
+
print_arr << best_aln_cut.normalized_sum_of_all_pairs
|
368
|
+
|
369
|
+
print_arr << best_aln_cut.sum_of_identities
|
370
|
+
print_arr << best_aln_cut.identity
|
371
|
+
|
372
|
+
base = [triad, "cut_longest_region"]
|
373
|
+
cut_seqs.window_identities.each do |e|
|
374
|
+
long_table.puts [base, e].flatten.join("\t")
|
375
|
+
end
|
376
|
+
|
377
|
+
base = [triad, "cut_best_region"]
|
378
|
+
best_aln_cut.window_identities.each do |e|
|
379
|
+
long_table.puts [base, e].flatten.join("\t")
|
380
|
+
end
|
381
|
+
|
382
|
+
base = [triad, "full_promoter"]
|
383
|
+
prom_aln.window_identities.each do |e|
|
384
|
+
long_table.puts [base, e].flatten.join("\t")
|
385
|
+
end
|
386
|
+
|
387
|
+
out.puts print_arr.join("\t")
|
388
|
+
|
389
|
+
FileUtils.mkdir_p folder
|
390
|
+
|
391
|
+
write_fasta_from_hash(prom_aln, save_prom) unless File.file?(save_prom)
|
392
|
+
|
393
|
+
save_prom_cut = "#{folder}/#{triad}.prom.cut.fa"
|
394
|
+
write_fasta_from_hash(cut_seqs, save_prom_cut) unless File.file?(save_prom)
|
395
|
+
|
396
|
+
save_prom_cut_best = "#{folder}/#{triad}.prom.cut.best.fa"
|
397
|
+
write_fasta_from_hash(best_aln_cut, save_prom_cut_best)
|
398
|
+
|
399
|
+
i += 1
|
400
|
+
#break if i > 10
|
401
|
+
end
|
402
|
+
long_table.close
|
403
|
+
out.close
|