bio-polymarker 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,183 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'pathname'
|
5
|
+
require 'bio-samtools-wrapper'
|
6
|
+
|
7
|
+
require 'set'
|
8
|
+
|
9
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
10
|
+
$: << File.expand_path('.')
|
11
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
12
|
+
require path
|
13
|
+
|
14
|
+
|
15
|
+
#@snp_map=Hash.new
|
16
|
+
|
17
|
+
class HomokaryotContainer < Bio::PolyploidTools::ExonContainer
|
18
|
+
|
19
|
+
|
20
|
+
def add_snp_file(filename, chromosome, snp_in, original_name)
|
21
|
+
flanking_size = 100
|
22
|
+
File.open(filename) do | f |
|
23
|
+
f.each_line do | line |
|
24
|
+
if ARGV.size == 1 #List with Sequence
|
25
|
+
snp = Bio::PolyploidTools::SNPSequence.parse(line)
|
26
|
+
snp.use_reference = false
|
27
|
+
elsif ARGV.size == 2 #List and fasta file
|
28
|
+
snp = Bio::PolyploidTools::SNP.parse(line)
|
29
|
+
snp.use_reference = true
|
30
|
+
end
|
31
|
+
#snp = Bio::PolyploidTools::SNP.parse(line)
|
32
|
+
# puts snp.gene
|
33
|
+
snp.flanking_size = flanking_size
|
34
|
+
if snp.position > 0
|
35
|
+
snp.container = self
|
36
|
+
snp.chromosome = chromosome
|
37
|
+
snp.snp_in = snp_in
|
38
|
+
snp.original_name = original_name
|
39
|
+
|
40
|
+
snp.container = self
|
41
|
+
@snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
|
42
|
+
@snp_map[snp.gene] << snp
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
def print_primer_3_exons (file, target_chromosome , parental )
|
51
|
+
@snp_map.each do | gene, snp_array|
|
52
|
+
snp_array.each do |snp|
|
53
|
+
string = snp.primer_3_string( snp.chromosome, parental )
|
54
|
+
file.puts string if string.size > 0
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class Bio::PolyploidTools::SNP
|
62
|
+
|
63
|
+
@aligned = false
|
64
|
+
|
65
|
+
def aligned_snp_position
|
66
|
+
return local_position
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
def aligned_sequences
|
71
|
+
|
72
|
+
@aligned_sequences = parental_sequences
|
73
|
+
@aligned_sequences["A"][local_position] = original
|
74
|
+
@aligned_sequences["B"][local_position] = snp
|
75
|
+
return @aligned_sequences
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
snp_file = ARGV[0]
|
84
|
+
reference_file = ARGV[1]
|
85
|
+
|
86
|
+
snp_in="A"
|
87
|
+
original_name="B"
|
88
|
+
snps = Array.new
|
89
|
+
|
90
|
+
#0. Load the fasta index
|
91
|
+
fasta_reference_db = nil
|
92
|
+
if reference_file
|
93
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>reference_file})
|
94
|
+
fasta_reference_db.load_fai_entries
|
95
|
+
p "Fasta reference: #{reference_file}"
|
96
|
+
end
|
97
|
+
#1. Read all the SNP files
|
98
|
+
#All the SNPs should be on the same chromosome as the first SNP.
|
99
|
+
chromosome = nil
|
100
|
+
File.open(snp_file) do | f |
|
101
|
+
f.each_line do | line |
|
102
|
+
# p line.chomp!
|
103
|
+
snp = nil
|
104
|
+
if ARGV.size == 1 #List with Sequence
|
105
|
+
snp = Bio::PolyploidTools::SNPSequence.parse(line)
|
106
|
+
|
107
|
+
elsif ARGV.size == 2 #List and fasta file
|
108
|
+
snp = Bio::PolyploidTools::SNP.parse(line)
|
109
|
+
region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
|
110
|
+
snp.template_sequence = fasta_reference_db.fetch_sequence(region)
|
111
|
+
else
|
112
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
113
|
+
end
|
114
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
115
|
+
snp.snp_in = snp_in
|
116
|
+
snp.original_name = original_name
|
117
|
+
snps << snp
|
118
|
+
chromosome = snp.chromosome unless chromosome
|
119
|
+
raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
output_folder="#{snp_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}/"
|
125
|
+
Dir.mkdir(output_folder)
|
126
|
+
seqs_file= output_folder + "sequences.fa"
|
127
|
+
written_seqs = Set.new
|
128
|
+
reference_file = seqs_file unless reference_file
|
129
|
+
|
130
|
+
|
131
|
+
file = File.open(seqs_file, "w")
|
132
|
+
snps.each do |snp|
|
133
|
+
unless written_seqs.include?(snp.gene)
|
134
|
+
written_seqs << snp.gene
|
135
|
+
file.puts snp.to_fasta
|
136
|
+
end
|
137
|
+
end
|
138
|
+
file.close
|
139
|
+
|
140
|
+
|
141
|
+
container = HomokaryotContainer.new
|
142
|
+
container.add_parental({:name=>snp_in})
|
143
|
+
container.add_parental({:name=>original_name})
|
144
|
+
container.gene_models(reference_file) if reference_file
|
145
|
+
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
primer_3_input="#{output_folder}primer_3_input_temp"
|
151
|
+
primer_3_output="#{output_folder}primer_3_output_temp"
|
152
|
+
container.add_snp_file(snp_file, "PST130", snp_in, original_name)
|
153
|
+
primer_3_config=File.expand_path(File.dirname(__FILE__) + '/../conf/primer3_config')
|
154
|
+
output_primers="#{output_folder}primers.csv"
|
155
|
+
|
156
|
+
file = File.open(primer_3_input, "w")
|
157
|
+
file.puts("PRIMER_PRODUCT_SIZE_RANGE=50-150")
|
158
|
+
file.puts("PRIMER_MAX_SIZE=25")
|
159
|
+
file.puts("PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS=1")
|
160
|
+
file.puts("PRIMER_LIBERAL_BASE=1")
|
161
|
+
file.puts("PRIMER_NUM_RETURN=5")
|
162
|
+
file.puts("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=#{primer_3_config}/")
|
163
|
+
|
164
|
+
|
165
|
+
container.print_primer_3_exons(file, "PST130",snp_in)
|
166
|
+
|
167
|
+
file.close
|
168
|
+
|
169
|
+
|
170
|
+
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output})
|
171
|
+
|
172
|
+
#2. Pick the best primer and make the primer3 output
|
173
|
+
kasp_container=Bio::DB::Primer3::KASPContainer.new
|
174
|
+
kasp_container.line_1=original_name
|
175
|
+
kasp_container.line_2=snp_in
|
176
|
+
|
177
|
+
snps.each do |snp|
|
178
|
+
kasp_container.add_snp(snp)
|
179
|
+
end
|
180
|
+
|
181
|
+
kasp_container.add_primers_file(primer_3_output)
|
182
|
+
header = "Marker,SNP,RegionSize,SNP_type,#{snp_in},#{original_name},common,primer_type,orientation,#{snp_in}_TM,#{original_name}_TM,common_TM,selected_from,product_size"
|
183
|
+
File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
|
data/bin/mafft_triads.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: mafft_triads.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
27
|
+
options[:triads] = o
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on("-f", "--pep FILE" , "FASTA file containing all the possible peptide sequences. ") do |o|
|
31
|
+
options[:pep] = o
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on("-s", "--cds FILE" , "FASTA file containing all the possible CDS sequences. ") do |o|
|
35
|
+
options[:cds] = o
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
39
|
+
options[:split_token] = o
|
40
|
+
end
|
41
|
+
|
42
|
+
end.parse!
|
43
|
+
|
44
|
+
|
45
|
+
def peptide_alignment(sequences_to_align)
|
46
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
47
|
+
mafft = Bio::MAFFT.new( "mafft" , options)
|
48
|
+
report = mafft.query_align(sequences_to_align)
|
49
|
+
report.alignment
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
split_token = options[:split_token]
|
54
|
+
|
55
|
+
pep_seq = Hash.new
|
56
|
+
pep_seq_count=0
|
57
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:pep]) do |fasta_file|
|
58
|
+
fasta_file.each do |entry|
|
59
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
60
|
+
pep_seq[gene_name] = entry unless pep_seq[gene_name]
|
61
|
+
pep_seq[gene_name] = entry if entry.length > pep_seq[gene_name].length
|
62
|
+
pep_seq_count += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
$stderr.puts "#Loaded #{pep_seq.length} genes from #{pep_seq_count} pep_seq"
|
66
|
+
|
67
|
+
cds_seq = Hash.new
|
68
|
+
cds_seq_count=0
|
69
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:cds]) do |fasta_file|
|
70
|
+
fasta_file.each do |entry|
|
71
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
72
|
+
cds_seq[gene_name] = entry unless cds_seq[gene_name]
|
73
|
+
cds_seq[gene_name] = entry if entry.length > cds_seq[gene_name].length
|
74
|
+
cds_seq_count += 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
$stderr.puts "#Loaded #{cds_seq.length} genes from #{cds_seq_count} cds_seq"
|
78
|
+
|
79
|
+
|
80
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
81
|
+
|
82
|
+
def write_fasta_from_hash(sequences, filename)
|
83
|
+
out = File.new(filename, "w")
|
84
|
+
#puts sequences.inspect
|
85
|
+
sequences.each_pair do | chromosome, exon_seq |
|
86
|
+
out.puts ">#{chromosome}\n#{exon_seq}\n"
|
87
|
+
end
|
88
|
+
out.close
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
93
|
+
a = row['A']
|
94
|
+
b = row['B']
|
95
|
+
d = row['D']
|
96
|
+
triad = row['group_id']
|
97
|
+
|
98
|
+
to_align = Bio::Alignment::SequenceHash.new
|
99
|
+
to_align[a] = pep_seq[a]
|
100
|
+
to_align[b] = pep_seq[b]
|
101
|
+
to_align[d] = pep_seq[d]
|
102
|
+
|
103
|
+
cds_seqs = Bio::Alignment::SequenceHash.new
|
104
|
+
cds_seqs[a] = cds_seq[a].to_biosequence
|
105
|
+
cds_seqs[b] = cds_seq[b].to_biosequence
|
106
|
+
cds_seqs[d] = cds_seq[d].to_biosequence
|
107
|
+
|
108
|
+
cent_triad = triad.to_i / 100
|
109
|
+
folder = "alignments/#{cent_triad}/"
|
110
|
+
FileUtils.mkdir_p folder
|
111
|
+
|
112
|
+
pep_aln = peptide_alignment(to_align)
|
113
|
+
|
114
|
+
save_pep = "#{folder}/#{triad}.pep.fa"
|
115
|
+
write_fasta_from_hash(pep_aln, save_pep)
|
116
|
+
|
117
|
+
save_cds = "#{folder}/#{triad}.cds.fa"
|
118
|
+
write_fasta_from_hash(cds_seqs, save_cds)
|
119
|
+
#break
|
120
|
+
end
|
@@ -0,0 +1,403 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:output_folder] = "."
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
26
|
+
options[:min_bases] = o.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
30
|
+
options[:triads] = o
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
34
|
+
options[:fasta] = o
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
38
|
+
options[:split_token] = o
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
42
|
+
options[:program] = o
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-o", "--output_folder DIR", "Folder to save the output") do |o|
|
46
|
+
options[:output_folder] = o
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end.parse!
|
51
|
+
|
52
|
+
module Bio::Alignment::EnumerableExtension
|
53
|
+
def each_base_alignment
|
54
|
+
names = self.keys
|
55
|
+
|
56
|
+
i = 0
|
57
|
+
len = 0
|
58
|
+
len = self[names[0]].length if names[0]
|
59
|
+
total_alignments = names.size
|
60
|
+
while i < len do
|
61
|
+
yield names.map { | chr| self[chr][i] }
|
62
|
+
i += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def cut_alignment(start, length)
|
67
|
+
a = Bio::Alignment::SequenceHash.new
|
68
|
+
a.set_all_property(get_all_property)
|
69
|
+
each_pair do |key, str|
|
70
|
+
seq = ""
|
71
|
+
seq = str[start, length] if str != nil
|
72
|
+
a.store(key, seq)
|
73
|
+
end
|
74
|
+
a
|
75
|
+
end
|
76
|
+
|
77
|
+
def best_block
|
78
|
+
best_start = 0
|
79
|
+
best_score = 0
|
80
|
+
best_end = 0
|
81
|
+
best_length = 0
|
82
|
+
current_start = 0
|
83
|
+
current_score = 0
|
84
|
+
current_length = 0
|
85
|
+
|
86
|
+
each_base_alignment_with_index do |bases, i|
|
87
|
+
current_start = i if current_length == 0
|
88
|
+
current_length += 1
|
89
|
+
current_score += sum_of_pair bases
|
90
|
+
if current_score > best_score
|
91
|
+
best_score = current_score
|
92
|
+
best_length = current_length
|
93
|
+
best_end = i
|
94
|
+
best_start = current_start
|
95
|
+
end
|
96
|
+
|
97
|
+
if current_score < 0
|
98
|
+
current_length = 0
|
99
|
+
current_score = 0
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
[best_start, best_length, len - best_start - best_length , len - best_start ]
|
105
|
+
end
|
106
|
+
|
107
|
+
def each_base_alignment_with_index
|
108
|
+
names = self.keys
|
109
|
+
total_alignments = names.size
|
110
|
+
i = 0
|
111
|
+
while i < len do
|
112
|
+
yield names.map { | chr| self[chr][i] } , i
|
113
|
+
i += 1
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def each_base_alignment
|
118
|
+
each_base_alignment_with_index do |chr, i|
|
119
|
+
yield chr
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def sum_of_all_pairs
|
124
|
+
return @sum_of_all_pairs if @sum_of_all_pairs
|
125
|
+
@sum_of_all_pairs = 0
|
126
|
+
self.each_base_alignment do |bases|
|
127
|
+
@sum_of_all_pairs += sum_of_pair bases
|
128
|
+
end
|
129
|
+
@sum_of_all_pairs
|
130
|
+
end
|
131
|
+
|
132
|
+
def sum_of_identities
|
133
|
+
return @sum_of_identities if @sum_of_identities
|
134
|
+
@sum_of_identities = 0
|
135
|
+
self.each_base_alignment do |bases|
|
136
|
+
@sum_of_identities += s_o_i bases
|
137
|
+
end
|
138
|
+
@sum_of_identities
|
139
|
+
end
|
140
|
+
|
141
|
+
def len
|
142
|
+
return @len if @len
|
143
|
+
names = self.keys
|
144
|
+
@len = 0
|
145
|
+
@len = self[names[0]].length if names[0] and self[names[0]] != nil
|
146
|
+
@len
|
147
|
+
end
|
148
|
+
|
149
|
+
def pairwise_comparaisons
|
150
|
+
names = self.keys
|
151
|
+
n = names.size
|
152
|
+
c = n * (n-1)/2
|
153
|
+
c
|
154
|
+
end
|
155
|
+
|
156
|
+
def identity
|
157
|
+
max_score = len * pairwise_comparaisons
|
158
|
+
sum_of_identities.to_f/max_score
|
159
|
+
end
|
160
|
+
|
161
|
+
def normalized_sum_of_all_pairs
|
162
|
+
max_score = len * pairwise_comparaisons
|
163
|
+
sum_of_all_pairs.to_f/max_score
|
164
|
+
end
|
165
|
+
|
166
|
+
def sum_of_pair(bases)
|
167
|
+
x = bases.length - 1
|
168
|
+
total = 0
|
169
|
+
for i in 0..x
|
170
|
+
y = i + 1
|
171
|
+
for j in y..x
|
172
|
+
case
|
173
|
+
when (bases[i] == "-" and bases[j] == "-")
|
174
|
+
total += 0
|
175
|
+
when (bases[i] == "N" and bases[j] == "N")
|
176
|
+
total += 0
|
177
|
+
when (bases[i] == "n" and bases[j] == "n")
|
178
|
+
total += 0
|
179
|
+
when (bases[i] == "-" or bases[j] == "-")
|
180
|
+
total -= 2
|
181
|
+
when bases[i] == bases[j]
|
182
|
+
total += 1
|
183
|
+
when bases[i] != bases[j]
|
184
|
+
total -= 1
|
185
|
+
else
|
186
|
+
$stderr.puts "Invalid comparaison! sum_of_all_pairs(#{bases})"
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
total
|
191
|
+
end
|
192
|
+
|
193
|
+
def s_o_i(bases)
|
194
|
+
x = bases.length - 1
|
195
|
+
total = 0
|
196
|
+
for i in 0..x
|
197
|
+
y = i + 1
|
198
|
+
for j in y..x
|
199
|
+
total += 1 if bases[i] == bases[j]
|
200
|
+
end
|
201
|
+
end
|
202
|
+
total
|
203
|
+
end
|
204
|
+
|
205
|
+
def window_identities(window_size=100, offset=25)
|
206
|
+
steps = (0..len).step(offset).to_a.map {|a| a + len%offset }.reverse
|
207
|
+
ret = []
|
208
|
+
steps.each_with_index do |e, i|
|
209
|
+
start = e - window_size
|
210
|
+
tmp_aln = self.cut_alignment start, window_size
|
211
|
+
tmp_arr = [
|
212
|
+
i * offset,
|
213
|
+
i * offset + window_size,
|
214
|
+
tmp_aln.sum_of_all_pairs,
|
215
|
+
tmp_aln.normalized_sum_of_all_pairs,
|
216
|
+
tmp_aln.sum_of_identities,
|
217
|
+
tmp_aln.identity]
|
218
|
+
ret << tmp_arr
|
219
|
+
end
|
220
|
+
ret
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def promoter_alignment(sequences_to_align)
|
225
|
+
process = true
|
226
|
+
sequences_to_align.each_value { |val| process &= val != nil }
|
227
|
+
return sequences_to_align unless process
|
228
|
+
#options = ['--maxiterate', '1000', '--ep', '0', '--genafpair', '--quiet']
|
229
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
230
|
+
@mafft = Bio::MAFFT.new( "mafft" , options) unless @mafft
|
231
|
+
report = @mafft.query_align(sequences_to_align)
|
232
|
+
report.alignment
|
233
|
+
end
|
234
|
+
|
235
|
+
def write_fasta_from_hash(sequences, filename)
|
236
|
+
out = File.new(filename, "w")
|
237
|
+
sequences.each_pair do | chromosome, exon_seq |
|
238
|
+
out.puts ">#{chromosome}\n#{exon_seq}\n"
|
239
|
+
end
|
240
|
+
out.close
|
241
|
+
end
|
242
|
+
|
243
|
+
def get_longest_aln(aln, max_gap: 10)
|
244
|
+
names = aln.keys
|
245
|
+
i = 0
|
246
|
+
len = 0
|
247
|
+
len = aln[names[0]].length if names[0] and aln[names[0]] != nil
|
248
|
+
total_alignments = names.size
|
249
|
+
masked_snps = "-" * len
|
250
|
+
longest_start = -1
|
251
|
+
longest_length = 0
|
252
|
+
current_start = -1
|
253
|
+
current_length = 0
|
254
|
+
current_gap = 0
|
255
|
+
longest_gaps = 0
|
256
|
+
gaps = 0
|
257
|
+
while i < len do
|
258
|
+
different = 0
|
259
|
+
cov = 0
|
260
|
+
names.each do | chr |
|
261
|
+
if aln[chr][i] != "-"
|
262
|
+
cov += 1
|
263
|
+
end
|
264
|
+
end
|
265
|
+
if cov == total_alignments
|
266
|
+
current_start = i if current_length == 0
|
267
|
+
current_length += 1
|
268
|
+
current_gap = 0
|
269
|
+
else
|
270
|
+
gaps += 1
|
271
|
+
current_gap += 1
|
272
|
+
end
|
273
|
+
|
274
|
+
if current_length > longest_length
|
275
|
+
longest_length = current_length
|
276
|
+
longest_start = current_start
|
277
|
+
longest_gaps = gaps - current_gap
|
278
|
+
end
|
279
|
+
if current_gap > max_gap
|
280
|
+
current_length = 0
|
281
|
+
gaps = 0
|
282
|
+
end
|
283
|
+
i += 1
|
284
|
+
end
|
285
|
+
longest_length += longest_gaps
|
286
|
+
[longest_start, longest_length, len - longest_start - longest_length, len - longest_start]
|
287
|
+
end
|
288
|
+
|
289
|
+
split_token = options[:split_token]
|
290
|
+
|
291
|
+
def read_alignments(fasta_path, split_token)
|
292
|
+
sequences = Hash.new
|
293
|
+
sequence_count=0
|
294
|
+
Bio::FlatFile.open(Bio::FastaFormat, fasta_path) do |fasta_file|
|
295
|
+
fasta_file.each do |entry|
|
296
|
+
#puts entry
|
297
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
298
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
299
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
300
|
+
sequence_count += 1
|
301
|
+
end
|
302
|
+
end
|
303
|
+
[sequences,sequence_count]
|
304
|
+
end
|
305
|
+
|
306
|
+
sequences, sequence_count = read_alignments(options[:fasta], split_token)
|
307
|
+
|
308
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
309
|
+
output_folder = options[:output_folder]
|
310
|
+
|
311
|
+
FileUtils.mkdir_p output_folder
|
312
|
+
summary_file = "#{output_folder}/identities.txt"
|
313
|
+
long_table_file = "#{output_folder}/sliding_window_identities.txt"
|
314
|
+
|
315
|
+
out = File.open(summary_file, "w")
|
316
|
+
long_table = File.open(long_table_file, "w")
|
317
|
+
|
318
|
+
i =0
|
319
|
+
|
320
|
+
header = ["triad", "total_aln_length"]
|
321
|
+
header << ["longest_start", "longest_length", "longest_start_from_CDS","longest_end_from_CDS", "longest_sum_of_all_pairs","longest_norm_sum_of_all_pairs","longest_sum_of_identities", "longest_identity"]
|
322
|
+
header << ["best_start", "best_length" , "best_start_from_CDS","best_end_from_CDS", "best_sum_of_all_pairs","best_norm_sum_of_all_pairs","best_sum_of_identities", "best_identity"]
|
323
|
+
out.puts header.join("\t")
|
324
|
+
long_table.puts ["triad", "type", "start_from_CDS", "end_from_cds" , "sum_of_all_pairs","norm_sum_of_all_pairs","sum_of_identities", "identity"].join("\t")
|
325
|
+
CSV.foreach( options[:triads], headers:true ) do |row|
|
326
|
+
a = row['A']
|
327
|
+
b = row['B']
|
328
|
+
d = row['D']
|
329
|
+
triad = row['group_id']
|
330
|
+
|
331
|
+
cent_triad = triad.to_i / 100
|
332
|
+
folder = "#{output_folder}/prom_aln/#{cent_triad}/"
|
333
|
+
save_prom = "#{folder}/#{triad}.prom.fa"
|
334
|
+
|
335
|
+
to_align = Bio::Alignment::SequenceHash.new
|
336
|
+
to_align[a] = sequences[a]
|
337
|
+
to_align[b] = sequences[b]
|
338
|
+
to_align[d] = sequences[d]
|
339
|
+
|
340
|
+
prom_aln = nil
|
341
|
+
unless File.file? save_prom
|
342
|
+
prom_aln = promoter_alignment to_align
|
343
|
+
else
|
344
|
+
ff, seqs_cnt = read_alignments save_prom, split_token
|
345
|
+
seqs = Bio::Alignment::SequenceHash.new
|
346
|
+
prom_aln = Bio::Alignment.new(ff)
|
347
|
+
end
|
348
|
+
print_arr = [triad, prom_aln.len]
|
349
|
+
aln_stats = get_longest_aln prom_aln
|
350
|
+
print_arr << aln_stats
|
351
|
+
cut_seqs = prom_aln.cut_alignment aln_stats[0], aln_stats[1]
|
352
|
+
|
353
|
+
|
354
|
+
|
355
|
+
print_arr << cut_seqs.sum_of_all_pairs
|
356
|
+
print_arr << cut_seqs.normalized_sum_of_all_pairs
|
357
|
+
|
358
|
+
print_arr << cut_seqs.sum_of_identities
|
359
|
+
print_arr << cut_seqs.identity
|
360
|
+
|
361
|
+
best_aln_stats = prom_aln.best_block
|
362
|
+
best_aln_cut = prom_aln.cut_alignment best_aln_stats[0], best_aln_stats[1]
|
363
|
+
|
364
|
+
print_arr << best_aln_stats
|
365
|
+
|
366
|
+
print_arr << best_aln_cut.sum_of_all_pairs
|
367
|
+
print_arr << best_aln_cut.normalized_sum_of_all_pairs
|
368
|
+
|
369
|
+
print_arr << best_aln_cut.sum_of_identities
|
370
|
+
print_arr << best_aln_cut.identity
|
371
|
+
|
372
|
+
base = [triad, "cut_longest_region"]
|
373
|
+
cut_seqs.window_identities.each do |e|
|
374
|
+
long_table.puts [base, e].flatten.join("\t")
|
375
|
+
end
|
376
|
+
|
377
|
+
base = [triad, "cut_best_region"]
|
378
|
+
best_aln_cut.window_identities.each do |e|
|
379
|
+
long_table.puts [base, e].flatten.join("\t")
|
380
|
+
end
|
381
|
+
|
382
|
+
base = [triad, "full_promoter"]
|
383
|
+
prom_aln.window_identities.each do |e|
|
384
|
+
long_table.puts [base, e].flatten.join("\t")
|
385
|
+
end
|
386
|
+
|
387
|
+
out.puts print_arr.join("\t")
|
388
|
+
|
389
|
+
FileUtils.mkdir_p folder
|
390
|
+
|
391
|
+
write_fasta_from_hash(prom_aln, save_prom) unless File.file?(save_prom)
|
392
|
+
|
393
|
+
save_prom_cut = "#{folder}/#{triad}.prom.cut.fa"
|
394
|
+
write_fasta_from_hash(cut_seqs, save_prom_cut) unless File.file?(save_prom)
|
395
|
+
|
396
|
+
save_prom_cut_best = "#{folder}/#{triad}.prom.cut.best.fa"
|
397
|
+
write_fasta_from_hash(best_aln_cut, save_prom_cut_best)
|
398
|
+
|
399
|
+
i += 1
|
400
|
+
#break if i > 10
|
401
|
+
end
|
402
|
+
long_table.close
|
403
|
+
out.close
|