bio-polyploid-tools 0.7.3 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +17 -0
- data/Gemfile +10 -7
- data/README.md +44 -0
- data/Rakefile +14 -14
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -2
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/find_homoeologue_variations.rb +385 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +2 -2
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/polymarker.rb +73 -17
- data/bin/polymarker_capillary.rb +416 -0
- data/bin/snp_position_to_polymarker.rb +5 -3
- data/bin/snps_between_bams.rb +0 -29
- data/bin/vcfLineToTable.rb +56 -0
- data/bio-polyploid-tools.gemspec +74 -32
- data/lib/bio/BFRTools.rb +1 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
- data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
- data/lib/bio/PolyploidTools/SNP.rb +58 -18
- data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
- data/lib/bio/db/blast.rb +112 -0
- data/lib/bio/db/exonerate.rb +4 -5
- data/lib/bio/db/primer3.rb +83 -14
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_for_polymarker.fa +1 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/bfr_out_test.csv +5 -5
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/test_bfr.rb +26 -34
- data/test/test_blast.rb +47 -0
- data/test/test_exonearate.rb +4 -9
- data/test/test_snp_parsing.rb +42 -22
- metadata +81 -20
- data/Gemfile.lock +0 -67
data/bin/hexaploid_primers.rb
CHANGED
@@ -63,9 +63,9 @@ File.open(test_file) do | f |
|
|
63
63
|
region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
|
64
64
|
snp.template_sequence = fasta_reference_db.fetch_sequence(region)
|
65
65
|
else
|
66
|
-
|
66
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
67
67
|
end
|
68
|
-
|
68
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
69
69
|
snp.snp_in = snp_in
|
70
70
|
snp.original_name = original_name
|
71
71
|
snps << snp
|
data/bin/homokaryot_primers.rb
CHANGED
@@ -109,9 +109,9 @@ File.open(snp_file) do | f |
|
|
109
109
|
region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
|
110
110
|
snp.template_sequence = fasta_reference_db.fetch_sequence(region)
|
111
111
|
else
|
112
|
-
|
112
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
113
113
|
end
|
114
|
-
|
114
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
115
115
|
snp.snp_in = snp_in
|
116
116
|
snp.original_name = original_name
|
117
117
|
snps << snp
|
data/bin/mafft_triads.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: mafft_triads.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
27
|
+
options[:triads] = o
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on("-f", "--pep FILE" , "FASTA file containing all the possible peptide sequences. ") do |o|
|
31
|
+
options[:pep] = o
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on("-s", "--cds FILE" , "FASTA file containing all the possible CDS sequences. ") do |o|
|
35
|
+
options[:cds] = o
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
39
|
+
options[:split_token] = o
|
40
|
+
end
|
41
|
+
|
42
|
+
end.parse!
|
43
|
+
|
44
|
+
|
45
|
+
def peptide_alignment(sequences_to_align)
|
46
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
47
|
+
mafft = Bio::MAFFT.new( "mafft" , options)
|
48
|
+
report = mafft.query_align(sequences_to_align)
|
49
|
+
report.alignment
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
split_token = options[:split_token]
|
54
|
+
|
55
|
+
pep_seq = Hash.new
|
56
|
+
pep_seq_count=0
|
57
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:pep]) do |fasta_file|
|
58
|
+
fasta_file.each do |entry|
|
59
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
60
|
+
pep_seq[gene_name] = entry unless pep_seq[gene_name]
|
61
|
+
pep_seq[gene_name] = entry if entry.length > pep_seq[gene_name].length
|
62
|
+
pep_seq_count += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
$stderr.puts "#Loaded #{pep_seq.length} genes from #{pep_seq_count} pep_seq"
|
66
|
+
|
67
|
+
cds_seq = Hash.new
|
68
|
+
cds_seq_count=0
|
69
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:cds]) do |fasta_file|
|
70
|
+
fasta_file.each do |entry|
|
71
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
72
|
+
cds_seq[gene_name] = entry unless cds_seq[gene_name]
|
73
|
+
cds_seq[gene_name] = entry if entry.length > cds_seq[gene_name].length
|
74
|
+
cds_seq_count += 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
$stderr.puts "#Loaded #{cds_seq.length} genes from #{cds_seq_count} cds_seq"
|
78
|
+
|
79
|
+
|
80
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
81
|
+
|
82
|
+
def write_fasta_from_hash(sequences, filename)
|
83
|
+
out = File.new(filename, "w")
|
84
|
+
#puts sequences.inspect
|
85
|
+
sequences.each_pair do | chromosome, exon_seq |
|
86
|
+
out.puts ">#{chromosome}\n#{exon_seq}\n"
|
87
|
+
end
|
88
|
+
out.close
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
93
|
+
a = row['A']
|
94
|
+
b = row['B']
|
95
|
+
d = row['D']
|
96
|
+
triad = row['group_id']
|
97
|
+
|
98
|
+
to_align = Bio::Alignment::SequenceHash.new
|
99
|
+
to_align[a] = pep_seq[a]
|
100
|
+
to_align[b] = pep_seq[b]
|
101
|
+
to_align[d] = pep_seq[d]
|
102
|
+
|
103
|
+
cds_seqs = Bio::Alignment::SequenceHash.new
|
104
|
+
cds_seqs[a] = cds_seq[a].to_biosequence
|
105
|
+
cds_seqs[b] = cds_seq[b].to_biosequence
|
106
|
+
cds_seqs[d] = cds_seq[d].to_biosequence
|
107
|
+
|
108
|
+
cent_triad = triad.to_i / 100
|
109
|
+
folder = "alignments/#{cent_triad}/"
|
110
|
+
FileUtils.mkdir_p folder
|
111
|
+
|
112
|
+
pep_aln = peptide_alignment(to_align)
|
113
|
+
|
114
|
+
save_pep = "#{folder}/#{triad}.pep.fa"
|
115
|
+
write_fasta_from_hash(pep_aln, save_pep)
|
116
|
+
|
117
|
+
save_cds = "#{folder}/#{triad}.cds.fa"
|
118
|
+
write_fasta_from_hash(cds_seqs, save_cds)
|
119
|
+
#break
|
120
|
+
end
|
@@ -0,0 +1,403 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:output_folder] = "."
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
26
|
+
options[:min_bases] = o.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
30
|
+
options[:triads] = o
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
34
|
+
options[:fasta] = o
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
38
|
+
options[:split_token] = o
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
42
|
+
options[:program] = o
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-o", "--output_folder DIR", "Folder to save the output") do |o|
|
46
|
+
options[:output_folder] = o
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end.parse!
|
51
|
+
|
52
|
+
module Bio::Alignment::EnumerableExtension
|
53
|
+
def each_base_alignment
|
54
|
+
names = self.keys
|
55
|
+
|
56
|
+
i = 0
|
57
|
+
len = 0
|
58
|
+
len = self[names[0]].length if names[0]
|
59
|
+
total_alignments = names.size
|
60
|
+
while i < len do
|
61
|
+
yield names.map { | chr| self[chr][i] }
|
62
|
+
i += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def cut_alignment(start, length)
|
67
|
+
a = Bio::Alignment::SequenceHash.new
|
68
|
+
a.set_all_property(get_all_property)
|
69
|
+
each_pair do |key, str|
|
70
|
+
seq = ""
|
71
|
+
seq = str[start, length] if str != nil
|
72
|
+
a.store(key, seq)
|
73
|
+
end
|
74
|
+
a
|
75
|
+
end
|
76
|
+
|
77
|
+
def best_block
|
78
|
+
best_start = 0
|
79
|
+
best_score = 0
|
80
|
+
best_end = 0
|
81
|
+
best_length = 0
|
82
|
+
current_start = 0
|
83
|
+
current_score = 0
|
84
|
+
current_length = 0
|
85
|
+
|
86
|
+
each_base_alignment_with_index do |bases, i|
|
87
|
+
current_start = i if current_length == 0
|
88
|
+
current_length += 1
|
89
|
+
current_score += sum_of_pair bases
|
90
|
+
if current_score > best_score
|
91
|
+
best_score = current_score
|
92
|
+
best_length = current_length
|
93
|
+
best_end = i
|
94
|
+
best_start = current_start
|
95
|
+
end
|
96
|
+
|
97
|
+
if current_score < 0
|
98
|
+
current_length = 0
|
99
|
+
current_score = 0
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
[best_start, best_length, len - best_start - best_length , len - best_start ]
|
105
|
+
end
|
106
|
+
|
107
|
+
def each_base_alignment_with_index
|
108
|
+
names = self.keys
|
109
|
+
total_alignments = names.size
|
110
|
+
i = 0
|
111
|
+
while i < len do
|
112
|
+
yield names.map { | chr| self[chr][i] } , i
|
113
|
+
i += 1
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def each_base_alignment
|
118
|
+
each_base_alignment_with_index do |chr, i|
|
119
|
+
yield chr
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def sum_of_all_pairs
|
124
|
+
return @sum_of_all_pairs if @sum_of_all_pairs
|
125
|
+
@sum_of_all_pairs = 0
|
126
|
+
self.each_base_alignment do |bases|
|
127
|
+
@sum_of_all_pairs += sum_of_pair bases
|
128
|
+
end
|
129
|
+
@sum_of_all_pairs
|
130
|
+
end
|
131
|
+
|
132
|
+
def sum_of_identities
|
133
|
+
return @sum_of_identities if @sum_of_identities
|
134
|
+
@sum_of_identities = 0
|
135
|
+
self.each_base_alignment do |bases|
|
136
|
+
@sum_of_identities += s_o_i bases
|
137
|
+
end
|
138
|
+
@sum_of_identities
|
139
|
+
end
|
140
|
+
|
141
|
+
def len
|
142
|
+
return @len if @len
|
143
|
+
names = self.keys
|
144
|
+
@len = 0
|
145
|
+
@len = self[names[0]].length if names[0] and self[names[0]] != nil
|
146
|
+
@len
|
147
|
+
end
|
148
|
+
|
149
|
+
def pairwise_comparaisons
|
150
|
+
names = self.keys
|
151
|
+
n = names.size
|
152
|
+
c = n * (n-1)/2
|
153
|
+
c
|
154
|
+
end
|
155
|
+
|
156
|
+
def identity
|
157
|
+
max_score = len * pairwise_comparaisons
|
158
|
+
sum_of_identities.to_f/max_score
|
159
|
+
end
|
160
|
+
|
161
|
+
def normalized_sum_of_all_pairs
|
162
|
+
max_score = len * pairwise_comparaisons
|
163
|
+
sum_of_all_pairs.to_f/max_score
|
164
|
+
end
|
165
|
+
|
166
|
+
def sum_of_pair(bases)
|
167
|
+
x = bases.length - 1
|
168
|
+
total = 0
|
169
|
+
for i in 0..x
|
170
|
+
y = i + 1
|
171
|
+
for j in y..x
|
172
|
+
case
|
173
|
+
when (bases[i] == "-" and bases[j] == "-")
|
174
|
+
total += 0
|
175
|
+
when (bases[i] == "N" and bases[j] == "N")
|
176
|
+
total += 0
|
177
|
+
when (bases[i] == "n" and bases[j] == "n")
|
178
|
+
total += 0
|
179
|
+
when (bases[i] == "-" or bases[j] == "-")
|
180
|
+
total -= 2
|
181
|
+
when bases[i] == bases[j]
|
182
|
+
total += 1
|
183
|
+
when bases[i] != bases[j]
|
184
|
+
total -= 1
|
185
|
+
else
|
186
|
+
$stderr.puts "Invalid comparaison! sum_of_all_pairs(#{bases})"
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
total
|
191
|
+
end
|
192
|
+
|
193
|
+
def s_o_i(bases)
|
194
|
+
x = bases.length - 1
|
195
|
+
total = 0
|
196
|
+
for i in 0..x
|
197
|
+
y = i + 1
|
198
|
+
for j in y..x
|
199
|
+
total += 1 if bases[i] == bases[j]
|
200
|
+
end
|
201
|
+
end
|
202
|
+
total
|
203
|
+
end
|
204
|
+
|
205
|
+
def window_identities(window_size=100, offset=25)
|
206
|
+
steps = (0..len).step(offset).to_a.map {|a| a + len%offset }.reverse
|
207
|
+
ret = []
|
208
|
+
steps.each_with_index do |e, i|
|
209
|
+
start = e - window_size
|
210
|
+
tmp_aln = self.cut_alignment start, window_size
|
211
|
+
tmp_arr = [
|
212
|
+
i * offset,
|
213
|
+
i * offset + window_size,
|
214
|
+
tmp_aln.sum_of_all_pairs,
|
215
|
+
tmp_aln.normalized_sum_of_all_pairs,
|
216
|
+
tmp_aln.sum_of_identities,
|
217
|
+
tmp_aln.identity]
|
218
|
+
ret << tmp_arr
|
219
|
+
end
|
220
|
+
ret
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def promoter_alignment(sequences_to_align)
|
225
|
+
process = true
|
226
|
+
sequences_to_align.each_value { |val| process &= val != nil }
|
227
|
+
return sequences_to_align unless process
|
228
|
+
#options = ['--maxiterate', '1000', '--ep', '0', '--genafpair', '--quiet']
|
229
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
230
|
+
@mafft = Bio::MAFFT.new( "mafft" , options) unless @mafft
|
231
|
+
report = @mafft.query_align(sequences_to_align)
|
232
|
+
report.alignment
|
233
|
+
end
|
234
|
+
|
235
|
+
def write_fasta_from_hash(sequences, filename)
|
236
|
+
out = File.new(filename, "w")
|
237
|
+
sequences.each_pair do | chromosome, exon_seq |
|
238
|
+
out.puts ">#{chromosome}\n#{exon_seq}\n"
|
239
|
+
end
|
240
|
+
out.close
|
241
|
+
end
|
242
|
+
|
243
|
+
def get_longest_aln(aln, max_gap: 10)
|
244
|
+
names = aln.keys
|
245
|
+
i = 0
|
246
|
+
len = 0
|
247
|
+
len = aln[names[0]].length if names[0] and aln[names[0]] != nil
|
248
|
+
total_alignments = names.size
|
249
|
+
masked_snps = "-" * len
|
250
|
+
longest_start = -1
|
251
|
+
longest_length = 0
|
252
|
+
current_start = -1
|
253
|
+
current_length = 0
|
254
|
+
current_gap = 0
|
255
|
+
longest_gaps = 0
|
256
|
+
gaps = 0
|
257
|
+
while i < len do
|
258
|
+
different = 0
|
259
|
+
cov = 0
|
260
|
+
names.each do | chr |
|
261
|
+
if aln[chr][i] != "-"
|
262
|
+
cov += 1
|
263
|
+
end
|
264
|
+
end
|
265
|
+
if cov == total_alignments
|
266
|
+
current_start = i if current_length == 0
|
267
|
+
current_length += 1
|
268
|
+
current_gap = 0
|
269
|
+
else
|
270
|
+
gaps += 1
|
271
|
+
current_gap += 1
|
272
|
+
end
|
273
|
+
|
274
|
+
if current_length > longest_length
|
275
|
+
longest_length = current_length
|
276
|
+
longest_start = current_start
|
277
|
+
longest_gaps = gaps - current_gap
|
278
|
+
end
|
279
|
+
if current_gap > max_gap
|
280
|
+
current_length = 0
|
281
|
+
gaps = 0
|
282
|
+
end
|
283
|
+
i += 1
|
284
|
+
end
|
285
|
+
longest_length += longest_gaps
|
286
|
+
[longest_start, longest_length, len - longest_start - longest_length, len - longest_start]
|
287
|
+
end
|
288
|
+
|
289
|
+
split_token = options[:split_token]
|
290
|
+
|
291
|
+
def read_alignments(fasta_path, split_token)
|
292
|
+
sequences = Hash.new
|
293
|
+
sequence_count=0
|
294
|
+
Bio::FlatFile.open(Bio::FastaFormat, fasta_path) do |fasta_file|
|
295
|
+
fasta_file.each do |entry|
|
296
|
+
#puts entry
|
297
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
298
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
299
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
300
|
+
sequence_count += 1
|
301
|
+
end
|
302
|
+
end
|
303
|
+
[sequences,sequence_count]
|
304
|
+
end
|
305
|
+
|
306
|
+
sequences, sequence_count = read_alignments(options[:fasta], split_token)
|
307
|
+
|
308
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
309
|
+
output_folder = options[:output_folder]
|
310
|
+
|
311
|
+
FileUtils.mkdir_p output_folder
|
312
|
+
summary_file = "#{output_folder}/identities.txt"
|
313
|
+
long_table_file = "#{output_folder}/sliding_window_identities.txt"
|
314
|
+
|
315
|
+
out = File.open(summary_file, "w")
|
316
|
+
long_table = File.open(long_table_file, "w")
|
317
|
+
|
318
|
+
i =0
|
319
|
+
|
320
|
+
header = ["triad", "total_aln_length"]
|
321
|
+
header << ["longest_start", "longest_length", "longest_start_from_CDS","longest_end_from_CDS", "longest_sum_of_all_pairs","longest_norm_sum_of_all_pairs","longest_sum_of_identities", "longest_identity"]
|
322
|
+
header << ["best_start", "best_length" , "best_start_from_CDS","best_end_from_CDS", "best_sum_of_all_pairs","best_norm_sum_of_all_pairs","best_sum_of_identities", "best_identity"]
|
323
|
+
out.puts header.join("\t")
|
324
|
+
long_table.puts ["triad", "type", "start_from_CDS", "end_from_cds" , "sum_of_all_pairs","norm_sum_of_all_pairs","sum_of_identities", "identity"].join("\t")
|
325
|
+
CSV.foreach( options[:triads], headers:true ) do |row|
|
326
|
+
a = row['A']
|
327
|
+
b = row['B']
|
328
|
+
d = row['D']
|
329
|
+
triad = row['group_id']
|
330
|
+
|
331
|
+
cent_triad = triad.to_i / 100
|
332
|
+
folder = "#{output_folder}/prom_aln/#{cent_triad}/"
|
333
|
+
save_prom = "#{folder}/#{triad}.prom.fa"
|
334
|
+
|
335
|
+
to_align = Bio::Alignment::SequenceHash.new
|
336
|
+
to_align[a] = sequences[a]
|
337
|
+
to_align[b] = sequences[b]
|
338
|
+
to_align[d] = sequences[d]
|
339
|
+
|
340
|
+
prom_aln = nil
|
341
|
+
unless File.file? save_prom
|
342
|
+
prom_aln = promoter_alignment to_align
|
343
|
+
else
|
344
|
+
ff, seqs_cnt = read_alignments save_prom, split_token
|
345
|
+
seqs = Bio::Alignment::SequenceHash.new
|
346
|
+
prom_aln = Bio::Alignment.new(ff)
|
347
|
+
end
|
348
|
+
print_arr = [triad, prom_aln.len]
|
349
|
+
aln_stats = get_longest_aln prom_aln
|
350
|
+
print_arr << aln_stats
|
351
|
+
cut_seqs = prom_aln.cut_alignment aln_stats[0], aln_stats[1]
|
352
|
+
|
353
|
+
|
354
|
+
|
355
|
+
print_arr << cut_seqs.sum_of_all_pairs
|
356
|
+
print_arr << cut_seqs.normalized_sum_of_all_pairs
|
357
|
+
|
358
|
+
print_arr << cut_seqs.sum_of_identities
|
359
|
+
print_arr << cut_seqs.identity
|
360
|
+
|
361
|
+
best_aln_stats = prom_aln.best_block
|
362
|
+
best_aln_cut = prom_aln.cut_alignment best_aln_stats[0], best_aln_stats[1]
|
363
|
+
|
364
|
+
print_arr << best_aln_stats
|
365
|
+
|
366
|
+
print_arr << best_aln_cut.sum_of_all_pairs
|
367
|
+
print_arr << best_aln_cut.normalized_sum_of_all_pairs
|
368
|
+
|
369
|
+
print_arr << best_aln_cut.sum_of_identities
|
370
|
+
print_arr << best_aln_cut.identity
|
371
|
+
|
372
|
+
base = [triad, "cut_longest_region"]
|
373
|
+
cut_seqs.window_identities.each do |e|
|
374
|
+
long_table.puts [base, e].flatten.join("\t")
|
375
|
+
end
|
376
|
+
|
377
|
+
base = [triad, "cut_best_region"]
|
378
|
+
best_aln_cut.window_identities.each do |e|
|
379
|
+
long_table.puts [base, e].flatten.join("\t")
|
380
|
+
end
|
381
|
+
|
382
|
+
base = [triad, "full_promoter"]
|
383
|
+
prom_aln.window_identities.each do |e|
|
384
|
+
long_table.puts [base, e].flatten.join("\t")
|
385
|
+
end
|
386
|
+
|
387
|
+
out.puts print_arr.join("\t")
|
388
|
+
|
389
|
+
FileUtils.mkdir_p folder
|
390
|
+
|
391
|
+
write_fasta_from_hash(prom_aln, save_prom) unless File.file?(save_prom)
|
392
|
+
|
393
|
+
save_prom_cut = "#{folder}/#{triad}.prom.cut.fa"
|
394
|
+
write_fasta_from_hash(cut_seqs, save_prom_cut) unless File.file?(save_prom)
|
395
|
+
|
396
|
+
save_prom_cut_best = "#{folder}/#{triad}.prom.cut.best.fa"
|
397
|
+
write_fasta_from_hash(best_aln_cut, save_prom_cut_best)
|
398
|
+
|
399
|
+
i += 1
|
400
|
+
#break if i > 10
|
401
|
+
end
|
402
|
+
long_table.close
|
403
|
+
out.close
|