bio-polyploid-tools 0.7.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +17 -0
- data/Gemfile +10 -7
- data/README.md +44 -0
- data/Rakefile +14 -14
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -2
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/find_homoeologue_variations.rb +385 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +2 -2
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/polymarker.rb +73 -17
- data/bin/polymarker_capillary.rb +416 -0
- data/bin/snp_position_to_polymarker.rb +5 -3
- data/bin/snps_between_bams.rb +0 -29
- data/bin/vcfLineToTable.rb +56 -0
- data/bio-polyploid-tools.gemspec +74 -32
- data/lib/bio/BFRTools.rb +1 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
- data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
- data/lib/bio/PolyploidTools/SNP.rb +58 -18
- data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
- data/lib/bio/db/blast.rb +112 -0
- data/lib/bio/db/exonerate.rb +4 -5
- data/lib/bio/db/primer3.rb +83 -14
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_for_polymarker.fa +1 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/bfr_out_test.csv +5 -5
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/test_bfr.rb +26 -34
- data/test/test_blast.rb +47 -0
- data/test/test_exonearate.rb +4 -9
- data/test/test_snp_parsing.rb +42 -22
- metadata +81 -20
- data/Gemfile.lock +0 -67
data/bin/hexaploid_primers.rb
CHANGED
@@ -63,9 +63,9 @@ File.open(test_file) do | f |
|
|
63
63
|
region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
|
64
64
|
snp.template_sequence = fasta_reference_db.fetch_sequence(region)
|
65
65
|
else
|
66
|
-
|
66
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
67
67
|
end
|
68
|
-
|
68
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
69
69
|
snp.snp_in = snp_in
|
70
70
|
snp.original_name = original_name
|
71
71
|
snps << snp
|
data/bin/homokaryot_primers.rb
CHANGED
@@ -109,9 +109,9 @@ File.open(snp_file) do | f |
|
|
109
109
|
region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
|
110
110
|
snp.template_sequence = fasta_reference_db.fetch_sequence(region)
|
111
111
|
else
|
112
|
-
|
112
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
113
113
|
end
|
114
|
-
|
114
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
115
115
|
snp.snp_in = snp_in
|
116
116
|
snp.original_name = original_name
|
117
117
|
snps << snp
|
data/bin/mafft_triads.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: mafft_triads.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
27
|
+
options[:triads] = o
|
28
|
+
end
|
29
|
+
|
30
|
+
opts.on("-f", "--pep FILE" , "FASTA file containing all the possible peptide sequences. ") do |o|
|
31
|
+
options[:pep] = o
|
32
|
+
end
|
33
|
+
|
34
|
+
opts.on("-s", "--cds FILE" , "FASTA file containing all the possible CDS sequences. ") do |o|
|
35
|
+
options[:cds] = o
|
36
|
+
end
|
37
|
+
|
38
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
39
|
+
options[:split_token] = o
|
40
|
+
end
|
41
|
+
|
42
|
+
end.parse!
|
43
|
+
|
44
|
+
|
45
|
+
def peptide_alignment(sequences_to_align)
|
46
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
47
|
+
mafft = Bio::MAFFT.new( "mafft" , options)
|
48
|
+
report = mafft.query_align(sequences_to_align)
|
49
|
+
report.alignment
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
split_token = options[:split_token]
|
54
|
+
|
55
|
+
pep_seq = Hash.new
|
56
|
+
pep_seq_count=0
|
57
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:pep]) do |fasta_file|
|
58
|
+
fasta_file.each do |entry|
|
59
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
60
|
+
pep_seq[gene_name] = entry unless pep_seq[gene_name]
|
61
|
+
pep_seq[gene_name] = entry if entry.length > pep_seq[gene_name].length
|
62
|
+
pep_seq_count += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
$stderr.puts "#Loaded #{pep_seq.length} genes from #{pep_seq_count} pep_seq"
|
66
|
+
|
67
|
+
cds_seq = Hash.new
|
68
|
+
cds_seq_count=0
|
69
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:cds]) do |fasta_file|
|
70
|
+
fasta_file.each do |entry|
|
71
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
72
|
+
cds_seq[gene_name] = entry unless cds_seq[gene_name]
|
73
|
+
cds_seq[gene_name] = entry if entry.length > cds_seq[gene_name].length
|
74
|
+
cds_seq_count += 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
$stderr.puts "#Loaded #{cds_seq.length} genes from #{cds_seq_count} cds_seq"
|
78
|
+
|
79
|
+
|
80
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
81
|
+
|
82
|
+
def write_fasta_from_hash(sequences, filename)
|
83
|
+
out = File.new(filename, "w")
|
84
|
+
#puts sequences.inspect
|
85
|
+
sequences.each_pair do | chromosome, exon_seq |
|
86
|
+
out.puts ">#{chromosome}\n#{exon_seq}\n"
|
87
|
+
end
|
88
|
+
out.close
|
89
|
+
end
|
90
|
+
|
91
|
+
|
92
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
93
|
+
a = row['A']
|
94
|
+
b = row['B']
|
95
|
+
d = row['D']
|
96
|
+
triad = row['group_id']
|
97
|
+
|
98
|
+
to_align = Bio::Alignment::SequenceHash.new
|
99
|
+
to_align[a] = pep_seq[a]
|
100
|
+
to_align[b] = pep_seq[b]
|
101
|
+
to_align[d] = pep_seq[d]
|
102
|
+
|
103
|
+
cds_seqs = Bio::Alignment::SequenceHash.new
|
104
|
+
cds_seqs[a] = cds_seq[a].to_biosequence
|
105
|
+
cds_seqs[b] = cds_seq[b].to_biosequence
|
106
|
+
cds_seqs[d] = cds_seq[d].to_biosequence
|
107
|
+
|
108
|
+
cent_triad = triad.to_i / 100
|
109
|
+
folder = "alignments/#{cent_triad}/"
|
110
|
+
FileUtils.mkdir_p folder
|
111
|
+
|
112
|
+
pep_aln = peptide_alignment(to_align)
|
113
|
+
|
114
|
+
save_pep = "#{folder}/#{triad}.pep.fa"
|
115
|
+
write_fasta_from_hash(pep_aln, save_pep)
|
116
|
+
|
117
|
+
save_cds = "#{folder}/#{triad}.cds.fa"
|
118
|
+
write_fasta_from_hash(cds_seqs, save_cds)
|
119
|
+
#break
|
120
|
+
end
|
@@ -0,0 +1,403 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:output_folder] = "."
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
26
|
+
options[:min_bases] = o.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
30
|
+
options[:triads] = o
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
34
|
+
options[:fasta] = o
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
38
|
+
options[:split_token] = o
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
42
|
+
options[:program] = o
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-o", "--output_folder DIR", "Folder to save the output") do |o|
|
46
|
+
options[:output_folder] = o
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end.parse!
|
51
|
+
|
52
|
+
module Bio::Alignment::EnumerableExtension
|
53
|
+
def each_base_alignment
|
54
|
+
names = self.keys
|
55
|
+
|
56
|
+
i = 0
|
57
|
+
len = 0
|
58
|
+
len = self[names[0]].length if names[0]
|
59
|
+
total_alignments = names.size
|
60
|
+
while i < len do
|
61
|
+
yield names.map { | chr| self[chr][i] }
|
62
|
+
i += 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def cut_alignment(start, length)
|
67
|
+
a = Bio::Alignment::SequenceHash.new
|
68
|
+
a.set_all_property(get_all_property)
|
69
|
+
each_pair do |key, str|
|
70
|
+
seq = ""
|
71
|
+
seq = str[start, length] if str != nil
|
72
|
+
a.store(key, seq)
|
73
|
+
end
|
74
|
+
a
|
75
|
+
end
|
76
|
+
|
77
|
+
def best_block
|
78
|
+
best_start = 0
|
79
|
+
best_score = 0
|
80
|
+
best_end = 0
|
81
|
+
best_length = 0
|
82
|
+
current_start = 0
|
83
|
+
current_score = 0
|
84
|
+
current_length = 0
|
85
|
+
|
86
|
+
each_base_alignment_with_index do |bases, i|
|
87
|
+
current_start = i if current_length == 0
|
88
|
+
current_length += 1
|
89
|
+
current_score += sum_of_pair bases
|
90
|
+
if current_score > best_score
|
91
|
+
best_score = current_score
|
92
|
+
best_length = current_length
|
93
|
+
best_end = i
|
94
|
+
best_start = current_start
|
95
|
+
end
|
96
|
+
|
97
|
+
if current_score < 0
|
98
|
+
current_length = 0
|
99
|
+
current_score = 0
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
[best_start, best_length, len - best_start - best_length , len - best_start ]
|
105
|
+
end
|
106
|
+
|
107
|
+
def each_base_alignment_with_index
|
108
|
+
names = self.keys
|
109
|
+
total_alignments = names.size
|
110
|
+
i = 0
|
111
|
+
while i < len do
|
112
|
+
yield names.map { | chr| self[chr][i] } , i
|
113
|
+
i += 1
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def each_base_alignment
|
118
|
+
each_base_alignment_with_index do |chr, i|
|
119
|
+
yield chr
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def sum_of_all_pairs
|
124
|
+
return @sum_of_all_pairs if @sum_of_all_pairs
|
125
|
+
@sum_of_all_pairs = 0
|
126
|
+
self.each_base_alignment do |bases|
|
127
|
+
@sum_of_all_pairs += sum_of_pair bases
|
128
|
+
end
|
129
|
+
@sum_of_all_pairs
|
130
|
+
end
|
131
|
+
|
132
|
+
def sum_of_identities
|
133
|
+
return @sum_of_identities if @sum_of_identities
|
134
|
+
@sum_of_identities = 0
|
135
|
+
self.each_base_alignment do |bases|
|
136
|
+
@sum_of_identities += s_o_i bases
|
137
|
+
end
|
138
|
+
@sum_of_identities
|
139
|
+
end
|
140
|
+
|
141
|
+
def len
|
142
|
+
return @len if @len
|
143
|
+
names = self.keys
|
144
|
+
@len = 0
|
145
|
+
@len = self[names[0]].length if names[0] and self[names[0]] != nil
|
146
|
+
@len
|
147
|
+
end
|
148
|
+
|
149
|
+
def pairwise_comparaisons
|
150
|
+
names = self.keys
|
151
|
+
n = names.size
|
152
|
+
c = n * (n-1)/2
|
153
|
+
c
|
154
|
+
end
|
155
|
+
|
156
|
+
def identity
|
157
|
+
max_score = len * pairwise_comparaisons
|
158
|
+
sum_of_identities.to_f/max_score
|
159
|
+
end
|
160
|
+
|
161
|
+
def normalized_sum_of_all_pairs
|
162
|
+
max_score = len * pairwise_comparaisons
|
163
|
+
sum_of_all_pairs.to_f/max_score
|
164
|
+
end
|
165
|
+
|
166
|
+
def sum_of_pair(bases)
|
167
|
+
x = bases.length - 1
|
168
|
+
total = 0
|
169
|
+
for i in 0..x
|
170
|
+
y = i + 1
|
171
|
+
for j in y..x
|
172
|
+
case
|
173
|
+
when (bases[i] == "-" and bases[j] == "-")
|
174
|
+
total += 0
|
175
|
+
when (bases[i] == "N" and bases[j] == "N")
|
176
|
+
total += 0
|
177
|
+
when (bases[i] == "n" and bases[j] == "n")
|
178
|
+
total += 0
|
179
|
+
when (bases[i] == "-" or bases[j] == "-")
|
180
|
+
total -= 2
|
181
|
+
when bases[i] == bases[j]
|
182
|
+
total += 1
|
183
|
+
when bases[i] != bases[j]
|
184
|
+
total -= 1
|
185
|
+
else
|
186
|
+
$stderr.puts "Invalid comparaison! sum_of_all_pairs(#{bases})"
|
187
|
+
end
|
188
|
+
end
|
189
|
+
end
|
190
|
+
total
|
191
|
+
end
|
192
|
+
|
193
|
+
def s_o_i(bases)
|
194
|
+
x = bases.length - 1
|
195
|
+
total = 0
|
196
|
+
for i in 0..x
|
197
|
+
y = i + 1
|
198
|
+
for j in y..x
|
199
|
+
total += 1 if bases[i] == bases[j]
|
200
|
+
end
|
201
|
+
end
|
202
|
+
total
|
203
|
+
end
|
204
|
+
|
205
|
+
def window_identities(window_size=100, offset=25)
|
206
|
+
steps = (0..len).step(offset).to_a.map {|a| a + len%offset }.reverse
|
207
|
+
ret = []
|
208
|
+
steps.each_with_index do |e, i|
|
209
|
+
start = e - window_size
|
210
|
+
tmp_aln = self.cut_alignment start, window_size
|
211
|
+
tmp_arr = [
|
212
|
+
i * offset,
|
213
|
+
i * offset + window_size,
|
214
|
+
tmp_aln.sum_of_all_pairs,
|
215
|
+
tmp_aln.normalized_sum_of_all_pairs,
|
216
|
+
tmp_aln.sum_of_identities,
|
217
|
+
tmp_aln.identity]
|
218
|
+
ret << tmp_arr
|
219
|
+
end
|
220
|
+
ret
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def promoter_alignment(sequences_to_align)
|
225
|
+
process = true
|
226
|
+
sequences_to_align.each_value { |val| process &= val != nil }
|
227
|
+
return sequences_to_align unless process
|
228
|
+
#options = ['--maxiterate', '1000', '--ep', '0', '--genafpair', '--quiet']
|
229
|
+
options = ['--maxiterate', '1000', '--localpair', '--quiet']
|
230
|
+
@mafft = Bio::MAFFT.new( "mafft" , options) unless @mafft
|
231
|
+
report = @mafft.query_align(sequences_to_align)
|
232
|
+
report.alignment
|
233
|
+
end
|
234
|
+
|
235
|
+
def write_fasta_from_hash(sequences, filename)
|
236
|
+
out = File.new(filename, "w")
|
237
|
+
sequences.each_pair do | chromosome, exon_seq |
|
238
|
+
out.puts ">#{chromosome}\n#{exon_seq}\n"
|
239
|
+
end
|
240
|
+
out.close
|
241
|
+
end
|
242
|
+
|
243
|
+
def get_longest_aln(aln, max_gap: 10)
|
244
|
+
names = aln.keys
|
245
|
+
i = 0
|
246
|
+
len = 0
|
247
|
+
len = aln[names[0]].length if names[0] and aln[names[0]] != nil
|
248
|
+
total_alignments = names.size
|
249
|
+
masked_snps = "-" * len
|
250
|
+
longest_start = -1
|
251
|
+
longest_length = 0
|
252
|
+
current_start = -1
|
253
|
+
current_length = 0
|
254
|
+
current_gap = 0
|
255
|
+
longest_gaps = 0
|
256
|
+
gaps = 0
|
257
|
+
while i < len do
|
258
|
+
different = 0
|
259
|
+
cov = 0
|
260
|
+
names.each do | chr |
|
261
|
+
if aln[chr][i] != "-"
|
262
|
+
cov += 1
|
263
|
+
end
|
264
|
+
end
|
265
|
+
if cov == total_alignments
|
266
|
+
current_start = i if current_length == 0
|
267
|
+
current_length += 1
|
268
|
+
current_gap = 0
|
269
|
+
else
|
270
|
+
gaps += 1
|
271
|
+
current_gap += 1
|
272
|
+
end
|
273
|
+
|
274
|
+
if current_length > longest_length
|
275
|
+
longest_length = current_length
|
276
|
+
longest_start = current_start
|
277
|
+
longest_gaps = gaps - current_gap
|
278
|
+
end
|
279
|
+
if current_gap > max_gap
|
280
|
+
current_length = 0
|
281
|
+
gaps = 0
|
282
|
+
end
|
283
|
+
i += 1
|
284
|
+
end
|
285
|
+
longest_length += longest_gaps
|
286
|
+
[longest_start, longest_length, len - longest_start - longest_length, len - longest_start]
|
287
|
+
end
|
288
|
+
|
289
|
+
split_token = options[:split_token]
|
290
|
+
|
291
|
+
def read_alignments(fasta_path, split_token)
|
292
|
+
sequences = Hash.new
|
293
|
+
sequence_count=0
|
294
|
+
Bio::FlatFile.open(Bio::FastaFormat, fasta_path) do |fasta_file|
|
295
|
+
fasta_file.each do |entry|
|
296
|
+
#puts entry
|
297
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
298
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
299
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
300
|
+
sequence_count += 1
|
301
|
+
end
|
302
|
+
end
|
303
|
+
[sequences,sequence_count]
|
304
|
+
end
|
305
|
+
|
306
|
+
sequences, sequence_count = read_alignments(options[:fasta], split_token)
|
307
|
+
|
308
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
309
|
+
output_folder = options[:output_folder]
|
310
|
+
|
311
|
+
FileUtils.mkdir_p output_folder
|
312
|
+
summary_file = "#{output_folder}/identities.txt"
|
313
|
+
long_table_file = "#{output_folder}/sliding_window_identities.txt"
|
314
|
+
|
315
|
+
out = File.open(summary_file, "w")
|
316
|
+
long_table = File.open(long_table_file, "w")
|
317
|
+
|
318
|
+
i =0
|
319
|
+
|
320
|
+
header = ["triad", "total_aln_length"]
|
321
|
+
header << ["longest_start", "longest_length", "longest_start_from_CDS","longest_end_from_CDS", "longest_sum_of_all_pairs","longest_norm_sum_of_all_pairs","longest_sum_of_identities", "longest_identity"]
|
322
|
+
header << ["best_start", "best_length" , "best_start_from_CDS","best_end_from_CDS", "best_sum_of_all_pairs","best_norm_sum_of_all_pairs","best_sum_of_identities", "best_identity"]
|
323
|
+
out.puts header.join("\t")
|
324
|
+
long_table.puts ["triad", "type", "start_from_CDS", "end_from_cds" , "sum_of_all_pairs","norm_sum_of_all_pairs","sum_of_identities", "identity"].join("\t")
|
325
|
+
CSV.foreach( options[:triads], headers:true ) do |row|
|
326
|
+
a = row['A']
|
327
|
+
b = row['B']
|
328
|
+
d = row['D']
|
329
|
+
triad = row['group_id']
|
330
|
+
|
331
|
+
cent_triad = triad.to_i / 100
|
332
|
+
folder = "#{output_folder}/prom_aln/#{cent_triad}/"
|
333
|
+
save_prom = "#{folder}/#{triad}.prom.fa"
|
334
|
+
|
335
|
+
to_align = Bio::Alignment::SequenceHash.new
|
336
|
+
to_align[a] = sequences[a]
|
337
|
+
to_align[b] = sequences[b]
|
338
|
+
to_align[d] = sequences[d]
|
339
|
+
|
340
|
+
prom_aln = nil
|
341
|
+
unless File.file? save_prom
|
342
|
+
prom_aln = promoter_alignment to_align
|
343
|
+
else
|
344
|
+
ff, seqs_cnt = read_alignments save_prom, split_token
|
345
|
+
seqs = Bio::Alignment::SequenceHash.new
|
346
|
+
prom_aln = Bio::Alignment.new(ff)
|
347
|
+
end
|
348
|
+
print_arr = [triad, prom_aln.len]
|
349
|
+
aln_stats = get_longest_aln prom_aln
|
350
|
+
print_arr << aln_stats
|
351
|
+
cut_seqs = prom_aln.cut_alignment aln_stats[0], aln_stats[1]
|
352
|
+
|
353
|
+
|
354
|
+
|
355
|
+
print_arr << cut_seqs.sum_of_all_pairs
|
356
|
+
print_arr << cut_seqs.normalized_sum_of_all_pairs
|
357
|
+
|
358
|
+
print_arr << cut_seqs.sum_of_identities
|
359
|
+
print_arr << cut_seqs.identity
|
360
|
+
|
361
|
+
best_aln_stats = prom_aln.best_block
|
362
|
+
best_aln_cut = prom_aln.cut_alignment best_aln_stats[0], best_aln_stats[1]
|
363
|
+
|
364
|
+
print_arr << best_aln_stats
|
365
|
+
|
366
|
+
print_arr << best_aln_cut.sum_of_all_pairs
|
367
|
+
print_arr << best_aln_cut.normalized_sum_of_all_pairs
|
368
|
+
|
369
|
+
print_arr << best_aln_cut.sum_of_identities
|
370
|
+
print_arr << best_aln_cut.identity
|
371
|
+
|
372
|
+
base = [triad, "cut_longest_region"]
|
373
|
+
cut_seqs.window_identities.each do |e|
|
374
|
+
long_table.puts [base, e].flatten.join("\t")
|
375
|
+
end
|
376
|
+
|
377
|
+
base = [triad, "cut_best_region"]
|
378
|
+
best_aln_cut.window_identities.each do |e|
|
379
|
+
long_table.puts [base, e].flatten.join("\t")
|
380
|
+
end
|
381
|
+
|
382
|
+
base = [triad, "full_promoter"]
|
383
|
+
prom_aln.window_identities.each do |e|
|
384
|
+
long_table.puts [base, e].flatten.join("\t")
|
385
|
+
end
|
386
|
+
|
387
|
+
out.puts print_arr.join("\t")
|
388
|
+
|
389
|
+
FileUtils.mkdir_p folder
|
390
|
+
|
391
|
+
write_fasta_from_hash(prom_aln, save_prom) unless File.file?(save_prom)
|
392
|
+
|
393
|
+
save_prom_cut = "#{folder}/#{triad}.prom.cut.fa"
|
394
|
+
write_fasta_from_hash(cut_seqs, save_prom_cut) unless File.file?(save_prom)
|
395
|
+
|
396
|
+
save_prom_cut_best = "#{folder}/#{triad}.prom.cut.best.fa"
|
397
|
+
write_fasta_from_hash(best_aln_cut, save_prom_cut_best)
|
398
|
+
|
399
|
+
i += 1
|
400
|
+
#break if i > 10
|
401
|
+
end
|
402
|
+
long_table.close
|
403
|
+
out.close
|