pasv_lib 0.4.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pasv_lib/alignment.rb +28 -7
- data/lib/pasv_lib/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9c784b906c7e31bdcd730c366169b973d47a2d42684515bcb80bdc07a1885deb
|
4
|
+
data.tar.gz: 6e8d255293a94a02fcaf4cee373632c387c753db7f71b8be6515c75ea84e22f6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ed50ff3cf8856cf568bb20f38f0adcd045699244b468909e88452a5fbc930477e26618dc0e18b0a7c6e332a6f8de1a48803b44ba228812eac53b1b35e92a38a4
|
7
|
+
data.tar.gz: d73b1d2bcbc5b28a951017d6074b7ce5cdc7435684c86be4df4911c5d01924164606cce8c679b638206b2c951ac7506567ef97a29dee6e848d3f8f051b3f7c73
|
data/lib/pasv_lib/alignment.rb
CHANGED
@@ -3,6 +3,7 @@ require "set"
|
|
3
3
|
|
4
4
|
module PasvLib
|
5
5
|
module Alignment
|
6
|
+
# If you need to check if a residue is a gap, use this Set.
|
6
7
|
GAP_CHARS = Set.new %w[- .]
|
7
8
|
|
8
9
|
# If the overall min of the scoring matrix is < 0, then this scales it so that the overall min becomes zero.
|
@@ -44,7 +45,7 @@ module PasvLib
|
|
44
45
|
# klass.alignment_columns ["AA-", "A-A"] #=> [["A", "A"], ["A", "-"], ["-", "A"]]
|
45
46
|
def alignment_columns seqs
|
46
47
|
seqs_no_spaces = seqs.map { |seq| seq.tr " ", "" }
|
47
|
-
len
|
48
|
+
len = seqs_no_spaces.first.length
|
48
49
|
|
49
50
|
seqs_no_spaces.map do |seq|
|
50
51
|
unless seq.length == len
|
@@ -55,6 +56,10 @@ module PasvLib
|
|
55
56
|
end.transpose
|
56
57
|
end
|
57
58
|
|
59
|
+
def gap? residue
|
60
|
+
GAP_CHARS.include? residue
|
61
|
+
end
|
62
|
+
|
58
63
|
# Calculate the geometric index for an alignment.
|
59
64
|
#
|
60
65
|
# Basically, you change all residues to 0 and all gaps to 1. Then you take the permutations of the sequences and then the residues and XOR each of the permutations. Then you add it up and take averages and you'll get the score. That is a pretty bad explanation, so see http://merenlab.org/2016/11/08/pangenomics-v2/#geometric-homogeneity-index for more information.
|
@@ -72,8 +77,8 @@ module PasvLib
|
|
72
77
|
binary_aln = binary_aln.transpose
|
73
78
|
end
|
74
79
|
|
75
|
-
num_rows
|
76
|
-
max_differences_per_row
|
80
|
+
num_rows = binary_aln.length
|
81
|
+
max_differences_per_row = binary_aln.first.length
|
77
82
|
num_comparisions_per_row = num_rows - 1
|
78
83
|
|
79
84
|
diff_score = binary_aln.permutation(2).map do |(row1, row2)|
|
@@ -91,13 +96,21 @@ module PasvLib
|
|
91
96
|
#
|
92
97
|
# @return [Float] a score between 0 and 1, with 1 being very homogeneous and 1 being very heterogeneous.
|
93
98
|
def geometric_index aln
|
94
|
-
by_seq_score
|
99
|
+
by_seq_score = geometric_score aln, "sequence"
|
95
100
|
by_residue_score = geometric_score aln, "residue"
|
96
101
|
|
97
102
|
(by_seq_score + by_residue_score) / 2.0
|
98
103
|
end
|
99
104
|
|
100
|
-
#
|
105
|
+
# Returns the similarity score for the alignment.
|
106
|
+
#
|
107
|
+
# For each colunm, each pair of residues are scored using the similarity matrix to get a points accrued / max_points. Then all column scores are averaged.
|
108
|
+
#
|
109
|
+
# @raise PasvLib::Error if any residue is not present in the scoring matrix.
|
110
|
+
# @raise PasvLib::Error if seqs is empty
|
111
|
+
# @raise PasvLib::Error if max_points is zero. Could happen if one of the seqs has all gaps, or no omparisons could be made.
|
112
|
+
#
|
113
|
+
# @note Technically you could get a negative score if you don't have enough high scoring residue pairs to keep the total score above zero. If this is the case, you're alignment probably isn't very good. Alternatively, you could use #adjust_scoring_matrix to avoid this issue....
|
101
114
|
def similarity_score seqs, scoring_matrix = Blosum::BLOSUM62
|
102
115
|
raise PasvLib::Error if seqs.empty?
|
103
116
|
return 1.0 if seqs.count == 1
|
@@ -109,7 +122,15 @@ module PasvLib
|
|
109
122
|
|
110
123
|
aln_cols.each do |residues|
|
111
124
|
residues.map(&:upcase).combination(2).each do |r1, r2|
|
112
|
-
unless
|
125
|
+
unless gap?(r1) || gap?(r2)
|
126
|
+
# Check that scoring matrix has the residues.
|
127
|
+
[r1, r2].each do |res|
|
128
|
+
unless scoring_matrix.has_key? res
|
129
|
+
raise PasvLib::Error, "Residue '#{res}' is missing from the " \
|
130
|
+
"scoring matrix."
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
113
134
|
r1_max_score = scoring_matrix[r1].values.max
|
114
135
|
r2_max_score = scoring_matrix[r2].values.max
|
115
136
|
pair_max = [r1_max_score, r2_max_score].max
|
@@ -139,7 +160,7 @@ module PasvLib
|
|
139
160
|
aln.map do |seq|
|
140
161
|
seq.chars.map do |char|
|
141
162
|
# Gaps turn to 1, residues turn to 0.
|
142
|
-
|
163
|
+
gap?(char) ? 1 : 0
|
143
164
|
end
|
144
165
|
end
|
145
166
|
end
|
data/lib/pasv_lib/version.rb
CHANGED