pasv_lib 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pasv_lib.rb +2 -0
- data/lib/pasv_lib/alignment.rb +147 -0
- data/lib/pasv_lib/error.rb +3 -0
- data/lib/pasv_lib/version.rb +1 -1
- data/pasv_lib.gemspec +2 -0
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 752ca72e197ff3f4249ba50b67cbe8ecc09a00aa8e4f4b050943437c303bbc42
|
4
|
+
data.tar.gz: 35c1ed983ba5643ea84e31fa3c7ca6c0ecb031597ea2d8058664725f443e336a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5436d6c890346eb6560d6f00d4801ca5f59f61e0d517404429553b23750f4516f117a86fb8236cc1cabd7f1a07251a2ce6f41cc2df8c3210620ac0204e686e5b
|
7
|
+
data.tar.gz: e92b04401c6b118b7906eca7e750f42dcfe720d4d42c3d533de33d6a8f46babb6601bdeeb2ed8e80bd4bce9407e084f8a8d60c0e6215a8e254aa96ab79442511
|
data/lib/pasv_lib.rb
CHANGED
@@ -0,0 +1,147 @@
|
|
1
|
+
require "blosum"
|
2
|
+
require "set"
|
3
|
+
|
4
|
+
module PasvLib
|
5
|
+
module Alignment
|
6
|
+
GAP_CHARS = Set.new %w[- .]
|
7
|
+
|
8
|
+
# If the overall min of the scoring matrix is < 0, then this scales it so that the overall min becomes zero.
|
9
|
+
#
|
10
|
+
# @param scoring_matrix The scoring matrix to rescale. E.g., Blosum::BLOSUM62.
|
11
|
+
#
|
12
|
+
# @return A new hash scaled to zero, or a deep copy of the old one if the overall min >= 0.
|
13
|
+
def adjust_scoring_matrix scoring_matrix
|
14
|
+
overal_min = scoring_matrix.values.map(&:values).flatten.min
|
15
|
+
|
16
|
+
scaling_value = overal_min < 0 ? overal_min.abs : 0
|
17
|
+
|
18
|
+
# We want a deep copy to prevent things from getting weird while using the new hash later.
|
19
|
+
new_matrix = {}
|
20
|
+
scoring_matrix.each do |residue, scores|
|
21
|
+
new_matrix[residue] = {}
|
22
|
+
scores.each do |other_residue, score|
|
23
|
+
new_matrix[residue][other_residue] = score + scaling_value
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
new_matrix
|
28
|
+
end
|
29
|
+
|
30
|
+
# Get the columns of an aligned set of sequences.
|
31
|
+
#
|
32
|
+
# Any spaces in the alignment are ignored (e.g., like those spaces NCBI puts in their alignments sometimes). Gap characters are '.' or '-'
|
33
|
+
#
|
34
|
+
# @param seqs [Array<String>] an array of sequences, normally aligned sequences
|
35
|
+
#
|
36
|
+
# @return [Array<Array<String>>] an array of alignment columns (which are arrays of single residue strings)
|
37
|
+
#
|
38
|
+
# @raise PasvLib::Error if seqs.empty?
|
39
|
+
# @raise PasvLib::Error if no comparisons can be made, e.g., all gaps
|
40
|
+
# @raise PasvLib::Error if not all sequences are the same length
|
41
|
+
#
|
42
|
+
#
|
43
|
+
# @example
|
44
|
+
# klass.alignment_columns ["AA-", "A-A"] #=> [["A", "A"], ["A", "-"], ["-", "A"]]
|
45
|
+
def alignment_columns seqs
|
46
|
+
seqs_no_spaces = seqs.map { |seq| seq.tr " ", "" }
|
47
|
+
len = seqs_no_spaces.first.length
|
48
|
+
|
49
|
+
seqs_no_spaces.map do |seq|
|
50
|
+
unless seq.length == len
|
51
|
+
raise PasvLib::Error, "Aligned seqs must be the same length"
|
52
|
+
end
|
53
|
+
|
54
|
+
seq.chars
|
55
|
+
end.transpose
|
56
|
+
end
|
57
|
+
|
58
|
+
# Calculate the geometric index for an alignment.
|
59
|
+
#
|
60
|
+
# Basically, you change all residues to 0 and all gaps to 1. Then you take the permutations of the sequences and then the residues and XOR each of the permutations. Then you add it up and take averages and you'll get the score. That is a pretty bad explanation, so see http://merenlab.org/2016/11/08/pangenomics-v2/#geometric-homogeneity-index for more information.
|
61
|
+
#
|
62
|
+
# @param aln [Array<String>] aligned sequenecs
|
63
|
+
# @param by [String] either "sequence" or "residue". Controls whether to do the calculation by sequence, or by residue.
|
64
|
+
#
|
65
|
+
# @return [Float] a score between 0 and 1, with 1 being very homogeneous and 1 being very heterogeneous.
|
66
|
+
#
|
67
|
+
# @note The original Anvi'o code uses a clever bitshifting scheme to avoid storing array of arrays. It may also speed up the XOR part as you're doing fewer XORs, but I'm not positive about that.
|
68
|
+
def geometric_score aln, by
|
69
|
+
binary_aln = to_binary_matrix aln
|
70
|
+
|
71
|
+
if by == "residue"
|
72
|
+
binary_aln = binary_aln.transpose
|
73
|
+
end
|
74
|
+
|
75
|
+
num_rows = binary_aln.length
|
76
|
+
max_differences_per_row = binary_aln.first.length
|
77
|
+
num_comparisions_per_row = num_rows - 1
|
78
|
+
|
79
|
+
diff_score = binary_aln.permutation(2).map do |(row1, row2)|
|
80
|
+
row1.zip(row2).map do |elem1, elem2|
|
81
|
+
elem1 ^ elem2
|
82
|
+
end.sum / max_differences_per_row.to_f
|
83
|
+
end.sum / num_comparisions_per_row.to_f / num_rows
|
84
|
+
|
85
|
+
1 - diff_score
|
86
|
+
end
|
87
|
+
|
88
|
+
# A wrapper for #geometric_score that takes the average of the by sequence and by residue scores for an alignment.
|
89
|
+
#
|
90
|
+
# @param aln [Array<String>] aligned sequenecs
|
91
|
+
#
|
92
|
+
# @return [Float] a score between 0 and 1, with 1 being very homogeneous and 1 being very heterogeneous.
|
93
|
+
def geometric_index aln
|
94
|
+
by_seq_score = geometric_score aln, "sequence"
|
95
|
+
by_residue_score = geometric_score aln, "residue"
|
96
|
+
|
97
|
+
(by_seq_score + by_residue_score) / 2.0
|
98
|
+
end
|
99
|
+
|
100
|
+
# @note Technically you could get a negative score if you don't have enough high scoring residue pairs to keep the total score above zero. If this is the case, you're alignment probably isn't very good.
|
101
|
+
def similarity_score seqs, scoring_matrix = Blosum::BLOSUM62
|
102
|
+
raise PasvLib::Error if seqs.empty?
|
103
|
+
return 1.0 if seqs.count == 1
|
104
|
+
|
105
|
+
aln_cols = alignment_columns seqs
|
106
|
+
|
107
|
+
actual_points = 0
|
108
|
+
max_points = 0
|
109
|
+
|
110
|
+
aln_cols.each do |residues|
|
111
|
+
residues.map(&:upcase).combination(2).each do |r1, r2|
|
112
|
+
unless GAP_CHARS.include?(r1) || GAP_CHARS.include?(r2)
|
113
|
+
r1_max_score = scoring_matrix[r1].values.max
|
114
|
+
r2_max_score = scoring_matrix[r2].values.max
|
115
|
+
pair_max = [r1_max_score, r2_max_score].max
|
116
|
+
|
117
|
+
# TODO check that residues exist in the scoring matrix.
|
118
|
+
actual_points += scoring_matrix[r1][r2]
|
119
|
+
max_points += pair_max
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
if max_points.zero?
|
125
|
+
raise PasvLib::Error, "Something went wrong and max_points was 0. " \
|
126
|
+
"Maybe one of your sequences is all gaps?"
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
actual_points / max_points.to_f
|
131
|
+
end
|
132
|
+
|
133
|
+
# Convert an aligment to a binary matrix where 1 represents gaps and 0 represents residues.
|
134
|
+
#
|
135
|
+
# @param aln [Array<String>] an array of (aligned) sequences
|
136
|
+
#
|
137
|
+
# @return [Array<Array<integer>>] an array of arrays. Each row is a sequence.
|
138
|
+
def to_binary_matrix aln
|
139
|
+
aln.map do |seq|
|
140
|
+
seq.chars.map do |char|
|
141
|
+
# Gaps turn to 1, residues turn to 0.
|
142
|
+
GAP_CHARS.include?(char) ? 1 : 0
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
data/lib/pasv_lib/version.rb
CHANGED
data/pasv_lib.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pasv_lib
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-10-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: blosum
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.1.0
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.1.0
|
55
69
|
description:
|
56
70
|
email:
|
57
71
|
- moorer@udel.edu
|
@@ -69,6 +83,8 @@ files:
|
|
69
83
|
- bin/console
|
70
84
|
- bin/setup
|
71
85
|
- lib/pasv_lib.rb
|
86
|
+
- lib/pasv_lib/alignment.rb
|
87
|
+
- lib/pasv_lib/error.rb
|
72
88
|
- lib/pasv_lib/version.rb
|
73
89
|
- pasv_lib.gemspec
|
74
90
|
- vendor/systemu.rb
|
@@ -90,7 +106,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
90
106
|
- !ruby/object:Gem::Version
|
91
107
|
version: '0'
|
92
108
|
requirements: []
|
93
|
-
rubygems_version: 3.0.
|
109
|
+
rubygems_version: 3.0.6
|
94
110
|
signing_key:
|
95
111
|
specification_version: 4
|
96
112
|
summary: Library code for PASV
|