pasv_lib 0.1.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/lib/pasv_lib.rb +4 -0
- data/lib/pasv_lib/alignment.rb +168 -0
- data/lib/pasv_lib/error.rb +4 -0
- data/lib/pasv_lib/io.rb +50 -0
- data/lib/pasv_lib/version.rb +1 -1
- data/pasv_lib.gemspec +3 -0
- data/vendor/systemu.rb +22 -7
- metadata +34 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6d2910a3379cacf4191d43ee3e9bafd9b1b4ffa5b2e4f1c6cabfc3716a98680
|
4
|
+
data.tar.gz: 544acd2bca1ecec7bd4ff1658e50cbbf4bd5301d8f51acf7e0d7af89a011cc63
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 371ce1e3a9001fa8deda62a90d9e01a33fdd759c8d061068f7ba4c7bded572c10d68b594f177ea609aed98865be01f7c179ab3f1cf398b1500ad96e0a2a802fa
|
7
|
+
data.tar.gz: 1c75ab7a7dd0534f4dd0935cb68af4b35e2c81b35c4abae4424e60899e2ec22b74c85365c961223598fc7ca8f8c9650f3d7dfdf74c414fe9a8e13741d50e7194
|
data/.gitignore
CHANGED
data/lib/pasv_lib.rb
CHANGED
@@ -0,0 +1,168 @@
|
|
1
|
+
require "blosum"
|
2
|
+
require "set"
|
3
|
+
|
4
|
+
module PasvLib
|
5
|
+
module Alignment
|
6
|
+
# If you need to check if a residue is a gap, use this Set.
|
7
|
+
GAP_CHARS = Set.new %w[- .]
|
8
|
+
|
9
|
+
# If the overall min of the scoring matrix is < 0, then this scales it so that the overall min becomes zero.
|
10
|
+
#
|
11
|
+
# @param scoring_matrix The scoring matrix to rescale. E.g., Blosum::BLOSUM62.
|
12
|
+
#
|
13
|
+
# @return A new hash scaled to zero, or a deep copy of the old one if the overall min >= 0.
|
14
|
+
def adjust_scoring_matrix scoring_matrix
|
15
|
+
overal_min = scoring_matrix.values.map(&:values).flatten.min
|
16
|
+
|
17
|
+
scaling_value = overal_min < 0 ? overal_min.abs : 0
|
18
|
+
|
19
|
+
# We want a deep copy to prevent things from getting weird while using the new hash later.
|
20
|
+
new_matrix = {}
|
21
|
+
scoring_matrix.each do |residue, scores|
|
22
|
+
new_matrix[residue] = {}
|
23
|
+
scores.each do |other_residue, score|
|
24
|
+
new_matrix[residue][other_residue] = score + scaling_value
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
new_matrix
|
29
|
+
end
|
30
|
+
|
31
|
+
# Get the columns of an aligned set of sequences.
|
32
|
+
#
|
33
|
+
# Any spaces in the alignment are ignored (e.g., like those spaces NCBI puts in their alignments sometimes). Gap characters are '.' or '-'
|
34
|
+
#
|
35
|
+
# @param seqs [Array<String>] an array of sequences, normally aligned sequences
|
36
|
+
#
|
37
|
+
# @return [Array<Array<String>>] an array of alignment columns (which are arrays of single residue strings)
|
38
|
+
#
|
39
|
+
# @raise PasvLib::Error if seqs.empty?
|
40
|
+
# @raise PasvLib::Error if no comparisons can be made, e.g., all gaps
|
41
|
+
# @raise PasvLib::Error if not all sequences are the same length
|
42
|
+
#
|
43
|
+
#
|
44
|
+
# @example
|
45
|
+
# klass.alignment_columns ["AA-", "A-A"] #=> [["A", "A"], ["A", "-"], ["-", "A"]]
|
46
|
+
def alignment_columns seqs
|
47
|
+
seqs_no_spaces = seqs.map { |seq| seq.tr " ", "" }
|
48
|
+
len = seqs_no_spaces.first.length
|
49
|
+
|
50
|
+
seqs_no_spaces.map do |seq|
|
51
|
+
unless seq.length == len
|
52
|
+
raise PasvLib::Error, "Aligned seqs must be the same length"
|
53
|
+
end
|
54
|
+
|
55
|
+
seq.chars
|
56
|
+
end.transpose
|
57
|
+
end
|
58
|
+
|
59
|
+
def gap? residue
|
60
|
+
GAP_CHARS.include? residue
|
61
|
+
end
|
62
|
+
|
63
|
+
# Calculate the geometric index for an alignment.
|
64
|
+
#
|
65
|
+
# Basically, you change all residues to 0 and all gaps to 1. Then you take the permutations of the sequences and then the residues and XOR each of the permutations. Then you add it up and take averages and you'll get the score. That is a pretty bad explanation, so see http://merenlab.org/2016/11/08/pangenomics-v2/#geometric-homogeneity-index for more information.
|
66
|
+
#
|
67
|
+
# @param aln [Array<String>] aligned sequenecs
|
68
|
+
# @param by [String] either "sequence" or "residue". Controls whether to do the calculation by sequence, or by residue.
|
69
|
+
#
|
70
|
+
# @return [Float] a score between 0 and 1, with 1 being very homogeneous and 1 being very heterogeneous.
|
71
|
+
#
|
72
|
+
# @note The original Anvi'o code uses a clever bitshifting scheme to avoid storing array of arrays. It may also speed up the XOR part as you're doing fewer XORs, but I'm not positive about that.
|
73
|
+
def geometric_score aln, by
|
74
|
+
binary_aln = to_binary_matrix aln
|
75
|
+
|
76
|
+
if by == "residue"
|
77
|
+
binary_aln = binary_aln.transpose
|
78
|
+
end
|
79
|
+
|
80
|
+
num_rows = binary_aln.length
|
81
|
+
max_differences_per_row = binary_aln.first.length
|
82
|
+
num_comparisions_per_row = num_rows - 1
|
83
|
+
|
84
|
+
diff_score = binary_aln.permutation(2).map do |(row1, row2)|
|
85
|
+
row1.zip(row2).map do |elem1, elem2|
|
86
|
+
elem1 ^ elem2
|
87
|
+
end.sum / max_differences_per_row.to_f
|
88
|
+
end.sum / num_comparisions_per_row.to_f / num_rows
|
89
|
+
|
90
|
+
1 - diff_score
|
91
|
+
end
|
92
|
+
|
93
|
+
# A wrapper for #geometric_score that takes the average of the by sequence and by residue scores for an alignment.
|
94
|
+
#
|
95
|
+
# @param aln [Array<String>] aligned sequenecs
|
96
|
+
#
|
97
|
+
# @return [Float] a score between 0 and 1, with 1 being very homogeneous and 1 being very heterogeneous.
|
98
|
+
def geometric_index aln
|
99
|
+
by_seq_score = geometric_score aln, "sequence"
|
100
|
+
by_residue_score = geometric_score aln, "residue"
|
101
|
+
|
102
|
+
(by_seq_score + by_residue_score) / 2.0
|
103
|
+
end
|
104
|
+
|
105
|
+
# Returns the similarity score for the alignment.
|
106
|
+
#
|
107
|
+
# For each colunm, each pair of residues are scored using the similarity matrix to get a points accrued / max_points. Then all column scores are averaged.
|
108
|
+
#
|
109
|
+
# @raise PasvLib::Error if any residue is not present in the scoring matrix.
|
110
|
+
# @raise PasvLib::Error if seqs is empty
|
111
|
+
# @raise PasvLib::Error if max_points is zero. Could happen if one of the seqs has all gaps, or no omparisons could be made.
|
112
|
+
#
|
113
|
+
# @note Technically you could get a negative score if you don't have enough high scoring residue pairs to keep the total score above zero. If this is the case, you're alignment probably isn't very good. Alternatively, you could use #adjust_scoring_matrix to avoid this issue....
|
114
|
+
def similarity_score seqs, scoring_matrix = Blosum::BLOSUM62
|
115
|
+
raise PasvLib::Error if seqs.empty?
|
116
|
+
return 1.0 if seqs.count == 1
|
117
|
+
|
118
|
+
aln_cols = alignment_columns seqs
|
119
|
+
|
120
|
+
actual_points = 0
|
121
|
+
max_points = 0
|
122
|
+
|
123
|
+
aln_cols.each do |residues|
|
124
|
+
residues.map(&:upcase).combination(2).each do |r1, r2|
|
125
|
+
unless gap?(r1) || gap?(r2)
|
126
|
+
# Check that scoring matrix has the residues.
|
127
|
+
[r1, r2].each do |res|
|
128
|
+
unless scoring_matrix.has_key? res
|
129
|
+
raise PasvLib::Error, "Residue '#{res}' is missing from the " \
|
130
|
+
"scoring matrix."
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
r1_max_score = scoring_matrix[r1].values.max
|
135
|
+
r2_max_score = scoring_matrix[r2].values.max
|
136
|
+
pair_max = [r1_max_score, r2_max_score].max
|
137
|
+
|
138
|
+
# TODO check that residues exist in the scoring matrix.
|
139
|
+
actual_points += scoring_matrix[r1][r2]
|
140
|
+
max_points += pair_max
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
if max_points.zero?
|
146
|
+
raise PasvLib::Error, "Something went wrong and max_points was 0. " \
|
147
|
+
"Maybe one of your sequences is all gaps?"
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
actual_points / max_points.to_f
|
152
|
+
end
|
153
|
+
|
154
|
+
# Convert an aligment to a binary matrix where 1 represents gaps and 0 represents residues.
|
155
|
+
#
|
156
|
+
# @param aln [Array<String>] an array of (aligned) sequences
|
157
|
+
#
|
158
|
+
# @return [Array<Array<integer>>] an array of arrays. Each row is a sequence.
|
159
|
+
def to_binary_matrix aln
|
160
|
+
aln.map do |seq|
|
161
|
+
seq.chars.map do |char|
|
162
|
+
# Gaps turn to 1, residues turn to 0.
|
163
|
+
gap?(char) ? 1 : 0
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
data/lib/pasv_lib/io.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require "parse_fasta"
|
2
|
+
|
3
|
+
module PasvLib
|
4
|
+
module Io
|
5
|
+
def read_refs fname
|
6
|
+
refs = {}
|
7
|
+
|
8
|
+
ParseFasta::SeqFile.open(fname).each_record do |rec|
|
9
|
+
if has_gaps? rec.seq
|
10
|
+
raise PasvLib::ParseError,
|
11
|
+
"Record '#{rec.header}' had gaps! Did you accidentally " \
|
12
|
+
"provide aligned sequences?"
|
13
|
+
end
|
14
|
+
|
15
|
+
if refs.count.zero?
|
16
|
+
head = "first_pasv_ref"
|
17
|
+
else
|
18
|
+
head = "pasv_ref___#{rec.id}"
|
19
|
+
end
|
20
|
+
|
21
|
+
refs[head] = rec.seq
|
22
|
+
end
|
23
|
+
|
24
|
+
refs
|
25
|
+
end
|
26
|
+
|
27
|
+
def read_queries fname
|
28
|
+
queries = {}
|
29
|
+
|
30
|
+
ParseFasta::SeqFile.open(fname).each_record do |rec|
|
31
|
+
if has_gaps? rec.seq
|
32
|
+
raise PasvLib::ParseError,
|
33
|
+
"Record '#{rec.header}' had gaps! Did you accidentally " \
|
34
|
+
"provide aligned sequences?"
|
35
|
+
end
|
36
|
+
|
37
|
+
header = "pasv_query___#{rec.header}"
|
38
|
+
queries[header] = rec.seq
|
39
|
+
end
|
40
|
+
|
41
|
+
queries
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def has_gaps? seq
|
47
|
+
seq.include?("-") || seq.include?(".")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
data/lib/pasv_lib/version.rb
CHANGED
data/pasv_lib.gemspec
CHANGED
@@ -23,4 +23,7 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_development_dependency "bundler", "~> 1.15"
|
24
24
|
spec.add_development_dependency "rake", "~> 10.0"
|
25
25
|
spec.add_development_dependency "rspec", "~> 3.0"
|
26
|
+
|
27
|
+
spec.add_runtime_dependency "blosum", "~> 0.1.0"
|
28
|
+
spec.add_runtime_dependency "parse_fasta", "~> 2.5.2"
|
26
29
|
end
|
data/vendor/systemu.rb
CHANGED
@@ -25,7 +25,9 @@
|
|
25
25
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26
26
|
#
|
27
27
|
|
28
|
-
# This is a modified version of systemu by me (Ryan Moore).
|
28
|
+
# This is a (slightly) modified version of systemu by me (Ryan Moore).
|
29
|
+
# I've only changed which errors are raised and removed the systemu
|
30
|
+
# method from the Object class.
|
29
31
|
|
30
32
|
require 'tmpdir'
|
31
33
|
require 'socket'
|
@@ -45,13 +47,17 @@ class SystemUniversal
|
|
45
47
|
#
|
46
48
|
# error class
|
47
49
|
#
|
50
|
+
# This is now set up so any error intentially raised by systemu will inherit from SystemUniversal::Error.
|
48
51
|
class Error < RuntimeError
|
49
52
|
end
|
50
53
|
|
54
|
+
class EEXIST < Error
|
55
|
+
end
|
56
|
+
|
51
57
|
#
|
52
58
|
# constants
|
53
59
|
#
|
54
|
-
SystemUniversal::VERSION = '2.6.5' unless SystemUniversal.send(:const_defined?, :VERSION)
|
60
|
+
SystemUniversal::VERSION = '2.6.5.9999' unless SystemUniversal.send(:const_defined?, :VERSION)
|
55
61
|
def SystemUniversal.version() SystemUniversal::VERSION end
|
56
62
|
def version() SystemUniversal::VERSION end
|
57
63
|
def SystemUniversal.description
|
@@ -128,10 +134,19 @@ class SystemUniversal
|
|
128
134
|
buf = pipe.read
|
129
135
|
buf = "#{ line }#{ buf }"
|
130
136
|
e = Marshal.load buf
|
131
|
-
|
132
|
-
raise
|
137
|
+
|
138
|
+
# If Marshal load loaded a particular exception, raise it.
|
139
|
+
if Exception === e
|
140
|
+
raise e
|
141
|
+
else
|
142
|
+
raise
|
143
|
+
end
|
144
|
+
|
145
|
+
# regardless of the error raised, we want to raise SystemUniversal::Error so we have one thnig to catch.
|
146
|
+
rescue e
|
147
|
+
raise Error, "systemu: Error - process interrupted (original error: #{e.inspect})!\n#{ buf }\n"
|
133
148
|
rescue
|
134
|
-
raise Error "systemu: Error - process interrupted!\n#{ buf }\n"
|
149
|
+
raise Error, "systemu: Error - process interrupted!\n#{ buf }\n"
|
135
150
|
end
|
136
151
|
end
|
137
152
|
thread = new_thread cid, @block if @block
|
@@ -280,8 +295,8 @@ class SystemUniversal
|
|
280
295
|
|
281
296
|
begin
|
282
297
|
Dir.mkdir tmp
|
283
|
-
rescue Errno::EEXIST
|
284
|
-
raise
|
298
|
+
rescue Errno::EEXIST => e
|
299
|
+
raise EEXIST, e.message if i >= max
|
285
300
|
next
|
286
301
|
end
|
287
302
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pasv_lib
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-05-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,34 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: blosum
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.1.0
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.1.0
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: parse_fasta
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 2.5.2
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 2.5.2
|
55
83
|
description:
|
56
84
|
email:
|
57
85
|
- moorer@udel.edu
|
@@ -69,6 +97,9 @@ files:
|
|
69
97
|
- bin/console
|
70
98
|
- bin/setup
|
71
99
|
- lib/pasv_lib.rb
|
100
|
+
- lib/pasv_lib/alignment.rb
|
101
|
+
- lib/pasv_lib/error.rb
|
102
|
+
- lib/pasv_lib/io.rb
|
72
103
|
- lib/pasv_lib/version.rb
|
73
104
|
- pasv_lib.gemspec
|
74
105
|
- vendor/systemu.rb
|
@@ -90,7 +121,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
90
121
|
- !ruby/object:Gem::Version
|
91
122
|
version: '0'
|
92
123
|
requirements: []
|
93
|
-
rubygems_version: 3.0.
|
124
|
+
rubygems_version: 3.0.6
|
94
125
|
signing_key:
|
95
126
|
specification_version: 4
|
96
127
|
summary: Library code for PASV
|