pasv_lib 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ee12cdba5e8132195d6655631b12e29b09a7fa79121b8f7fcb7357bb66eacf88
4
- data.tar.gz: 0bf0fbbb1c805d2ccac3ffa5343886e8389cd41080fcb528c923c87d26589cdf
3
+ metadata.gz: 752ca72e197ff3f4249ba50b67cbe8ecc09a00aa8e4f4b050943437c303bbc42
4
+ data.tar.gz: 35c1ed983ba5643ea84e31fa3c7ca6c0ecb031597ea2d8058664725f443e336a
5
5
  SHA512:
6
- metadata.gz: 67dffebcaa35845a5bfc3c603978083565dcccf816e11cea312b1005fa9071a208428fdd34b3c7eae60daf40ed2359483ec29dfacece2d8d1d64db054a896ff8
7
- data.tar.gz: '03139a6ff7a005b2eed0b36579e7d9d54d31073435e49e0118da1360b3f880de680811b9e4da9f8f0a4077caea43bcfe0642f1391f1d836bd439362bcaa8f84d'
6
+ metadata.gz: 5436d6c890346eb6560d6f00d4801ca5f59f61e0d517404429553b23750f4516f117a86fb8236cc1cabd7f1a07251a2ce6f41cc2df8c3210620ac0204e686e5b
7
+ data.tar.gz: e92b04401c6b118b7906eca7e750f42dcfe720d4d42c3d533de33d6a8f46babb6601bdeeb2ed8e80bd4bce9407e084f8a8d60c0e6215a8e254aa96ab79442511
data/lib/pasv_lib.rb CHANGED
@@ -1,3 +1,5 @@
1
+ require "pasv_lib/error"
2
+ require "pasv_lib/alignment"
1
3
  require "pasv_lib/version"
2
4
  require File.join __dir__, "..", "vendor", "systemu"
3
5
 
@@ -0,0 +1,147 @@
1
+ require "blosum"
2
+ require "set"
3
+
4
+ module PasvLib
5
+ module Alignment
6
+ GAP_CHARS = Set.new %w[- .]
7
+
8
+ # If the overall min of the scoring matrix is < 0, then this scales it so that the overall min becomes zero.
9
+ #
10
+ # @param scoring_matrix The scoring matrix to rescale. E.g., Blosum::BLOSUM62.
11
+ #
12
+ # @return A new hash scaled to zero, or a deep copy of the old one if the overall min >= 0.
13
+ def adjust_scoring_matrix scoring_matrix
14
+ overal_min = scoring_matrix.values.map(&:values).flatten.min
15
+
16
+ scaling_value = overal_min < 0 ? overal_min.abs : 0
17
+
18
+ # We want a deep copy to prevent things from getting weird while using the new hash later.
19
+ new_matrix = {}
20
+ scoring_matrix.each do |residue, scores|
21
+ new_matrix[residue] = {}
22
+ scores.each do |other_residue, score|
23
+ new_matrix[residue][other_residue] = score + scaling_value
24
+ end
25
+ end
26
+
27
+ new_matrix
28
+ end
29
+
30
+ # Get the columns of an aligned set of sequences.
31
+ #
32
+ # Any spaces in the alignment are ignored (e.g., like those spaces NCBI puts in their alignments sometimes). Gap characters are '.' or '-'
33
+ #
34
+ # @param seqs [Array<String>] an array of sequences, normally aligned sequences
35
+ #
36
+ # @return [Array<Array<String>>] an array of alignment columns (which are arrays of single residue strings)
37
+ #
38
+ # @raise PasvLib::Error if seqs.empty?
39
+ # @raise PasvLib::Error if no comparisons can be made, e.g., all gaps
40
+ # @raise PasvLib::Error if not all sequences are the same length
41
+ #
42
+ #
43
+ # @example
44
+ # klass.alignment_columns ["AA-", "A-A"] #=> [["A", "A"], ["A", "-"], ["-", "A"]]
45
+ def alignment_columns seqs
46
+ seqs_no_spaces = seqs.map { |seq| seq.tr " ", "" }
47
+ len = seqs_no_spaces.first.length
48
+
49
+ seqs_no_spaces.map do |seq|
50
+ unless seq.length == len
51
+ raise PasvLib::Error, "Aligned seqs must be the same length"
52
+ end
53
+
54
+ seq.chars
55
+ end.transpose
56
+ end
57
+
58
+ # Calculate the geometric index for an alignment.
59
+ #
60
+ # Basically, you change all residues to 0 and all gaps to 1. Then you take the permutations of the sequences and then the residues and XOR each of the permutations. Then you add it up and take averages and you'll get the score. That is a pretty bad explanation, so see http://merenlab.org/2016/11/08/pangenomics-v2/#geometric-homogeneity-index for more information.
61
+ #
62
+ # @param aln [Array<String>] aligned sequenecs
63
+ # @param by [String] either "sequence" or "residue". Controls whether to do the calculation by sequence, or by residue.
64
+ #
65
+ # @return [Float] a score between 0 and 1, with 1 being very homogeneous and 1 being very heterogeneous.
66
+ #
67
+ # @note The original Anvi'o code uses a clever bitshifting scheme to avoid storing array of arrays. It may also speed up the XOR part as you're doing fewer XORs, but I'm not positive about that.
68
+ def geometric_score aln, by
69
+ binary_aln = to_binary_matrix aln
70
+
71
+ if by == "residue"
72
+ binary_aln = binary_aln.transpose
73
+ end
74
+
75
+ num_rows = binary_aln.length
76
+ max_differences_per_row = binary_aln.first.length
77
+ num_comparisions_per_row = num_rows - 1
78
+
79
+ diff_score = binary_aln.permutation(2).map do |(row1, row2)|
80
+ row1.zip(row2).map do |elem1, elem2|
81
+ elem1 ^ elem2
82
+ end.sum / max_differences_per_row.to_f
83
+ end.sum / num_comparisions_per_row.to_f / num_rows
84
+
85
+ 1 - diff_score
86
+ end
87
+
88
+ # A wrapper for #geometric_score that takes the average of the by sequence and by residue scores for an alignment.
89
+ #
90
+ # @param aln [Array<String>] aligned sequenecs
91
+ #
92
+ # @return [Float] a score between 0 and 1, with 1 being very homogeneous and 1 being very heterogeneous.
93
+ def geometric_index aln
94
+ by_seq_score = geometric_score aln, "sequence"
95
+ by_residue_score = geometric_score aln, "residue"
96
+
97
+ (by_seq_score + by_residue_score) / 2.0
98
+ end
99
+
100
+ # @note Technically you could get a negative score if you don't have enough high scoring residue pairs to keep the total score above zero. If this is the case, you're alignment probably isn't very good.
101
+ def similarity_score seqs, scoring_matrix = Blosum::BLOSUM62
102
+ raise PasvLib::Error if seqs.empty?
103
+ return 1.0 if seqs.count == 1
104
+
105
+ aln_cols = alignment_columns seqs
106
+
107
+ actual_points = 0
108
+ max_points = 0
109
+
110
+ aln_cols.each do |residues|
111
+ residues.map(&:upcase).combination(2).each do |r1, r2|
112
+ unless GAP_CHARS.include?(r1) || GAP_CHARS.include?(r2)
113
+ r1_max_score = scoring_matrix[r1].values.max
114
+ r2_max_score = scoring_matrix[r2].values.max
115
+ pair_max = [r1_max_score, r2_max_score].max
116
+
117
+ # TODO check that residues exist in the scoring matrix.
118
+ actual_points += scoring_matrix[r1][r2]
119
+ max_points += pair_max
120
+ end
121
+ end
122
+ end
123
+
124
+ if max_points.zero?
125
+ raise PasvLib::Error, "Something went wrong and max_points was 0. " \
126
+ "Maybe one of your sequences is all gaps?"
127
+
128
+ end
129
+
130
+ actual_points / max_points.to_f
131
+ end
132
+
133
+ # Convert an aligment to a binary matrix where 1 represents gaps and 0 represents residues.
134
+ #
135
+ # @param aln [Array<String>] an array of (aligned) sequences
136
+ #
137
+ # @return [Array<Array<integer>>] an array of arrays. Each row is a sequence.
138
+ def to_binary_matrix aln
139
+ aln.map do |seq|
140
+ seq.chars.map do |char|
141
+ # Gaps turn to 1, residues turn to 0.
142
+ GAP_CHARS.include?(char) ? 1 : 0
143
+ end
144
+ end
145
+ end
146
+ end
147
+ end
@@ -0,0 +1,3 @@
1
+ module PasvLib
2
+ class Error < StandardError; end
3
+ end
@@ -1,3 +1,3 @@
1
1
  module PasvLib
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
data/pasv_lib.gemspec CHANGED
@@ -23,4 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "bundler", "~> 1.15"
24
24
  spec.add_development_dependency "rake", "~> 10.0"
25
25
  spec.add_development_dependency "rspec", "~> 3.0"
26
+
27
+ spec.add_runtime_dependency "blosum", "~> 0.1.0"
26
28
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pasv_lib
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-02-21 00:00:00.000000000 Z
11
+ date: 2019-10-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: blosum
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.1.0
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.1.0
55
69
  description:
56
70
  email:
57
71
  - moorer@udel.edu
@@ -69,6 +83,8 @@ files:
69
83
  - bin/console
70
84
  - bin/setup
71
85
  - lib/pasv_lib.rb
86
+ - lib/pasv_lib/alignment.rb
87
+ - lib/pasv_lib/error.rb
72
88
  - lib/pasv_lib/version.rb
73
89
  - pasv_lib.gemspec
74
90
  - vendor/systemu.rb
@@ -90,7 +106,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
90
106
  - !ruby/object:Gem::Version
91
107
  version: '0'
92
108
  requirements: []
93
- rubygems_version: 3.0.1
109
+ rubygems_version: 3.0.6
94
110
  signing_key:
95
111
  specification_version: 4
96
112
  summary: Library code for PASV