pasv_lib 0.1.2 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 50e4d7a31367670be250e437270f02fd37b8638fbd7a541fa91ab0ec7e35dccb
4
- data.tar.gz: 304eab6bd2becd3995b14c92bfdd1a39adda5d73687598eea0c64faeffd03651
3
+ metadata.gz: e6d2910a3379cacf4191d43ee3e9bafd9b1b4ffa5b2e4f1c6cabfc3716a98680
4
+ data.tar.gz: 544acd2bca1ecec7bd4ff1658e50cbbf4bd5301d8f51acf7e0d7af89a011cc63
5
5
  SHA512:
6
- metadata.gz: 2f550d96c0a73954f1a39ddce847580197ec75bcdf0e69733926ba5d794e986c4d13f76d47a9d4872d20156b095381b052b983ded6dd364d56e414aea21b08de
7
- data.tar.gz: d72ecf0b88674b51d0622878659d12f29d22b3bf815ec64b75eb6e120b00dc4f571744ebeb48f1c334bc91e94569ed2673b2c9e78a7bf82ad8fdd0ba04a01d5f
6
+ metadata.gz: 371ce1e3a9001fa8deda62a90d9e01a33fdd759c8d061068f7ba4c7bded572c10d68b594f177ea609aed98865be01f7c179ab3f1cf398b1500ad96e0a2a802fa
7
+ data.tar.gz: 1c75ab7a7dd0534f4dd0935cb68af4b35e2c81b35c4abae4424e60899e2ec22b74c85365c961223598fc7ca8f8c9650f3d7dfdf74c414fe9a8e13741d50e7194
data/.gitignore CHANGED
@@ -11,4 +11,6 @@
11
11
  # rspec failure tracking
12
12
  .rspec_status
13
13
 
14
- .idea
14
+ .idea
15
+
16
+ .ruby-*
@@ -1,4 +1,8 @@
1
+ require "pasv_lib/error"
2
+ require "pasv_lib/alignment"
3
+ require "pasv_lib/io"
1
4
  require "pasv_lib/version"
5
+
2
6
  require File.join __dir__, "..", "vendor", "systemu"
3
7
 
4
8
  module PasvLib
@@ -0,0 +1,168 @@
1
+ require "blosum"
2
+ require "set"
3
+
4
+ module PasvLib
5
+ module Alignment
6
+ # If you need to check if a residue is a gap, use this Set.
7
+ GAP_CHARS = Set.new %w[- .]
8
+
9
+ # If the overall min of the scoring matrix is < 0, then this scales it so that the overall min becomes zero.
10
+ #
11
+ # @param scoring_matrix The scoring matrix to rescale. E.g., Blosum::BLOSUM62.
12
+ #
13
+ # @return A new hash scaled to zero, or a deep copy of the old one if the overall min >= 0.
14
+ def adjust_scoring_matrix scoring_matrix
15
+ overal_min = scoring_matrix.values.map(&:values).flatten.min
16
+
17
+ scaling_value = overal_min < 0 ? overal_min.abs : 0
18
+
19
+ # We want a deep copy to prevent things from getting weird while using the new hash later.
20
+ new_matrix = {}
21
+ scoring_matrix.each do |residue, scores|
22
+ new_matrix[residue] = {}
23
+ scores.each do |other_residue, score|
24
+ new_matrix[residue][other_residue] = score + scaling_value
25
+ end
26
+ end
27
+
28
+ new_matrix
29
+ end
30
+
31
+ # Get the columns of an aligned set of sequences.
32
+ #
33
+ # Any spaces in the alignment are ignored (e.g., like those spaces NCBI puts in their alignments sometimes). Gap characters are '.' or '-'
34
+ #
35
+ # @param seqs [Array<String>] an array of sequences, normally aligned sequences
36
+ #
37
+ # @return [Array<Array<String>>] an array of alignment columns (which are arrays of single residue strings)
38
+ #
39
+ # @raise PasvLib::Error if seqs.empty?
40
+ # @raise PasvLib::Error if no comparisons can be made, e.g., all gaps
41
+ # @raise PasvLib::Error if not all sequences are the same length
42
+ #
43
+ #
44
+ # @example
45
+ # klass.alignment_columns ["AA-", "A-A"] #=> [["A", "A"], ["A", "-"], ["-", "A"]]
46
+ def alignment_columns seqs
47
+ seqs_no_spaces = seqs.map { |seq| seq.tr " ", "" }
48
+ len = seqs_no_spaces.first.length
49
+
50
+ seqs_no_spaces.map do |seq|
51
+ unless seq.length == len
52
+ raise PasvLib::Error, "Aligned seqs must be the same length"
53
+ end
54
+
55
+ seq.chars
56
+ end.transpose
57
+ end
58
+
59
+ def gap? residue
60
+ GAP_CHARS.include? residue
61
+ end
62
+
63
+ # Calculate the geometric index for an alignment.
64
+ #
65
+ # Basically, you change all residues to 0 and all gaps to 1. Then you take the permutations of the sequences and then the residues and XOR each of the permutations. Then you add it up and take averages and you'll get the score. That is a pretty bad explanation, so see http://merenlab.org/2016/11/08/pangenomics-v2/#geometric-homogeneity-index for more information.
66
+ #
67
+ # @param aln [Array<String>] aligned sequenecs
68
+ # @param by [String] either "sequence" or "residue". Controls whether to do the calculation by sequence, or by residue.
69
+ #
70
+ # @return [Float] a score between 0 and 1, with 1 being very homogeneous and 1 being very heterogeneous.
71
+ #
72
+ # @note The original Anvi'o code uses a clever bitshifting scheme to avoid storing array of arrays. It may also speed up the XOR part as you're doing fewer XORs, but I'm not positive about that.
73
+ def geometric_score aln, by
74
+ binary_aln = to_binary_matrix aln
75
+
76
+ if by == "residue"
77
+ binary_aln = binary_aln.transpose
78
+ end
79
+
80
+ num_rows = binary_aln.length
81
+ max_differences_per_row = binary_aln.first.length
82
+ num_comparisions_per_row = num_rows - 1
83
+
84
+ diff_score = binary_aln.permutation(2).map do |(row1, row2)|
85
+ row1.zip(row2).map do |elem1, elem2|
86
+ elem1 ^ elem2
87
+ end.sum / max_differences_per_row.to_f
88
+ end.sum / num_comparisions_per_row.to_f / num_rows
89
+
90
+ 1 - diff_score
91
+ end
92
+
93
+ # A wrapper for #geometric_score that takes the average of the by sequence and by residue scores for an alignment.
94
+ #
95
+ # @param aln [Array<String>] aligned sequenecs
96
+ #
97
+ # @return [Float] a score between 0 and 1, with 1 being very homogeneous and 1 being very heterogeneous.
98
+ def geometric_index aln
99
+ by_seq_score = geometric_score aln, "sequence"
100
+ by_residue_score = geometric_score aln, "residue"
101
+
102
+ (by_seq_score + by_residue_score) / 2.0
103
+ end
104
+
105
+ # Returns the similarity score for the alignment.
106
+ #
107
+ # For each colunm, each pair of residues are scored using the similarity matrix to get a points accrued / max_points. Then all column scores are averaged.
108
+ #
109
+ # @raise PasvLib::Error if any residue is not present in the scoring matrix.
110
+ # @raise PasvLib::Error if seqs is empty
111
+ # @raise PasvLib::Error if max_points is zero. Could happen if one of the seqs has all gaps, or no omparisons could be made.
112
+ #
113
+ # @note Technically you could get a negative score if you don't have enough high scoring residue pairs to keep the total score above zero. If this is the case, you're alignment probably isn't very good. Alternatively, you could use #adjust_scoring_matrix to avoid this issue....
114
+ def similarity_score seqs, scoring_matrix = Blosum::BLOSUM62
115
+ raise PasvLib::Error if seqs.empty?
116
+ return 1.0 if seqs.count == 1
117
+
118
+ aln_cols = alignment_columns seqs
119
+
120
+ actual_points = 0
121
+ max_points = 0
122
+
123
+ aln_cols.each do |residues|
124
+ residues.map(&:upcase).combination(2).each do |r1, r2|
125
+ unless gap?(r1) || gap?(r2)
126
+ # Check that scoring matrix has the residues.
127
+ [r1, r2].each do |res|
128
+ unless scoring_matrix.has_key? res
129
+ raise PasvLib::Error, "Residue '#{res}' is missing from the " \
130
+ "scoring matrix."
131
+ end
132
+ end
133
+
134
+ r1_max_score = scoring_matrix[r1].values.max
135
+ r2_max_score = scoring_matrix[r2].values.max
136
+ pair_max = [r1_max_score, r2_max_score].max
137
+
138
+ # TODO check that residues exist in the scoring matrix.
139
+ actual_points += scoring_matrix[r1][r2]
140
+ max_points += pair_max
141
+ end
142
+ end
143
+ end
144
+
145
+ if max_points.zero?
146
+ raise PasvLib::Error, "Something went wrong and max_points was 0. " \
147
+ "Maybe one of your sequences is all gaps?"
148
+
149
+ end
150
+
151
+ actual_points / max_points.to_f
152
+ end
153
+
154
+ # Convert an aligment to a binary matrix where 1 represents gaps and 0 represents residues.
155
+ #
156
+ # @param aln [Array<String>] an array of (aligned) sequences
157
+ #
158
+ # @return [Array<Array<integer>>] an array of arrays. Each row is a sequence.
159
+ def to_binary_matrix aln
160
+ aln.map do |seq|
161
+ seq.chars.map do |char|
162
+ # Gaps turn to 1, residues turn to 0.
163
+ gap?(char) ? 1 : 0
164
+ end
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,4 @@
1
+ module PasvLib
2
+ class Error < StandardError; end
3
+ class ParseError < Error; end
4
+ end
@@ -0,0 +1,50 @@
1
+ require "parse_fasta"
2
+
3
+ module PasvLib
4
+ module Io
5
+ def read_refs fname
6
+ refs = {}
7
+
8
+ ParseFasta::SeqFile.open(fname).each_record do |rec|
9
+ if has_gaps? rec.seq
10
+ raise PasvLib::ParseError,
11
+ "Record '#{rec.header}' had gaps! Did you accidentally " \
12
+ "provide aligned sequences?"
13
+ end
14
+
15
+ if refs.count.zero?
16
+ head = "first_pasv_ref"
17
+ else
18
+ head = "pasv_ref___#{rec.id}"
19
+ end
20
+
21
+ refs[head] = rec.seq
22
+ end
23
+
24
+ refs
25
+ end
26
+
27
+ def read_queries fname
28
+ queries = {}
29
+
30
+ ParseFasta::SeqFile.open(fname).each_record do |rec|
31
+ if has_gaps? rec.seq
32
+ raise PasvLib::ParseError,
33
+ "Record '#{rec.header}' had gaps! Did you accidentally " \
34
+ "provide aligned sequences?"
35
+ end
36
+
37
+ header = "pasv_query___#{rec.header}"
38
+ queries[header] = rec.seq
39
+ end
40
+
41
+ queries
42
+ end
43
+
44
+ private
45
+
46
+ def has_gaps? seq
47
+ seq.include?("-") || seq.include?(".")
48
+ end
49
+ end
50
+ end
@@ -1,3 +1,3 @@
1
1
  module PasvLib
2
- VERSION = "0.1.2"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -23,4 +23,7 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "bundler", "~> 1.15"
24
24
  spec.add_development_dependency "rake", "~> 10.0"
25
25
  spec.add_development_dependency "rspec", "~> 3.0"
26
+
27
+ spec.add_runtime_dependency "blosum", "~> 0.1.0"
28
+ spec.add_runtime_dependency "parse_fasta", "~> 2.5.2"
26
29
  end
@@ -25,7 +25,9 @@
25
25
  # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
26
  #
27
27
 
28
- # This is a modified version of systemu by me (Ryan Moore).
28
+ # This is a (slightly) modified version of systemu by me (Ryan Moore).
29
+ # I've only changed which errors are raised and removed the systemu
30
+ # method from the Object class.
29
31
 
30
32
  require 'tmpdir'
31
33
  require 'socket'
@@ -45,13 +47,17 @@ class SystemUniversal
45
47
  #
46
48
  # error class
47
49
  #
50
+ # This is now set up so any error intentially raised by systemu will inherit from SystemUniversal::Error.
48
51
  class Error < RuntimeError
49
52
  end
50
53
 
54
+ class EEXIST < Error
55
+ end
56
+
51
57
  #
52
58
  # constants
53
59
  #
54
- SystemUniversal::VERSION = '2.6.5' unless SystemUniversal.send(:const_defined?, :VERSION)
60
+ SystemUniversal::VERSION = '2.6.5.9999' unless SystemUniversal.send(:const_defined?, :VERSION)
55
61
  def SystemUniversal.version() SystemUniversal::VERSION end
56
62
  def version() SystemUniversal::VERSION end
57
63
  def SystemUniversal.description
@@ -128,10 +134,19 @@ class SystemUniversal
128
134
  buf = pipe.read
129
135
  buf = "#{ line }#{ buf }"
130
136
  e = Marshal.load buf
131
- raise unless Exception === e
132
- raise e
137
+
138
+ # If Marshal load loaded a particular exception, raise it.
139
+ if Exception === e
140
+ raise e
141
+ else
142
+ raise
143
+ end
144
+
145
+ # regardless of the error raised, we want to raise SystemUniversal::Error so we have one thnig to catch.
146
+ rescue e
147
+ raise Error, "systemu: Error - process interrupted (original error: #{e.inspect})!\n#{ buf }\n"
133
148
  rescue
134
- raise Error "systemu: Error - process interrupted!\n#{ buf }\n"
149
+ raise Error, "systemu: Error - process interrupted!\n#{ buf }\n"
135
150
  end
136
151
  end
137
152
  thread = new_thread cid, @block if @block
@@ -280,8 +295,8 @@ class SystemUniversal
280
295
 
281
296
  begin
282
297
  Dir.mkdir tmp
283
- rescue Errno::EEXIST
284
- raise Error if i >= max
298
+ rescue Errno::EEXIST => e
299
+ raise EEXIST, e.message if i >= max
285
300
  next
286
301
  end
287
302
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pasv_lib
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-02-21 00:00:00.000000000 Z
11
+ date: 2020-05-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,34 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: blosum
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.1.0
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.1.0
69
+ - !ruby/object:Gem::Dependency
70
+ name: parse_fasta
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 2.5.2
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 2.5.2
55
83
  description:
56
84
  email:
57
85
  - moorer@udel.edu
@@ -69,6 +97,9 @@ files:
69
97
  - bin/console
70
98
  - bin/setup
71
99
  - lib/pasv_lib.rb
100
+ - lib/pasv_lib/alignment.rb
101
+ - lib/pasv_lib/error.rb
102
+ - lib/pasv_lib/io.rb
72
103
  - lib/pasv_lib/version.rb
73
104
  - pasv_lib.gemspec
74
105
  - vendor/systemu.rb
@@ -90,7 +121,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
90
121
  - !ruby/object:Gem::Version
91
122
  version: '0'
92
123
  requirements: []
93
- rubygems_version: 3.0.1
124
+ rubygems_version: 3.0.6
94
125
  signing_key:
95
126
  specification_version: 4
96
127
  summary: Library code for PASV