ngs-ci 0.0.1.a → 0.0.2.b

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 85d1cc53730fb57307136cd2313e81c60e55aad2
4
- data.tar.gz: c6724259d4f7709b728bb7df07a3cbdcaa352173
3
+ metadata.gz: 1f537bf2115279943c5673a5dde124ae358972f2
4
+ data.tar.gz: fcb22914dfe57a1f9bdfddb71e8cfd453c9431dd
5
5
  SHA512:
6
- metadata.gz: 334662df0f16af26954bc29d99bfe6ecb59289f892a34de669d57be2125482a557ce4657086f167cf50afd31f00cbdc8ae8e41e2905c7269a25b3a41adce33a9
7
- data.tar.gz: 354bd86f5d1274c0cf26dc266b41f348d8ff26d1572101bd3196c72dc8aaad9f994d5703730777a91c8efa30d392bf8217d14b5a5c8a40f7998b2af2ff456ad3
6
+ metadata.gz: 123ab12bb812db0c6f2fafe6a6785a792b0e3596b154e45252be188ad7ab192755f4c715268b3b96f1ada8a7a6610afe67b4608352574bd033706868f410c202
7
+ data.tar.gz: 7f77bcb7e67204cb71f69869f8383233dcbc68d9182b7b62deb6cd186b404342d6aa65365f10f49d0eb7473f83fe562ac12348f74b63936f78d68ee3b9d1cf26
data/.gitignore CHANGED
@@ -12,4 +12,6 @@
12
12
  *.o
13
13
  *.a
14
14
  mkmf.log
15
- *~
15
+ *~
16
+ .ruby-gemset
17
+ .ruby-version
data/README.md CHANGED
@@ -1,12 +1,18 @@
1
- [![Build Status](https://travis-ci.org/MatthewRalston/SCI.png?branch=master)](https://travis-ci.org/MatthewRalston/SCI)
1
+ [![Build Status](https://travis-ci.org/MatthewRalston/ngs-ci.png?branch=master)](https://travis-ci.org/MatthewRalston/ngs-ci)
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/SCI.png)](http://badge.fury.io/rb/SCI)
3
+ [![Gem Version](https://badge.fury.io/rb/ngs-ci.png)](http://badge.fury.io/rb/ngs-ci)
4
4
 
5
- [![Coverage Status](https://coveralls.io/repos/MatthewRalston/SCI/badge.png)](https://coveralls.io/r/MatthewRalston/SCI)
5
+ [![Coverage Status](https://coveralls.io/repos/MatthewRalston/ngs-ci/badge.svg?branch=master&service=github)](https://coveralls.io/github/MatthewRalston/ngs-ci?branch=master)
6
6
 
7
7
 
8
+ # Todo
8
9
 
9
- # SCI
10
+ The inconsistency between the max summed dissimilarity and the denominator calculation is likely:
11
+ 1. Issue in the complexity index (when present is max,
12
+ a. ( present - missing ) / max = present/max_similarity - missing/max_dissim
13
+
14
+
15
+ # NGS Complexity Index
10
16
 
11
17
  NOTE: This is a project in progress.
12
18
  This gem will calculate a sequencing complexity index for BAM files.
@@ -16,7 +22,7 @@ This gem will calculate a sequencing complexity index for BAM files.
16
22
  Add this line to your application's Gemfile:
17
23
 
18
24
  ```ruby
19
- gem 'sci'
25
+ gem 'NGSCI'
20
26
  ```
21
27
 
22
28
  And then execute:
@@ -25,11 +31,18 @@ And then execute:
25
31
 
26
32
  Or install it yourself as:
27
33
 
28
- $ gem install sci --pre
34
+ $ gem install ngs-ci --pre
35
+
36
+ Or install manually:
37
+
38
+ $ git clone https://github.com/MatthewRalston/ngs-ci.git
39
+ $ cd ngs-ci
40
+ $ gem build ngs-ci.gemspec
41
+ $ gem install ngs-ci-[Version].gem
29
42
 
30
43
  ## Usage
31
44
 
32
- TODO: Write usage instructions here
45
+ * See ```--help``` for details. More to come.
33
46
 
34
47
  ## Contributing
35
48
 
data/TODO.org CHANGED
@@ -30,10 +30,18 @@
30
30
  **** 2850 (triangular number T(L-1) L=76 J=1
31
31
  **** f(76) = 2850
32
32
  * Notes
33
- ** U*O/L vs. 200*U*O/(L^2)
33
+ ** U = u/L
34
34
  ** U/L is the number of unique reads at that base, length normalized
35
- ** When U/L is 1 (maximum saturation)
36
- ** O = L/2
37
- ** Although, average overlap can be greater than L/2 with less reads
38
-
35
+ ** U*O/L vs. 200*U*O/(L^2)
36
+ ** the average summed overlap is O/(L)
37
+ ** average because different reads in the formation have different summed overlaps
38
+ ** the average average overlap is 2L/3 or O/L/(L-1) or O/(L^2-L)
39
+ **
40
+ ** D = d /
41
+ ** the average summed dissimilarity is D/(L)
42
+ ** average because different reads in the formation have different summed dissimilarities
43
+ ** the average average dissimilarity is L/3 or D/L/(L-1) or D/(L^2-L)
44
+ ** This matches the average similarity nicely...
45
+ **
46
+ ** u*d / L*L*(L-1)
39
47
  * Bugs
@@ -7,16 +7,21 @@ module NGSCI
7
7
 
8
8
  # A calculator calculates the sequencing complexity index.
9
9
  #
10
- # @!attribute [r] sci
10
+ # @author Matthew Ralston
11
+ # @abstract A class for calculating the complexity index on next generation sequencing reads
12
+ # @attr_reader [Integer] block_size The block size for parallelizing disk access
13
+ # @attr_reader [Hash<Symbol,Integer>] chroms A hash of chromosomes and their sizes
14
+ # @attr_reader [Integer] read_length The read length obtained from a bam file
15
+ # @attr_reader [Integer] denominator The denominator and normalization factors calculated from the read length
11
16
  class Calculator
12
- attr_reader :sci, :block_size, :buffer, :chroms
17
+ attr_reader :block_size, :chroms, :read_length, :denominator
13
18
 
14
19
  # A new calculator to compute the sequencing complexity index given
15
20
  # a loaded Bio::DB::Sam object and optional thread argument.
16
21
  #
17
- # @param bam [Bio::DB::Sam] Opened bam file with loaded reference.
18
- # @param threads [Int] The number of threads used to compute NGSCI.
19
- # @param strand [String] One of [FR RF F] or nil for strandedness.
22
+ # @param [Bio::DB::Sam] bam Opened bam file with loaded reference.
23
+ # @param [Int] threads The number of threads used to compute NGSCI.
24
+ # @param [String] strand One of [FR RF F] or nil for strandedness.
20
25
  def initialize(bam, reference, strand: nil, threads: 1)
21
26
  @block_size = 1600
22
27
  @results = nil
@@ -28,7 +33,8 @@ module NGSCI
28
33
  @bam.open
29
34
  @threads = threads
30
35
  @chroms = reference_sequences(reference)
31
- read_length
36
+ @read_length = NGSCI::Calculator.read_length_calc(@bam,@block_size)
37
+ @denominator = denominator_calc(@read_length)
32
38
  if strand
33
39
  unless %w(FR RF F).include?(strand)
34
40
  raise NGSCI::NGSCIError.new "Strand specific option #{opts.strand} is invalid." +
@@ -42,6 +48,7 @@ module NGSCI
42
48
 
43
49
  # Calculation of the sequencing complexity index
44
50
  #
51
+ # @param runtime [false] Print profiling information?
45
52
  def run(runtime: false)
46
53
  RubyProf.start if runtime
47
54
  # Convert each aligned read to Read clas
@@ -83,16 +90,16 @@ module NGSCI
83
90
  #
84
91
  # @param chrom [String] The chromosome from the bam file
85
92
  # @param i [Integer] The number of blocks that have been read
86
- # @return localNGSCI [Hash<Symbol,Array>]
87
- # * :+ (Array[Integer]) The NGSCI for the + strand
88
- # * :- (Array[Integer]) The NGSCI for the - strand
93
+ # @return [Hash<Symbol,Array>]
94
+ # * :+ (Array[Array]) The NGSCI for the + strand along the
95
+ # * :- (Array[Array]) The NGSCI for the - strand
89
96
  def readblock(chrom,i)
90
97
  reads=[]
91
98
  results = @strand ? {"+" => [],"-" => []}: {nil => []}
92
- start = [0,(i * @block_size) - @buffer].max
99
+ start = [0,(i * @block_size) - @read_length].max
93
100
  stop = [(i + 1) * @block_size, self.chroms[chrom]].min
94
101
  @bam.fetch(chrom,start,stop) {|read| reads << convert(read)}
95
- start += @buffer unless start == 0
102
+ start += @read_length unless start == 0
96
103
  reads.compact!
97
104
  reads.sort_by!(&:start) unless reads.empty?
98
105
  x=0
@@ -109,93 +116,114 @@ module NGSCI
109
116
  return results
110
117
  end
111
118
 
112
-
113
119
  # Calculates sequencing complexity index for a single base
114
120
  #
115
121
  # @param reads [Array<NGSCI::Read>] A group of reads aligned to a single base.
116
- # @return sci [Float]
122
+ # @return [Array<Integer,Integer,Float,Float>]
117
123
  def sci(reads)
118
124
  numreads=reads.size
119
125
  # Groups reads by start site
120
126
  # selects the largest read length from the groups
121
- reads = reads.group_by(&:start).map{|k,v| v.max{|x,y| (x.stop-x.start).abs <=> (y.stop-y.start).abs}}
122
- o = summed_overlaps(reads)
127
+ reads = reads.group_by(&:start).map{|k,v| v.max{|x,y| x.length <=> y.length}}
128
+ d = summed_dissimilarity(reads)
123
129
  uniquereads = reads.size
124
- return [numreads,uniquereads,(@buffer*o.to_f/@denom).round(4),(300*uniquereads*o/(2*@denom)).round(4)]
130
+ return [numreads,uniquereads,(d.to_f/@read_length).round(4),(100*uniquereads*d/@denominator).round(4)]
125
131
  end
126
132
 
127
- # Calculates summed overlap between a group of reads
133
+
134
+ # Calculation of the dissimilarity between two reads
135
+ #
136
+ # @param read1 [NGSCI::Read] First read to be compared
137
+ # @param read2 [NGSCI::Read] Second read to be compared
138
+ # @return [Integer] Length of non-overlapping/unique bases
139
+ def dissimilarity(read1,read2)
140
+ if read1.start > read2.start
141
+ if read1.stop < read2.stop # Read 1 is inside read 2
142
+ (read1.start - read2.start) + (read2.stop - read1.stop)
143
+ else # Normal overlap
144
+ read1.start - read2.start
145
+ end
146
+ else
147
+ if read1.stop > read2.stop # Read 2 is inside read 1
148
+ (read2.start - read1.start) + (read1.stop - read2.stop)
149
+ else # Normal overlap
150
+ read2.start - read1.start
151
+ end
152
+ end
153
+ end
154
+
155
+ # Calculates summed dissimilarity between a group of reads
128
156
  #
129
157
  # @param reads [Array<NGSCI::Read>] Array of reads
130
- # @return avg_overlap [Integer] Summed overlap between reads
131
- def summed_overlaps(reads)
158
+ # @return [Integer] Sum of all dissimilarities between the group of reads
159
+ def summed_dissimilarity(reads)
132
160
  numreads = reads.size
133
161
  sum=0
134
- unless numreads == 1
162
+ unless numreads <= 1
135
163
  i = 0
136
164
  while i < numreads
137
165
  r1 = reads[i] # for each of n reads
138
166
  sum+=reads.
139
167
  reject{|r| r == r1}. # select the n-1 other reads
140
- map{|r| overlap(r,r1)}. # calculate their overlap to r1
168
+ map{|r| dissimilarity(r,r1)}. # calculate their overlap to r1
141
169
  reduce(:+)
142
170
  i+=1
143
171
  end
144
172
  end
145
173
  return sum
174
+ end
175
+
176
+ # Calculates the average summed dissimilarity (per read) of that read to all other reads
177
+ #
178
+ # @param [Integer] read_length The read length
179
+ # @return [Integer] avg_summed_dissimilarity
180
+ def max_summed_dissimilarity(read_length)
181
+ # For each unique read under maximum saturation, calculate the sum of dissimilarities for that read to all other reads
182
+ summed_dissimilarities = (1..read_length).to_a.map { |r|
183
+ (read_length ** 2) / 2 - read_length*r + read_length/2 + r**2 - r }.reduce(:+)
146
184
  end
147
185
 
148
- # Calculation of the overlap between two reads
149
- #
150
- # @param read1 [NGSCI::Read] First read to be compared
151
- # @param read2 [NGSCI::Read] First read to be compared
152
- # @return overlap_length [Integer] Length of overlap
153
- def overlap(read1,read2)
154
- if read1.start > read2.start
155
- if read1.stop < read2.stop # Read 1 is inside read 2
156
- read1.stop - read1.start
157
- else # Normal overlap
158
- read2.stop - read1.start
159
- end
160
- else
161
- if read1.stop > read2.stop # Read 2 is inside read 1
162
- read2.stop - read2.start
163
- else # Normal overlap
164
- read1.stop - read2.start
165
- end
166
- end
167
- end
186
+ # Calculates the denominator for the complexity index from the read length, assuming maximum saturation (i.e. number of unique reads == read_length)
187
+ # unique reads /read length * summed_dissimilarity / (max_summed_dissimilarity/(read length * read length)
188
+ # Denomiator = read length * max_summed_dissimilarity / (read_length * read_length)
189
+ #
190
+ # @param [Integer] read_length The read length
191
+ # @return [Float] denominator The denominator including normalization factors for the complexity index 349184
192
+ def denominator_calc(read_length)
193
+ read_length*max_summed_dissimilarity(read_length)
194
+ end
195
+
168
196
 
169
- # Loads the read length from a bam file into the @buffer variable
197
+ # Calculates the read length of a bam file by sampling at least on full block of reads
170
198
  #
171
- def read_length
172
- buffer=0
173
- stats=@bam.index_stats.select {|k,v| k != "*" && v[:mapped_reads] > 0}
199
+ # @param [Bio::DB::Sam] bam A bam reader object
200
+ # @param [Integer] block_size The number of reads to read from a bam file
201
+ # @return [Integer] read_length The read length acquired from reading a block at a time until at least 100 reads are acquired
202
+ def self.read_length_calc(bam,block_size)
203
+ stats=bam.index_stats.select {|k,v| k != "*" && v[:mapped_reads] > 0}
174
204
  if stats.empty?
175
205
  raise NGSCIIOError.new "BAM file is empty! Check samtools idxstats."
176
- else
177
- i=0
178
- lengths=[]
179
- test = @block_size
180
- while i <= test
181
- @bam.view do |read|
182
- lengths << read.seq.size
183
- i +=1
184
- end
185
- if i == test && lengths.size < 100
186
- test += @block_size
187
- end
206
+ end
207
+ i=0
208
+ lengths=[]
209
+ test = block_size
210
+ while i <= test
211
+ bam.view do |read|
212
+ lengths << read.seq.size
213
+ i +=1
214
+ end
215
+ if i == test && lengths.size < 100
216
+ test += block_size
188
217
  end
189
- @buffer = lengths.max
190
- @denom = @buffer**2 * (@buffer - 1)**2
191
218
  end
219
+ lengths.max
192
220
  end
193
221
 
194
222
  # Converts strand specific BAM read into a sequence object format
195
223
  # Uses the @strand instance variable to determine the strand of conversion
196
224
  #
197
- # @param read [Bio::DB::Alignment] Read to be converted.
198
- # @return read [NGSCI::Read] Converted Read object
225
+ # @param [Bio::DB::Alignment] read Read to be converted.
226
+ # @return [NGSCI::Read] read Converted Read object
199
227
  def convert(read)
200
228
  unless read.query_unmapped
201
229
  if @strand
@@ -212,7 +240,7 @@ module NGSCI
212
240
  # Assumes paired-end strand-specific sequencing with "fr" chemistry
213
241
  #
214
242
  # @param read [Bio::DB::Alignment] Read to be converted.
215
- # @return read [NGSCI::Read] Converted Read object
243
+ # @return [NGSCI::Read] Converted Read object
216
244
  def fr(read)
217
245
  if read.first_in_pair
218
246
  read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
@@ -226,7 +254,7 @@ module NGSCI
226
254
  # Assumes paired-end strand-specific sequencing with "rf" chemistry
227
255
  #
228
256
  # @param read [Bio::DB::Alignment] Read to be converted.
229
- # @return read [NGSCI::Read] Converted Read object
257
+ # @return [NGSCI::Read] Converted Read object
230
258
  def rf(read)
231
259
  if read.first_in_pair
232
260
  read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+")
@@ -240,7 +268,7 @@ module NGSCI
240
268
  # Assumes single-end strand-specific sequencing with "f" chemistry
241
269
  #
242
270
  # @param read [Bio::DB::Alignment] Read to be converted.
243
- # @return read [NGSCI::Read] Converted Read object
271
+ # @return [NGSCI::Read] Converted Read object
244
272
  def f(read)
245
273
  read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
246
274
  end
@@ -249,7 +277,7 @@ module NGSCI
249
277
  #
250
278
  # @param read [Bio::DB::Alignment] Aligned read to be converted
251
279
  # @param strand [String] Strand of read
252
- # @return read [NGSCI::Read] Converted Read object
280
+ # @return [NGSCI::Read] Converted Read object
253
281
  def newread(read,strand: nil)
254
282
  Read.new(read.pos,read.pos+read.seq.size,strand: strand)
255
283
  end
@@ -257,7 +285,7 @@ module NGSCI
257
285
  # Acquires names and sizes of reference sequences included in the bam file
258
286
  #
259
287
  # @param reference [String] Path to reference fasta file.
260
- # @return chromosomes [Hash<Symbol,Object>] A dictionary of chromosome sizes
288
+ # @return [Hash<Symbol,Integer>] A dictionary of chromosome sizes
261
289
  def reference_sequences(reference)
262
290
  chromosomes={}
263
291
  Bio::FastaFormat.open(@reference).each_entry do |f|
@@ -265,6 +293,7 @@ module NGSCI
265
293
  end
266
294
  chromosomes.select {|chrom| @bam.index_stats.keys.include?(chrom)}
267
295
  end
296
+
268
297
  # Exports the results to outfile
269
298
  #
270
299
  # @param outfile [String] Path to outfile
data/lib/NGSCI/read.rb CHANGED
@@ -4,9 +4,10 @@ module NGSCI
4
4
  #
5
5
  # @!attribute [r] start
6
6
  # @!attribute [r] stop
7
+ # @!attribute [r] length
7
8
  # @!attribute [r] strand
8
9
  class Read
9
- attr_reader :start, :stop, :strand
10
+ attr_reader :start, :stop, :length, :strand
10
11
  def initialize(start,stop,strand: nil)
11
12
  =begin DEPRECATED chromosome variable
12
13
  unless chr.is_a?(String)
@@ -24,8 +25,8 @@ module NGSCI
24
25
  end
25
26
  @start=start
26
27
  @stop=stop
28
+ @length=stop-start
27
29
  @strand=strand
28
30
  end
29
-
30
31
  end
31
32
  end
data/lib/NGSCI/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module NGSCI
2
- VERSION = "0.0.1.a"
2
+ VERSION = "0.0.2.b"
3
3
  end
data/lib/NGSCI.rb CHANGED
@@ -1,20 +1,21 @@
1
- require 'yell'
1
+
2
+
2
3
 
3
4
  # NGSCI stands for Sequencing Complexity Index
4
5
  # This program calculates a sequencing complexity index for each base and/or strand in a genome.
5
6
  # This program calculates this by averaging average overlaps of reads aligned to that base.
6
7
  module NGSCI
8
+ require 'yell'
7
9
  # For custom error handling in the future, unimplemented
8
10
  class NGSCIError < StandardError; end
9
11
  class NGSCIIOError < NGSCIError; end
10
- class NGSCIArgError < NGSCIError; end
11
-
12
+ class NGSCIArgError < NGSCIError; end
12
13
 
13
14
  # Create the universal logger and include it in Object
14
15
  # making the logger object available everywhere
15
16
  format = Yell::Formatter.new("[%5L] %d : %m", "%Y-%m-%d %H:%M:%S")
16
17
  # http://xkcd.com/1179/
17
- Yell.new(:format => format) do |l|
18
+ logger = Yell.new(:format => format) do |l|
18
19
  l.level = :info
19
20
  l.name = Object
20
21
  l.adapter STDOUT, level: [:debug, :info, :warn]
@@ -29,3 +30,4 @@ require 'NGSCI/cmd'
29
30
  require 'NGSCI/version'
30
31
  require 'NGSCI/calculator'
31
32
  require 'NGSCI/read'
33
+ #require 'yell'
data/ngs-ci.gemspec CHANGED
@@ -22,14 +22,15 @@ Gem::Specification.new do |spec|
22
22
  spec.add_dependency 'trollop','~> 2.1.2'
23
23
  spec.add_dependency 'bio-samtools', '= 2.3.2'
24
24
  spec.add_dependency 'parallel', '~> 1.4'
25
- spec.add_dependency 'yell'
25
+ spec.add_dependency 'yell', '~> 2'
26
26
  spec.add_dependency "ruby-prof", "~> 0.15"
27
- spec.has_rdoc = 'yard'
27
+ spec.has_rdoc = 'yard', '~> 0'
28
28
 
29
- spec.add_development_dependency "bundler"
29
+ spec.add_development_dependency "bundler", "~> 1"
30
30
  spec.add_development_dependency "rake", "~> 10.0"
31
31
  spec.add_development_dependency "rspec", "~> 3.1"
32
+ spec.add_development_dependency "pry", "~> 0"
32
33
  #spec.add_development_dependency "guard", "~> 2.12"
33
- spec.add_development_dependency "coveralls"
34
+ spec.add_development_dependency "coveralls", "~> 0"
34
35
  #spec.add_development_dependency "cucumber", "~> 1.3"
35
36
  end
@@ -3,11 +3,11 @@ require 'bio-samtools'
3
3
 
4
4
  testbam="spec/test_files/test.bam"
5
5
  emptybam="spec/test_files/empty.bam"
6
+ saturatedbam="spec/test_files/saturated.bam"
6
7
  testfasta="spec/test_files/test.fa"
7
8
  testout="spec/test_files/testfile.txt"
8
9
 
9
10
 
10
-
11
11
  describe "#run" do
12
12
  context "during a strand specific run" do
13
13
  before(:each) do
@@ -68,21 +68,21 @@ describe "#sci" do
68
68
  context "when passed an array of read objects" do
69
69
  before(:each) do
70
70
  @calc = NGSCI::Calculator.new(testbam,testfasta)
71
- @bam = Bio::DB::Sam.new(:bam=>testbam,:fasta=>testfasta)
71
+ @bam = Bio::DB::Sam.new(:bam=>saturatedbam,:fasta=>testfasta)
72
72
  @bam.open
73
73
  @reads = []
74
- @bam.fetch("NC_001988.2",75,75){|x| read = @calc.convert(x); @reads << read unless read.nil?}
75
- @reads = @reads.uniq{|r|r.start}
74
+ @bam.fetch("NC_001988.2",76,76){|x| read = @calc.convert(x); @reads << read unless read.nil?}
75
+ @reads = @reads.uniq{|x| x.start}
76
76
  end
77
77
  it "returns an array" do
78
78
  expect(@calc.sci(@reads)).to be_kind_of(Array)
79
79
  end
80
80
  it "returns the sequencing complexity index" do
81
- expect(@calc.sci(@reads)[-1]).to eq(0.0)
81
+ expect(@calc.sci(@reads)[-1]).to eq(100.0)
82
82
  end
83
83
  end
84
84
  context "when passed an empty array" do
85
- it "returns nil" do
85
+ it "returns zero" do
86
86
  @calc = NGSCI::Calculator.new(testbam,testfasta)
87
87
  empty_sci = @calc.sci([])[-1]
88
88
  expect(empty_sci).to be_zero
@@ -90,26 +90,36 @@ describe "#sci" do
90
90
  end
91
91
  end
92
92
 
93
- describe "#read_length" do
94
- it "calculates the read length" do
95
- @calc=NGSCI::Calculator.new(testbam,testfasta)
96
- expect(@calc.buffer).to eq(76)
93
+
94
+ describe "#dissimilarity" do
95
+ before(:each) do
96
+ @bam=Bio::DB::Sam.new(:bam => testbam, :fasta => testfasta)
97
+ @bam.open
98
+ @reads = []
99
+ @bam.fetch("NC_001988.2",0,200) {|x| @reads << x }
100
+ @calc = NGSCI::Calculator.new(testbam,testfasta)
101
+ @read1 = @calc.convert(@reads[2])
102
+ @read2 = @calc.convert(@reads[3])
97
103
  end
98
- it "fails on an empty bam file" do
99
- expect{NGSCI::Calculator.new(emptybam,testfasta)}.to raise_error(NGSCI::NGSCIIOError)
100
- `rm #{emptybam}.bai`
104
+
105
+ it "calculates the unique bases of the first read from the second" do
106
+ expect(@calc.dissimilarity(@read1,@read2)).to eq(62)
107
+ end
108
+
109
+ it "calculates the unique bases, regardless of the order" do
110
+ expect(@calc.dissimilarity(@read2,@read1)).to eq(62)
101
111
  end
102
112
  end
103
113
 
104
- describe "#summed_overlaps" do
114
+ describe "#summed_dissimilarity" do
105
115
  it "returns an int" do
106
116
  @bam=Bio::DB::Sam.new(:bam=>testbam,:fasta=>testfasta)
107
117
  @bam.open
108
118
  @reads = []
109
119
  @calc=NGSCI::Calculator.new(testbam,testfasta)
110
- @bam.fetch("NC_001988.2",8,75) {|x| read=@calc.convert(x); @reads << read if read}
120
+ @bam.fetch("NC_001988.2",8,75) {|x| @reads << @calc.convert(x) }
111
121
  @reads = @reads.uniq{|r| r.start}
112
- expect(@calc.summed_overlaps(@reads)).to be_an(Integer)
122
+ expect(@calc.summed_dissimilarity(@reads)).to be_an(Integer)
113
123
  end
114
124
  context "when passed an array of read objects" do
115
125
  before(:each) do
@@ -120,13 +130,15 @@ describe "#summed_overlaps" do
120
130
  @bam.fetch("NC_001988.2",8,75) {|x| read=@calc.convert(x); @reads << read if read}
121
131
  @reads = @reads.uniq{|r| r.start}
122
132
  end
123
- it "returns the #overlap of two reads" do
124
- summed_overlap = 2*@calc.overlap(@reads[0],@reads[1])
125
- expect(@calc.summed_overlaps(@reads[0..1])).to eq(summed_overlap)
133
+ context "when passed two reads" do
134
+ it "returns the sum of their dissimilarities" do
135
+ summed_dissimilarity = @calc.dissimilarity(@reads[0],@reads[1]) + @calc.dissimilarity(@reads[1],@reads[0])
136
+ expect(@calc.summed_dissimilarity(@reads[0..1])).to eq(summed_dissimilarity)
137
+ end
126
138
  end
127
139
 
128
- it "calculates the average overlap between a group of reads" do
129
- expect(@calc.summed_overlaps(@reads[0..7]).round(4)).to eq(380.0)
140
+ it "calculates the summed dissimlarity of a group of reads" do
141
+ expect(@calc.summed_dissimilarity(@reads[0..7]).round(4)).to eq(532.0)
130
142
  end
131
143
  end
132
144
  context "when passed an array with a single read object" do
@@ -136,33 +148,97 @@ describe "#summed_overlaps" do
136
148
  @reads=[]
137
149
  @calc=NGSCI::Calculator.new(testbam,testfasta)
138
150
  @bam.fetch("NC_001988.2",8,75) {|x| read=@calc.convert(x); @reads << read if read}
139
- expect(@calc.summed_overlaps([@reads[0]])).to be_zero
151
+ expect(@calc.summed_dissimilarity([@reads[0]])).to be_zero
140
152
  end
141
153
  end
142
154
  context "when passed an empty array" do
143
155
  it "returns zero" do
144
156
  @calc=NGSCI::Calculator.new(testbam,testfasta)
145
- expect(@calc.summed_overlaps([])).to be_zero
157
+ expect(@calc.summed_dissimilarity([])).to be_zero
146
158
  end
147
159
  end
148
160
  end
149
161
 
150
- describe "#overlap" do
151
- before(:each) do
152
- @bam=Bio::DB::Sam.new(:bam=>testbam,:fasta=>testfasta)
153
- @bam.open
154
- @reads=[]
155
- @bam.fetch("NC_001988.2",0,200) {|x| @reads << x}
156
- @calc=NGSCI::Calculator.new(testbam,testfasta)
157
- @read1=@calc.convert(@reads[2])
158
- @read2=@calc.convert(@reads[3])
162
+ describe "#max_summed_dissimilarity" do
163
+ context "when passed and integer read length" do
164
+ before(:each) do
165
+ @read_length = 76
166
+ @calc = NGSCI::Calculator.new(testbam,testfasta)
167
+ end
168
+ it "returns a float" do
169
+ expect(@calc.max_summed_dissimilarity(@read_length)).to be_kind_of Integer
170
+ end
159
171
  end
160
- it "calculates the overlap between two reads" do
161
- expect(@calc.overlap(@read1,@read2)).to eq(14)
172
+ context "when calculating the maximum summed dissimilarity" do
173
+ before(:each) do
174
+ @read_length = 76
175
+ @calc = NGSCI::Calculator.new(saturatedbam,testfasta)
176
+ @bam = Bio::DB::Sam.new(:bam=>saturatedbam,:fasta=>testfasta)
177
+ @bam.open
178
+ @reads = []
179
+ @bam.fetch("NC_001988.2",76,76){|x| read = @calc.convert(x); @reads << read unless read.nil?}
180
+ @reads = @reads.uniq{|x| x.start}
181
+ end
182
+ it "yields the triangular sum dissimilarity" do
183
+ # This test demonstrates that the simplified (more efficient) formula for maximum summed dissimilarity
184
+ # is equivalent to the triangular sum formula for the maximum summed dissimilarity within a group of reads
185
+ def tri(x,n=0)
186
+ return x == 0 ? n : tri(x-1,n+x)
187
+ end
188
+ triangular_sum = (1..@read_length).to_a.map{|x|
189
+ tri(@read_length - x) + tri(x - 1)
190
+ }.reduce(:+)
191
+ calculated_max_summed_dissimilarity = @calc.max_summed_dissimilarity(@read_length)
192
+ expect(calculated_max_summed_dissimilarity).to eq(triangular_sum)
193
+ end
194
+ it "is equal to the #summed_dissimilarity of saturated reads" do
195
+ # This test demonstrates that the formula for the theoretical maximum summed dissimilarity among reads
196
+ # is equivalent to the summed dissimilarity under maximum saturation (the saturated.bam test file)
197
+ theoretical_max_summed_dissimilarity = @calc.max_summed_dissimilarity(@read_length)
198
+ expect(theoretical_max_summed_dissimilarity).to eq(@calc.summed_dissimilarity(@reads))
199
+ end
162
200
  end
201
+ context "when averaging per read" do
202
+ it "is equal to 1/3 times (read_length - 1)" do
203
+ @calc = NGSCI::Calculator.new(testbam,testfasta)
204
+ (32..200).each do |read_length|
205
+ calculated_max_summed_dissimilarity = @calc.max_summed_dissimilarity(read_length)/(read_length*read_length)
206
+ expect(calculated_max_summed_dissimilarity).to eq((read_length-1)/3)
207
+ end
208
+ end
209
+ end
210
+ end
163
211
 
164
- it "calculates the overlap regardless of order" do
165
- expect(@calc.overlap(@read2,@read1)).to eq(14)
212
+
213
+ describe "#denominator_calc" do
214
+ context "when passed and integer read length" do
215
+ before(:each) do
216
+ @calc = NGSCI::Calculator.new(testbam,testfasta)
217
+ end
218
+ it "returns a float denominator" do
219
+ read_length = 76
220
+ expect(@calc.denominator_calc(read_length)).to be_kind_of Integer
221
+ end
222
+ end
223
+ it "is the max_summed_dissimilarity * read length" do
224
+ @calc = NGSCI::Calculator.new(testbam,testfasta)
225
+ (32..200).each do |read_length|
226
+ max_sum_dissim = @calc.max_summed_dissimilarity(read_length)
227
+ expect(@calc.denominator_calc(read_length)).to eq(read_length*max_sum_dissim)
228
+ end
229
+ end
230
+ end
231
+
232
+ describe "#read_length_calc" do
233
+ it "calculates the read length" do
234
+ @bam=Bio::DB::Sam.new(:bam => testbam,:fasta => testfasta)
235
+ test_block_size = 100
236
+ expect(NGSCI::Calculator.read_length_calc(@bam,100)).to eq(76)
237
+ end
238
+
239
+ it "fails on an empty bam file" do
240
+ @emptybam = Bio::DB::Sam.new(:bam => emptybam, :fasta => testfasta)
241
+ expect{NGSCI::Calculator.read_length_calc(@emptybam,100)}.to raise_error(NGSCI::NGSCIIOError)
166
242
  end
167
243
  end
168
244
 
@@ -1,35 +1,57 @@
1
1
  require 'spec_helper'
2
2
 
3
3
 
4
- describe "reads" do
5
-
6
- it "fails to instantiate on a string start site" do
7
- expect{NGSCI::Read.new("foo",3)}.to raise_error(NGSCI::NGSCIError)
8
- end
9
-
10
- it "fails to instantiate on a string stop site" do
11
- expect{NGSCI::Read.new(1,"foo")}.to raise_error(NGSCI::NGSCIError)
12
- end
13
-
14
- it "fails to instantiate when the stop site is greater than the start site" do
15
- expect{NGSCI::Read.new(3,1)}.to raise_error(NGSCI::NGSCIError)
16
- end
17
-
18
- it "fails to instantiate on an improper strand argument" do
19
- expect{NGSCI::Read.new(1,3,strand:"foo")}.to raise_error(NGSCI::NGSCIError)
20
- end
21
-
22
- it "fails to instantiate without the three necessary arguments" do
23
- expect{NGSCI::Read.new(1)}.to raise_error(ArgumentError)
4
+ describe NGSCI::Read do
5
+ context "before created" do
6
+ it "fails to instantiate on a string start site" do
7
+ expect{NGSCI::Read.new("foo",3)}.to raise_error(NGSCI::NGSCIError)
8
+ end
9
+
10
+ it "fails to instantiate on a string stop site" do
11
+ expect{NGSCI::Read.new(1,"foo")}.to raise_error(NGSCI::NGSCIError)
12
+ end
13
+
14
+ it "fails to instantiate when the stop site is greater than the start site" do
15
+ expect{NGSCI::Read.new(3,1)}.to raise_error(NGSCI::NGSCIError)
16
+ end
17
+
18
+ it "fails to instantiate on an improper strand argument" do
19
+ expect{NGSCI::Read.new(1,3,strand:"foo")}.to raise_error(NGSCI::NGSCIError)
20
+ end
21
+
22
+ it "fails to instantiate without the three necessary arguments" do
23
+ expect{NGSCI::Read.new(1)}.to raise_error(ArgumentError)
24
+ end
25
+
26
+ it "instantiates a new read with proper unstranded arguments" do
27
+ expect{NGSCI::Read.new(1,3)}.to_not raise_error
28
+ end
29
+
30
+ it "instantiates a new read with proper stranded arguments" do
31
+ expect{NGSCI::Read.new(1,3,strand:"+")}.to_not raise_error
32
+ end
24
33
  end
25
34
 
26
- it "instantiates a new read with proper unstranded arguments" do
27
- expect{NGSCI::Read.new(1,3)}.to_not raise_error
28
- end
35
+ context "after created" do
36
+ before(:each) do
37
+ @read = NGSCI::Read.new(1,3,strand:"+")
38
+ end
39
+
40
+ it "has a start attribute" do
41
+ expect(@read.methods).to include(:start)
42
+ end
43
+
44
+ it "has a stop attribute" do
45
+ expect(@read.methods).to include(:stop)
46
+ end
47
+
48
+ it "has a length attribute" do
49
+ expect(@read.methods).to include(:length)
50
+ end
51
+
52
+ it "has a strand attribute" do
53
+ expect(@read.methods).to include(:strand)
54
+ end
29
55
 
30
- it "instantiates a new read with proper stranded arguments" do
31
- expect{NGSCI::Read.new(1,3,strand:"+")}.to_not raise_error
32
56
  end
33
-
34
-
35
57
  end
Binary file
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ngs-ci
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1.a
4
+ version: 0.0.2.b
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Ralston
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-28 00:00:00.000000000 Z
11
+ date: 2015-12-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: trollop
@@ -56,16 +56,16 @@ dependencies:
56
56
  name: yell
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ">="
59
+ - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0'
61
+ version: '2'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ">="
66
+ - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '0'
68
+ version: '2'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: ruby-prof
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -84,16 +84,16 @@ dependencies:
84
84
  name: bundler
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ">="
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '0'
89
+ version: '1'
90
90
  type: :development
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - ">="
94
+ - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '0'
96
+ version: '1'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: rake
99
99
  requirement: !ruby/object:Gem::Requirement
@@ -122,18 +122,32 @@ dependencies:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
124
  version: '3.1'
125
+ - !ruby/object:Gem::Dependency
126
+ name: pry
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
125
139
  - !ruby/object:Gem::Dependency
126
140
  name: coveralls
127
141
  requirement: !ruby/object:Gem::Requirement
128
142
  requirements:
129
- - - ">="
143
+ - - "~>"
130
144
  - !ruby/object:Gem::Version
131
145
  version: '0'
132
146
  type: :development
133
147
  prerelease: false
134
148
  version_requirements: !ruby/object:Gem::Requirement
135
149
  requirements:
136
- - - ">="
150
+ - - "~>"
137
151
  - !ruby/object:Gem::Version
138
152
  version: '0'
139
153
  description: Calculated a metric that estimates read complexity at each base for RNA-seq
@@ -169,6 +183,8 @@ files:
169
183
  - spec/lib/read_spec.rb
170
184
  - spec/spec_helper.rb
171
185
  - spec/test_files/empty.bam
186
+ - spec/test_files/saturated.bam
187
+ - spec/test_files/saturated.bam.bai
172
188
  - spec/test_files/test.bam
173
189
  - spec/test_files/test.bam.bai
174
190
  - spec/test_files/test.fa
@@ -204,6 +220,11 @@ test_files:
204
220
  - spec/lib/read_spec.rb
205
221
  - spec/spec_helper.rb
206
222
  - spec/test_files/empty.bam
223
+ - spec/test_files/saturated.bam
224
+ - spec/test_files/saturated.bam.bai
207
225
  - spec/test_files/test.bam
208
226
  - spec/test_files/test.bam.bai
209
227
  - spec/test_files/test.fa
228
+ has_rdoc:
229
+ - yard
230
+ - "~> 0"