ngs-ci 0.0.1.a → 0.0.2.b
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/README.md +20 -7
- data/TODO.org +13 -5
- data/lib/NGSCI/calculator.rb +95 -66
- data/lib/NGSCI/read.rb +3 -2
- data/lib/NGSCI/version.rb +1 -1
- data/lib/NGSCI.rb +6 -4
- data/ngs-ci.gemspec +5 -4
- data/spec/lib/calculator_spec.rb +112 -36
- data/spec/lib/read_spec.rb +49 -27
- data/spec/test_files/saturated.bam +0 -0
- data/spec/test_files/saturated.bam.bai +0 -0
- metadata +33 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f537bf2115279943c5673a5dde124ae358972f2
|
4
|
+
data.tar.gz: fcb22914dfe57a1f9bdfddb71e8cfd453c9431dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 123ab12bb812db0c6f2fafe6a6785a792b0e3596b154e45252be188ad7ab192755f4c715268b3b96f1ada8a7a6610afe67b4608352574bd033706868f410c202
|
7
|
+
data.tar.gz: 7f77bcb7e67204cb71f69869f8383233dcbc68d9182b7b62deb6cd186b404342d6aa65365f10f49d0eb7473f83fe562ac12348f74b63936f78d68ee3b9d1cf26
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,12 +1,18 @@
|
|
1
|
-
[![Build Status](https://travis-ci.org/MatthewRalston/
|
1
|
+
[![Build Status](https://travis-ci.org/MatthewRalston/ngs-ci.png?branch=master)](https://travis-ci.org/MatthewRalston/ngs-ci)
|
2
2
|
|
3
|
-
[![Gem Version](https://badge.fury.io/rb/
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/ngs-ci.png)](http://badge.fury.io/rb/ngs-ci)
|
4
4
|
|
5
|
-
[![Coverage Status](https://coveralls.io/repos/MatthewRalston/
|
5
|
+
[![Coverage Status](https://coveralls.io/repos/MatthewRalston/ngs-ci/badge.svg?branch=master&service=github)](https://coveralls.io/github/MatthewRalston/ngs-ci?branch=master)
|
6
6
|
|
7
7
|
|
8
|
+
# Todo
|
8
9
|
|
9
|
-
|
10
|
+
The inconsistency between the max summed dissimilarity and the denominator calculation is likely:
|
11
|
+
1. Issue in the complexity index (when present is max,
|
12
|
+
a. ( present - missing ) / max = present/max_similarity - missing/max_dissim
|
13
|
+
|
14
|
+
|
15
|
+
# NGS Complexity Index
|
10
16
|
|
11
17
|
NOTE: This is a project in progress.
|
12
18
|
This gem will calculate a sequencing complexity index for BAM files.
|
@@ -16,7 +22,7 @@ This gem will calculate a sequencing complexity index for BAM files.
|
|
16
22
|
Add this line to your application's Gemfile:
|
17
23
|
|
18
24
|
```ruby
|
19
|
-
gem '
|
25
|
+
gem 'NGSCI'
|
20
26
|
```
|
21
27
|
|
22
28
|
And then execute:
|
@@ -25,11 +31,18 @@ And then execute:
|
|
25
31
|
|
26
32
|
Or install it yourself as:
|
27
33
|
|
28
|
-
$ gem install
|
34
|
+
$ gem install ngs-ci --pre
|
35
|
+
|
36
|
+
Or install manually:
|
37
|
+
|
38
|
+
$ git clone https://github.com/MatthewRalston/ngs-ci.git
|
39
|
+
$ cd ngs-ci
|
40
|
+
$ gem build ngs-ci.gemspec
|
41
|
+
$ gem install ngs-ci-[Version].gem
|
29
42
|
|
30
43
|
## Usage
|
31
44
|
|
32
|
-
|
45
|
+
* See ```--help``` for details. More to come.
|
33
46
|
|
34
47
|
## Contributing
|
35
48
|
|
data/TODO.org
CHANGED
@@ -30,10 +30,18 @@
|
|
30
30
|
**** 2850 (triangular number T(L-1) L=76 J=1
|
31
31
|
**** f(76) = 2850
|
32
32
|
* Notes
|
33
|
-
** U
|
33
|
+
** U = u/L
|
34
34
|
** U/L is the number of unique reads at that base, length normalized
|
35
|
-
**
|
36
|
-
**
|
37
|
-
**
|
38
|
-
|
35
|
+
** U*O/L vs. 200*U*O/(L^2)
|
36
|
+
** the average summed overlap is O/(L)
|
37
|
+
** average because different reads in the formation have different summed overlaps
|
38
|
+
** the average average overlap is 2L/3 or O/L/(L-1) or O/(L^2-L)
|
39
|
+
**
|
40
|
+
** D = d /
|
41
|
+
** the average summed dissimilarity is D/(L)
|
42
|
+
** average because different reads in the formation have different summed dissimilarities
|
43
|
+
** the average average dissimilarity is L/3 or D/L/(L-1) or D/(L^2-L)
|
44
|
+
** This matches the average similarity nicely...
|
45
|
+
**
|
46
|
+
** u*d / L*L*(L-1)
|
39
47
|
* Bugs
|
data/lib/NGSCI/calculator.rb
CHANGED
@@ -7,16 +7,21 @@ module NGSCI
|
|
7
7
|
|
8
8
|
# A calculator calculates the sequencing complexity index.
|
9
9
|
#
|
10
|
-
#
|
10
|
+
# @author Matthew Ralston
|
11
|
+
# @abstract A class for calculating the complexity index on next generation sequencing reads
|
12
|
+
# @attr_reader [Integer] block_size The block size for parallelizing disk access
|
13
|
+
# @attr_reader [Hash<Symbol,Integer>] chroms A hash of chromosomes and their sizes
|
14
|
+
# @attr_reader [Integer] read_length The read length obtained from a bam file
|
15
|
+
# @attr_reader [Integer] denominator The denominator and normalization factors calculated from the read length
|
11
16
|
class Calculator
|
12
|
-
attr_reader :
|
17
|
+
attr_reader :block_size, :chroms, :read_length, :denominator
|
13
18
|
|
14
19
|
# A new calculator to compute the sequencing complexity index given
|
15
20
|
# a loaded Bio::DB::Sam object and optional thread argument.
|
16
21
|
#
|
17
|
-
# @param
|
18
|
-
# @param
|
19
|
-
# @param
|
22
|
+
# @param [Bio::DB::Sam] bam Opened bam file with loaded reference.
|
23
|
+
# @param [Int] threads The number of threads used to compute NGSCI.
|
24
|
+
# @param [String] strand One of [FR RF F] or nil for strandedness.
|
20
25
|
def initialize(bam, reference, strand: nil, threads: 1)
|
21
26
|
@block_size = 1600
|
22
27
|
@results = nil
|
@@ -28,7 +33,8 @@ module NGSCI
|
|
28
33
|
@bam.open
|
29
34
|
@threads = threads
|
30
35
|
@chroms = reference_sequences(reference)
|
31
|
-
read_length
|
36
|
+
@read_length = NGSCI::Calculator.read_length_calc(@bam,@block_size)
|
37
|
+
@denominator = denominator_calc(@read_length)
|
32
38
|
if strand
|
33
39
|
unless %w(FR RF F).include?(strand)
|
34
40
|
raise NGSCI::NGSCIError.new "Strand specific option #{opts.strand} is invalid." +
|
@@ -42,6 +48,7 @@ module NGSCI
|
|
42
48
|
|
43
49
|
# Calculation of the sequencing complexity index
|
44
50
|
#
|
51
|
+
# @param runtime [false] Print profiling information?
|
45
52
|
def run(runtime: false)
|
46
53
|
RubyProf.start if runtime
|
47
54
|
# Convert each aligned read to Read clas
|
@@ -83,16 +90,16 @@ module NGSCI
|
|
83
90
|
#
|
84
91
|
# @param chrom [String] The chromosome from the bam file
|
85
92
|
# @param i [Integer] The number of blocks that have been read
|
86
|
-
# @return
|
87
|
-
# * :+ (Array[
|
88
|
-
# * :- (Array[
|
93
|
+
# @return [Hash<Symbol,Array>]
|
94
|
+
# * :+ (Array[Array]) The NGSCI for the + strand along the
|
95
|
+
# * :- (Array[Array]) The NGSCI for the - strand
|
89
96
|
def readblock(chrom,i)
|
90
97
|
reads=[]
|
91
98
|
results = @strand ? {"+" => [],"-" => []}: {nil => []}
|
92
|
-
start = [0,(i * @block_size) - @
|
99
|
+
start = [0,(i * @block_size) - @read_length].max
|
93
100
|
stop = [(i + 1) * @block_size, self.chroms[chrom]].min
|
94
101
|
@bam.fetch(chrom,start,stop) {|read| reads << convert(read)}
|
95
|
-
start += @
|
102
|
+
start += @read_length unless start == 0
|
96
103
|
reads.compact!
|
97
104
|
reads.sort_by!(&:start) unless reads.empty?
|
98
105
|
x=0
|
@@ -109,93 +116,114 @@ module NGSCI
|
|
109
116
|
return results
|
110
117
|
end
|
111
118
|
|
112
|
-
|
113
119
|
# Calculates sequencing complexity index for a single base
|
114
120
|
#
|
115
121
|
# @param reads [Array<NGSCI::Read>] A group of reads aligned to a single base.
|
116
|
-
# @return
|
122
|
+
# @return [Array<Integer,Integer,Float,Float>]
|
117
123
|
def sci(reads)
|
118
124
|
numreads=reads.size
|
119
125
|
# Groups reads by start site
|
120
126
|
# selects the largest read length from the groups
|
121
|
-
reads = reads.group_by(&:start).map{|k,v| v.max{|x,y|
|
122
|
-
|
127
|
+
reads = reads.group_by(&:start).map{|k,v| v.max{|x,y| x.length <=> y.length}}
|
128
|
+
d = summed_dissimilarity(reads)
|
123
129
|
uniquereads = reads.size
|
124
|
-
return [numreads,uniquereads,(
|
130
|
+
return [numreads,uniquereads,(d.to_f/@read_length).round(4),(100*uniquereads*d/@denominator).round(4)]
|
125
131
|
end
|
126
132
|
|
127
|
-
|
133
|
+
|
134
|
+
# Calculation of the dissimilarity between two reads
|
135
|
+
#
|
136
|
+
# @param read1 [NGSCI::Read] First read to be compared
|
137
|
+
# @param read2 [NGSCI::Read] Second read to be compared
|
138
|
+
# @return [Integer] Length of non-overlapping/unique bases
|
139
|
+
def dissimilarity(read1,read2)
|
140
|
+
if read1.start > read2.start
|
141
|
+
if read1.stop < read2.stop # Read 1 is inside read 2
|
142
|
+
(read1.start - read2.start) + (read2.stop - read1.stop)
|
143
|
+
else # Normal overlap
|
144
|
+
read1.start - read2.start
|
145
|
+
end
|
146
|
+
else
|
147
|
+
if read1.stop > read2.stop # Read 2 is inside read 1
|
148
|
+
(read2.start - read1.start) + (read1.stop - read2.stop)
|
149
|
+
else # Normal overlap
|
150
|
+
read2.start - read1.start
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Calculates summed dissimilarity between a group of reads
|
128
156
|
#
|
129
157
|
# @param reads [Array<NGSCI::Read>] Array of reads
|
130
|
-
# @return
|
131
|
-
def
|
158
|
+
# @return [Integer] Sum of all dissimilarities between the group of reads
|
159
|
+
def summed_dissimilarity(reads)
|
132
160
|
numreads = reads.size
|
133
161
|
sum=0
|
134
|
-
unless numreads
|
162
|
+
unless numreads <= 1
|
135
163
|
i = 0
|
136
164
|
while i < numreads
|
137
165
|
r1 = reads[i] # for each of n reads
|
138
166
|
sum+=reads.
|
139
167
|
reject{|r| r == r1}. # select the n-1 other reads
|
140
|
-
map{|r|
|
168
|
+
map{|r| dissimilarity(r,r1)}. # calculate their overlap to r1
|
141
169
|
reduce(:+)
|
142
170
|
i+=1
|
143
171
|
end
|
144
172
|
end
|
145
173
|
return sum
|
174
|
+
end
|
175
|
+
|
176
|
+
# Calculates the average summed dissimilarity (per read) of that read to all other reads
|
177
|
+
#
|
178
|
+
# @param [Integer] read_length The read length
|
179
|
+
# @return [Integer] avg_summed_dissimilarity
|
180
|
+
def max_summed_dissimilarity(read_length)
|
181
|
+
# For each unique read under maximum saturation, calculate the sum of dissimilarities for that read to all other reads
|
182
|
+
summed_dissimilarities = (1..read_length).to_a.map { |r|
|
183
|
+
(read_length ** 2) / 2 - read_length*r + read_length/2 + r**2 - r }.reduce(:+)
|
146
184
|
end
|
147
185
|
|
148
|
-
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
152
|
-
# @
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
read2.stop - read1.start
|
159
|
-
end
|
160
|
-
else
|
161
|
-
if read1.stop > read2.stop # Read 2 is inside read 1
|
162
|
-
read2.stop - read2.start
|
163
|
-
else # Normal overlap
|
164
|
-
read1.stop - read2.start
|
165
|
-
end
|
166
|
-
end
|
167
|
-
end
|
186
|
+
# Calculates the denominator for the complexity index from the read length, assuming maximum saturation (i.e. number of unique reads == read_length)
|
187
|
+
# unique reads /read length * summed_dissimilarity / (max_summed_dissimilarity/(read length * read length)
|
188
|
+
# Denomiator = read length * max_summed_dissimilarity / (read_length * read_length)
|
189
|
+
#
|
190
|
+
# @param [Integer] read_length The read length
|
191
|
+
# @return [Float] denominator The denominator including normalization factors for the complexity index 349184
|
192
|
+
def denominator_calc(read_length)
|
193
|
+
read_length*max_summed_dissimilarity(read_length)
|
194
|
+
end
|
195
|
+
|
168
196
|
|
169
|
-
#
|
197
|
+
# Calculates the read length of a bam file by sampling at least on full block of reads
|
170
198
|
#
|
171
|
-
|
172
|
-
|
173
|
-
|
199
|
+
# @param [Bio::DB::Sam] bam A bam reader object
|
200
|
+
# @param [Integer] block_size The number of reads to read from a bam file
|
201
|
+
# @return [Integer] read_length The read length acquired from reading a block at a time until at least 100 reads are acquired
|
202
|
+
def self.read_length_calc(bam,block_size)
|
203
|
+
stats=bam.index_stats.select {|k,v| k != "*" && v[:mapped_reads] > 0}
|
174
204
|
if stats.empty?
|
175
205
|
raise NGSCIIOError.new "BAM file is empty! Check samtools idxstats."
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
end
|
206
|
+
end
|
207
|
+
i=0
|
208
|
+
lengths=[]
|
209
|
+
test = block_size
|
210
|
+
while i <= test
|
211
|
+
bam.view do |read|
|
212
|
+
lengths << read.seq.size
|
213
|
+
i +=1
|
214
|
+
end
|
215
|
+
if i == test && lengths.size < 100
|
216
|
+
test += block_size
|
188
217
|
end
|
189
|
-
@buffer = lengths.max
|
190
|
-
@denom = @buffer**2 * (@buffer - 1)**2
|
191
218
|
end
|
219
|
+
lengths.max
|
192
220
|
end
|
193
221
|
|
194
222
|
# Converts strand specific BAM read into a sequence object format
|
195
223
|
# Uses the @strand instance variable to determine the strand of conversion
|
196
224
|
#
|
197
|
-
# @param
|
198
|
-
# @return
|
225
|
+
# @param [Bio::DB::Alignment] read Read to be converted.
|
226
|
+
# @return [NGSCI::Read] read Converted Read object
|
199
227
|
def convert(read)
|
200
228
|
unless read.query_unmapped
|
201
229
|
if @strand
|
@@ -212,7 +240,7 @@ module NGSCI
|
|
212
240
|
# Assumes paired-end strand-specific sequencing with "fr" chemistry
|
213
241
|
#
|
214
242
|
# @param read [Bio::DB::Alignment] Read to be converted.
|
215
|
-
# @return
|
243
|
+
# @return [NGSCI::Read] Converted Read object
|
216
244
|
def fr(read)
|
217
245
|
if read.first_in_pair
|
218
246
|
read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
|
@@ -226,7 +254,7 @@ module NGSCI
|
|
226
254
|
# Assumes paired-end strand-specific sequencing with "rf" chemistry
|
227
255
|
#
|
228
256
|
# @param read [Bio::DB::Alignment] Read to be converted.
|
229
|
-
# @return
|
257
|
+
# @return [NGSCI::Read] Converted Read object
|
230
258
|
def rf(read)
|
231
259
|
if read.first_in_pair
|
232
260
|
read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+")
|
@@ -240,7 +268,7 @@ module NGSCI
|
|
240
268
|
# Assumes single-end strand-specific sequencing with "f" chemistry
|
241
269
|
#
|
242
270
|
# @param read [Bio::DB::Alignment] Read to be converted.
|
243
|
-
# @return
|
271
|
+
# @return [NGSCI::Read] Converted Read object
|
244
272
|
def f(read)
|
245
273
|
read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
|
246
274
|
end
|
@@ -249,7 +277,7 @@ module NGSCI
|
|
249
277
|
#
|
250
278
|
# @param read [Bio::DB::Alignment] Aligned read to be converted
|
251
279
|
# @param strand [String] Strand of read
|
252
|
-
# @return
|
280
|
+
# @return [NGSCI::Read] Converted Read object
|
253
281
|
def newread(read,strand: nil)
|
254
282
|
Read.new(read.pos,read.pos+read.seq.size,strand: strand)
|
255
283
|
end
|
@@ -257,7 +285,7 @@ module NGSCI
|
|
257
285
|
# Acquires names and sizes of reference sequences included in the bam file
|
258
286
|
#
|
259
287
|
# @param reference [String] Path to reference fasta file.
|
260
|
-
# @return
|
288
|
+
# @return [Hash<Symbol,Integer>] A dictionary of chromosome sizes
|
261
289
|
def reference_sequences(reference)
|
262
290
|
chromosomes={}
|
263
291
|
Bio::FastaFormat.open(@reference).each_entry do |f|
|
@@ -265,6 +293,7 @@ module NGSCI
|
|
265
293
|
end
|
266
294
|
chromosomes.select {|chrom| @bam.index_stats.keys.include?(chrom)}
|
267
295
|
end
|
296
|
+
|
268
297
|
# Exports the results to outfile
|
269
298
|
#
|
270
299
|
# @param outfile [String] Path to outfile
|
data/lib/NGSCI/read.rb
CHANGED
@@ -4,9 +4,10 @@ module NGSCI
|
|
4
4
|
#
|
5
5
|
# @!attribute [r] start
|
6
6
|
# @!attribute [r] stop
|
7
|
+
# @!attribute [r] length
|
7
8
|
# @!attribute [r] strand
|
8
9
|
class Read
|
9
|
-
attr_reader :start, :stop, :strand
|
10
|
+
attr_reader :start, :stop, :length, :strand
|
10
11
|
def initialize(start,stop,strand: nil)
|
11
12
|
=begin DEPRECATED chromosome variable
|
12
13
|
unless chr.is_a?(String)
|
@@ -24,8 +25,8 @@ module NGSCI
|
|
24
25
|
end
|
25
26
|
@start=start
|
26
27
|
@stop=stop
|
28
|
+
@length=stop-start
|
27
29
|
@strand=strand
|
28
30
|
end
|
29
|
-
|
30
31
|
end
|
31
32
|
end
|
data/lib/NGSCI/version.rb
CHANGED
data/lib/NGSCI.rb
CHANGED
@@ -1,20 +1,21 @@
|
|
1
|
-
|
1
|
+
|
2
|
+
|
2
3
|
|
3
4
|
# NGSCI stands for Sequencing Complexity Index
|
4
5
|
# This program calculates a sequencing complexity index for each base and/or strand in a genome.
|
5
6
|
# This program calculates this by averaging average overlaps of reads aligned to that base.
|
6
7
|
module NGSCI
|
8
|
+
require 'yell'
|
7
9
|
# For custom error handling in the future, unimplemented
|
8
10
|
class NGSCIError < StandardError; end
|
9
11
|
class NGSCIIOError < NGSCIError; end
|
10
|
-
class NGSCIArgError < NGSCIError; end
|
11
|
-
|
12
|
+
class NGSCIArgError < NGSCIError; end
|
12
13
|
|
13
14
|
# Create the universal logger and include it in Object
|
14
15
|
# making the logger object available everywhere
|
15
16
|
format = Yell::Formatter.new("[%5L] %d : %m", "%Y-%m-%d %H:%M:%S")
|
16
17
|
# http://xkcd.com/1179/
|
17
|
-
Yell.new(:format => format) do |l|
|
18
|
+
logger = Yell.new(:format => format) do |l|
|
18
19
|
l.level = :info
|
19
20
|
l.name = Object
|
20
21
|
l.adapter STDOUT, level: [:debug, :info, :warn]
|
@@ -29,3 +30,4 @@ require 'NGSCI/cmd'
|
|
29
30
|
require 'NGSCI/version'
|
30
31
|
require 'NGSCI/calculator'
|
31
32
|
require 'NGSCI/read'
|
33
|
+
#require 'yell'
|
data/ngs-ci.gemspec
CHANGED
@@ -22,14 +22,15 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.add_dependency 'trollop','~> 2.1.2'
|
23
23
|
spec.add_dependency 'bio-samtools', '= 2.3.2'
|
24
24
|
spec.add_dependency 'parallel', '~> 1.4'
|
25
|
-
spec.add_dependency 'yell'
|
25
|
+
spec.add_dependency 'yell', '~> 2'
|
26
26
|
spec.add_dependency "ruby-prof", "~> 0.15"
|
27
|
-
spec.has_rdoc = 'yard'
|
27
|
+
spec.has_rdoc = 'yard', '~> 0'
|
28
28
|
|
29
|
-
spec.add_development_dependency "bundler"
|
29
|
+
spec.add_development_dependency "bundler", "~> 1"
|
30
30
|
spec.add_development_dependency "rake", "~> 10.0"
|
31
31
|
spec.add_development_dependency "rspec", "~> 3.1"
|
32
|
+
spec.add_development_dependency "pry", "~> 0"
|
32
33
|
#spec.add_development_dependency "guard", "~> 2.12"
|
33
|
-
spec.add_development_dependency "coveralls"
|
34
|
+
spec.add_development_dependency "coveralls", "~> 0"
|
34
35
|
#spec.add_development_dependency "cucumber", "~> 1.3"
|
35
36
|
end
|
data/spec/lib/calculator_spec.rb
CHANGED
@@ -3,11 +3,11 @@ require 'bio-samtools'
|
|
3
3
|
|
4
4
|
testbam="spec/test_files/test.bam"
|
5
5
|
emptybam="spec/test_files/empty.bam"
|
6
|
+
saturatedbam="spec/test_files/saturated.bam"
|
6
7
|
testfasta="spec/test_files/test.fa"
|
7
8
|
testout="spec/test_files/testfile.txt"
|
8
9
|
|
9
10
|
|
10
|
-
|
11
11
|
describe "#run" do
|
12
12
|
context "during a strand specific run" do
|
13
13
|
before(:each) do
|
@@ -68,21 +68,21 @@ describe "#sci" do
|
|
68
68
|
context "when passed an array of read objects" do
|
69
69
|
before(:each) do
|
70
70
|
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
71
|
-
@bam = Bio::DB::Sam.new(:bam=>
|
71
|
+
@bam = Bio::DB::Sam.new(:bam=>saturatedbam,:fasta=>testfasta)
|
72
72
|
@bam.open
|
73
73
|
@reads = []
|
74
|
-
@bam.fetch("NC_001988.2",
|
75
|
-
@reads = @reads.uniq{|
|
74
|
+
@bam.fetch("NC_001988.2",76,76){|x| read = @calc.convert(x); @reads << read unless read.nil?}
|
75
|
+
@reads = @reads.uniq{|x| x.start}
|
76
76
|
end
|
77
77
|
it "returns an array" do
|
78
78
|
expect(@calc.sci(@reads)).to be_kind_of(Array)
|
79
79
|
end
|
80
80
|
it "returns the sequencing complexity index" do
|
81
|
-
expect(@calc.sci(@reads)[-1]).to eq(
|
81
|
+
expect(@calc.sci(@reads)[-1]).to eq(100.0)
|
82
82
|
end
|
83
83
|
end
|
84
84
|
context "when passed an empty array" do
|
85
|
-
it "returns
|
85
|
+
it "returns zero" do
|
86
86
|
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
87
87
|
empty_sci = @calc.sci([])[-1]
|
88
88
|
expect(empty_sci).to be_zero
|
@@ -90,26 +90,36 @@ describe "#sci" do
|
|
90
90
|
end
|
91
91
|
end
|
92
92
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
93
|
+
|
94
|
+
describe "#dissimilarity" do
|
95
|
+
before(:each) do
|
96
|
+
@bam=Bio::DB::Sam.new(:bam => testbam, :fasta => testfasta)
|
97
|
+
@bam.open
|
98
|
+
@reads = []
|
99
|
+
@bam.fetch("NC_001988.2",0,200) {|x| @reads << x }
|
100
|
+
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
101
|
+
@read1 = @calc.convert(@reads[2])
|
102
|
+
@read2 = @calc.convert(@reads[3])
|
97
103
|
end
|
98
|
-
|
99
|
-
|
100
|
-
|
104
|
+
|
105
|
+
it "calculates the unique bases of the first read from the second" do
|
106
|
+
expect(@calc.dissimilarity(@read1,@read2)).to eq(62)
|
107
|
+
end
|
108
|
+
|
109
|
+
it "calculates the unique bases, regardless of the order" do
|
110
|
+
expect(@calc.dissimilarity(@read2,@read1)).to eq(62)
|
101
111
|
end
|
102
112
|
end
|
103
113
|
|
104
|
-
describe "#
|
114
|
+
describe "#summed_dissimilarity" do
|
105
115
|
it "returns an int" do
|
106
116
|
@bam=Bio::DB::Sam.new(:bam=>testbam,:fasta=>testfasta)
|
107
117
|
@bam.open
|
108
118
|
@reads = []
|
109
119
|
@calc=NGSCI::Calculator.new(testbam,testfasta)
|
110
|
-
@bam.fetch("NC_001988.2",8,75) {|x|
|
120
|
+
@bam.fetch("NC_001988.2",8,75) {|x| @reads << @calc.convert(x) }
|
111
121
|
@reads = @reads.uniq{|r| r.start}
|
112
|
-
expect(@calc.
|
122
|
+
expect(@calc.summed_dissimilarity(@reads)).to be_an(Integer)
|
113
123
|
end
|
114
124
|
context "when passed an array of read objects" do
|
115
125
|
before(:each) do
|
@@ -120,13 +130,15 @@ describe "#summed_overlaps" do
|
|
120
130
|
@bam.fetch("NC_001988.2",8,75) {|x| read=@calc.convert(x); @reads << read if read}
|
121
131
|
@reads = @reads.uniq{|r| r.start}
|
122
132
|
end
|
123
|
-
|
124
|
-
|
125
|
-
|
133
|
+
context "when passed two reads" do
|
134
|
+
it "returns the sum of their dissimilarities" do
|
135
|
+
summed_dissimilarity = @calc.dissimilarity(@reads[0],@reads[1]) + @calc.dissimilarity(@reads[1],@reads[0])
|
136
|
+
expect(@calc.summed_dissimilarity(@reads[0..1])).to eq(summed_dissimilarity)
|
137
|
+
end
|
126
138
|
end
|
127
139
|
|
128
|
-
it "calculates the
|
129
|
-
expect(@calc.
|
140
|
+
it "calculates the summed dissimlarity of a group of reads" do
|
141
|
+
expect(@calc.summed_dissimilarity(@reads[0..7]).round(4)).to eq(532.0)
|
130
142
|
end
|
131
143
|
end
|
132
144
|
context "when passed an array with a single read object" do
|
@@ -136,33 +148,97 @@ describe "#summed_overlaps" do
|
|
136
148
|
@reads=[]
|
137
149
|
@calc=NGSCI::Calculator.new(testbam,testfasta)
|
138
150
|
@bam.fetch("NC_001988.2",8,75) {|x| read=@calc.convert(x); @reads << read if read}
|
139
|
-
expect(@calc.
|
151
|
+
expect(@calc.summed_dissimilarity([@reads[0]])).to be_zero
|
140
152
|
end
|
141
153
|
end
|
142
154
|
context "when passed an empty array" do
|
143
155
|
it "returns zero" do
|
144
156
|
@calc=NGSCI::Calculator.new(testbam,testfasta)
|
145
|
-
expect(@calc.
|
157
|
+
expect(@calc.summed_dissimilarity([])).to be_zero
|
146
158
|
end
|
147
159
|
end
|
148
160
|
end
|
149
161
|
|
150
|
-
describe "#
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
162
|
+
describe "#max_summed_dissimilarity" do
|
163
|
+
context "when passed and integer read length" do
|
164
|
+
before(:each) do
|
165
|
+
@read_length = 76
|
166
|
+
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
167
|
+
end
|
168
|
+
it "returns a float" do
|
169
|
+
expect(@calc.max_summed_dissimilarity(@read_length)).to be_kind_of Integer
|
170
|
+
end
|
159
171
|
end
|
160
|
-
|
161
|
-
|
172
|
+
context "when calculating the maximum summed dissimilarity" do
|
173
|
+
before(:each) do
|
174
|
+
@read_length = 76
|
175
|
+
@calc = NGSCI::Calculator.new(saturatedbam,testfasta)
|
176
|
+
@bam = Bio::DB::Sam.new(:bam=>saturatedbam,:fasta=>testfasta)
|
177
|
+
@bam.open
|
178
|
+
@reads = []
|
179
|
+
@bam.fetch("NC_001988.2",76,76){|x| read = @calc.convert(x); @reads << read unless read.nil?}
|
180
|
+
@reads = @reads.uniq{|x| x.start}
|
181
|
+
end
|
182
|
+
it "yields the triangular sum dissimilarity" do
|
183
|
+
# This test demonstrates that the simplified (more efficient) formula for maximum summed dissimilarity
|
184
|
+
# is equivalent to the triangular sum formula for the maximum summed dissimilarity within a group of reads
|
185
|
+
def tri(x,n=0)
|
186
|
+
return x == 0 ? n : tri(x-1,n+x)
|
187
|
+
end
|
188
|
+
triangular_sum = (1..@read_length).to_a.map{|x|
|
189
|
+
tri(@read_length - x) + tri(x - 1)
|
190
|
+
}.reduce(:+)
|
191
|
+
calculated_max_summed_dissimilarity = @calc.max_summed_dissimilarity(@read_length)
|
192
|
+
expect(calculated_max_summed_dissimilarity).to eq(triangular_sum)
|
193
|
+
end
|
194
|
+
it "is equal to the #summed_dissimilarity of saturated reads" do
|
195
|
+
# This test demonstrates that the formula for the theoretical maximum summed dissimilarity among reads
|
196
|
+
# is equivalent to the summed dissimilarity under maximum saturation (the saturated.bam test file)
|
197
|
+
theoretical_max_summed_dissimilarity = @calc.max_summed_dissimilarity(@read_length)
|
198
|
+
expect(theoretical_max_summed_dissimilarity).to eq(@calc.summed_dissimilarity(@reads))
|
199
|
+
end
|
162
200
|
end
|
201
|
+
context "when averaging per read" do
|
202
|
+
it "is equal to 1/3 times (read_length - 1)" do
|
203
|
+
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
204
|
+
(32..200).each do |read_length|
|
205
|
+
calculated_max_summed_dissimilarity = @calc.max_summed_dissimilarity(read_length)/(read_length*read_length)
|
206
|
+
expect(calculated_max_summed_dissimilarity).to eq((read_length-1)/3)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
163
211
|
|
164
|
-
|
165
|
-
|
212
|
+
|
213
|
+
describe "#denominator_calc" do
|
214
|
+
context "when passed and integer read length" do
|
215
|
+
before(:each) do
|
216
|
+
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
217
|
+
end
|
218
|
+
it "returns a float denominator" do
|
219
|
+
read_length = 76
|
220
|
+
expect(@calc.denominator_calc(read_length)).to be_kind_of Integer
|
221
|
+
end
|
222
|
+
end
|
223
|
+
it "is the max_summed_dissimilarity * read length" do
|
224
|
+
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
225
|
+
(32..200).each do |read_length|
|
226
|
+
max_sum_dissim = @calc.max_summed_dissimilarity(read_length)
|
227
|
+
expect(@calc.denominator_calc(read_length)).to eq(read_length*max_sum_dissim)
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
describe "#read_length_calc" do
|
233
|
+
it "calculates the read length" do
|
234
|
+
@bam=Bio::DB::Sam.new(:bam => testbam,:fasta => testfasta)
|
235
|
+
test_block_size = 100
|
236
|
+
expect(NGSCI::Calculator.read_length_calc(@bam,100)).to eq(76)
|
237
|
+
end
|
238
|
+
|
239
|
+
it "fails on an empty bam file" do
|
240
|
+
@emptybam = Bio::DB::Sam.new(:bam => emptybam, :fasta => testfasta)
|
241
|
+
expect{NGSCI::Calculator.read_length_calc(@emptybam,100)}.to raise_error(NGSCI::NGSCIIOError)
|
166
242
|
end
|
167
243
|
end
|
168
244
|
|
data/spec/lib/read_spec.rb
CHANGED
@@ -1,35 +1,57 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
|
4
|
-
describe
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
4
|
+
describe NGSCI::Read do
|
5
|
+
context "before created" do
|
6
|
+
it "fails to instantiate on a string start site" do
|
7
|
+
expect{NGSCI::Read.new("foo",3)}.to raise_error(NGSCI::NGSCIError)
|
8
|
+
end
|
9
|
+
|
10
|
+
it "fails to instantiate on a string stop site" do
|
11
|
+
expect{NGSCI::Read.new(1,"foo")}.to raise_error(NGSCI::NGSCIError)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "fails to instantiate when the stop site is greater than the start site" do
|
15
|
+
expect{NGSCI::Read.new(3,1)}.to raise_error(NGSCI::NGSCIError)
|
16
|
+
end
|
17
|
+
|
18
|
+
it "fails to instantiate on an improper strand argument" do
|
19
|
+
expect{NGSCI::Read.new(1,3,strand:"foo")}.to raise_error(NGSCI::NGSCIError)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "fails to instantiate without the three necessary arguments" do
|
23
|
+
expect{NGSCI::Read.new(1)}.to raise_error(ArgumentError)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "instantiates a new read with proper unstranded arguments" do
|
27
|
+
expect{NGSCI::Read.new(1,3)}.to_not raise_error
|
28
|
+
end
|
29
|
+
|
30
|
+
it "instantiates a new read with proper stranded arguments" do
|
31
|
+
expect{NGSCI::Read.new(1,3,strand:"+")}.to_not raise_error
|
32
|
+
end
|
24
33
|
end
|
25
34
|
|
26
|
-
|
27
|
-
|
28
|
-
|
35
|
+
context "after created" do
|
36
|
+
before(:each) do
|
37
|
+
@read = NGSCI::Read.new(1,3,strand:"+")
|
38
|
+
end
|
39
|
+
|
40
|
+
it "has a start attribute" do
|
41
|
+
expect(@read.methods).to include(:start)
|
42
|
+
end
|
43
|
+
|
44
|
+
it "has a stop attribute" do
|
45
|
+
expect(@read.methods).to include(:stop)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "has a length attribute" do
|
49
|
+
expect(@read.methods).to include(:length)
|
50
|
+
end
|
51
|
+
|
52
|
+
it "has a strand attribute" do
|
53
|
+
expect(@read.methods).to include(:strand)
|
54
|
+
end
|
29
55
|
|
30
|
-
it "instantiates a new read with proper stranded arguments" do
|
31
|
-
expect{NGSCI::Read.new(1,3,strand:"+")}.to_not raise_error
|
32
56
|
end
|
33
|
-
|
34
|
-
|
35
57
|
end
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ngs-ci
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2.b
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matthew Ralston
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: trollop
|
@@ -56,16 +56,16 @@ dependencies:
|
|
56
56
|
name: yell
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '2'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '2'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: ruby-prof
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,16 +84,16 @@ dependencies:
|
|
84
84
|
name: bundler
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - "
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
89
|
+
version: '1'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - "
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
96
|
+
version: '1'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: rake
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,18 +122,32 @@ dependencies:
|
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '3.1'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pry
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: coveralls
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
128
142
|
requirements:
|
129
|
-
- - "
|
143
|
+
- - "~>"
|
130
144
|
- !ruby/object:Gem::Version
|
131
145
|
version: '0'
|
132
146
|
type: :development
|
133
147
|
prerelease: false
|
134
148
|
version_requirements: !ruby/object:Gem::Requirement
|
135
149
|
requirements:
|
136
|
-
- - "
|
150
|
+
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
152
|
version: '0'
|
139
153
|
description: Calculated a metric that estimates read complexity at each base for RNA-seq
|
@@ -169,6 +183,8 @@ files:
|
|
169
183
|
- spec/lib/read_spec.rb
|
170
184
|
- spec/spec_helper.rb
|
171
185
|
- spec/test_files/empty.bam
|
186
|
+
- spec/test_files/saturated.bam
|
187
|
+
- spec/test_files/saturated.bam.bai
|
172
188
|
- spec/test_files/test.bam
|
173
189
|
- spec/test_files/test.bam.bai
|
174
190
|
- spec/test_files/test.fa
|
@@ -204,6 +220,11 @@ test_files:
|
|
204
220
|
- spec/lib/read_spec.rb
|
205
221
|
- spec/spec_helper.rb
|
206
222
|
- spec/test_files/empty.bam
|
223
|
+
- spec/test_files/saturated.bam
|
224
|
+
- spec/test_files/saturated.bam.bai
|
207
225
|
- spec/test_files/test.bam
|
208
226
|
- spec/test_files/test.bam.bai
|
209
227
|
- spec/test_files/test.fa
|
228
|
+
has_rdoc:
|
229
|
+
- yard
|
230
|
+
- "~> 0"
|