ngs-ci 0.0.1.a → 0.0.2.b
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/README.md +20 -7
- data/TODO.org +13 -5
- data/lib/NGSCI/calculator.rb +95 -66
- data/lib/NGSCI/read.rb +3 -2
- data/lib/NGSCI/version.rb +1 -1
- data/lib/NGSCI.rb +6 -4
- data/ngs-ci.gemspec +5 -4
- data/spec/lib/calculator_spec.rb +112 -36
- data/spec/lib/read_spec.rb +49 -27
- data/spec/test_files/saturated.bam +0 -0
- data/spec/test_files/saturated.bam.bai +0 -0
- metadata +33 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f537bf2115279943c5673a5dde124ae358972f2
|
4
|
+
data.tar.gz: fcb22914dfe57a1f9bdfddb71e8cfd453c9431dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 123ab12bb812db0c6f2fafe6a6785a792b0e3596b154e45252be188ad7ab192755f4c715268b3b96f1ada8a7a6610afe67b4608352574bd033706868f410c202
|
7
|
+
data.tar.gz: 7f77bcb7e67204cb71f69869f8383233dcbc68d9182b7b62deb6cd186b404342d6aa65365f10f49d0eb7473f83fe562ac12348f74b63936f78d68ee3b9d1cf26
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,12 +1,18 @@
|
|
1
|
-
[](https://travis-ci.org/MatthewRalston/ngs-ci)
|
2
2
|
|
3
|
-
[](http://badge.fury.io/rb/ngs-ci)
|
4
4
|
|
5
|
-
[](https://coveralls.io/github/MatthewRalston/ngs-ci?branch=master)
|
6
6
|
|
7
7
|
|
8
|
+
# Todo
|
8
9
|
|
9
|
-
|
10
|
+
The inconsistency between the max summed dissimilarity and the denominator calculation is likely:
|
11
|
+
1. Issue in the complexity index (when present is max,
|
12
|
+
a. ( present - missing ) / max = present/max_similarity - missing/max_dissim
|
13
|
+
|
14
|
+
|
15
|
+
# NGS Complexity Index
|
10
16
|
|
11
17
|
NOTE: This is a project in progress.
|
12
18
|
This gem will calculate a sequencing complexity index for BAM files.
|
@@ -16,7 +22,7 @@ This gem will calculate a sequencing complexity index for BAM files.
|
|
16
22
|
Add this line to your application's Gemfile:
|
17
23
|
|
18
24
|
```ruby
|
19
|
-
gem '
|
25
|
+
gem 'NGSCI'
|
20
26
|
```
|
21
27
|
|
22
28
|
And then execute:
|
@@ -25,11 +31,18 @@ And then execute:
|
|
25
31
|
|
26
32
|
Or install it yourself as:
|
27
33
|
|
28
|
-
$ gem install
|
34
|
+
$ gem install ngs-ci --pre
|
35
|
+
|
36
|
+
Or install manually:
|
37
|
+
|
38
|
+
$ git clone https://github.com/MatthewRalston/ngs-ci.git
|
39
|
+
$ cd ngs-ci
|
40
|
+
$ gem build ngs-ci.gemspec
|
41
|
+
$ gem install ngs-ci-[Version].gem
|
29
42
|
|
30
43
|
## Usage
|
31
44
|
|
32
|
-
|
45
|
+
* See ```--help``` for details. More to come.
|
33
46
|
|
34
47
|
## Contributing
|
35
48
|
|
data/TODO.org
CHANGED
@@ -30,10 +30,18 @@
|
|
30
30
|
**** 2850 (triangular number T(L-1) L=76 J=1
|
31
31
|
**** f(76) = 2850
|
32
32
|
* Notes
|
33
|
-
** U
|
33
|
+
** U = u/L
|
34
34
|
** U/L is the number of unique reads at that base, length normalized
|
35
|
-
**
|
36
|
-
**
|
37
|
-
**
|
38
|
-
|
35
|
+
** U*O/L vs. 200*U*O/(L^2)
|
36
|
+
** the average summed overlap is O/(L)
|
37
|
+
** average because different reads in the formation have different summed overlaps
|
38
|
+
** the average average overlap is 2L/3 or O/L/(L-1) or O/(L^2-L)
|
39
|
+
**
|
40
|
+
** D = d /
|
41
|
+
** the average summed dissimilarity is D/(L)
|
42
|
+
** average because different reads in the formation have different summed dissimilarities
|
43
|
+
** the average average dissimilarity is L/3 or D/L/(L-1) or D/(L^2-L)
|
44
|
+
** This matches the average similarity nicely...
|
45
|
+
**
|
46
|
+
** u*d / L*L*(L-1)
|
39
47
|
* Bugs
|
data/lib/NGSCI/calculator.rb
CHANGED
@@ -7,16 +7,21 @@ module NGSCI
|
|
7
7
|
|
8
8
|
# A calculator calculates the sequencing complexity index.
|
9
9
|
#
|
10
|
-
#
|
10
|
+
# @author Matthew Ralston
|
11
|
+
# @abstract A class for calculating the complexity index on next generation sequencing reads
|
12
|
+
# @attr_reader [Integer] block_size The block size for parallelizing disk access
|
13
|
+
# @attr_reader [Hash<Symbol,Integer>] chroms A hash of chromosomes and their sizes
|
14
|
+
# @attr_reader [Integer] read_length The read length obtained from a bam file
|
15
|
+
# @attr_reader [Integer] denominator The denominator and normalization factors calculated from the read length
|
11
16
|
class Calculator
|
12
|
-
attr_reader :
|
17
|
+
attr_reader :block_size, :chroms, :read_length, :denominator
|
13
18
|
|
14
19
|
# A new calculator to compute the sequencing complexity index given
|
15
20
|
# a loaded Bio::DB::Sam object and optional thread argument.
|
16
21
|
#
|
17
|
-
# @param
|
18
|
-
# @param
|
19
|
-
# @param
|
22
|
+
# @param [Bio::DB::Sam] bam Opened bam file with loaded reference.
|
23
|
+
# @param [Int] threads The number of threads used to compute NGSCI.
|
24
|
+
# @param [String] strand One of [FR RF F] or nil for strandedness.
|
20
25
|
def initialize(bam, reference, strand: nil, threads: 1)
|
21
26
|
@block_size = 1600
|
22
27
|
@results = nil
|
@@ -28,7 +33,8 @@ module NGSCI
|
|
28
33
|
@bam.open
|
29
34
|
@threads = threads
|
30
35
|
@chroms = reference_sequences(reference)
|
31
|
-
read_length
|
36
|
+
@read_length = NGSCI::Calculator.read_length_calc(@bam,@block_size)
|
37
|
+
@denominator = denominator_calc(@read_length)
|
32
38
|
if strand
|
33
39
|
unless %w(FR RF F).include?(strand)
|
34
40
|
raise NGSCI::NGSCIError.new "Strand specific option #{opts.strand} is invalid." +
|
@@ -42,6 +48,7 @@ module NGSCI
|
|
42
48
|
|
43
49
|
# Calculation of the sequencing complexity index
|
44
50
|
#
|
51
|
+
# @param runtime [false] Print profiling information?
|
45
52
|
def run(runtime: false)
|
46
53
|
RubyProf.start if runtime
|
47
54
|
# Convert each aligned read to Read clas
|
@@ -83,16 +90,16 @@ module NGSCI
|
|
83
90
|
#
|
84
91
|
# @param chrom [String] The chromosome from the bam file
|
85
92
|
# @param i [Integer] The number of blocks that have been read
|
86
|
-
# @return
|
87
|
-
# * :+ (Array[
|
88
|
-
# * :- (Array[
|
93
|
+
# @return [Hash<Symbol,Array>]
|
94
|
+
# * :+ (Array[Array]) The NGSCI for the + strand along the
|
95
|
+
# * :- (Array[Array]) The NGSCI for the - strand
|
89
96
|
def readblock(chrom,i)
|
90
97
|
reads=[]
|
91
98
|
results = @strand ? {"+" => [],"-" => []}: {nil => []}
|
92
|
-
start = [0,(i * @block_size) - @
|
99
|
+
start = [0,(i * @block_size) - @read_length].max
|
93
100
|
stop = [(i + 1) * @block_size, self.chroms[chrom]].min
|
94
101
|
@bam.fetch(chrom,start,stop) {|read| reads << convert(read)}
|
95
|
-
start += @
|
102
|
+
start += @read_length unless start == 0
|
96
103
|
reads.compact!
|
97
104
|
reads.sort_by!(&:start) unless reads.empty?
|
98
105
|
x=0
|
@@ -109,93 +116,114 @@ module NGSCI
|
|
109
116
|
return results
|
110
117
|
end
|
111
118
|
|
112
|
-
|
113
119
|
# Calculates sequencing complexity index for a single base
|
114
120
|
#
|
115
121
|
# @param reads [Array<NGSCI::Read>] A group of reads aligned to a single base.
|
116
|
-
# @return
|
122
|
+
# @return [Array<Integer,Integer,Float,Float>]
|
117
123
|
def sci(reads)
|
118
124
|
numreads=reads.size
|
119
125
|
# Groups reads by start site
|
120
126
|
# selects the largest read length from the groups
|
121
|
-
reads = reads.group_by(&:start).map{|k,v| v.max{|x,y|
|
122
|
-
|
127
|
+
reads = reads.group_by(&:start).map{|k,v| v.max{|x,y| x.length <=> y.length}}
|
128
|
+
d = summed_dissimilarity(reads)
|
123
129
|
uniquereads = reads.size
|
124
|
-
return [numreads,uniquereads,(
|
130
|
+
return [numreads,uniquereads,(d.to_f/@read_length).round(4),(100*uniquereads*d/@denominator).round(4)]
|
125
131
|
end
|
126
132
|
|
127
|
-
|
133
|
+
|
134
|
+
# Calculation of the dissimilarity between two reads
|
135
|
+
#
|
136
|
+
# @param read1 [NGSCI::Read] First read to be compared
|
137
|
+
# @param read2 [NGSCI::Read] Second read to be compared
|
138
|
+
# @return [Integer] Length of non-overlapping/unique bases
|
139
|
+
def dissimilarity(read1,read2)
|
140
|
+
if read1.start > read2.start
|
141
|
+
if read1.stop < read2.stop # Read 1 is inside read 2
|
142
|
+
(read1.start - read2.start) + (read2.stop - read1.stop)
|
143
|
+
else # Normal overlap
|
144
|
+
read1.start - read2.start
|
145
|
+
end
|
146
|
+
else
|
147
|
+
if read1.stop > read2.stop # Read 2 is inside read 1
|
148
|
+
(read2.start - read1.start) + (read1.stop - read2.stop)
|
149
|
+
else # Normal overlap
|
150
|
+
read2.start - read1.start
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Calculates summed dissimilarity between a group of reads
|
128
156
|
#
|
129
157
|
# @param reads [Array<NGSCI::Read>] Array of reads
|
130
|
-
# @return
|
131
|
-
def
|
158
|
+
# @return [Integer] Sum of all dissimilarities between the group of reads
|
159
|
+
def summed_dissimilarity(reads)
|
132
160
|
numreads = reads.size
|
133
161
|
sum=0
|
134
|
-
unless numreads
|
162
|
+
unless numreads <= 1
|
135
163
|
i = 0
|
136
164
|
while i < numreads
|
137
165
|
r1 = reads[i] # for each of n reads
|
138
166
|
sum+=reads.
|
139
167
|
reject{|r| r == r1}. # select the n-1 other reads
|
140
|
-
map{|r|
|
168
|
+
map{|r| dissimilarity(r,r1)}. # calculate their overlap to r1
|
141
169
|
reduce(:+)
|
142
170
|
i+=1
|
143
171
|
end
|
144
172
|
end
|
145
173
|
return sum
|
174
|
+
end
|
175
|
+
|
176
|
+
# Calculates the average summed dissimilarity (per read) of that read to all other reads
|
177
|
+
#
|
178
|
+
# @param [Integer] read_length The read length
|
179
|
+
# @return [Integer] avg_summed_dissimilarity
|
180
|
+
def max_summed_dissimilarity(read_length)
|
181
|
+
# For each unique read under maximum saturation, calculate the sum of dissimilarities for that read to all other reads
|
182
|
+
summed_dissimilarities = (1..read_length).to_a.map { |r|
|
183
|
+
(read_length ** 2) / 2 - read_length*r + read_length/2 + r**2 - r }.reduce(:+)
|
146
184
|
end
|
147
185
|
|
148
|
-
#
|
149
|
-
#
|
150
|
-
#
|
151
|
-
#
|
152
|
-
# @
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
read2.stop - read1.start
|
159
|
-
end
|
160
|
-
else
|
161
|
-
if read1.stop > read2.stop # Read 2 is inside read 1
|
162
|
-
read2.stop - read2.start
|
163
|
-
else # Normal overlap
|
164
|
-
read1.stop - read2.start
|
165
|
-
end
|
166
|
-
end
|
167
|
-
end
|
186
|
+
# Calculates the denominator for the complexity index from the read length, assuming maximum saturation (i.e. number of unique reads == read_length)
|
187
|
+
# unique reads /read length * summed_dissimilarity / (max_summed_dissimilarity/(read length * read length)
|
188
|
+
# Denomiator = read length * max_summed_dissimilarity / (read_length * read_length)
|
189
|
+
#
|
190
|
+
# @param [Integer] read_length The read length
|
191
|
+
# @return [Float] denominator The denominator including normalization factors for the complexity index 349184
|
192
|
+
def denominator_calc(read_length)
|
193
|
+
read_length*max_summed_dissimilarity(read_length)
|
194
|
+
end
|
195
|
+
|
168
196
|
|
169
|
-
#
|
197
|
+
# Calculates the read length of a bam file by sampling at least on full block of reads
|
170
198
|
#
|
171
|
-
|
172
|
-
|
173
|
-
|
199
|
+
# @param [Bio::DB::Sam] bam A bam reader object
|
200
|
+
# @param [Integer] block_size The number of reads to read from a bam file
|
201
|
+
# @return [Integer] read_length The read length acquired from reading a block at a time until at least 100 reads are acquired
|
202
|
+
def self.read_length_calc(bam,block_size)
|
203
|
+
stats=bam.index_stats.select {|k,v| k != "*" && v[:mapped_reads] > 0}
|
174
204
|
if stats.empty?
|
175
205
|
raise NGSCIIOError.new "BAM file is empty! Check samtools idxstats."
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
end
|
206
|
+
end
|
207
|
+
i=0
|
208
|
+
lengths=[]
|
209
|
+
test = block_size
|
210
|
+
while i <= test
|
211
|
+
bam.view do |read|
|
212
|
+
lengths << read.seq.size
|
213
|
+
i +=1
|
214
|
+
end
|
215
|
+
if i == test && lengths.size < 100
|
216
|
+
test += block_size
|
188
217
|
end
|
189
|
-
@buffer = lengths.max
|
190
|
-
@denom = @buffer**2 * (@buffer - 1)**2
|
191
218
|
end
|
219
|
+
lengths.max
|
192
220
|
end
|
193
221
|
|
194
222
|
# Converts strand specific BAM read into a sequence object format
|
195
223
|
# Uses the @strand instance variable to determine the strand of conversion
|
196
224
|
#
|
197
|
-
# @param
|
198
|
-
# @return
|
225
|
+
# @param [Bio::DB::Alignment] read Read to be converted.
|
226
|
+
# @return [NGSCI::Read] read Converted Read object
|
199
227
|
def convert(read)
|
200
228
|
unless read.query_unmapped
|
201
229
|
if @strand
|
@@ -212,7 +240,7 @@ module NGSCI
|
|
212
240
|
# Assumes paired-end strand-specific sequencing with "fr" chemistry
|
213
241
|
#
|
214
242
|
# @param read [Bio::DB::Alignment] Read to be converted.
|
215
|
-
# @return
|
243
|
+
# @return [NGSCI::Read] Converted Read object
|
216
244
|
def fr(read)
|
217
245
|
if read.first_in_pair
|
218
246
|
read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
|
@@ -226,7 +254,7 @@ module NGSCI
|
|
226
254
|
# Assumes paired-end strand-specific sequencing with "rf" chemistry
|
227
255
|
#
|
228
256
|
# @param read [Bio::DB::Alignment] Read to be converted.
|
229
|
-
# @return
|
257
|
+
# @return [NGSCI::Read] Converted Read object
|
230
258
|
def rf(read)
|
231
259
|
if read.first_in_pair
|
232
260
|
read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+")
|
@@ -240,7 +268,7 @@ module NGSCI
|
|
240
268
|
# Assumes single-end strand-specific sequencing with "f" chemistry
|
241
269
|
#
|
242
270
|
# @param read [Bio::DB::Alignment] Read to be converted.
|
243
|
-
# @return
|
271
|
+
# @return [NGSCI::Read] Converted Read object
|
244
272
|
def f(read)
|
245
273
|
read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
|
246
274
|
end
|
@@ -249,7 +277,7 @@ module NGSCI
|
|
249
277
|
#
|
250
278
|
# @param read [Bio::DB::Alignment] Aligned read to be converted
|
251
279
|
# @param strand [String] Strand of read
|
252
|
-
# @return
|
280
|
+
# @return [NGSCI::Read] Converted Read object
|
253
281
|
def newread(read,strand: nil)
|
254
282
|
Read.new(read.pos,read.pos+read.seq.size,strand: strand)
|
255
283
|
end
|
@@ -257,7 +285,7 @@ module NGSCI
|
|
257
285
|
# Acquires names and sizes of reference sequences included in the bam file
|
258
286
|
#
|
259
287
|
# @param reference [String] Path to reference fasta file.
|
260
|
-
# @return
|
288
|
+
# @return [Hash<Symbol,Integer>] A dictionary of chromosome sizes
|
261
289
|
def reference_sequences(reference)
|
262
290
|
chromosomes={}
|
263
291
|
Bio::FastaFormat.open(@reference).each_entry do |f|
|
@@ -265,6 +293,7 @@ module NGSCI
|
|
265
293
|
end
|
266
294
|
chromosomes.select {|chrom| @bam.index_stats.keys.include?(chrom)}
|
267
295
|
end
|
296
|
+
|
268
297
|
# Exports the results to outfile
|
269
298
|
#
|
270
299
|
# @param outfile [String] Path to outfile
|
data/lib/NGSCI/read.rb
CHANGED
@@ -4,9 +4,10 @@ module NGSCI
|
|
4
4
|
#
|
5
5
|
# @!attribute [r] start
|
6
6
|
# @!attribute [r] stop
|
7
|
+
# @!attribute [r] length
|
7
8
|
# @!attribute [r] strand
|
8
9
|
class Read
|
9
|
-
attr_reader :start, :stop, :strand
|
10
|
+
attr_reader :start, :stop, :length, :strand
|
10
11
|
def initialize(start,stop,strand: nil)
|
11
12
|
=begin DEPRECATED chromosome variable
|
12
13
|
unless chr.is_a?(String)
|
@@ -24,8 +25,8 @@ module NGSCI
|
|
24
25
|
end
|
25
26
|
@start=start
|
26
27
|
@stop=stop
|
28
|
+
@length=stop-start
|
27
29
|
@strand=strand
|
28
30
|
end
|
29
|
-
|
30
31
|
end
|
31
32
|
end
|
data/lib/NGSCI/version.rb
CHANGED
data/lib/NGSCI.rb
CHANGED
@@ -1,20 +1,21 @@
|
|
1
|
-
|
1
|
+
|
2
|
+
|
2
3
|
|
3
4
|
# NGSCI stands for Sequencing Complexity Index
|
4
5
|
# This program calculates a sequencing complexity index for each base and/or strand in a genome.
|
5
6
|
# This program calculates this by averaging average overlaps of reads aligned to that base.
|
6
7
|
module NGSCI
|
8
|
+
require 'yell'
|
7
9
|
# For custom error handling in the future, unimplemented
|
8
10
|
class NGSCIError < StandardError; end
|
9
11
|
class NGSCIIOError < NGSCIError; end
|
10
|
-
class NGSCIArgError < NGSCIError; end
|
11
|
-
|
12
|
+
class NGSCIArgError < NGSCIError; end
|
12
13
|
|
13
14
|
# Create the universal logger and include it in Object
|
14
15
|
# making the logger object available everywhere
|
15
16
|
format = Yell::Formatter.new("[%5L] %d : %m", "%Y-%m-%d %H:%M:%S")
|
16
17
|
# http://xkcd.com/1179/
|
17
|
-
Yell.new(:format => format) do |l|
|
18
|
+
logger = Yell.new(:format => format) do |l|
|
18
19
|
l.level = :info
|
19
20
|
l.name = Object
|
20
21
|
l.adapter STDOUT, level: [:debug, :info, :warn]
|
@@ -29,3 +30,4 @@ require 'NGSCI/cmd'
|
|
29
30
|
require 'NGSCI/version'
|
30
31
|
require 'NGSCI/calculator'
|
31
32
|
require 'NGSCI/read'
|
33
|
+
#require 'yell'
|
data/ngs-ci.gemspec
CHANGED
@@ -22,14 +22,15 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.add_dependency 'trollop','~> 2.1.2'
|
23
23
|
spec.add_dependency 'bio-samtools', '= 2.3.2'
|
24
24
|
spec.add_dependency 'parallel', '~> 1.4'
|
25
|
-
spec.add_dependency 'yell'
|
25
|
+
spec.add_dependency 'yell', '~> 2'
|
26
26
|
spec.add_dependency "ruby-prof", "~> 0.15"
|
27
|
-
spec.has_rdoc = 'yard'
|
27
|
+
spec.has_rdoc = 'yard', '~> 0'
|
28
28
|
|
29
|
-
spec.add_development_dependency "bundler"
|
29
|
+
spec.add_development_dependency "bundler", "~> 1"
|
30
30
|
spec.add_development_dependency "rake", "~> 10.0"
|
31
31
|
spec.add_development_dependency "rspec", "~> 3.1"
|
32
|
+
spec.add_development_dependency "pry", "~> 0"
|
32
33
|
#spec.add_development_dependency "guard", "~> 2.12"
|
33
|
-
spec.add_development_dependency "coveralls"
|
34
|
+
spec.add_development_dependency "coveralls", "~> 0"
|
34
35
|
#spec.add_development_dependency "cucumber", "~> 1.3"
|
35
36
|
end
|
data/spec/lib/calculator_spec.rb
CHANGED
@@ -3,11 +3,11 @@ require 'bio-samtools'
|
|
3
3
|
|
4
4
|
testbam="spec/test_files/test.bam"
|
5
5
|
emptybam="spec/test_files/empty.bam"
|
6
|
+
saturatedbam="spec/test_files/saturated.bam"
|
6
7
|
testfasta="spec/test_files/test.fa"
|
7
8
|
testout="spec/test_files/testfile.txt"
|
8
9
|
|
9
10
|
|
10
|
-
|
11
11
|
describe "#run" do
|
12
12
|
context "during a strand specific run" do
|
13
13
|
before(:each) do
|
@@ -68,21 +68,21 @@ describe "#sci" do
|
|
68
68
|
context "when passed an array of read objects" do
|
69
69
|
before(:each) do
|
70
70
|
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
71
|
-
@bam = Bio::DB::Sam.new(:bam=>
|
71
|
+
@bam = Bio::DB::Sam.new(:bam=>saturatedbam,:fasta=>testfasta)
|
72
72
|
@bam.open
|
73
73
|
@reads = []
|
74
|
-
@bam.fetch("NC_001988.2",
|
75
|
-
@reads = @reads.uniq{|
|
74
|
+
@bam.fetch("NC_001988.2",76,76){|x| read = @calc.convert(x); @reads << read unless read.nil?}
|
75
|
+
@reads = @reads.uniq{|x| x.start}
|
76
76
|
end
|
77
77
|
it "returns an array" do
|
78
78
|
expect(@calc.sci(@reads)).to be_kind_of(Array)
|
79
79
|
end
|
80
80
|
it "returns the sequencing complexity index" do
|
81
|
-
expect(@calc.sci(@reads)[-1]).to eq(
|
81
|
+
expect(@calc.sci(@reads)[-1]).to eq(100.0)
|
82
82
|
end
|
83
83
|
end
|
84
84
|
context "when passed an empty array" do
|
85
|
-
it "returns
|
85
|
+
it "returns zero" do
|
86
86
|
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
87
87
|
empty_sci = @calc.sci([])[-1]
|
88
88
|
expect(empty_sci).to be_zero
|
@@ -90,26 +90,36 @@ describe "#sci" do
|
|
90
90
|
end
|
91
91
|
end
|
92
92
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
93
|
+
|
94
|
+
describe "#dissimilarity" do
|
95
|
+
before(:each) do
|
96
|
+
@bam=Bio::DB::Sam.new(:bam => testbam, :fasta => testfasta)
|
97
|
+
@bam.open
|
98
|
+
@reads = []
|
99
|
+
@bam.fetch("NC_001988.2",0,200) {|x| @reads << x }
|
100
|
+
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
101
|
+
@read1 = @calc.convert(@reads[2])
|
102
|
+
@read2 = @calc.convert(@reads[3])
|
97
103
|
end
|
98
|
-
|
99
|
-
|
100
|
-
|
104
|
+
|
105
|
+
it "calculates the unique bases of the first read from the second" do
|
106
|
+
expect(@calc.dissimilarity(@read1,@read2)).to eq(62)
|
107
|
+
end
|
108
|
+
|
109
|
+
it "calculates the unique bases, regardless of the order" do
|
110
|
+
expect(@calc.dissimilarity(@read2,@read1)).to eq(62)
|
101
111
|
end
|
102
112
|
end
|
103
113
|
|
104
|
-
describe "#
|
114
|
+
describe "#summed_dissimilarity" do
|
105
115
|
it "returns an int" do
|
106
116
|
@bam=Bio::DB::Sam.new(:bam=>testbam,:fasta=>testfasta)
|
107
117
|
@bam.open
|
108
118
|
@reads = []
|
109
119
|
@calc=NGSCI::Calculator.new(testbam,testfasta)
|
110
|
-
@bam.fetch("NC_001988.2",8,75) {|x|
|
120
|
+
@bam.fetch("NC_001988.2",8,75) {|x| @reads << @calc.convert(x) }
|
111
121
|
@reads = @reads.uniq{|r| r.start}
|
112
|
-
expect(@calc.
|
122
|
+
expect(@calc.summed_dissimilarity(@reads)).to be_an(Integer)
|
113
123
|
end
|
114
124
|
context "when passed an array of read objects" do
|
115
125
|
before(:each) do
|
@@ -120,13 +130,15 @@ describe "#summed_overlaps" do
|
|
120
130
|
@bam.fetch("NC_001988.2",8,75) {|x| read=@calc.convert(x); @reads << read if read}
|
121
131
|
@reads = @reads.uniq{|r| r.start}
|
122
132
|
end
|
123
|
-
|
124
|
-
|
125
|
-
|
133
|
+
context "when passed two reads" do
|
134
|
+
it "returns the sum of their dissimilarities" do
|
135
|
+
summed_dissimilarity = @calc.dissimilarity(@reads[0],@reads[1]) + @calc.dissimilarity(@reads[1],@reads[0])
|
136
|
+
expect(@calc.summed_dissimilarity(@reads[0..1])).to eq(summed_dissimilarity)
|
137
|
+
end
|
126
138
|
end
|
127
139
|
|
128
|
-
it "calculates the
|
129
|
-
expect(@calc.
|
140
|
+
it "calculates the summed dissimlarity of a group of reads" do
|
141
|
+
expect(@calc.summed_dissimilarity(@reads[0..7]).round(4)).to eq(532.0)
|
130
142
|
end
|
131
143
|
end
|
132
144
|
context "when passed an array with a single read object" do
|
@@ -136,33 +148,97 @@ describe "#summed_overlaps" do
|
|
136
148
|
@reads=[]
|
137
149
|
@calc=NGSCI::Calculator.new(testbam,testfasta)
|
138
150
|
@bam.fetch("NC_001988.2",8,75) {|x| read=@calc.convert(x); @reads << read if read}
|
139
|
-
expect(@calc.
|
151
|
+
expect(@calc.summed_dissimilarity([@reads[0]])).to be_zero
|
140
152
|
end
|
141
153
|
end
|
142
154
|
context "when passed an empty array" do
|
143
155
|
it "returns zero" do
|
144
156
|
@calc=NGSCI::Calculator.new(testbam,testfasta)
|
145
|
-
expect(@calc.
|
157
|
+
expect(@calc.summed_dissimilarity([])).to be_zero
|
146
158
|
end
|
147
159
|
end
|
148
160
|
end
|
149
161
|
|
150
|
-
describe "#
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
162
|
+
describe "#max_summed_dissimilarity" do
|
163
|
+
context "when passed and integer read length" do
|
164
|
+
before(:each) do
|
165
|
+
@read_length = 76
|
166
|
+
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
167
|
+
end
|
168
|
+
it "returns a float" do
|
169
|
+
expect(@calc.max_summed_dissimilarity(@read_length)).to be_kind_of Integer
|
170
|
+
end
|
159
171
|
end
|
160
|
-
|
161
|
-
|
172
|
+
context "when calculating the maximum summed dissimilarity" do
|
173
|
+
before(:each) do
|
174
|
+
@read_length = 76
|
175
|
+
@calc = NGSCI::Calculator.new(saturatedbam,testfasta)
|
176
|
+
@bam = Bio::DB::Sam.new(:bam=>saturatedbam,:fasta=>testfasta)
|
177
|
+
@bam.open
|
178
|
+
@reads = []
|
179
|
+
@bam.fetch("NC_001988.2",76,76){|x| read = @calc.convert(x); @reads << read unless read.nil?}
|
180
|
+
@reads = @reads.uniq{|x| x.start}
|
181
|
+
end
|
182
|
+
it "yields the triangular sum dissimilarity" do
|
183
|
+
# This test demonstrates that the simplified (more efficient) formula for maximum summed dissimilarity
|
184
|
+
# is equivalent to the triangular sum formula for the maximum summed dissimilarity within a group of reads
|
185
|
+
def tri(x,n=0)
|
186
|
+
return x == 0 ? n : tri(x-1,n+x)
|
187
|
+
end
|
188
|
+
triangular_sum = (1..@read_length).to_a.map{|x|
|
189
|
+
tri(@read_length - x) + tri(x - 1)
|
190
|
+
}.reduce(:+)
|
191
|
+
calculated_max_summed_dissimilarity = @calc.max_summed_dissimilarity(@read_length)
|
192
|
+
expect(calculated_max_summed_dissimilarity).to eq(triangular_sum)
|
193
|
+
end
|
194
|
+
it "is equal to the #summed_dissimilarity of saturated reads" do
|
195
|
+
# This test demonstrates that the formula for the theoretical maximum summed dissimilarity among reads
|
196
|
+
# is equivalent to the summed dissimilarity under maximum saturation (the saturated.bam test file)
|
197
|
+
theoretical_max_summed_dissimilarity = @calc.max_summed_dissimilarity(@read_length)
|
198
|
+
expect(theoretical_max_summed_dissimilarity).to eq(@calc.summed_dissimilarity(@reads))
|
199
|
+
end
|
162
200
|
end
|
201
|
+
context "when averaging per read" do
|
202
|
+
it "is equal to 1/3 times (read_length - 1)" do
|
203
|
+
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
204
|
+
(32..200).each do |read_length|
|
205
|
+
calculated_max_summed_dissimilarity = @calc.max_summed_dissimilarity(read_length)/(read_length*read_length)
|
206
|
+
expect(calculated_max_summed_dissimilarity).to eq((read_length-1)/3)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
163
211
|
|
164
|
-
|
165
|
-
|
212
|
+
|
213
|
+
describe "#denominator_calc" do
|
214
|
+
context "when passed and integer read length" do
|
215
|
+
before(:each) do
|
216
|
+
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
217
|
+
end
|
218
|
+
it "returns a float denominator" do
|
219
|
+
read_length = 76
|
220
|
+
expect(@calc.denominator_calc(read_length)).to be_kind_of Integer
|
221
|
+
end
|
222
|
+
end
|
223
|
+
it "is the max_summed_dissimilarity * read length" do
|
224
|
+
@calc = NGSCI::Calculator.new(testbam,testfasta)
|
225
|
+
(32..200).each do |read_length|
|
226
|
+
max_sum_dissim = @calc.max_summed_dissimilarity(read_length)
|
227
|
+
expect(@calc.denominator_calc(read_length)).to eq(read_length*max_sum_dissim)
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
describe "#read_length_calc" do
|
233
|
+
it "calculates the read length" do
|
234
|
+
@bam=Bio::DB::Sam.new(:bam => testbam,:fasta => testfasta)
|
235
|
+
test_block_size = 100
|
236
|
+
expect(NGSCI::Calculator.read_length_calc(@bam,100)).to eq(76)
|
237
|
+
end
|
238
|
+
|
239
|
+
it "fails on an empty bam file" do
|
240
|
+
@emptybam = Bio::DB::Sam.new(:bam => emptybam, :fasta => testfasta)
|
241
|
+
expect{NGSCI::Calculator.read_length_calc(@emptybam,100)}.to raise_error(NGSCI::NGSCIIOError)
|
166
242
|
end
|
167
243
|
end
|
168
244
|
|
data/spec/lib/read_spec.rb
CHANGED
@@ -1,35 +1,57 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
|
4
|
-
describe
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
4
|
+
describe NGSCI::Read do
|
5
|
+
context "before created" do
|
6
|
+
it "fails to instantiate on a string start site" do
|
7
|
+
expect{NGSCI::Read.new("foo",3)}.to raise_error(NGSCI::NGSCIError)
|
8
|
+
end
|
9
|
+
|
10
|
+
it "fails to instantiate on a string stop site" do
|
11
|
+
expect{NGSCI::Read.new(1,"foo")}.to raise_error(NGSCI::NGSCIError)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "fails to instantiate when the stop site is greater than the start site" do
|
15
|
+
expect{NGSCI::Read.new(3,1)}.to raise_error(NGSCI::NGSCIError)
|
16
|
+
end
|
17
|
+
|
18
|
+
it "fails to instantiate on an improper strand argument" do
|
19
|
+
expect{NGSCI::Read.new(1,3,strand:"foo")}.to raise_error(NGSCI::NGSCIError)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "fails to instantiate without the three necessary arguments" do
|
23
|
+
expect{NGSCI::Read.new(1)}.to raise_error(ArgumentError)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "instantiates a new read with proper unstranded arguments" do
|
27
|
+
expect{NGSCI::Read.new(1,3)}.to_not raise_error
|
28
|
+
end
|
29
|
+
|
30
|
+
it "instantiates a new read with proper stranded arguments" do
|
31
|
+
expect{NGSCI::Read.new(1,3,strand:"+")}.to_not raise_error
|
32
|
+
end
|
24
33
|
end
|
25
34
|
|
26
|
-
|
27
|
-
|
28
|
-
|
35
|
+
context "after created" do
|
36
|
+
before(:each) do
|
37
|
+
@read = NGSCI::Read.new(1,3,strand:"+")
|
38
|
+
end
|
39
|
+
|
40
|
+
it "has a start attribute" do
|
41
|
+
expect(@read.methods).to include(:start)
|
42
|
+
end
|
43
|
+
|
44
|
+
it "has a stop attribute" do
|
45
|
+
expect(@read.methods).to include(:stop)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "has a length attribute" do
|
49
|
+
expect(@read.methods).to include(:length)
|
50
|
+
end
|
51
|
+
|
52
|
+
it "has a strand attribute" do
|
53
|
+
expect(@read.methods).to include(:strand)
|
54
|
+
end
|
29
55
|
|
30
|
-
it "instantiates a new read with proper stranded arguments" do
|
31
|
-
expect{NGSCI::Read.new(1,3,strand:"+")}.to_not raise_error
|
32
56
|
end
|
33
|
-
|
34
|
-
|
35
57
|
end
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ngs-ci
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2.b
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matthew Ralston
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: trollop
|
@@ -56,16 +56,16 @@ dependencies:
|
|
56
56
|
name: yell
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- - "
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '2'
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- - "
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '2'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: ruby-prof
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,16 +84,16 @@ dependencies:
|
|
84
84
|
name: bundler
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- - "
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
89
|
+
version: '1'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- - "
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
96
|
+
version: '1'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
98
|
name: rake
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,18 +122,32 @@ dependencies:
|
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '3.1'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pry
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: coveralls
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
128
142
|
requirements:
|
129
|
-
- - "
|
143
|
+
- - "~>"
|
130
144
|
- !ruby/object:Gem::Version
|
131
145
|
version: '0'
|
132
146
|
type: :development
|
133
147
|
prerelease: false
|
134
148
|
version_requirements: !ruby/object:Gem::Requirement
|
135
149
|
requirements:
|
136
|
-
- - "
|
150
|
+
- - "~>"
|
137
151
|
- !ruby/object:Gem::Version
|
138
152
|
version: '0'
|
139
153
|
description: Calculated a metric that estimates read complexity at each base for RNA-seq
|
@@ -169,6 +183,8 @@ files:
|
|
169
183
|
- spec/lib/read_spec.rb
|
170
184
|
- spec/spec_helper.rb
|
171
185
|
- spec/test_files/empty.bam
|
186
|
+
- spec/test_files/saturated.bam
|
187
|
+
- spec/test_files/saturated.bam.bai
|
172
188
|
- spec/test_files/test.bam
|
173
189
|
- spec/test_files/test.bam.bai
|
174
190
|
- spec/test_files/test.fa
|
@@ -204,6 +220,11 @@ test_files:
|
|
204
220
|
- spec/lib/read_spec.rb
|
205
221
|
- spec/spec_helper.rb
|
206
222
|
- spec/test_files/empty.bam
|
223
|
+
- spec/test_files/saturated.bam
|
224
|
+
- spec/test_files/saturated.bam.bai
|
207
225
|
- spec/test_files/test.bam
|
208
226
|
- spec/test_files/test.bam.bai
|
209
227
|
- spec/test_files/test.fa
|
228
|
+
has_rdoc:
|
229
|
+
- yard
|
230
|
+
- "~> 0"
|