ngs-ci 0.0.1.a
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.coveralls.yml +1 -0
- data/.gitignore +15 -0
- data/.rspec +1 -0
- data/.travis.yml +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +674 -0
- data/README.md +43 -0
- data/Rakefile +6 -0
- data/TODO.md +31 -0
- data/TODO.org +39 -0
- data/bin/ngs-ci +125 -0
- data/lib/NGSCI/calculator.rb +289 -0
- data/lib/NGSCI/cmd.rb +23 -0
- data/lib/NGSCI/read.rb +31 -0
- data/lib/NGSCI/version.rb +3 -0
- data/lib/NGSCI.rb +31 -0
- data/ngs-ci.gemspec +35 -0
- data/spec/lib/NGSCI_spec.rb +10 -0
- data/spec/lib/bin_spec.rb +51 -0
- data/spec/lib/calculator_spec.rb +316 -0
- data/spec/lib/cmd_spec.rb +17 -0
- data/spec/lib/read_spec.rb +35 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/test_files/empty.bam +0 -0
- data/spec/test_files/test.bam +0 -0
- data/spec/test_files/test.bam.bai +0 -0
- data/spec/test_files/test.fa +2 -0
- metadata +209 -0
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
[![Build Status](https://travis-ci.org/MatthewRalston/SCI.png?branch=master)](https://travis-ci.org/MatthewRalston/SCI)
|
2
|
+
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/SCI.png)](http://badge.fury.io/rb/SCI)
|
4
|
+
|
5
|
+
[![Coverage Status](https://coveralls.io/repos/MatthewRalston/SCI/badge.png)](https://coveralls.io/r/MatthewRalston/SCI)
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
# SCI
|
10
|
+
|
11
|
+
NOTE: This is a project in progress.
|
12
|
+
This gem will calculate a sequencing complexity index for BAM files.
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
Add this line to your application's Gemfile:
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
gem 'sci'
|
20
|
+
```
|
21
|
+
|
22
|
+
And then execute:
|
23
|
+
|
24
|
+
$ bundle
|
25
|
+
|
26
|
+
Or install it yourself as:
|
27
|
+
|
28
|
+
$ gem install sci --pre
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
TODO: Write usage instructions here
|
33
|
+
|
34
|
+
## Contributing
|
35
|
+
|
36
|
+
1. Fork it ( https://github.com/MatthewRalston/SCI/fork )
|
37
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
38
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
39
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
40
|
+
5. Create a new Pull Request
|
41
|
+
|
42
|
+
## License
|
43
|
+
GPL v3. See LICENSE.txt for details.
|
data/Rakefile
ADDED
data/TODO.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
Library Complexity Index
|
2
|
+
|
3
|
+
* Summary
|
4
|
+
This is a Ruby gem that calculates an alternative to the "pileup" format. The calculation is an average of average overlaps among reads at a particular base in the genome. Offers both stranded and unstranded calculations.
|
5
|
+
|
6
|
+
* To-do list
|
7
|
+
1. Create gem environment
|
8
|
+
2. Create master class with options a la transrate
|
9
|
+
* including FR, RF, ??, or F strand specific options
|
10
|
+
3. Create bam processing class that
|
11
|
+
* increments through bases (x)
|
12
|
+
* calls stranded or unstranded method
|
13
|
+
* adds results to list
|
14
|
+
* stranded method (strand chemistry)
|
15
|
+
* calls methods by strand chemistry
|
16
|
+
* returns calculation class results
|
17
|
+
* strand chemistry methods
|
18
|
+
* each has specific SAM flags used to acquire reads
|
19
|
+
* e.g. for FR chemistry
|
20
|
+
* F reads are acquired according to strand
|
21
|
+
* R reads are acquired and assigned according to strand of mate
|
22
|
+
* unstranded method
|
23
|
+
* calls samtools to acquire reads from base "x"
|
24
|
+
* converts to bed and sorts
|
25
|
+
* returns calculation class results
|
26
|
+
4. Create calculation class that
|
27
|
+
* Increments through reads (i)
|
28
|
+
* Acquires reads overlapping read "i"
|
29
|
+
* Calculates average overlap and adds to list
|
30
|
+
* Averages all overlaps and returns
|
31
|
+
6. Prints either a Nx2 matrix or 1xN matrix to file.
|
data/TODO.org
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
* TODO
|
2
|
+
** Square the U/L term?? Max of 100?
|
3
|
+
** Rarefaction curves
|
4
|
+
*** bash + samtools + resampling (100 times?)
|
5
|
+
*** How many divisions??
|
6
|
+
** Triangular numbers
|
7
|
+
*** Triangular number defined as
|
8
|
+
*** T(n) = n(n+1)/2
|
9
|
+
*** Maximum overlap determined by function of read length L
|
10
|
+
*** The read with most 'even' overlaps is directly in the middle
|
11
|
+
*** EXAMPLES:
|
12
|
+
**** Left most read (a)
|
13
|
+
***** T(L-1)
|
14
|
+
**** Next left-most read (b)
|
15
|
+
***** (L-1) + T(L-1) - 1
|
16
|
+
**** (c)
|
17
|
+
***** (L-2) + (L-1) + T(L-1) - 2 - 1
|
18
|
+
*** For each of j reads (aligned in best case scenario)
|
19
|
+
**** Sum overlaps with all other reads:
|
20
|
+
**** f(L) = 2*T(L-1) - T(J-1) - T(L - J)
|
21
|
+
**** f(L) = (L-1)(L-1+1) - (J-1)(J-1+1)/2 - (L-J)(L-J+1)/2
|
22
|
+
**** f(L) = L(L-1) - J*(J-1)/2 - (L-J)(L-J+1)/2
|
23
|
+
**** f(L) = L^2 - L + (-J*J + J)/2 - (L-J)(L-J+1)/2
|
24
|
+
**** 2f(L) = 2*(L^2) - 2L - J^2 + J - (L-J)(L-J+1)
|
25
|
+
**** 2f(L) = 2*(L^2) - 2L - J^2 + J - (L^2 - 2LJ + L + J^2 - J)
|
26
|
+
**** 2f(L) = 2*(L^2) - 2L - J^2 + J - L^2 + 2LJ - L - J^2 + J
|
27
|
+
**** 2f(L) = 2*(L^2) - L^2 - J^2 - J^2 + 2LJ - 2L - L + J + J
|
28
|
+
**** 2f(L) = L^2 - 2*(J^2) + 2LJ - 3L + 2J
|
29
|
+
**** f(L) = -J^2 + (L^2)/2 + LJ - 3L/2 + J
|
30
|
+
**** 2850 (triangular number T(L-1) L=76 J=1
|
31
|
+
**** f(76) = 2850
|
32
|
+
* Notes
|
33
|
+
** U*O/L vs. 200*U*O/(L^2)
|
34
|
+
** U/L is the number of unique reads at that base, length normalized
|
35
|
+
** When U/L is 1 (maximum saturation)
|
36
|
+
** O = L/2
|
37
|
+
** Although, average overlap can be greater than L/2 with less reads
|
38
|
+
|
39
|
+
* Bugs
|
data/bin/ngs-ci
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'trollop'
|
4
|
+
require 'bio'
|
5
|
+
require 'NGSCI'
|
6
|
+
require 'yell'
|
7
|
+
|
8
|
+
include NGSCI
|
9
|
+
|
10
|
+
# Performance settings
|
11
|
+
RUBY_GC_HEAP_GROWTH_FACTOR=2
|
12
|
+
RUBY_GC_MALLOC_LIMIT=40000000
|
13
|
+
RUBY_GC_MALLOC_LIMIT_MAX=75000000
|
14
|
+
|
15
|
+
|
16
|
+
# Show the help message without arguments
|
17
|
+
|
18
|
+
ARGV[0] = "--help" if ARGV.length == 0
|
19
|
+
|
20
|
+
# We want clean error messages through the logger, no ugly backtraces
|
21
|
+
# because the user doesn't care about them, unless they specifically ask for
|
22
|
+
# them with --loglevel debug
|
23
|
+
module Kernel
|
24
|
+
alias _raise raise
|
25
|
+
|
26
|
+
def raise(*a)
|
27
|
+
begin
|
28
|
+
_raise(*a)
|
29
|
+
rescue NGSCIError => e
|
30
|
+
logger.error e.message
|
31
|
+
logger.debug e.backtrace unless e.backtrace.nil?
|
32
|
+
exit 1
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
opts = Trollop::options do
|
39
|
+
version NGSCI::VERSION
|
40
|
+
banner <<-EOS
|
41
|
+
NGSCI v#{NGSCI::VERSION}
|
42
|
+
by Matt Ralston <mrals@udel.edu>
|
43
|
+
DESCRIPTION:
|
44
|
+
Calculates a complexity metric for each base in the genome,
|
45
|
+
an alternative to pileup format.
|
46
|
+
The complexity metric is calculated for each base in the genome as the product of a read-length normalized average read overlap and a read-length normalized number of unique reads.
|
47
|
+
|
48
|
+
Bug reports and feature requests at:
|
49
|
+
http://github.com/MatthewRalston/sci
|
50
|
+
USAGE:
|
51
|
+
ngs-ci --reference REFERENCE_FASTA --bam SORTED_BAM <options>
|
52
|
+
|
53
|
+
EXAMPLES:
|
54
|
+
# compute sci for a set of reeads
|
55
|
+
ngs-ci --reference genome.fa --bam aligned_reads.bam
|
56
|
+
|
57
|
+
|
58
|
+
OPTIONS:
|
59
|
+
EOS
|
60
|
+
opt :reference, "Reference genome in fasta format.",
|
61
|
+
:type => String,
|
62
|
+
:required => true
|
63
|
+
opt :bam, "Sorted bam file.",
|
64
|
+
:type => String,
|
65
|
+
:required => true
|
66
|
+
opt :strand, "Strand specific option. One of [FR, RF, F].",
|
67
|
+
:type => String
|
68
|
+
opt :threads, "Number of threads to use",
|
69
|
+
:default => 1,
|
70
|
+
:type => Integer
|
71
|
+
opt :outfile, "Prefix filename to use for CSV output",
|
72
|
+
:default => "sci.csv"
|
73
|
+
opt :loglevel, "The amount of information to print. " +
|
74
|
+
"One of [error, info, warn, debug]",
|
75
|
+
:default => 'info'
|
76
|
+
end
|
77
|
+
|
78
|
+
####################
|
79
|
+
# Handle commands
|
80
|
+
####################
|
81
|
+
# Logging
|
82
|
+
unless %w[error info warn debug].include? opts.loglevel
|
83
|
+
raise NGSCIError.new "Loglevel #{opts.loglevel} is not valid. " +
|
84
|
+
"It must be one of: error, info, warn, debug."
|
85
|
+
end
|
86
|
+
|
87
|
+
logger.level = Yell::Level.new opts.loglevel.to_sym
|
88
|
+
|
89
|
+
# Strand specific option
|
90
|
+
if opts.strand
|
91
|
+
unless %w[FR RF F].include? opts.strand
|
92
|
+
raise NGSCIError.new "Strand specific option #{opts.strand} is invalid." +
|
93
|
+
" It must be one of: [FR, RF, F]"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Bam and fasta files exist
|
98
|
+
if opts.bam && opts.reference
|
99
|
+
if !File.exist?(opts.bam)
|
100
|
+
raise NGSCIIOError.new "BAM file #{opts.bam} does not exist."
|
101
|
+
elsif !File.exist?(opts.reference)
|
102
|
+
raise NGSCIIOError.new "Fasta file #{opts.reference} does not exist."
|
103
|
+
end
|
104
|
+
else
|
105
|
+
raise NGSCIIOError.new "A sorted BAM file and a fasta file are required."
|
106
|
+
end
|
107
|
+
|
108
|
+
####################
|
109
|
+
# Run calculation
|
110
|
+
####################
|
111
|
+
logger.info "Opening BAM and reference files for calculation."
|
112
|
+
calculator = Calculator.new(opts.bam,opts.reference,strand: opts.strand,threads: opts.threads)
|
113
|
+
|
114
|
+
if opts.loglevel == "debug"
|
115
|
+
calculator.run(runtime: true)
|
116
|
+
else
|
117
|
+
calculator.run
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
outfile = opts.outfile
|
122
|
+
logger.info "Writing sequencing complexity index to #{outfile}"
|
123
|
+
calculator.export(outfile)
|
124
|
+
|
125
|
+
logger.info "Calculation complete."
|
@@ -0,0 +1,289 @@
|
|
1
|
+
require 'bio'
|
2
|
+
require 'parallel'
|
3
|
+
require 'bio-samtools'
|
4
|
+
require 'ruby-prof'
|
5
|
+
|
6
|
+
module NGSCI
|
7
|
+
|
8
|
+
# A calculator calculates the sequencing complexity index.
|
9
|
+
#
|
10
|
+
# @!attribute [r] sci
|
11
|
+
class Calculator
|
12
|
+
attr_reader :sci, :block_size, :buffer, :chroms
|
13
|
+
|
14
|
+
# A new calculator to compute the sequencing complexity index given
|
15
|
+
# a loaded Bio::DB::Sam object and optional thread argument.
|
16
|
+
#
|
17
|
+
# @param bam [Bio::DB::Sam] Opened bam file with loaded reference.
|
18
|
+
# @param threads [Int] The number of threads used to compute NGSCI.
|
19
|
+
# @param strand [String] One of [FR RF F] or nil for strandedness.
|
20
|
+
def initialize(bam, reference, strand: nil, threads: 1)
|
21
|
+
@block_size = 1600
|
22
|
+
@results = nil
|
23
|
+
@reference=reference
|
24
|
+
@bam = Bio::DB::Sam.new(:bam=>bam,:fasta=>reference)
|
25
|
+
unless @bam.indexed?
|
26
|
+
@bam.index
|
27
|
+
end
|
28
|
+
@bam.open
|
29
|
+
@threads = threads
|
30
|
+
@chroms = reference_sequences(reference)
|
31
|
+
read_length
|
32
|
+
if strand
|
33
|
+
unless %w(FR RF F).include?(strand)
|
34
|
+
raise NGSCI::NGSCIError.new "Strand specific option #{opts.strand} is invalid." +
|
35
|
+
" It must be one of: [FR, RF, F]"
|
36
|
+
end
|
37
|
+
@strand = strand.downcase
|
38
|
+
else
|
39
|
+
@strand = nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Calculation of the sequencing complexity index
|
44
|
+
#
|
45
|
+
def run(runtime: false)
|
46
|
+
RubyProf.start if runtime
|
47
|
+
# Convert each aligned read to Read clas
|
48
|
+
chroms={}
|
49
|
+
@chroms.each do |chrom,size|
|
50
|
+
chroms[chrom] = @strand ? {"+"=>[],"-"=>[]} : {nil=>[]}
|
51
|
+
disk_accesses = (size/@block_size.to_f).ceil
|
52
|
+
=begin
|
53
|
+
# N O N - P A R A L L E L
|
54
|
+
i=0
|
55
|
+
while i < disk_accesses
|
56
|
+
|
57
|
+
readblock(chrom,i).each do |key,val|
|
58
|
+
chroms[chrom][key] += val
|
59
|
+
end
|
60
|
+
i+=1
|
61
|
+
end
|
62
|
+
=end
|
63
|
+
|
64
|
+
data = Parallel.map((0...disk_accesses).to_a,:in_processes => @threads) do |i|
|
65
|
+
readblock(chrom,i)
|
66
|
+
end
|
67
|
+
chroms[chrom].keys.each do |key|
|
68
|
+
chroms[chrom][key] = data.map{|x| x[key]}.flatten(1)
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
# Printing runtime information for optimization
|
74
|
+
if runtime
|
75
|
+
runtime=RubyProf.stop
|
76
|
+
printer=RubyProf::FlatPrinter.new(runtime)
|
77
|
+
printer.print(STDOUT)
|
78
|
+
end
|
79
|
+
@results = chroms
|
80
|
+
end
|
81
|
+
|
82
|
+
# Reads a single block from the disk and calculates the NGSCI
|
83
|
+
#
|
84
|
+
# @param chrom [String] The chromosome from the bam file
|
85
|
+
# @param i [Integer] The number of blocks that have been read
|
86
|
+
# @return localNGSCI [Hash<Symbol,Array>]
|
87
|
+
# * :+ (Array[Integer]) The NGSCI for the + strand
|
88
|
+
# * :- (Array[Integer]) The NGSCI for the - strand
|
89
|
+
def readblock(chrom,i)
|
90
|
+
reads=[]
|
91
|
+
results = @strand ? {"+" => [],"-" => []}: {nil => []}
|
92
|
+
start = [0,(i * @block_size) - @buffer].max
|
93
|
+
stop = [(i + 1) * @block_size, self.chroms[chrom]].min
|
94
|
+
@bam.fetch(chrom,start,stop) {|read| reads << convert(read)}
|
95
|
+
start += @buffer unless start == 0
|
96
|
+
reads.compact!
|
97
|
+
reads.sort_by!(&:start) unless reads.empty?
|
98
|
+
x=0
|
99
|
+
bases = (start...stop).to_a
|
100
|
+
block = stop - start
|
101
|
+
while x < block
|
102
|
+
b = bases[x]
|
103
|
+
aligned = reads.select{|r| r.start <= b && r.stop - 1 >= b}.group_by &:strand
|
104
|
+
results.keys.each do|key|
|
105
|
+
results[key] << [b,*sci(aligned[key] || [])]
|
106
|
+
end
|
107
|
+
x+=1
|
108
|
+
end
|
109
|
+
return results
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
# Calculates sequencing complexity index for a single base
|
114
|
+
#
|
115
|
+
# @param reads [Array<NGSCI::Read>] A group of reads aligned to a single base.
|
116
|
+
# @return sci [Float]
|
117
|
+
def sci(reads)
|
118
|
+
numreads=reads.size
|
119
|
+
# Groups reads by start site
|
120
|
+
# selects the largest read length from the groups
|
121
|
+
reads = reads.group_by(&:start).map{|k,v| v.max{|x,y| (x.stop-x.start).abs <=> (y.stop-y.start).abs}}
|
122
|
+
o = summed_overlaps(reads)
|
123
|
+
uniquereads = reads.size
|
124
|
+
return [numreads,uniquereads,(@buffer*o.to_f/@denom).round(4),(300*uniquereads*o/(2*@denom)).round(4)]
|
125
|
+
end
|
126
|
+
|
127
|
+
# Calculates summed overlap between a group of reads
|
128
|
+
#
|
129
|
+
# @param reads [Array<NGSCI::Read>] Array of reads
|
130
|
+
# @return avg_overlap [Integer] Summed overlap between reads
|
131
|
+
def summed_overlaps(reads)
|
132
|
+
numreads = reads.size
|
133
|
+
sum=0
|
134
|
+
unless numreads == 1
|
135
|
+
i = 0
|
136
|
+
while i < numreads
|
137
|
+
r1 = reads[i] # for each of n reads
|
138
|
+
sum+=reads.
|
139
|
+
reject{|r| r == r1}. # select the n-1 other reads
|
140
|
+
map{|r| overlap(r,r1)}. # calculate their overlap to r1
|
141
|
+
reduce(:+)
|
142
|
+
i+=1
|
143
|
+
end
|
144
|
+
end
|
145
|
+
return sum
|
146
|
+
end
|
147
|
+
|
148
|
+
# Calculation of the overlap between two reads
|
149
|
+
#
|
150
|
+
# @param read1 [NGSCI::Read] First read to be compared
|
151
|
+
# @param read2 [NGSCI::Read] First read to be compared
|
152
|
+
# @return overlap_length [Integer] Length of overlap
|
153
|
+
def overlap(read1,read2)
|
154
|
+
if read1.start > read2.start
|
155
|
+
if read1.stop < read2.stop # Read 1 is inside read 2
|
156
|
+
read1.stop - read1.start
|
157
|
+
else # Normal overlap
|
158
|
+
read2.stop - read1.start
|
159
|
+
end
|
160
|
+
else
|
161
|
+
if read1.stop > read2.stop # Read 2 is inside read 1
|
162
|
+
read2.stop - read2.start
|
163
|
+
else # Normal overlap
|
164
|
+
read1.stop - read2.start
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# Loads the read length from a bam file into the @buffer variable
|
170
|
+
#
|
171
|
+
def read_length
|
172
|
+
buffer=0
|
173
|
+
stats=@bam.index_stats.select {|k,v| k != "*" && v[:mapped_reads] > 0}
|
174
|
+
if stats.empty?
|
175
|
+
raise NGSCIIOError.new "BAM file is empty! Check samtools idxstats."
|
176
|
+
else
|
177
|
+
i=0
|
178
|
+
lengths=[]
|
179
|
+
test = @block_size
|
180
|
+
while i <= test
|
181
|
+
@bam.view do |read|
|
182
|
+
lengths << read.seq.size
|
183
|
+
i +=1
|
184
|
+
end
|
185
|
+
if i == test && lengths.size < 100
|
186
|
+
test += @block_size
|
187
|
+
end
|
188
|
+
end
|
189
|
+
@buffer = lengths.max
|
190
|
+
@denom = @buffer**2 * (@buffer - 1)**2
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
# Converts strand specific BAM read into a sequence object format
|
195
|
+
# Uses the @strand instance variable to determine the strand of conversion
|
196
|
+
#
|
197
|
+
# @param read [Bio::DB::Alignment] Read to be converted.
|
198
|
+
# @return read [NGSCI::Read] Converted Read object
|
199
|
+
def convert(read)
|
200
|
+
unless read.query_unmapped
|
201
|
+
if @strand
|
202
|
+
return self.send(@strand.to_sym,read)
|
203
|
+
else
|
204
|
+
return newread(read)
|
205
|
+
end
|
206
|
+
end
|
207
|
+
return nil
|
208
|
+
end
|
209
|
+
|
210
|
+
|
211
|
+
# Converts strand specific BAM read into a sequence object format
|
212
|
+
# Assumes paired-end strand-specific sequencing with "fr" chemistry
|
213
|
+
#
|
214
|
+
# @param read [Bio::DB::Alignment] Read to be converted.
|
215
|
+
# @return read [NGSCI::Read] Converted Read object
|
216
|
+
def fr(read)
|
217
|
+
if read.first_in_pair
|
218
|
+
read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
|
219
|
+
else
|
220
|
+
read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+")
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
# Converts strand specific BAM read into a sequence object format
|
226
|
+
# Assumes paired-end strand-specific sequencing with "rf" chemistry
|
227
|
+
#
|
228
|
+
# @param read [Bio::DB::Alignment] Read to be converted.
|
229
|
+
# @return read [NGSCI::Read] Converted Read object
|
230
|
+
def rf(read)
|
231
|
+
if read.first_in_pair
|
232
|
+
read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+")
|
233
|
+
else
|
234
|
+
read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
|
239
|
+
# Converts strand specific BAM read into a sequence object format
|
240
|
+
# Assumes single-end strand-specific sequencing with "f" chemistry
|
241
|
+
#
|
242
|
+
# @param read [Bio::DB::Alignment] Read to be converted.
|
243
|
+
# @return read [NGSCI::Read] Converted Read object
|
244
|
+
def f(read)
|
245
|
+
read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
|
246
|
+
end
|
247
|
+
|
248
|
+
# Creates a new read with optional strand argument
|
249
|
+
#
|
250
|
+
# @param read [Bio::DB::Alignment] Aligned read to be converted
|
251
|
+
# @param strand [String] Strand of read
|
252
|
+
# @return read [NGSCI::Read] Converted Read object
|
253
|
+
def newread(read,strand: nil)
|
254
|
+
Read.new(read.pos,read.pos+read.seq.size,strand: strand)
|
255
|
+
end
|
256
|
+
|
257
|
+
# Acquires names and sizes of reference sequences included in the bam file
|
258
|
+
#
|
259
|
+
# @param reference [String] Path to reference fasta file.
|
260
|
+
# @return chromosomes [Hash<Symbol,Object>] A dictionary of chromosome sizes
|
261
|
+
def reference_sequences(reference)
|
262
|
+
chromosomes={}
|
263
|
+
Bio::FastaFormat.open(@reference).each_entry do |f|
|
264
|
+
chromosomes[f.entry_id]=f.seq.size
|
265
|
+
end
|
266
|
+
chromosomes.select {|chrom| @bam.index_stats.keys.include?(chrom)}
|
267
|
+
end
|
268
|
+
# Exports the results to outfile
|
269
|
+
#
|
270
|
+
# @param outfile [String] Path to outfile
|
271
|
+
def export(outfile)
|
272
|
+
if @results
|
273
|
+
File.open(outfile,'w') do |file|
|
274
|
+
file.puts("Chrom,Base,Strand,Depth,Unique_Reads,Overlap,NGS-CI")
|
275
|
+
@results.each do |chrom,results|
|
276
|
+
results.each do |strand,val|
|
277
|
+
val.each do |x|
|
278
|
+
file.puts([chrom,x[0],strand,*x[1..-1]].join(","))
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
return outfile
|
284
|
+
else
|
285
|
+
return nil
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end # End calculator class
|
289
|
+
end
|
data/lib/NGSCI/cmd.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module NGSCI
|
4
|
+
|
5
|
+
class Cmd
|
6
|
+
|
7
|
+
attr_accessor :cmd, :stdout, :stderr, :status
|
8
|
+
|
9
|
+
def initialize cmd
|
10
|
+
@cmd = cmd
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
@stdout, @stderr, @status = Open3.capture3 @cmd
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
@cmd
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
data/lib/NGSCI/read.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module NGSCI
|
2
|
+
|
3
|
+
# A simple read class
|
4
|
+
#
|
5
|
+
# @!attribute [r] start
|
6
|
+
# @!attribute [r] stop
|
7
|
+
# @!attribute [r] strand
|
8
|
+
class Read
|
9
|
+
attr_reader :start, :stop, :strand
|
10
|
+
def initialize(start,stop,strand: nil)
|
11
|
+
=begin DEPRECATED chromosome variable
|
12
|
+
unless chr.is_a?(String)
|
13
|
+
raise NGSCIError.new "Invalid chromosome argument:\n"
|
14
|
+
"chr:#{chr}\tstart:#{start}\tstop:#{stop}\tstrand:#{strand}"
|
15
|
+
end
|
16
|
+
=end
|
17
|
+
unless start.is_a?(Integer) && stop.is_a?(Integer) && stop > start
|
18
|
+
raise NGSCIError.new "Invalid coordinate arguments:\n"
|
19
|
+
"chr:#{chr}\tstart:#{start}\tstop:#{stop}\tstrand:#{strand}"
|
20
|
+
end
|
21
|
+
if strand && !%w(+ -).include?(strand)
|
22
|
+
raise NGSCIError.new "Invalid strand argument:\n"
|
23
|
+
"chr:#{chr}\tstart:#{start}\tstop:#{stop}\tstrand:#{strand}"
|
24
|
+
end
|
25
|
+
@start=start
|
26
|
+
@stop=stop
|
27
|
+
@strand=strand
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
data/lib/NGSCI.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'yell'
|
2
|
+
|
3
|
+
# NGSCI stands for Sequencing Complexity Index
|
4
|
+
# This program calculates a sequencing complexity index for each base and/or strand in a genome.
|
5
|
+
# This program calculates this by averaging average overlaps of reads aligned to that base.
|
6
|
+
module NGSCI
|
7
|
+
# For custom error handling in the future, unimplemented
|
8
|
+
class NGSCIError < StandardError; end
|
9
|
+
class NGSCIIOError < NGSCIError; end
|
10
|
+
class NGSCIArgError < NGSCIError; end
|
11
|
+
|
12
|
+
|
13
|
+
# Create the universal logger and include it in Object
|
14
|
+
# making the logger object available everywhere
|
15
|
+
format = Yell::Formatter.new("[%5L] %d : %m", "%Y-%m-%d %H:%M:%S")
|
16
|
+
# http://xkcd.com/1179/
|
17
|
+
Yell.new(:format => format) do |l|
|
18
|
+
l.level = :info
|
19
|
+
l.name = Object
|
20
|
+
l.adapter STDOUT, level: [:debug, :info, :warn]
|
21
|
+
l.adapter STDERR, level: [:error, :fatal]
|
22
|
+
end
|
23
|
+
Object.send :include, Yell::Loggable
|
24
|
+
|
25
|
+
end # NGSCI
|
26
|
+
|
27
|
+
# Integrate modules
|
28
|
+
require 'NGSCI/cmd'
|
29
|
+
require 'NGSCI/version'
|
30
|
+
require 'NGSCI/calculator'
|
31
|
+
require 'NGSCI/read'
|
data/ngs-ci.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
|
2
|
+
# coding: utf-8
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'NGSCI/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = "ngs-ci"
|
9
|
+
spec.version = NGSCI::VERSION
|
10
|
+
spec.authors = ["Matthew Ralston"]
|
11
|
+
spec.email = ["mrals89@gmail.com"]
|
12
|
+
spec.summary = %q{Next Generation Sequencing Complexity Index.}
|
13
|
+
spec.description = %q{Calculated a metric that estimates read complexity at each base for RNA-seq BAM files. Alternative to pileup format.}
|
14
|
+
spec.homepage = ""
|
15
|
+
spec.license = "GPL v3"
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0")
|
18
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
19
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
20
|
+
spec.require_paths = ["lib"]
|
21
|
+
|
22
|
+
spec.add_dependency 'trollop','~> 2.1.2'
|
23
|
+
spec.add_dependency 'bio-samtools', '= 2.3.2'
|
24
|
+
spec.add_dependency 'parallel', '~> 1.4'
|
25
|
+
spec.add_dependency 'yell'
|
26
|
+
spec.add_dependency "ruby-prof", "~> 0.15"
|
27
|
+
spec.has_rdoc = 'yard'
|
28
|
+
|
29
|
+
spec.add_development_dependency "bundler"
|
30
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
31
|
+
spec.add_development_dependency "rspec", "~> 3.1"
|
32
|
+
#spec.add_development_dependency "guard", "~> 2.12"
|
33
|
+
spec.add_development_dependency "coveralls"
|
34
|
+
#spec.add_development_dependency "cucumber", "~> 1.3"
|
35
|
+
end
|