ngs-ci 0.0.1.a
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.coveralls.yml +1 -0
- data/.gitignore +15 -0
- data/.rspec +1 -0
- data/.travis.yml +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +674 -0
- data/README.md +43 -0
- data/Rakefile +6 -0
- data/TODO.md +31 -0
- data/TODO.org +39 -0
- data/bin/ngs-ci +125 -0
- data/lib/NGSCI/calculator.rb +289 -0
- data/lib/NGSCI/cmd.rb +23 -0
- data/lib/NGSCI/read.rb +31 -0
- data/lib/NGSCI/version.rb +3 -0
- data/lib/NGSCI.rb +31 -0
- data/ngs-ci.gemspec +35 -0
- data/spec/lib/NGSCI_spec.rb +10 -0
- data/spec/lib/bin_spec.rb +51 -0
- data/spec/lib/calculator_spec.rb +316 -0
- data/spec/lib/cmd_spec.rb +17 -0
- data/spec/lib/read_spec.rb +35 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/test_files/empty.bam +0 -0
- data/spec/test_files/test.bam +0 -0
- data/spec/test_files/test.bam.bai +0 -0
- data/spec/test_files/test.fa +2 -0
- metadata +209 -0
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
[](https://travis-ci.org/MatthewRalston/SCI)
|
2
|
+
|
3
|
+
[](http://badge.fury.io/rb/SCI)
|
4
|
+
|
5
|
+
[](https://coveralls.io/r/MatthewRalston/SCI)
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
# SCI
|
10
|
+
|
11
|
+
NOTE: This is a project in progress.
|
12
|
+
This gem will calculate a sequencing complexity index for BAM files.
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
Add this line to your application's Gemfile:
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
gem 'sci'
|
20
|
+
```
|
21
|
+
|
22
|
+
And then execute:
|
23
|
+
|
24
|
+
$ bundle
|
25
|
+
|
26
|
+
Or install it yourself as:
|
27
|
+
|
28
|
+
$ gem install sci --pre
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
TODO: Write usage instructions here
|
33
|
+
|
34
|
+
## Contributing
|
35
|
+
|
36
|
+
1. Fork it ( https://github.com/MatthewRalston/SCI/fork )
|
37
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
38
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
39
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
40
|
+
5. Create a new Pull Request
|
41
|
+
|
42
|
+
## License
|
43
|
+
GPL v3. See LICENSE.txt for details.
|
data/Rakefile
ADDED
data/TODO.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
Library Complexity Index
|
2
|
+
|
3
|
+
* Summary
|
4
|
+
This is a Ruby gem that calculates an alternative to the "pileup" format. The calculation is an average of average overlaps among reads at a particular base in the genome. Offers both stranded and unstranded calculations.
|
5
|
+
|
6
|
+
* To-do list
|
7
|
+
1. Create gem environment
|
8
|
+
2. Create master class with options a la transrate
|
9
|
+
* including FR, RF, ??, or F strand specific options
|
10
|
+
3. Create bam processing class that
|
11
|
+
* increments through bases (x)
|
12
|
+
* calls stranded or unstranded method
|
13
|
+
* adds results to list
|
14
|
+
* stranded method (strand chemistry)
|
15
|
+
* calls methods by strand chemistry
|
16
|
+
* returns calculation class results
|
17
|
+
* strand chemistry methods
|
18
|
+
* each has specific SAM flags used to acquire reads
|
19
|
+
* e.g. for FR chemistry
|
20
|
+
* F reads are acquired according to strand
|
21
|
+
* R reads are acquired and assigned according to strand of mate
|
22
|
+
* unstranded method
|
23
|
+
* calls samtools to acquire reads from base "x"
|
24
|
+
* converts to bed and sorts
|
25
|
+
* returns calculation class results
|
26
|
+
4. Create calculation class that
|
27
|
+
* Increments through reads (i)
|
28
|
+
* Acquires reads overlapping read "i"
|
29
|
+
* Calculates average overlap and adds to list
|
30
|
+
* Averages all overlaps and returns
|
31
|
+
6. Prints either a Nx2 matrix or 1xN matrix to file.
|
data/TODO.org
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
* TODO
|
2
|
+
** Square the U/L term?? Max of 100?
|
3
|
+
** Rarefaction curves
|
4
|
+
*** bash + samtools + resampling (100 times?)
|
5
|
+
*** How many divisions??
|
6
|
+
** Triangular numbers
|
7
|
+
*** Triangular number defined as
|
8
|
+
*** T(n) = n(n+1)/2
|
9
|
+
*** Maximum overlap determined by function of read length L
|
10
|
+
*** The read with most 'even' overlaps is directly in the middle
|
11
|
+
*** EXAMPLES:
|
12
|
+
**** Left most read (a)
|
13
|
+
***** T(L-1)
|
14
|
+
**** Next left-most read (b)
|
15
|
+
***** (L-1) + T(L-1) - 1
|
16
|
+
**** (c)
|
17
|
+
***** (L-2) + (L-1) + T(L-1) - 2 - 1
|
18
|
+
*** For each of j reads (aligned in best case scenario)
|
19
|
+
**** Sum overlaps with all other reads:
|
20
|
+
**** f(L) = 2*T(L-1) - T(J-1) - T(L - J)
|
21
|
+
**** f(L) = (L-1)(L-1+1) - (J-1)(J-1+1)/2 - (L-J)(L-J+1)/2
|
22
|
+
**** f(L) = L(L-1) - J*(J-1)/2 - (L-J)(L-J+1)/2
|
23
|
+
**** f(L) = L^2 - L + (-J*J + J)/2 - (L-J)(L-J+1)/2
|
24
|
+
**** 2f(L) = 2*(L^2) - 2L - J^2 + J - (L-J)(L-J+1)
|
25
|
+
**** 2f(L) = 2*(L^2) - 2L - J^2 + J - (L^2 - 2LJ + L + J^2 - J)
|
26
|
+
**** 2f(L) = 2*(L^2) - 2L - J^2 + J - L^2 + 2LJ - L - J^2 + J
|
27
|
+
**** 2f(L) = 2*(L^2) - L^2 - J^2 - J^2 + 2LJ - 2L - L + J + J
|
28
|
+
**** 2f(L) = L^2 - 2*(J^2) + 2LJ - 3L + 2J
|
29
|
+
**** f(L) = -J^2 + (L^2)/2 + LJ - 3L/2 + J
|
30
|
+
**** 2850 (triangular number T(L-1) L=76 J=1
|
31
|
+
**** f(76) = 2850
|
32
|
+
* Notes
|
33
|
+
** U*O/L vs. 200*U*O/(L^2)
|
34
|
+
** U/L is the number of unique reads at that base, length normalized
|
35
|
+
** When U/L is 1 (maximum saturation)
|
36
|
+
** O = L/2
|
37
|
+
** Although, average overlap can be greater than L/2 with less reads
|
38
|
+
|
39
|
+
* Bugs
|
data/bin/ngs-ci
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'trollop'
|
4
|
+
require 'bio'
|
5
|
+
require 'NGSCI'
|
6
|
+
require 'yell'
|
7
|
+
|
8
|
+
include NGSCI
|
9
|
+
|
10
|
+
# Performance settings
|
11
|
+
RUBY_GC_HEAP_GROWTH_FACTOR=2
|
12
|
+
RUBY_GC_MALLOC_LIMIT=40000000
|
13
|
+
RUBY_GC_MALLOC_LIMIT_MAX=75000000
|
14
|
+
|
15
|
+
|
16
|
+
# Show the help message without arguments
|
17
|
+
|
18
|
+
ARGV[0] = "--help" if ARGV.length == 0
|
19
|
+
|
20
|
+
# We want clean error messages through the logger, no ugly backtraces
|
21
|
+
# because the user doesn't care about them, unless they specifically ask for
|
22
|
+
# them with --loglevel debug
|
23
|
+
module Kernel
|
24
|
+
alias _raise raise
|
25
|
+
|
26
|
+
def raise(*a)
|
27
|
+
begin
|
28
|
+
_raise(*a)
|
29
|
+
rescue NGSCIError => e
|
30
|
+
logger.error e.message
|
31
|
+
logger.debug e.backtrace unless e.backtrace.nil?
|
32
|
+
exit 1
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
opts = Trollop::options do
|
39
|
+
version NGSCI::VERSION
|
40
|
+
banner <<-EOS
|
41
|
+
NGSCI v#{NGSCI::VERSION}
|
42
|
+
by Matt Ralston <mrals@udel.edu>
|
43
|
+
DESCRIPTION:
|
44
|
+
Calculates a complexity metric for each base in the genome,
|
45
|
+
an alternative to pileup format.
|
46
|
+
The complexity metric is calculated for each base in the genome as the product of a read-length normalized average read overlap and a read-length normalized number of unique reads.
|
47
|
+
|
48
|
+
Bug reports and feature requests at:
|
49
|
+
http://github.com/MatthewRalston/sci
|
50
|
+
USAGE:
|
51
|
+
ngs-ci --reference REFERENCE_FASTA --bam SORTED_BAM <options>
|
52
|
+
|
53
|
+
EXAMPLES:
|
54
|
+
# compute sci for a set of reeads
|
55
|
+
ngs-ci --reference genome.fa --bam aligned_reads.bam
|
56
|
+
|
57
|
+
|
58
|
+
OPTIONS:
|
59
|
+
EOS
|
60
|
+
opt :reference, "Reference genome in fasta format.",
|
61
|
+
:type => String,
|
62
|
+
:required => true
|
63
|
+
opt :bam, "Sorted bam file.",
|
64
|
+
:type => String,
|
65
|
+
:required => true
|
66
|
+
opt :strand, "Strand specific option. One of [FR, RF, F].",
|
67
|
+
:type => String
|
68
|
+
opt :threads, "Number of threads to use",
|
69
|
+
:default => 1,
|
70
|
+
:type => Integer
|
71
|
+
opt :outfile, "Prefix filename to use for CSV output",
|
72
|
+
:default => "sci.csv"
|
73
|
+
opt :loglevel, "The amount of information to print. " +
|
74
|
+
"One of [error, info, warn, debug]",
|
75
|
+
:default => 'info'
|
76
|
+
end
|
77
|
+
|
78
|
+
####################
|
79
|
+
# Handle commands
|
80
|
+
####################
|
81
|
+
# Logging
|
82
|
+
unless %w[error info warn debug].include? opts.loglevel
|
83
|
+
raise NGSCIError.new "Loglevel #{opts.loglevel} is not valid. " +
|
84
|
+
"It must be one of: error, info, warn, debug."
|
85
|
+
end
|
86
|
+
|
87
|
+
logger.level = Yell::Level.new opts.loglevel.to_sym
|
88
|
+
|
89
|
+
# Strand specific option
|
90
|
+
if opts.strand
|
91
|
+
unless %w[FR RF F].include? opts.strand
|
92
|
+
raise NGSCIError.new "Strand specific option #{opts.strand} is invalid." +
|
93
|
+
" It must be one of: [FR, RF, F]"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Bam and fasta files exist
|
98
|
+
if opts.bam && opts.reference
|
99
|
+
if !File.exist?(opts.bam)
|
100
|
+
raise NGSCIIOError.new "BAM file #{opts.bam} does not exist."
|
101
|
+
elsif !File.exist?(opts.reference)
|
102
|
+
raise NGSCIIOError.new "Fasta file #{opts.reference} does not exist."
|
103
|
+
end
|
104
|
+
else
|
105
|
+
raise NGSCIIOError.new "A sorted BAM file and a fasta file are required."
|
106
|
+
end
|
107
|
+
|
108
|
+
####################
|
109
|
+
# Run calculation
|
110
|
+
####################
|
111
|
+
logger.info "Opening BAM and reference files for calculation."
|
112
|
+
calculator = Calculator.new(opts.bam,opts.reference,strand: opts.strand,threads: opts.threads)
|
113
|
+
|
114
|
+
if opts.loglevel == "debug"
|
115
|
+
calculator.run(runtime: true)
|
116
|
+
else
|
117
|
+
calculator.run
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
outfile = opts.outfile
|
122
|
+
logger.info "Writing sequencing complexity index to #{outfile}"
|
123
|
+
calculator.export(outfile)
|
124
|
+
|
125
|
+
logger.info "Calculation complete."
|
@@ -0,0 +1,289 @@
|
|
1
|
+
require 'bio'
|
2
|
+
require 'parallel'
|
3
|
+
require 'bio-samtools'
|
4
|
+
require 'ruby-prof'
|
5
|
+
|
6
|
+
module NGSCI
|
7
|
+
|
8
|
+
# A calculator calculates the sequencing complexity index.
|
9
|
+
#
|
10
|
+
# @!attribute [r] sci
|
11
|
+
class Calculator
|
12
|
+
attr_reader :sci, :block_size, :buffer, :chroms
|
13
|
+
|
14
|
+
# A new calculator to compute the sequencing complexity index given
|
15
|
+
# a loaded Bio::DB::Sam object and optional thread argument.
|
16
|
+
#
|
17
|
+
# @param bam [Bio::DB::Sam] Opened bam file with loaded reference.
|
18
|
+
# @param threads [Int] The number of threads used to compute NGSCI.
|
19
|
+
# @param strand [String] One of [FR RF F] or nil for strandedness.
|
20
|
+
def initialize(bam, reference, strand: nil, threads: 1)
|
21
|
+
@block_size = 1600
|
22
|
+
@results = nil
|
23
|
+
@reference=reference
|
24
|
+
@bam = Bio::DB::Sam.new(:bam=>bam,:fasta=>reference)
|
25
|
+
unless @bam.indexed?
|
26
|
+
@bam.index
|
27
|
+
end
|
28
|
+
@bam.open
|
29
|
+
@threads = threads
|
30
|
+
@chroms = reference_sequences(reference)
|
31
|
+
read_length
|
32
|
+
if strand
|
33
|
+
unless %w(FR RF F).include?(strand)
|
34
|
+
raise NGSCI::NGSCIError.new "Strand specific option #{opts.strand} is invalid." +
|
35
|
+
" It must be one of: [FR, RF, F]"
|
36
|
+
end
|
37
|
+
@strand = strand.downcase
|
38
|
+
else
|
39
|
+
@strand = nil
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Calculation of the sequencing complexity index
|
44
|
+
#
|
45
|
+
def run(runtime: false)
|
46
|
+
RubyProf.start if runtime
|
47
|
+
# Convert each aligned read to Read clas
|
48
|
+
chroms={}
|
49
|
+
@chroms.each do |chrom,size|
|
50
|
+
chroms[chrom] = @strand ? {"+"=>[],"-"=>[]} : {nil=>[]}
|
51
|
+
disk_accesses = (size/@block_size.to_f).ceil
|
52
|
+
=begin
|
53
|
+
# N O N - P A R A L L E L
|
54
|
+
i=0
|
55
|
+
while i < disk_accesses
|
56
|
+
|
57
|
+
readblock(chrom,i).each do |key,val|
|
58
|
+
chroms[chrom][key] += val
|
59
|
+
end
|
60
|
+
i+=1
|
61
|
+
end
|
62
|
+
=end
|
63
|
+
|
64
|
+
data = Parallel.map((0...disk_accesses).to_a,:in_processes => @threads) do |i|
|
65
|
+
readblock(chrom,i)
|
66
|
+
end
|
67
|
+
chroms[chrom].keys.each do |key|
|
68
|
+
chroms[chrom][key] = data.map{|x| x[key]}.flatten(1)
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
# Printing runtime information for optimization
|
74
|
+
if runtime
|
75
|
+
runtime=RubyProf.stop
|
76
|
+
printer=RubyProf::FlatPrinter.new(runtime)
|
77
|
+
printer.print(STDOUT)
|
78
|
+
end
|
79
|
+
@results = chroms
|
80
|
+
end
|
81
|
+
|
82
|
+
# Reads a single block from the disk and calculates the NGSCI
|
83
|
+
#
|
84
|
+
# @param chrom [String] The chromosome from the bam file
|
85
|
+
# @param i [Integer] The number of blocks that have been read
|
86
|
+
# @return localNGSCI [Hash<Symbol,Array>]
|
87
|
+
# * :+ (Array[Integer]) The NGSCI for the + strand
|
88
|
+
# * :- (Array[Integer]) The NGSCI for the - strand
|
89
|
+
def readblock(chrom,i)
|
90
|
+
reads=[]
|
91
|
+
results = @strand ? {"+" => [],"-" => []}: {nil => []}
|
92
|
+
start = [0,(i * @block_size) - @buffer].max
|
93
|
+
stop = [(i + 1) * @block_size, self.chroms[chrom]].min
|
94
|
+
@bam.fetch(chrom,start,stop) {|read| reads << convert(read)}
|
95
|
+
start += @buffer unless start == 0
|
96
|
+
reads.compact!
|
97
|
+
reads.sort_by!(&:start) unless reads.empty?
|
98
|
+
x=0
|
99
|
+
bases = (start...stop).to_a
|
100
|
+
block = stop - start
|
101
|
+
while x < block
|
102
|
+
b = bases[x]
|
103
|
+
aligned = reads.select{|r| r.start <= b && r.stop - 1 >= b}.group_by &:strand
|
104
|
+
results.keys.each do|key|
|
105
|
+
results[key] << [b,*sci(aligned[key] || [])]
|
106
|
+
end
|
107
|
+
x+=1
|
108
|
+
end
|
109
|
+
return results
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
# Calculates sequencing complexity index for a single base
|
114
|
+
#
|
115
|
+
# @param reads [Array<NGSCI::Read>] A group of reads aligned to a single base.
|
116
|
+
# @return sci [Float]
|
117
|
+
def sci(reads)
|
118
|
+
numreads=reads.size
|
119
|
+
# Groups reads by start site
|
120
|
+
# selects the largest read length from the groups
|
121
|
+
reads = reads.group_by(&:start).map{|k,v| v.max{|x,y| (x.stop-x.start).abs <=> (y.stop-y.start).abs}}
|
122
|
+
o = summed_overlaps(reads)
|
123
|
+
uniquereads = reads.size
|
124
|
+
return [numreads,uniquereads,(@buffer*o.to_f/@denom).round(4),(300*uniquereads*o/(2*@denom)).round(4)]
|
125
|
+
end
|
126
|
+
|
127
|
+
# Calculates summed overlap between a group of reads
|
128
|
+
#
|
129
|
+
# @param reads [Array<NGSCI::Read>] Array of reads
|
130
|
+
# @return avg_overlap [Integer] Summed overlap between reads
|
131
|
+
def summed_overlaps(reads)
|
132
|
+
numreads = reads.size
|
133
|
+
sum=0
|
134
|
+
unless numreads == 1
|
135
|
+
i = 0
|
136
|
+
while i < numreads
|
137
|
+
r1 = reads[i] # for each of n reads
|
138
|
+
sum+=reads.
|
139
|
+
reject{|r| r == r1}. # select the n-1 other reads
|
140
|
+
map{|r| overlap(r,r1)}. # calculate their overlap to r1
|
141
|
+
reduce(:+)
|
142
|
+
i+=1
|
143
|
+
end
|
144
|
+
end
|
145
|
+
return sum
|
146
|
+
end
|
147
|
+
|
148
|
+
# Calculation of the overlap between two reads
|
149
|
+
#
|
150
|
+
# @param read1 [NGSCI::Read] First read to be compared
|
151
|
+
# @param read2 [NGSCI::Read] First read to be compared
|
152
|
+
# @return overlap_length [Integer] Length of overlap
|
153
|
+
def overlap(read1,read2)
|
154
|
+
if read1.start > read2.start
|
155
|
+
if read1.stop < read2.stop # Read 1 is inside read 2
|
156
|
+
read1.stop - read1.start
|
157
|
+
else # Normal overlap
|
158
|
+
read2.stop - read1.start
|
159
|
+
end
|
160
|
+
else
|
161
|
+
if read1.stop > read2.stop # Read 2 is inside read 1
|
162
|
+
read2.stop - read2.start
|
163
|
+
else # Normal overlap
|
164
|
+
read1.stop - read2.start
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
# Loads the read length from a bam file into the @buffer variable
|
170
|
+
#
|
171
|
+
def read_length
|
172
|
+
buffer=0
|
173
|
+
stats=@bam.index_stats.select {|k,v| k != "*" && v[:mapped_reads] > 0}
|
174
|
+
if stats.empty?
|
175
|
+
raise NGSCIIOError.new "BAM file is empty! Check samtools idxstats."
|
176
|
+
else
|
177
|
+
i=0
|
178
|
+
lengths=[]
|
179
|
+
test = @block_size
|
180
|
+
while i <= test
|
181
|
+
@bam.view do |read|
|
182
|
+
lengths << read.seq.size
|
183
|
+
i +=1
|
184
|
+
end
|
185
|
+
if i == test && lengths.size < 100
|
186
|
+
test += @block_size
|
187
|
+
end
|
188
|
+
end
|
189
|
+
@buffer = lengths.max
|
190
|
+
@denom = @buffer**2 * (@buffer - 1)**2
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
# Converts strand specific BAM read into a sequence object format
|
195
|
+
# Uses the @strand instance variable to determine the strand of conversion
|
196
|
+
#
|
197
|
+
# @param read [Bio::DB::Alignment] Read to be converted.
|
198
|
+
# @return read [NGSCI::Read] Converted Read object
|
199
|
+
def convert(read)
|
200
|
+
unless read.query_unmapped
|
201
|
+
if @strand
|
202
|
+
return self.send(@strand.to_sym,read)
|
203
|
+
else
|
204
|
+
return newread(read)
|
205
|
+
end
|
206
|
+
end
|
207
|
+
return nil
|
208
|
+
end
|
209
|
+
|
210
|
+
|
211
|
+
# Converts strand specific BAM read into a sequence object format
|
212
|
+
# Assumes paired-end strand-specific sequencing with "fr" chemistry
|
213
|
+
#
|
214
|
+
# @param read [Bio::DB::Alignment] Read to be converted.
|
215
|
+
# @return read [NGSCI::Read] Converted Read object
|
216
|
+
def fr(read)
|
217
|
+
if read.first_in_pair
|
218
|
+
read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
|
219
|
+
else
|
220
|
+
read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+")
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
# Converts strand specific BAM read into a sequence object format
|
226
|
+
# Assumes paired-end strand-specific sequencing with "rf" chemistry
|
227
|
+
#
|
228
|
+
# @param read [Bio::DB::Alignment] Read to be converted.
|
229
|
+
# @return read [NGSCI::Read] Converted Read object
|
230
|
+
def rf(read)
|
231
|
+
if read.first_in_pair
|
232
|
+
read.query_strand ? newread(read,strand:"-") : newread(read,strand:"+")
|
233
|
+
else
|
234
|
+
read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
|
239
|
+
# Converts strand specific BAM read into a sequence object format
|
240
|
+
# Assumes single-end strand-specific sequencing with "f" chemistry
|
241
|
+
#
|
242
|
+
# @param read [Bio::DB::Alignment] Read to be converted.
|
243
|
+
# @return read [NGSCI::Read] Converted Read object
|
244
|
+
def f(read)
|
245
|
+
read.query_strand ? newread(read,strand:"+") : newread(read,strand:"-")
|
246
|
+
end
|
247
|
+
|
248
|
+
# Creates a new read with optional strand argument
|
249
|
+
#
|
250
|
+
# @param read [Bio::DB::Alignment] Aligned read to be converted
|
251
|
+
# @param strand [String] Strand of read
|
252
|
+
# @return read [NGSCI::Read] Converted Read object
|
253
|
+
def newread(read,strand: nil)
|
254
|
+
Read.new(read.pos,read.pos+read.seq.size,strand: strand)
|
255
|
+
end
|
256
|
+
|
257
|
+
# Acquires names and sizes of reference sequences included in the bam file
|
258
|
+
#
|
259
|
+
# @param reference [String] Path to reference fasta file.
|
260
|
+
# @return chromosomes [Hash<Symbol,Object>] A dictionary of chromosome sizes
|
261
|
+
def reference_sequences(reference)
|
262
|
+
chromosomes={}
|
263
|
+
Bio::FastaFormat.open(@reference).each_entry do |f|
|
264
|
+
chromosomes[f.entry_id]=f.seq.size
|
265
|
+
end
|
266
|
+
chromosomes.select {|chrom| @bam.index_stats.keys.include?(chrom)}
|
267
|
+
end
|
268
|
+
# Exports the results to outfile
|
269
|
+
#
|
270
|
+
# @param outfile [String] Path to outfile
|
271
|
+
def export(outfile)
|
272
|
+
if @results
|
273
|
+
File.open(outfile,'w') do |file|
|
274
|
+
file.puts("Chrom,Base,Strand,Depth,Unique_Reads,Overlap,NGS-CI")
|
275
|
+
@results.each do |chrom,results|
|
276
|
+
results.each do |strand,val|
|
277
|
+
val.each do |x|
|
278
|
+
file.puts([chrom,x[0],strand,*x[1..-1]].join(","))
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
return outfile
|
284
|
+
else
|
285
|
+
return nil
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end # End calculator class
|
289
|
+
end
|
data/lib/NGSCI/cmd.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module NGSCI
|
4
|
+
|
5
|
+
class Cmd
|
6
|
+
|
7
|
+
attr_accessor :cmd, :stdout, :stderr, :status
|
8
|
+
|
9
|
+
def initialize cmd
|
10
|
+
@cmd = cmd
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
@stdout, @stderr, @status = Open3.capture3 @cmd
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
@cmd
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
data/lib/NGSCI/read.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module NGSCI
|
2
|
+
|
3
|
+
# A simple read class
|
4
|
+
#
|
5
|
+
# @!attribute [r] start
|
6
|
+
# @!attribute [r] stop
|
7
|
+
# @!attribute [r] strand
|
8
|
+
class Read
|
9
|
+
attr_reader :start, :stop, :strand
|
10
|
+
def initialize(start,stop,strand: nil)
|
11
|
+
=begin DEPRECATED chromosome variable
|
12
|
+
unless chr.is_a?(String)
|
13
|
+
raise NGSCIError.new "Invalid chromosome argument:\n"
|
14
|
+
"chr:#{chr}\tstart:#{start}\tstop:#{stop}\tstrand:#{strand}"
|
15
|
+
end
|
16
|
+
=end
|
17
|
+
unless start.is_a?(Integer) && stop.is_a?(Integer) && stop > start
|
18
|
+
raise NGSCIError.new "Invalid coordinate arguments:\n"
|
19
|
+
"chr:#{chr}\tstart:#{start}\tstop:#{stop}\tstrand:#{strand}"
|
20
|
+
end
|
21
|
+
if strand && !%w(+ -).include?(strand)
|
22
|
+
raise NGSCIError.new "Invalid strand argument:\n"
|
23
|
+
"chr:#{chr}\tstart:#{start}\tstop:#{stop}\tstrand:#{strand}"
|
24
|
+
end
|
25
|
+
@start=start
|
26
|
+
@stop=stop
|
27
|
+
@strand=strand
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
data/lib/NGSCI.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'yell'
|
2
|
+
|
3
|
+
# NGSCI stands for Sequencing Complexity Index
|
4
|
+
# This program calculates a sequencing complexity index for each base and/or strand in a genome.
|
5
|
+
# This program calculates this by averaging average overlaps of reads aligned to that base.
|
6
|
+
module NGSCI
|
7
|
+
# For custom error handling in the future, unimplemented
|
8
|
+
class NGSCIError < StandardError; end
|
9
|
+
class NGSCIIOError < NGSCIError; end
|
10
|
+
class NGSCIArgError < NGSCIError; end
|
11
|
+
|
12
|
+
|
13
|
+
# Create the universal logger and include it in Object
|
14
|
+
# making the logger object available everywhere
|
15
|
+
format = Yell::Formatter.new("[%5L] %d : %m", "%Y-%m-%d %H:%M:%S")
|
16
|
+
# http://xkcd.com/1179/
|
17
|
+
Yell.new(:format => format) do |l|
|
18
|
+
l.level = :info
|
19
|
+
l.name = Object
|
20
|
+
l.adapter STDOUT, level: [:debug, :info, :warn]
|
21
|
+
l.adapter STDERR, level: [:error, :fatal]
|
22
|
+
end
|
23
|
+
Object.send :include, Yell::Loggable
|
24
|
+
|
25
|
+
end # NGSCI
|
26
|
+
|
27
|
+
# Integrate modules
|
28
|
+
require 'NGSCI/cmd'
|
29
|
+
require 'NGSCI/version'
|
30
|
+
require 'NGSCI/calculator'
|
31
|
+
require 'NGSCI/read'
|
data/ngs-ci.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
|
2
|
+
# coding: utf-8
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'NGSCI/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = "ngs-ci"
|
9
|
+
spec.version = NGSCI::VERSION
|
10
|
+
spec.authors = ["Matthew Ralston"]
|
11
|
+
spec.email = ["mrals89@gmail.com"]
|
12
|
+
spec.summary = %q{Next Generation Sequencing Complexity Index.}
|
13
|
+
spec.description = %q{Calculated a metric that estimates read complexity at each base for RNA-seq BAM files. Alternative to pileup format.}
|
14
|
+
spec.homepage = ""
|
15
|
+
spec.license = "GPL v3"
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0")
|
18
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
19
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
20
|
+
spec.require_paths = ["lib"]
|
21
|
+
|
22
|
+
spec.add_dependency 'trollop','~> 2.1.2'
|
23
|
+
spec.add_dependency 'bio-samtools', '= 2.3.2'
|
24
|
+
spec.add_dependency 'parallel', '~> 1.4'
|
25
|
+
spec.add_dependency 'yell'
|
26
|
+
spec.add_dependency "ruby-prof", "~> 0.15"
|
27
|
+
spec.has_rdoc = 'yard'
|
28
|
+
|
29
|
+
spec.add_development_dependency "bundler"
|
30
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
31
|
+
spec.add_development_dependency "rspec", "~> 3.1"
|
32
|
+
#spec.add_development_dependency "guard", "~> 2.12"
|
33
|
+
spec.add_development_dependency "coveralls"
|
34
|
+
#spec.add_development_dependency "cucumber", "~> 1.3"
|
35
|
+
end
|