dirseq 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -4
- data/README.md +2 -2
- data/VERSION +1 -1
- data/bin/dirseq +112 -13
- data/spec/data/eg.bam.bai +0 -0
- data/spec/script_spec.rb +24 -0
- metadata +6 -47
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57eb18eb4e971726d26849e01d3a4dd32597584d
|
4
|
+
data.tar.gz: f38d044727e8163873617fad111d3c244507fe71
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8842180ae34fa546362dbcd63a0d07e9269b9903cdec1b078c10f5031b848d59b8f4db6d1e9708942beb84b92c639365049e8fd563c59723e861402b92771797
|
7
|
+
data.tar.gz: c2f035c3dba77ee68fd2861d30c4764eb61095dac654e402a47a7213ddfec651bfb401caf2b5fa015c60efa783c7a50f09a7f4573bcfb544c621bbd3e8d6b0e3
|
data/Gemfile
CHANGED
@@ -9,11 +9,10 @@ gem "bio", "~>1.4", ">=1.4.2"
|
|
9
9
|
# Add dependencies to develop your gem here.
|
10
10
|
# Include everything needed to run rake, tests, features, etc.
|
11
11
|
group :development do
|
12
|
-
gem "shoulda", "~> 3.5"
|
13
|
-
gem "
|
14
|
-
gem "simplecov", "~> 0.8"
|
12
|
+
#gem "shoulda", "~> 3.5"
|
13
|
+
#gem "simplecov", "~> 0.8"
|
15
14
|
gem "jeweler", "~> 2.0"
|
16
15
|
gem "bundler", "~> 1.6"
|
17
|
-
gem "rspec", "~>
|
16
|
+
gem "rspec", "~> 3.0"
|
18
17
|
gem 'pry', '~>0.10'
|
19
18
|
end
|
data/README.md
CHANGED
@@ -13,8 +13,8 @@ Won't work just yet:
|
|
13
13
|
gem install dirseq
|
14
14
|
```
|
15
15
|
Requires:
|
16
|
-
* samtools (tested with 0.1.19)
|
17
|
-
* bedtools (tested with 2.
|
16
|
+
* samtools (tested with 0.1.19 and 1.0+)
|
17
|
+
* bedtools (tested with 2.24.0) - old versions won't work.
|
18
18
|
* Ruby (tested with 2.1.1)
|
19
19
|
|
20
20
|
## Usage
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/dirseq
CHANGED
@@ -9,11 +9,21 @@ require 'tempfile'
|
|
9
9
|
|
10
10
|
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
11
11
|
|
12
|
+
COVERAGE_COUNT_TYPE = 'coverage'
|
13
|
+
COUNT_COUNT_TYPE = 'count'
|
14
|
+
COUNT_TYPES = [
|
15
|
+
COUNT_COUNT_TYPE,
|
16
|
+
COVERAGE_COUNT_TYPE
|
17
|
+
]
|
18
|
+
|
12
19
|
# Parse command line options into the options hash
|
13
20
|
options = {
|
14
21
|
:ignore_directions => false,
|
15
22
|
:logger => 'stderr',
|
16
23
|
:log_level => 'info',
|
24
|
+
:count_type => COVERAGE_COUNT_TYPE,
|
25
|
+
:forward_read_only => false,
|
26
|
+
:accepted_feature_types => ['CDS'],
|
17
27
|
}
|
18
28
|
o = OptionParser.new do |opts|
|
19
29
|
opts.banner = "
|
@@ -28,9 +38,20 @@ o = OptionParser.new do |opts|
|
|
28
38
|
options[:gff] = arg
|
29
39
|
end
|
30
40
|
opts.separator "\nOptional parameters:\n\n"
|
41
|
+
opts.on("--forward-read-only", "consider only forward reads (i.e. read1) and ignore reverse reads. [default #{options[:forward_read_only]}]") do
|
42
|
+
options[:forward_ready_only] = true
|
43
|
+
end
|
31
44
|
opts.on("--ignore-directions", "ignore directionality, give overall coverage [default: false i.e. differentiate between directions]") do |arg|
|
32
45
|
options[:ignore_directions] = true
|
33
46
|
end
|
47
|
+
opts.on("--measure-type TYPE", "what to count for each gene [options: #{COUNT_TYPES.join(', ')}][default: #{options[:count_type]}]") do |arg|
|
48
|
+
raise "Unexpected count type detected" if not COUNT_TYPES.include?(arg)
|
49
|
+
options[:count_type] = arg
|
50
|
+
end
|
51
|
+
opts.on("--accepted-feature-types TYPE", Array,
|
52
|
+
"Print only features of these type(s) [default #{options[:accepted_feature_types].join(',')}]") do |arg|
|
53
|
+
options[:accepted_feature_types] = set(arg)
|
54
|
+
end
|
34
55
|
|
35
56
|
# logger options
|
36
57
|
opts.separator "\nVerbosity:\n\n"
|
@@ -47,6 +68,11 @@ Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level])
|
|
47
68
|
|
48
69
|
gff_file = options[:gff]
|
49
70
|
bam_file = options[:bam]
|
71
|
+
accepted_feature_types = options[:accepted_feature_types]
|
72
|
+
|
73
|
+
if options[:count_type] != COVERAGE_COUNT_TYPE and options[:ignore_directions]
|
74
|
+
raise "ignore_directions + count_type != coverage is currently unsupported"
|
75
|
+
end
|
50
76
|
|
51
77
|
|
52
78
|
calculate_cov = lambda do |covs, num_covs|
|
@@ -76,14 +102,26 @@ get_covs = lambda do |cov_lines|
|
|
76
102
|
#96 #coverage
|
77
103
|
#0.0208333
|
78
104
|
feat = splits[0..8]
|
105
|
+
feature_type = feat[2]
|
106
|
+
if not accepted_feature_types.include?(feature_type)
|
107
|
+
log.debug "Skipping feature as it is of type #{feature_type}"
|
108
|
+
next
|
109
|
+
end
|
79
110
|
if feat != previous_feature
|
80
111
|
feature_to_covs[previous_feature] = calculate_cov.call(covs, num_covs) unless previous_feature.nil?
|
81
112
|
covs = []
|
82
113
|
num_covs = 0
|
83
114
|
end
|
84
|
-
|
85
|
-
|
86
|
-
|
115
|
+
if splits.length == 13 # -hist
|
116
|
+
num = splits[10].to_i
|
117
|
+
covs.push num*splits[9].to_i
|
118
|
+
num_covs += num
|
119
|
+
elsif splits.length == 10 # -count
|
120
|
+
covs.push splits[9].to_i
|
121
|
+
num_covs += 1
|
122
|
+
else
|
123
|
+
raise "Unexpected bedtools output line: #{line}"
|
124
|
+
end
|
87
125
|
previous_feature = feat
|
88
126
|
end
|
89
127
|
feature_to_covs[previous_feature] = calculate_cov.call(covs, num_covs)
|
@@ -93,10 +131,54 @@ end
|
|
93
131
|
|
94
132
|
# Remove the ##FASTA and afterwards from the GFF file as this makes bedtools <2.25 fail
|
95
133
|
# https://github.com/arq5x/bedtools2/issues/235#issuecomment-103776618
|
96
|
-
no_fasta_gff = Tempfile.new('dirseq')
|
134
|
+
no_fasta_gff = Tempfile.new(['dirseq','.gff3'])
|
97
135
|
Bio::Commandeer.run "sed '/^##FASTA$/,$d' #{gff_file.inspect} > #{no_fasta_gff.path}", :log => log
|
98
136
|
gff_file = no_fasta_gff.path
|
99
137
|
|
138
|
+
|
139
|
+
|
140
|
+
|
141
|
+
|
142
|
+
# Find featureless contigs. Need to so that bedtools coverage -sorted does not complain
|
143
|
+
if not File.exists?("#{bam_file}.bai")
|
144
|
+
raise "Input bam file must be indexed, but the index file does not exist"
|
145
|
+
end
|
146
|
+
|
147
|
+
chromosome_file = Tempfile.new('bam_contigs')
|
148
|
+
log.info "Listing contigs in sorted order .."
|
149
|
+
cmd = "samtools idxstats #{bam_file.inspect} |cut -f1,2 >#{chromosome_file.path.inspect}"
|
150
|
+
Bio::Commandeer.run(cmd, :log => log)
|
151
|
+
|
152
|
+
log.info "Finding featureless contigs"
|
153
|
+
cmd = "grep -v '^#' #{gff_file.inspect} |cut -f1 |sort |uniq |grep -vFw -f /dev/stdin #{chromosome_file.path.inspect} |cut -f1"
|
154
|
+
featureless_contigs = Bio::Commandeer.run(cmd, :log => log).lines.map(&:chomp).reject{ |ref| ref=='*' }
|
155
|
+
log.info "Found #{featureless_contigs.length} featureless contigs"
|
156
|
+
|
157
|
+
# Sort the GFF
|
158
|
+
dummy_features = featureless_contigs.collect do |ref|
|
159
|
+
[ref,
|
160
|
+
'dirseq',
|
161
|
+
'misc_RNA',
|
162
|
+
'1',
|
163
|
+
'2',
|
164
|
+
'.',
|
165
|
+
'+',
|
166
|
+
'0',
|
167
|
+
"ID=#{ref}_dummy_feature"].join("\t")
|
168
|
+
end
|
169
|
+
sorted_gff_file_f = Tempfile.new(['sorted_gff','.gff3'])
|
170
|
+
sorted_gff_file = sorted_gff_file_f.path
|
171
|
+
Tempfile.open(["extra_features",'.gff']) do |ef|
|
172
|
+
ef.puts dummy_features.join("\n")
|
173
|
+
ef.close
|
174
|
+
|
175
|
+
cmd = "cat #{ef.path} #{gff_file.inspect} |bedtools sort -i /dev/stdin -faidx #{chromosome_file.path.inspect} >#{sorted_gff_file.inspect}"
|
176
|
+
log.info "Running bedtools sort"
|
177
|
+
Bio::Commandeer.run(cmd, :log => log)
|
178
|
+
end
|
179
|
+
|
180
|
+
|
181
|
+
|
100
182
|
covs_fwd = nil
|
101
183
|
if options[:ignore_directions]
|
102
184
|
cmd1 = "bedtools coverage -b #{bam_file.inspect} -a #{gff_file.inspect} -hist"
|
@@ -108,10 +190,14 @@ else
|
|
108
190
|
# fwd read 1
|
109
191
|
read1_flag = '-F128' #account for read1 in pair, as well as single reads mapping
|
110
192
|
read2_flag = '-f128'
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
193
|
+
bedtools_type_flag = '-hist'
|
194
|
+
if options[:count_type] == COUNT_COUNT_TYPE
|
195
|
+
bedtools_type_flag = '-counts'
|
196
|
+
end
|
197
|
+
cmdf1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -s #{bedtools_type_flag}"
|
198
|
+
cmdf2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -s #{bedtools_type_flag}"
|
199
|
+
cmdr1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -S #{bedtools_type_flag}"
|
200
|
+
cmdr2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -S #{bedtools_type_flag}"
|
115
201
|
|
116
202
|
command_to_parsed = lambda do |cmds, name|
|
117
203
|
covs_lines_initial = cmds.collect do |cmd|
|
@@ -122,15 +208,23 @@ else
|
|
122
208
|
get_covs.call(lines)
|
123
209
|
end
|
124
210
|
covs = covs_initial[0]
|
125
|
-
covs_initial
|
126
|
-
|
211
|
+
if covs_initial.length > 1
|
212
|
+
covs_initial[1].each do |cov_key, cov|
|
213
|
+
covs[cov_key] += cov
|
214
|
+
end
|
127
215
|
end
|
128
216
|
covs #'return' from lambda
|
129
217
|
end
|
130
218
|
|
131
219
|
# Agreeing reads (those whose template are fwd along the reference sequence) are either first and fwd, or second and rev
|
132
|
-
|
133
|
-
|
220
|
+
commands_fwd = [cmdf1,cmdr2]
|
221
|
+
commands_rev = [cmdf2,cmdr1]
|
222
|
+
if options[:forward_ready_only]
|
223
|
+
commands_fwd = [cmdf1]
|
224
|
+
commands_rev = [cmdr1]
|
225
|
+
end
|
226
|
+
covs_fwd = command_to_parsed.call(commands_fwd, 'reads with same direction as their reference')
|
227
|
+
covs_rev = command_to_parsed.call(commands_rev, 'reads with opposing direction as their reference')
|
134
228
|
end
|
135
229
|
|
136
230
|
headers = [
|
@@ -142,9 +236,14 @@ headers = [
|
|
142
236
|
]
|
143
237
|
if options[:ignore_directions]
|
144
238
|
headers.push 'average_coverage'
|
145
|
-
|
239
|
+
elsif options[:count_type] == COVERAGE_COUNT_TYPE
|
146
240
|
headers.push 'forward_average_coverage'
|
147
241
|
headers.push 'reverse_average_coverage'
|
242
|
+
elsif options[:count_type] == COUNT_COUNT_TYPE
|
243
|
+
headers.push 'forward_read_count'
|
244
|
+
headers.push 'reverse_read_count'
|
245
|
+
else
|
246
|
+
raise
|
148
247
|
end
|
149
248
|
headers.push 'annotation'
|
150
249
|
puts headers.join("\t")
|
Binary file
|
data/spec/script_spec.rb
CHANGED
@@ -51,4 +51,28 @@ describe 'script' do
|
|
51
51
|
|
52
52
|
found.should == answer
|
53
53
|
end
|
54
|
+
|
55
|
+
it 'should print counts correctly' do
|
56
|
+
answer = %w(
|
57
|
+
contig type start end strand forward_read_count reverse_read_count annotation
|
58
|
+
).join("\t")+"\n"+%w(
|
59
|
+
contig_100 CDS 2 127 + 0.0 2.0 putative
|
60
|
+
).join("\t")+" methyltransferase YcgJ\n"
|
61
|
+
|
62
|
+
found = Bio::Commandeer.run "#{path_to_script} --bam #{data_dir}/eg.bam --gff #{data_dir}/realer.gff -q --measure-type count"
|
63
|
+
|
64
|
+
found.should == answer
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should count only the forward read when asked' do
|
68
|
+
answer = %w(
|
69
|
+
contig type start end strand forward_read_count reverse_read_count annotation
|
70
|
+
).join("\t")+"\n"+%w(
|
71
|
+
contig_100 CDS 2 127 + 0.0 1.0 putative
|
72
|
+
).join("\t")+" methyltransferase YcgJ\n"
|
73
|
+
|
74
|
+
found = Bio::Commandeer.run "#{path_to_script} --bam #{data_dir}/eg.bam --gff #{data_dir}/realer.gff -q --measure-type count --forward-read-only"
|
75
|
+
|
76
|
+
found.should == answer
|
77
|
+
end
|
54
78
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dirseq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J. Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-02-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-commandeer
|
@@ -58,48 +58,6 @@ dependencies:
|
|
58
58
|
- - "~>"
|
59
59
|
- !ruby/object:Gem::Version
|
60
60
|
version: '1.4'
|
61
|
-
- !ruby/object:Gem::Dependency
|
62
|
-
name: shoulda
|
63
|
-
requirement: !ruby/object:Gem::Requirement
|
64
|
-
requirements:
|
65
|
-
- - "~>"
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version: '3.5'
|
68
|
-
type: :development
|
69
|
-
prerelease: false
|
70
|
-
version_requirements: !ruby/object:Gem::Requirement
|
71
|
-
requirements:
|
72
|
-
- - "~>"
|
73
|
-
- !ruby/object:Gem::Version
|
74
|
-
version: '3.5'
|
75
|
-
- !ruby/object:Gem::Dependency
|
76
|
-
name: rdoc
|
77
|
-
requirement: !ruby/object:Gem::Requirement
|
78
|
-
requirements:
|
79
|
-
- - "~>"
|
80
|
-
- !ruby/object:Gem::Version
|
81
|
-
version: '3.12'
|
82
|
-
type: :development
|
83
|
-
prerelease: false
|
84
|
-
version_requirements: !ruby/object:Gem::Requirement
|
85
|
-
requirements:
|
86
|
-
- - "~>"
|
87
|
-
- !ruby/object:Gem::Version
|
88
|
-
version: '3.12'
|
89
|
-
- !ruby/object:Gem::Dependency
|
90
|
-
name: simplecov
|
91
|
-
requirement: !ruby/object:Gem::Requirement
|
92
|
-
requirements:
|
93
|
-
- - "~>"
|
94
|
-
- !ruby/object:Gem::Version
|
95
|
-
version: '0.8'
|
96
|
-
type: :development
|
97
|
-
prerelease: false
|
98
|
-
version_requirements: !ruby/object:Gem::Requirement
|
99
|
-
requirements:
|
100
|
-
- - "~>"
|
101
|
-
- !ruby/object:Gem::Version
|
102
|
-
version: '0.8'
|
103
61
|
- !ruby/object:Gem::Dependency
|
104
62
|
name: jeweler
|
105
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -134,14 +92,14 @@ dependencies:
|
|
134
92
|
requirements:
|
135
93
|
- - "~>"
|
136
94
|
- !ruby/object:Gem::Version
|
137
|
-
version: '
|
95
|
+
version: '3.0'
|
138
96
|
type: :development
|
139
97
|
prerelease: false
|
140
98
|
version_requirements: !ruby/object:Gem::Requirement
|
141
99
|
requirements:
|
142
100
|
- - "~>"
|
143
101
|
- !ruby/object:Gem::Version
|
144
|
-
version: '
|
102
|
+
version: '3.0'
|
145
103
|
- !ruby/object:Gem::Dependency
|
146
104
|
name: pry
|
147
105
|
requirement: !ruby/object:Gem::Requirement
|
@@ -176,6 +134,7 @@ files:
|
|
176
134
|
- lib/bio-rnaseq_transcription_directionality.rb
|
177
135
|
- lib/bio-rnaseq_transcription_directionality/rnaseq_transcription_directionality.rb
|
178
136
|
- spec/data/eg.bam
|
137
|
+
- spec/data/eg.bam.bai
|
179
138
|
- spec/data/eg.gff
|
180
139
|
- spec/data/eg_with_fasta.gff
|
181
140
|
- spec/data/realer.gff
|
@@ -201,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
201
160
|
version: '0'
|
202
161
|
requirements: []
|
203
162
|
rubyforge_project:
|
204
|
-
rubygems_version: 2.
|
163
|
+
rubygems_version: 2.6.13
|
205
164
|
signing_key:
|
206
165
|
specification_version: 4
|
207
166
|
summary: FPKG calculator for metatranscriptomics
|