dirseq 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -4
- data/README.md +2 -2
- data/VERSION +1 -1
- data/bin/dirseq +112 -13
- data/spec/data/eg.bam.bai +0 -0
- data/spec/script_spec.rb +24 -0
- metadata +6 -47
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57eb18eb4e971726d26849e01d3a4dd32597584d
|
4
|
+
data.tar.gz: f38d044727e8163873617fad111d3c244507fe71
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8842180ae34fa546362dbcd63a0d07e9269b9903cdec1b078c10f5031b848d59b8f4db6d1e9708942beb84b92c639365049e8fd563c59723e861402b92771797
|
7
|
+
data.tar.gz: c2f035c3dba77ee68fd2861d30c4764eb61095dac654e402a47a7213ddfec651bfb401caf2b5fa015c60efa783c7a50f09a7f4573bcfb544c621bbd3e8d6b0e3
|
data/Gemfile
CHANGED
@@ -9,11 +9,10 @@ gem "bio", "~>1.4", ">=1.4.2"
|
|
9
9
|
# Add dependencies to develop your gem here.
|
10
10
|
# Include everything needed to run rake, tests, features, etc.
|
11
11
|
group :development do
|
12
|
-
gem "shoulda", "~> 3.5"
|
13
|
-
gem "
|
14
|
-
gem "simplecov", "~> 0.8"
|
12
|
+
#gem "shoulda", "~> 3.5"
|
13
|
+
#gem "simplecov", "~> 0.8"
|
15
14
|
gem "jeweler", "~> 2.0"
|
16
15
|
gem "bundler", "~> 1.6"
|
17
|
-
gem "rspec", "~>
|
16
|
+
gem "rspec", "~> 3.0"
|
18
17
|
gem 'pry', '~>0.10'
|
19
18
|
end
|
data/README.md
CHANGED
@@ -13,8 +13,8 @@ Won't work just yet:
|
|
13
13
|
gem install dirseq
|
14
14
|
```
|
15
15
|
Requires:
|
16
|
-
* samtools (tested with 0.1.19)
|
17
|
-
* bedtools (tested with 2.
|
16
|
+
* samtools (tested with 0.1.19 and 1.0+)
|
17
|
+
* bedtools (tested with 2.24.0) - old versions won't work.
|
18
18
|
* Ruby (tested with 2.1.1)
|
19
19
|
|
20
20
|
## Usage
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/dirseq
CHANGED
@@ -9,11 +9,21 @@ require 'tempfile'
|
|
9
9
|
|
10
10
|
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
|
11
11
|
|
12
|
+
COVERAGE_COUNT_TYPE = 'coverage'
|
13
|
+
COUNT_COUNT_TYPE = 'count'
|
14
|
+
COUNT_TYPES = [
|
15
|
+
COUNT_COUNT_TYPE,
|
16
|
+
COVERAGE_COUNT_TYPE
|
17
|
+
]
|
18
|
+
|
12
19
|
# Parse command line options into the options hash
|
13
20
|
options = {
|
14
21
|
:ignore_directions => false,
|
15
22
|
:logger => 'stderr',
|
16
23
|
:log_level => 'info',
|
24
|
+
:count_type => COVERAGE_COUNT_TYPE,
|
25
|
+
:forward_read_only => false,
|
26
|
+
:accepted_feature_types => ['CDS'],
|
17
27
|
}
|
18
28
|
o = OptionParser.new do |opts|
|
19
29
|
opts.banner = "
|
@@ -28,9 +38,20 @@ o = OptionParser.new do |opts|
|
|
28
38
|
options[:gff] = arg
|
29
39
|
end
|
30
40
|
opts.separator "\nOptional parameters:\n\n"
|
41
|
+
opts.on("--forward-read-only", "consider only forward reads (i.e. read1) and ignore reverse reads. [default #{options[:forward_read_only]}]") do
|
42
|
+
options[:forward_ready_only] = true
|
43
|
+
end
|
31
44
|
opts.on("--ignore-directions", "ignore directionality, give overall coverage [default: false i.e. differentiate between directions]") do |arg|
|
32
45
|
options[:ignore_directions] = true
|
33
46
|
end
|
47
|
+
opts.on("--measure-type TYPE", "what to count for each gene [options: #{COUNT_TYPES.join(', ')}][default: #{options[:count_type]}]") do |arg|
|
48
|
+
raise "Unexpected count type detected" if not COUNT_TYPES.include?(arg)
|
49
|
+
options[:count_type] = arg
|
50
|
+
end
|
51
|
+
opts.on("--accepted-feature-types TYPE", Array,
|
52
|
+
"Print only features of these type(s) [default #{options[:accepted_feature_types].join(',')}]") do |arg|
|
53
|
+
options[:accepted_feature_types] = set(arg)
|
54
|
+
end
|
34
55
|
|
35
56
|
# logger options
|
36
57
|
opts.separator "\nVerbosity:\n\n"
|
@@ -47,6 +68,11 @@ Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level])
|
|
47
68
|
|
48
69
|
gff_file = options[:gff]
|
49
70
|
bam_file = options[:bam]
|
71
|
+
accepted_feature_types = options[:accepted_feature_types]
|
72
|
+
|
73
|
+
if options[:count_type] != COVERAGE_COUNT_TYPE and options[:ignore_directions]
|
74
|
+
raise "ignore_directions + count_type != coverage is currently unsupported"
|
75
|
+
end
|
50
76
|
|
51
77
|
|
52
78
|
calculate_cov = lambda do |covs, num_covs|
|
@@ -76,14 +102,26 @@ get_covs = lambda do |cov_lines|
|
|
76
102
|
#96 #coverage
|
77
103
|
#0.0208333
|
78
104
|
feat = splits[0..8]
|
105
|
+
feature_type = feat[2]
|
106
|
+
if not accepted_feature_types.include?(feature_type)
|
107
|
+
log.debug "Skipping feature as it is of type #{feature_type}"
|
108
|
+
next
|
109
|
+
end
|
79
110
|
if feat != previous_feature
|
80
111
|
feature_to_covs[previous_feature] = calculate_cov.call(covs, num_covs) unless previous_feature.nil?
|
81
112
|
covs = []
|
82
113
|
num_covs = 0
|
83
114
|
end
|
84
|
-
|
85
|
-
|
86
|
-
|
115
|
+
if splits.length == 13 # -hist
|
116
|
+
num = splits[10].to_i
|
117
|
+
covs.push num*splits[9].to_i
|
118
|
+
num_covs += num
|
119
|
+
elsif splits.length == 10 # -count
|
120
|
+
covs.push splits[9].to_i
|
121
|
+
num_covs += 1
|
122
|
+
else
|
123
|
+
raise "Unexpected bedtools output line: #{line}"
|
124
|
+
end
|
87
125
|
previous_feature = feat
|
88
126
|
end
|
89
127
|
feature_to_covs[previous_feature] = calculate_cov.call(covs, num_covs)
|
@@ -93,10 +131,54 @@ end
|
|
93
131
|
|
94
132
|
# Remove the ##FASTA and afterwards from the GFF file as this makes bedtools <2.25 fail
|
95
133
|
# https://github.com/arq5x/bedtools2/issues/235#issuecomment-103776618
|
96
|
-
no_fasta_gff = Tempfile.new('dirseq')
|
134
|
+
no_fasta_gff = Tempfile.new(['dirseq','.gff3'])
|
97
135
|
Bio::Commandeer.run "sed '/^##FASTA$/,$d' #{gff_file.inspect} > #{no_fasta_gff.path}", :log => log
|
98
136
|
gff_file = no_fasta_gff.path
|
99
137
|
|
138
|
+
|
139
|
+
|
140
|
+
|
141
|
+
|
142
|
+
# Find featureless contigs. Need to so that bedtools coverage -sorted does not complain
|
143
|
+
if not File.exists?("#{bam_file}.bai")
|
144
|
+
raise "Input bam file must be indexed, but the index file does not exist"
|
145
|
+
end
|
146
|
+
|
147
|
+
chromosome_file = Tempfile.new('bam_contigs')
|
148
|
+
log.info "Listing contigs in sorted order .."
|
149
|
+
cmd = "samtools idxstats #{bam_file.inspect} |cut -f1,2 >#{chromosome_file.path.inspect}"
|
150
|
+
Bio::Commandeer.run(cmd, :log => log)
|
151
|
+
|
152
|
+
log.info "Finding featureless contigs"
|
153
|
+
cmd = "grep -v '^#' #{gff_file.inspect} |cut -f1 |sort |uniq |grep -vFw -f /dev/stdin #{chromosome_file.path.inspect} |cut -f1"
|
154
|
+
featureless_contigs = Bio::Commandeer.run(cmd, :log => log).lines.map(&:chomp).reject{ |ref| ref=='*' }
|
155
|
+
log.info "Found #{featureless_contigs.length} featureless contigs"
|
156
|
+
|
157
|
+
# Sort the GFF
|
158
|
+
dummy_features = featureless_contigs.collect do |ref|
|
159
|
+
[ref,
|
160
|
+
'dirseq',
|
161
|
+
'misc_RNA',
|
162
|
+
'1',
|
163
|
+
'2',
|
164
|
+
'.',
|
165
|
+
'+',
|
166
|
+
'0',
|
167
|
+
"ID=#{ref}_dummy_feature"].join("\t")
|
168
|
+
end
|
169
|
+
sorted_gff_file_f = Tempfile.new(['sorted_gff','.gff3'])
|
170
|
+
sorted_gff_file = sorted_gff_file_f.path
|
171
|
+
Tempfile.open(["extra_features",'.gff']) do |ef|
|
172
|
+
ef.puts dummy_features.join("\n")
|
173
|
+
ef.close
|
174
|
+
|
175
|
+
cmd = "cat #{ef.path} #{gff_file.inspect} |bedtools sort -i /dev/stdin -faidx #{chromosome_file.path.inspect} >#{sorted_gff_file.inspect}"
|
176
|
+
log.info "Running bedtools sort"
|
177
|
+
Bio::Commandeer.run(cmd, :log => log)
|
178
|
+
end
|
179
|
+
|
180
|
+
|
181
|
+
|
100
182
|
covs_fwd = nil
|
101
183
|
if options[:ignore_directions]
|
102
184
|
cmd1 = "bedtools coverage -b #{bam_file.inspect} -a #{gff_file.inspect} -hist"
|
@@ -108,10 +190,14 @@ else
|
|
108
190
|
# fwd read 1
|
109
191
|
read1_flag = '-F128' #account for read1 in pair, as well as single reads mapping
|
110
192
|
read2_flag = '-f128'
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
193
|
+
bedtools_type_flag = '-hist'
|
194
|
+
if options[:count_type] == COUNT_COUNT_TYPE
|
195
|
+
bedtools_type_flag = '-counts'
|
196
|
+
end
|
197
|
+
cmdf1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -s #{bedtools_type_flag}"
|
198
|
+
cmdf2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -s #{bedtools_type_flag}"
|
199
|
+
cmdr1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -S #{bedtools_type_flag}"
|
200
|
+
cmdr2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -S #{bedtools_type_flag}"
|
115
201
|
|
116
202
|
command_to_parsed = lambda do |cmds, name|
|
117
203
|
covs_lines_initial = cmds.collect do |cmd|
|
@@ -122,15 +208,23 @@ else
|
|
122
208
|
get_covs.call(lines)
|
123
209
|
end
|
124
210
|
covs = covs_initial[0]
|
125
|
-
covs_initial
|
126
|
-
|
211
|
+
if covs_initial.length > 1
|
212
|
+
covs_initial[1].each do |cov_key, cov|
|
213
|
+
covs[cov_key] += cov
|
214
|
+
end
|
127
215
|
end
|
128
216
|
covs #'return' from lambda
|
129
217
|
end
|
130
218
|
|
131
219
|
# Agreeing reads (those whose template are fwd along the reference sequence) are either first and fwd, or second and rev
|
132
|
-
|
133
|
-
|
220
|
+
commands_fwd = [cmdf1,cmdr2]
|
221
|
+
commands_rev = [cmdf2,cmdr1]
|
222
|
+
if options[:forward_ready_only]
|
223
|
+
commands_fwd = [cmdf1]
|
224
|
+
commands_rev = [cmdr1]
|
225
|
+
end
|
226
|
+
covs_fwd = command_to_parsed.call(commands_fwd, 'reads with same direction as their reference')
|
227
|
+
covs_rev = command_to_parsed.call(commands_rev, 'reads with opposing direction as their reference')
|
134
228
|
end
|
135
229
|
|
136
230
|
headers = [
|
@@ -142,9 +236,14 @@ headers = [
|
|
142
236
|
]
|
143
237
|
if options[:ignore_directions]
|
144
238
|
headers.push 'average_coverage'
|
145
|
-
|
239
|
+
elsif options[:count_type] == COVERAGE_COUNT_TYPE
|
146
240
|
headers.push 'forward_average_coverage'
|
147
241
|
headers.push 'reverse_average_coverage'
|
242
|
+
elsif options[:count_type] == COUNT_COUNT_TYPE
|
243
|
+
headers.push 'forward_read_count'
|
244
|
+
headers.push 'reverse_read_count'
|
245
|
+
else
|
246
|
+
raise
|
148
247
|
end
|
149
248
|
headers.push 'annotation'
|
150
249
|
puts headers.join("\t")
|
Binary file
|
data/spec/script_spec.rb
CHANGED
@@ -51,4 +51,28 @@ describe 'script' do
|
|
51
51
|
|
52
52
|
found.should == answer
|
53
53
|
end
|
54
|
+
|
55
|
+
it 'should print counts correctly' do
|
56
|
+
answer = %w(
|
57
|
+
contig type start end strand forward_read_count reverse_read_count annotation
|
58
|
+
).join("\t")+"\n"+%w(
|
59
|
+
contig_100 CDS 2 127 + 0.0 2.0 putative
|
60
|
+
).join("\t")+" methyltransferase YcgJ\n"
|
61
|
+
|
62
|
+
found = Bio::Commandeer.run "#{path_to_script} --bam #{data_dir}/eg.bam --gff #{data_dir}/realer.gff -q --measure-type count"
|
63
|
+
|
64
|
+
found.should == answer
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should count only the forward read when asked' do
|
68
|
+
answer = %w(
|
69
|
+
contig type start end strand forward_read_count reverse_read_count annotation
|
70
|
+
).join("\t")+"\n"+%w(
|
71
|
+
contig_100 CDS 2 127 + 0.0 1.0 putative
|
72
|
+
).join("\t")+" methyltransferase YcgJ\n"
|
73
|
+
|
74
|
+
found = Bio::Commandeer.run "#{path_to_script} --bam #{data_dir}/eg.bam --gff #{data_dir}/realer.gff -q --measure-type count --forward-read-only"
|
75
|
+
|
76
|
+
found.should == answer
|
77
|
+
end
|
54
78
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dirseq
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J. Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-02-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-commandeer
|
@@ -58,48 +58,6 @@ dependencies:
|
|
58
58
|
- - "~>"
|
59
59
|
- !ruby/object:Gem::Version
|
60
60
|
version: '1.4'
|
61
|
-
- !ruby/object:Gem::Dependency
|
62
|
-
name: shoulda
|
63
|
-
requirement: !ruby/object:Gem::Requirement
|
64
|
-
requirements:
|
65
|
-
- - "~>"
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version: '3.5'
|
68
|
-
type: :development
|
69
|
-
prerelease: false
|
70
|
-
version_requirements: !ruby/object:Gem::Requirement
|
71
|
-
requirements:
|
72
|
-
- - "~>"
|
73
|
-
- !ruby/object:Gem::Version
|
74
|
-
version: '3.5'
|
75
|
-
- !ruby/object:Gem::Dependency
|
76
|
-
name: rdoc
|
77
|
-
requirement: !ruby/object:Gem::Requirement
|
78
|
-
requirements:
|
79
|
-
- - "~>"
|
80
|
-
- !ruby/object:Gem::Version
|
81
|
-
version: '3.12'
|
82
|
-
type: :development
|
83
|
-
prerelease: false
|
84
|
-
version_requirements: !ruby/object:Gem::Requirement
|
85
|
-
requirements:
|
86
|
-
- - "~>"
|
87
|
-
- !ruby/object:Gem::Version
|
88
|
-
version: '3.12'
|
89
|
-
- !ruby/object:Gem::Dependency
|
90
|
-
name: simplecov
|
91
|
-
requirement: !ruby/object:Gem::Requirement
|
92
|
-
requirements:
|
93
|
-
- - "~>"
|
94
|
-
- !ruby/object:Gem::Version
|
95
|
-
version: '0.8'
|
96
|
-
type: :development
|
97
|
-
prerelease: false
|
98
|
-
version_requirements: !ruby/object:Gem::Requirement
|
99
|
-
requirements:
|
100
|
-
- - "~>"
|
101
|
-
- !ruby/object:Gem::Version
|
102
|
-
version: '0.8'
|
103
61
|
- !ruby/object:Gem::Dependency
|
104
62
|
name: jeweler
|
105
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -134,14 +92,14 @@ dependencies:
|
|
134
92
|
requirements:
|
135
93
|
- - "~>"
|
136
94
|
- !ruby/object:Gem::Version
|
137
|
-
version: '
|
95
|
+
version: '3.0'
|
138
96
|
type: :development
|
139
97
|
prerelease: false
|
140
98
|
version_requirements: !ruby/object:Gem::Requirement
|
141
99
|
requirements:
|
142
100
|
- - "~>"
|
143
101
|
- !ruby/object:Gem::Version
|
144
|
-
version: '
|
102
|
+
version: '3.0'
|
145
103
|
- !ruby/object:Gem::Dependency
|
146
104
|
name: pry
|
147
105
|
requirement: !ruby/object:Gem::Requirement
|
@@ -176,6 +134,7 @@ files:
|
|
176
134
|
- lib/bio-rnaseq_transcription_directionality.rb
|
177
135
|
- lib/bio-rnaseq_transcription_directionality/rnaseq_transcription_directionality.rb
|
178
136
|
- spec/data/eg.bam
|
137
|
+
- spec/data/eg.bam.bai
|
179
138
|
- spec/data/eg.gff
|
180
139
|
- spec/data/eg_with_fasta.gff
|
181
140
|
- spec/data/realer.gff
|
@@ -201,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
201
160
|
version: '0'
|
202
161
|
requirements: []
|
203
162
|
rubyforge_project:
|
204
|
-
rubygems_version: 2.
|
163
|
+
rubygems_version: 2.6.13
|
205
164
|
signing_key:
|
206
165
|
specification_version: 4
|
207
166
|
summary: FPKG calculator for metatranscriptomics
|