dirseq 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7b2362c43bff2a2cd43f0dc727bf8d713e4ab3c4
4
- data.tar.gz: 7a56c16b7bfe38543c8d217d9307dc4f18ea5d21
3
+ metadata.gz: 57eb18eb4e971726d26849e01d3a4dd32597584d
4
+ data.tar.gz: f38d044727e8163873617fad111d3c244507fe71
5
5
  SHA512:
6
- metadata.gz: 526abeb2d9b53cc0b0d26b3688d9b01ed44dea5d9b010d575b64922c8b3ae3d7097134db5fe23cff29855aa05f46449563ce7fcc575601a30e901782d90aaf7d
7
- data.tar.gz: 8702f8ba681c9e161f97f76cfd763593ad266a77acf813a194ecbc7390ef08d46d6c0a68d6b60bebe46f2b9d86bc6bb0a1dbf51c0f1ae3861d33b27e775b1e59
6
+ metadata.gz: 8842180ae34fa546362dbcd63a0d07e9269b9903cdec1b078c10f5031b848d59b8f4db6d1e9708942beb84b92c639365049e8fd563c59723e861402b92771797
7
+ data.tar.gz: c2f035c3dba77ee68fd2861d30c4764eb61095dac654e402a47a7213ddfec651bfb401caf2b5fa015c60efa783c7a50f09a7f4573bcfb544c621bbd3e8d6b0e3
data/Gemfile CHANGED
@@ -9,11 +9,10 @@ gem "bio", "~>1.4", ">=1.4.2"
9
9
  # Add dependencies to develop your gem here.
10
10
  # Include everything needed to run rake, tests, features, etc.
11
11
  group :development do
12
- gem "shoulda", "~> 3.5"
13
- gem "rdoc", "~> 3.12"
14
- gem "simplecov", "~> 0.8"
12
+ #gem "shoulda", "~> 3.5"
13
+ #gem "simplecov", "~> 0.8"
15
14
  gem "jeweler", "~> 2.0"
16
15
  gem "bundler", "~> 1.6"
17
- gem "rspec", "~> 2.99"
16
+ gem "rspec", "~> 3.0"
18
17
  gem 'pry', '~>0.10'
19
18
  end
data/README.md CHANGED
@@ -13,8 +13,8 @@ Won't work just yet:
13
13
  gem install dirseq
14
14
  ```
15
15
  Requires:
16
- * samtools (tested with 0.1.19)
17
- * bedtools (tested with 2.20.1)
16
+ * samtools (tested with 0.1.19 and 1.0+)
17
+ * bedtools (tested with 2.24.0) - old versions won't work.
18
18
  * Ruby (tested with 2.1.1)
19
19
 
20
20
  ## Usage
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
data/bin/dirseq CHANGED
@@ -9,11 +9,21 @@ require 'tempfile'
9
9
 
10
10
  SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
11
11
 
12
+ COVERAGE_COUNT_TYPE = 'coverage'
13
+ COUNT_COUNT_TYPE = 'count'
14
+ COUNT_TYPES = [
15
+ COUNT_COUNT_TYPE,
16
+ COVERAGE_COUNT_TYPE
17
+ ]
18
+
12
19
  # Parse command line options into the options hash
13
20
  options = {
14
21
  :ignore_directions => false,
15
22
  :logger => 'stderr',
16
23
  :log_level => 'info',
24
+ :count_type => COVERAGE_COUNT_TYPE,
25
+ :forward_read_only => false,
26
+ :accepted_feature_types => ['CDS'],
17
27
  }
18
28
  o = OptionParser.new do |opts|
19
29
  opts.banner = "
@@ -28,9 +38,20 @@ o = OptionParser.new do |opts|
28
38
  options[:gff] = arg
29
39
  end
30
40
  opts.separator "\nOptional parameters:\n\n"
41
+ opts.on("--forward-read-only", "consider only forward reads (i.e. read1) and ignore reverse reads. [default #{options[:forward_read_only]}]") do
42
+ options[:forward_ready_only] = true
43
+ end
31
44
  opts.on("--ignore-directions", "ignore directionality, give overall coverage [default: false i.e. differentiate between directions]") do |arg|
32
45
  options[:ignore_directions] = true
33
46
  end
47
+ opts.on("--measure-type TYPE", "what to count for each gene [options: #{COUNT_TYPES.join(', ')}][default: #{options[:count_type]}]") do |arg|
48
+ raise "Unexpected count type detected" if not COUNT_TYPES.include?(arg)
49
+ options[:count_type] = arg
50
+ end
51
+ opts.on("--accepted-feature-types TYPE", Array,
52
+ "Print only features of these type(s) [default #{options[:accepted_feature_types].join(',')}]") do |arg|
53
+ options[:accepted_feature_types] = set(arg)
54
+ end
34
55
 
35
56
  # logger options
36
57
  opts.separator "\nVerbosity:\n\n"
@@ -47,6 +68,11 @@ Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level])
47
68
 
48
69
  gff_file = options[:gff]
49
70
  bam_file = options[:bam]
71
+ accepted_feature_types = options[:accepted_feature_types]
72
+
73
+ if options[:count_type] != COVERAGE_COUNT_TYPE and options[:ignore_directions]
74
+ raise "ignore_directions + count_type != coverage is currently unsupported"
75
+ end
50
76
 
51
77
 
52
78
  calculate_cov = lambda do |covs, num_covs|
@@ -76,14 +102,26 @@ get_covs = lambda do |cov_lines|
76
102
  #96 #coverage
77
103
  #0.0208333
78
104
  feat = splits[0..8]
105
+ feature_type = feat[2]
106
+ if not accepted_feature_types.include?(feature_type)
107
+ log.debug "Skipping feature as it is of type #{feature_type}"
108
+ next
109
+ end
79
110
  if feat != previous_feature
80
111
  feature_to_covs[previous_feature] = calculate_cov.call(covs, num_covs) unless previous_feature.nil?
81
112
  covs = []
82
113
  num_covs = 0
83
114
  end
84
- num = splits[10].to_i
85
- covs.push num*splits[9].to_i
86
- num_covs += num
115
+ if splits.length == 13 # -hist
116
+ num = splits[10].to_i
117
+ covs.push num*splits[9].to_i
118
+ num_covs += num
119
+ elsif splits.length == 10 # -count
120
+ covs.push splits[9].to_i
121
+ num_covs += 1
122
+ else
123
+ raise "Unexpected bedtools output line: #{line}"
124
+ end
87
125
  previous_feature = feat
88
126
  end
89
127
  feature_to_covs[previous_feature] = calculate_cov.call(covs, num_covs)
@@ -93,10 +131,54 @@ end
93
131
 
94
132
  # Remove the ##FASTA and afterwards from the GFF file as this makes bedtools <2.25 fail
95
133
  # https://github.com/arq5x/bedtools2/issues/235#issuecomment-103776618
96
- no_fasta_gff = Tempfile.new('dirseq')
134
+ no_fasta_gff = Tempfile.new(['dirseq','.gff3'])
97
135
  Bio::Commandeer.run "sed '/^##FASTA$/,$d' #{gff_file.inspect} > #{no_fasta_gff.path}", :log => log
98
136
  gff_file = no_fasta_gff.path
99
137
 
138
+
139
+
140
+
141
+
142
+ # Find featureless contigs. Need to so that bedtools coverage -sorted does not complain
143
+ if not File.exists?("#{bam_file}.bai")
144
+ raise "Input bam file must be indexed, but the index file does not exist"
145
+ end
146
+
147
+ chromosome_file = Tempfile.new('bam_contigs')
148
+ log.info "Listing contigs in sorted order .."
149
+ cmd = "samtools idxstats #{bam_file.inspect} |cut -f1,2 >#{chromosome_file.path.inspect}"
150
+ Bio::Commandeer.run(cmd, :log => log)
151
+
152
+ log.info "Finding featureless contigs"
153
+ cmd = "grep -v '^#' #{gff_file.inspect} |cut -f1 |sort |uniq |grep -vFw -f /dev/stdin #{chromosome_file.path.inspect} |cut -f1"
154
+ featureless_contigs = Bio::Commandeer.run(cmd, :log => log).lines.map(&:chomp).reject{ |ref| ref=='*' }
155
+ log.info "Found #{featureless_contigs.length} featureless contigs"
156
+
157
+ # Sort the GFF
158
+ dummy_features = featureless_contigs.collect do |ref|
159
+ [ref,
160
+ 'dirseq',
161
+ 'misc_RNA',
162
+ '1',
163
+ '2',
164
+ '.',
165
+ '+',
166
+ '0',
167
+ "ID=#{ref}_dummy_feature"].join("\t")
168
+ end
169
+ sorted_gff_file_f = Tempfile.new(['sorted_gff','.gff3'])
170
+ sorted_gff_file = sorted_gff_file_f.path
171
+ Tempfile.open(["extra_features",'.gff']) do |ef|
172
+ ef.puts dummy_features.join("\n")
173
+ ef.close
174
+
175
+ cmd = "cat #{ef.path} #{gff_file.inspect} |bedtools sort -i /dev/stdin -faidx #{chromosome_file.path.inspect} >#{sorted_gff_file.inspect}"
176
+ log.info "Running bedtools sort"
177
+ Bio::Commandeer.run(cmd, :log => log)
178
+ end
179
+
180
+
181
+
100
182
  covs_fwd = nil
101
183
  if options[:ignore_directions]
102
184
  cmd1 = "bedtools coverage -b #{bam_file.inspect} -a #{gff_file.inspect} -hist"
@@ -108,10 +190,14 @@ else
108
190
  # fwd read 1
109
191
  read1_flag = '-F128' #account for read1 in pair, as well as single reads mapping
110
192
  read2_flag = '-f128'
111
- cmdf1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -b /dev/stdin -a #{gff_file.inspect} -hist -s"
112
- cmdf2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -b /dev/stdin -a #{gff_file.inspect} -hist -s"
113
- cmdr1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -b /dev/stdin -a #{gff_file.inspect} -hist -S"
114
- cmdr2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -b /dev/stdin -a #{gff_file.inspect} -hist -S"
193
+ bedtools_type_flag = '-hist'
194
+ if options[:count_type] == COUNT_COUNT_TYPE
195
+ bedtools_type_flag = '-counts'
196
+ end
197
+ cmdf1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -s #{bedtools_type_flag}"
198
+ cmdf2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -s #{bedtools_type_flag}"
199
+ cmdr1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -S #{bedtools_type_flag}"
200
+ cmdr2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -S #{bedtools_type_flag}"
115
201
 
116
202
  command_to_parsed = lambda do |cmds, name|
117
203
  covs_lines_initial = cmds.collect do |cmd|
@@ -122,15 +208,23 @@ else
122
208
  get_covs.call(lines)
123
209
  end
124
210
  covs = covs_initial[0]
125
- covs_initial[1].each do |cov_key, cov|
126
- covs[cov_key] += cov
211
+ if covs_initial.length > 1
212
+ covs_initial[1].each do |cov_key, cov|
213
+ covs[cov_key] += cov
214
+ end
127
215
  end
128
216
  covs #'return' from lambda
129
217
  end
130
218
 
131
219
  # Agreeing reads (those whose template are fwd along the reference sequence) are either first and fwd, or second and rev
132
- covs_fwd = command_to_parsed.call([cmdf1,cmdr2], 'reads with same direction as their reference')
133
- covs_rev = command_to_parsed.call([cmdf2,cmdr1], 'reads with opposing direction as their reference')
220
+ commands_fwd = [cmdf1,cmdr2]
221
+ commands_rev = [cmdf2,cmdr1]
222
+ if options[:forward_ready_only]
223
+ commands_fwd = [cmdf1]
224
+ commands_rev = [cmdr1]
225
+ end
226
+ covs_fwd = command_to_parsed.call(commands_fwd, 'reads with same direction as their reference')
227
+ covs_rev = command_to_parsed.call(commands_rev, 'reads with opposing direction as their reference')
134
228
  end
135
229
 
136
230
  headers = [
@@ -142,9 +236,14 @@ headers = [
142
236
  ]
143
237
  if options[:ignore_directions]
144
238
  headers.push 'average_coverage'
145
- else
239
+ elsif options[:count_type] == COVERAGE_COUNT_TYPE
146
240
  headers.push 'forward_average_coverage'
147
241
  headers.push 'reverse_average_coverage'
242
+ elsif options[:count_type] == COUNT_COUNT_TYPE
243
+ headers.push 'forward_read_count'
244
+ headers.push 'reverse_read_count'
245
+ else
246
+ raise
148
247
  end
149
248
  headers.push 'annotation'
150
249
  puts headers.join("\t")
Binary file
data/spec/script_spec.rb CHANGED
@@ -51,4 +51,28 @@ describe 'script' do
51
51
 
52
52
  found.should == answer
53
53
  end
54
+
55
+ it 'should print counts correctly' do
56
+ answer = %w(
57
+ contig type start end strand forward_read_count reverse_read_count annotation
58
+ ).join("\t")+"\n"+%w(
59
+ contig_100 CDS 2 127 + 0.0 2.0 putative
60
+ ).join("\t")+" methyltransferase YcgJ\n"
61
+
62
+ found = Bio::Commandeer.run "#{path_to_script} --bam #{data_dir}/eg.bam --gff #{data_dir}/realer.gff -q --measure-type count"
63
+
64
+ found.should == answer
65
+ end
66
+
67
+ it 'should count only the forward read when asked' do
68
+ answer = %w(
69
+ contig type start end strand forward_read_count reverse_read_count annotation
70
+ ).join("\t")+"\n"+%w(
71
+ contig_100 CDS 2 127 + 0.0 1.0 putative
72
+ ).join("\t")+" methyltransferase YcgJ\n"
73
+
74
+ found = Bio::Commandeer.run "#{path_to_script} --bam #{data_dir}/eg.bam --gff #{data_dir}/realer.gff -q --measure-type count --forward-read-only"
75
+
76
+ found.should == answer
77
+ end
54
78
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dirseq
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J. Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-08 00:00:00.000000000 Z
11
+ date: 2018-02-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-commandeer
@@ -58,48 +58,6 @@ dependencies:
58
58
  - - "~>"
59
59
  - !ruby/object:Gem::Version
60
60
  version: '1.4'
61
- - !ruby/object:Gem::Dependency
62
- name: shoulda
63
- requirement: !ruby/object:Gem::Requirement
64
- requirements:
65
- - - "~>"
66
- - !ruby/object:Gem::Version
67
- version: '3.5'
68
- type: :development
69
- prerelease: false
70
- version_requirements: !ruby/object:Gem::Requirement
71
- requirements:
72
- - - "~>"
73
- - !ruby/object:Gem::Version
74
- version: '3.5'
75
- - !ruby/object:Gem::Dependency
76
- name: rdoc
77
- requirement: !ruby/object:Gem::Requirement
78
- requirements:
79
- - - "~>"
80
- - !ruby/object:Gem::Version
81
- version: '3.12'
82
- type: :development
83
- prerelease: false
84
- version_requirements: !ruby/object:Gem::Requirement
85
- requirements:
86
- - - "~>"
87
- - !ruby/object:Gem::Version
88
- version: '3.12'
89
- - !ruby/object:Gem::Dependency
90
- name: simplecov
91
- requirement: !ruby/object:Gem::Requirement
92
- requirements:
93
- - - "~>"
94
- - !ruby/object:Gem::Version
95
- version: '0.8'
96
- type: :development
97
- prerelease: false
98
- version_requirements: !ruby/object:Gem::Requirement
99
- requirements:
100
- - - "~>"
101
- - !ruby/object:Gem::Version
102
- version: '0.8'
103
61
  - !ruby/object:Gem::Dependency
104
62
  name: jeweler
105
63
  requirement: !ruby/object:Gem::Requirement
@@ -134,14 +92,14 @@ dependencies:
134
92
  requirements:
135
93
  - - "~>"
136
94
  - !ruby/object:Gem::Version
137
- version: '2.99'
95
+ version: '3.0'
138
96
  type: :development
139
97
  prerelease: false
140
98
  version_requirements: !ruby/object:Gem::Requirement
141
99
  requirements:
142
100
  - - "~>"
143
101
  - !ruby/object:Gem::Version
144
- version: '2.99'
102
+ version: '3.0'
145
103
  - !ruby/object:Gem::Dependency
146
104
  name: pry
147
105
  requirement: !ruby/object:Gem::Requirement
@@ -176,6 +134,7 @@ files:
176
134
  - lib/bio-rnaseq_transcription_directionality.rb
177
135
  - lib/bio-rnaseq_transcription_directionality/rnaseq_transcription_directionality.rb
178
136
  - spec/data/eg.bam
137
+ - spec/data/eg.bam.bai
179
138
  - spec/data/eg.gff
180
139
  - spec/data/eg_with_fasta.gff
181
140
  - spec/data/realer.gff
@@ -201,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
201
160
  version: '0'
202
161
  requirements: []
203
162
  rubyforge_project:
204
- rubygems_version: 2.4.5.1
163
+ rubygems_version: 2.6.13
205
164
  signing_key:
206
165
  specification_version: 4
207
166
  summary: FPKG calculator for metatranscriptomics