dirseq 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7b2362c43bff2a2cd43f0dc727bf8d713e4ab3c4
4
- data.tar.gz: 7a56c16b7bfe38543c8d217d9307dc4f18ea5d21
3
+ metadata.gz: 57eb18eb4e971726d26849e01d3a4dd32597584d
4
+ data.tar.gz: f38d044727e8163873617fad111d3c244507fe71
5
5
  SHA512:
6
- metadata.gz: 526abeb2d9b53cc0b0d26b3688d9b01ed44dea5d9b010d575b64922c8b3ae3d7097134db5fe23cff29855aa05f46449563ce7fcc575601a30e901782d90aaf7d
7
- data.tar.gz: 8702f8ba681c9e161f97f76cfd763593ad266a77acf813a194ecbc7390ef08d46d6c0a68d6b60bebe46f2b9d86bc6bb0a1dbf51c0f1ae3861d33b27e775b1e59
6
+ metadata.gz: 8842180ae34fa546362dbcd63a0d07e9269b9903cdec1b078c10f5031b848d59b8f4db6d1e9708942beb84b92c639365049e8fd563c59723e861402b92771797
7
+ data.tar.gz: c2f035c3dba77ee68fd2861d30c4764eb61095dac654e402a47a7213ddfec651bfb401caf2b5fa015c60efa783c7a50f09a7f4573bcfb544c621bbd3e8d6b0e3
data/Gemfile CHANGED
@@ -9,11 +9,10 @@ gem "bio", "~>1.4", ">=1.4.2"
9
9
  # Add dependencies to develop your gem here.
10
10
  # Include everything needed to run rake, tests, features, etc.
11
11
  group :development do
12
- gem "shoulda", "~> 3.5"
13
- gem "rdoc", "~> 3.12"
14
- gem "simplecov", "~> 0.8"
12
+ #gem "shoulda", "~> 3.5"
13
+ #gem "simplecov", "~> 0.8"
15
14
  gem "jeweler", "~> 2.0"
16
15
  gem "bundler", "~> 1.6"
17
- gem "rspec", "~> 2.99"
16
+ gem "rspec", "~> 3.0"
18
17
  gem 'pry', '~>0.10'
19
18
  end
data/README.md CHANGED
@@ -13,8 +13,8 @@ Won't work just yet:
13
13
  gem install dirseq
14
14
  ```
15
15
  Requires:
16
- * samtools (tested with 0.1.19)
17
- * bedtools (tested with 2.20.1)
16
+ * samtools (tested with 0.1.19 and 1.0+)
17
+ * bedtools (tested with 2.24.0) - old versions won't work.
18
18
  * Ruby (tested with 2.1.1)
19
19
 
20
20
  ## Usage
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
data/bin/dirseq CHANGED
@@ -9,11 +9,21 @@ require 'tempfile'
9
9
 
10
10
  SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
11
11
 
12
+ COVERAGE_COUNT_TYPE = 'coverage'
13
+ COUNT_COUNT_TYPE = 'count'
14
+ COUNT_TYPES = [
15
+ COUNT_COUNT_TYPE,
16
+ COVERAGE_COUNT_TYPE
17
+ ]
18
+
12
19
  # Parse command line options into the options hash
13
20
  options = {
14
21
  :ignore_directions => false,
15
22
  :logger => 'stderr',
16
23
  :log_level => 'info',
24
+ :count_type => COVERAGE_COUNT_TYPE,
25
+ :forward_read_only => false,
26
+ :accepted_feature_types => ['CDS'],
17
27
  }
18
28
  o = OptionParser.new do |opts|
19
29
  opts.banner = "
@@ -28,9 +38,20 @@ o = OptionParser.new do |opts|
28
38
  options[:gff] = arg
29
39
  end
30
40
  opts.separator "\nOptional parameters:\n\n"
41
+ opts.on("--forward-read-only", "consider only forward reads (i.e. read1) and ignore reverse reads. [default #{options[:forward_read_only]}]") do
42
+ options[:forward_ready_only] = true
43
+ end
31
44
  opts.on("--ignore-directions", "ignore directionality, give overall coverage [default: false i.e. differentiate between directions]") do |arg|
32
45
  options[:ignore_directions] = true
33
46
  end
47
+ opts.on("--measure-type TYPE", "what to count for each gene [options: #{COUNT_TYPES.join(', ')}][default: #{options[:count_type]}]") do |arg|
48
+ raise "Unexpected count type detected" if not COUNT_TYPES.include?(arg)
49
+ options[:count_type] = arg
50
+ end
51
+ opts.on("--accepted-feature-types TYPE", Array,
52
+ "Print only features of these type(s) [default #{options[:accepted_feature_types].join(',')}]") do |arg|
53
+ options[:accepted_feature_types] = set(arg)
54
+ end
34
55
 
35
56
  # logger options
36
57
  opts.separator "\nVerbosity:\n\n"
@@ -47,6 +68,11 @@ Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level])
47
68
 
48
69
  gff_file = options[:gff]
49
70
  bam_file = options[:bam]
71
+ accepted_feature_types = options[:accepted_feature_types]
72
+
73
+ if options[:count_type] != COVERAGE_COUNT_TYPE and options[:ignore_directions]
74
+ raise "ignore_directions + count_type != coverage is currently unsupported"
75
+ end
50
76
 
51
77
 
52
78
  calculate_cov = lambda do |covs, num_covs|
@@ -76,14 +102,26 @@ get_covs = lambda do |cov_lines|
76
102
  #96 #coverage
77
103
  #0.0208333
78
104
  feat = splits[0..8]
105
+ feature_type = feat[2]
106
+ if not accepted_feature_types.include?(feature_type)
107
+ log.debug "Skipping feature as it is of type #{feature_type}"
108
+ next
109
+ end
79
110
  if feat != previous_feature
80
111
  feature_to_covs[previous_feature] = calculate_cov.call(covs, num_covs) unless previous_feature.nil?
81
112
  covs = []
82
113
  num_covs = 0
83
114
  end
84
- num = splits[10].to_i
85
- covs.push num*splits[9].to_i
86
- num_covs += num
115
+ if splits.length == 13 # -hist
116
+ num = splits[10].to_i
117
+ covs.push num*splits[9].to_i
118
+ num_covs += num
119
+ elsif splits.length == 10 # -count
120
+ covs.push splits[9].to_i
121
+ num_covs += 1
122
+ else
123
+ raise "Unexpected bedtools output line: #{line}"
124
+ end
87
125
  previous_feature = feat
88
126
  end
89
127
  feature_to_covs[previous_feature] = calculate_cov.call(covs, num_covs)
@@ -93,10 +131,54 @@ end
93
131
 
94
132
  # Remove the ##FASTA and afterwards from the GFF file as this makes bedtools <2.25 fail
95
133
  # https://github.com/arq5x/bedtools2/issues/235#issuecomment-103776618
96
- no_fasta_gff = Tempfile.new('dirseq')
134
+ no_fasta_gff = Tempfile.new(['dirseq','.gff3'])
97
135
  Bio::Commandeer.run "sed '/^##FASTA$/,$d' #{gff_file.inspect} > #{no_fasta_gff.path}", :log => log
98
136
  gff_file = no_fasta_gff.path
99
137
 
138
+
139
+
140
+
141
+
142
+ # Find featureless contigs. Need to so that bedtools coverage -sorted does not complain
143
+ if not File.exists?("#{bam_file}.bai")
144
+ raise "Input bam file must be indexed, but the index file does not exist"
145
+ end
146
+
147
+ chromosome_file = Tempfile.new('bam_contigs')
148
+ log.info "Listing contigs in sorted order .."
149
+ cmd = "samtools idxstats #{bam_file.inspect} |cut -f1,2 >#{chromosome_file.path.inspect}"
150
+ Bio::Commandeer.run(cmd, :log => log)
151
+
152
+ log.info "Finding featureless contigs"
153
+ cmd = "grep -v '^#' #{gff_file.inspect} |cut -f1 |sort |uniq |grep -vFw -f /dev/stdin #{chromosome_file.path.inspect} |cut -f1"
154
+ featureless_contigs = Bio::Commandeer.run(cmd, :log => log).lines.map(&:chomp).reject{ |ref| ref=='*' }
155
+ log.info "Found #{featureless_contigs.length} featureless contigs"
156
+
157
+ # Sort the GFF
158
+ dummy_features = featureless_contigs.collect do |ref|
159
+ [ref,
160
+ 'dirseq',
161
+ 'misc_RNA',
162
+ '1',
163
+ '2',
164
+ '.',
165
+ '+',
166
+ '0',
167
+ "ID=#{ref}_dummy_feature"].join("\t")
168
+ end
169
+ sorted_gff_file_f = Tempfile.new(['sorted_gff','.gff3'])
170
+ sorted_gff_file = sorted_gff_file_f.path
171
+ Tempfile.open(["extra_features",'.gff']) do |ef|
172
+ ef.puts dummy_features.join("\n")
173
+ ef.close
174
+
175
+ cmd = "cat #{ef.path} #{gff_file.inspect} |bedtools sort -i /dev/stdin -faidx #{chromosome_file.path.inspect} >#{sorted_gff_file.inspect}"
176
+ log.info "Running bedtools sort"
177
+ Bio::Commandeer.run(cmd, :log => log)
178
+ end
179
+
180
+
181
+
100
182
  covs_fwd = nil
101
183
  if options[:ignore_directions]
102
184
  cmd1 = "bedtools coverage -b #{bam_file.inspect} -a #{gff_file.inspect} -hist"
@@ -108,10 +190,14 @@ else
108
190
  # fwd read 1
109
191
  read1_flag = '-F128' #account for read1 in pair, as well as single reads mapping
110
192
  read2_flag = '-f128'
111
- cmdf1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -b /dev/stdin -a #{gff_file.inspect} -hist -s"
112
- cmdf2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -b /dev/stdin -a #{gff_file.inspect} -hist -s"
113
- cmdr1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -b /dev/stdin -a #{gff_file.inspect} -hist -S"
114
- cmdr2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -b /dev/stdin -a #{gff_file.inspect} -hist -S"
193
+ bedtools_type_flag = '-hist'
194
+ if options[:count_type] == COUNT_COUNT_TYPE
195
+ bedtools_type_flag = '-counts'
196
+ end
197
+ cmdf1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -s #{bedtools_type_flag}"
198
+ cmdf2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -s #{bedtools_type_flag}"
199
+ cmdr1 = "samtools view -u #{read1_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -S #{bedtools_type_flag}"
200
+ cmdr2 = "samtools view -u #{read2_flag} #{bam_file.inspect} |bedtools coverage -sorted -g #{chromosome_file.path.inspect} -b /dev/stdin -a #{sorted_gff_file.inspect} -S #{bedtools_type_flag}"
115
201
 
116
202
  command_to_parsed = lambda do |cmds, name|
117
203
  covs_lines_initial = cmds.collect do |cmd|
@@ -122,15 +208,23 @@ else
122
208
  get_covs.call(lines)
123
209
  end
124
210
  covs = covs_initial[0]
125
- covs_initial[1].each do |cov_key, cov|
126
- covs[cov_key] += cov
211
+ if covs_initial.length > 1
212
+ covs_initial[1].each do |cov_key, cov|
213
+ covs[cov_key] += cov
214
+ end
127
215
  end
128
216
  covs #'return' from lambda
129
217
  end
130
218
 
131
219
  # Agreeing reads (those whose template are fwd along the reference sequence) are either first and fwd, or second and rev
132
- covs_fwd = command_to_parsed.call([cmdf1,cmdr2], 'reads with same direction as their reference')
133
- covs_rev = command_to_parsed.call([cmdf2,cmdr1], 'reads with opposing direction as their reference')
220
+ commands_fwd = [cmdf1,cmdr2]
221
+ commands_rev = [cmdf2,cmdr1]
222
+ if options[:forward_ready_only]
223
+ commands_fwd = [cmdf1]
224
+ commands_rev = [cmdr1]
225
+ end
226
+ covs_fwd = command_to_parsed.call(commands_fwd, 'reads with same direction as their reference')
227
+ covs_rev = command_to_parsed.call(commands_rev, 'reads with opposing direction as their reference')
134
228
  end
135
229
 
136
230
  headers = [
@@ -142,9 +236,14 @@ headers = [
142
236
  ]
143
237
  if options[:ignore_directions]
144
238
  headers.push 'average_coverage'
145
- else
239
+ elsif options[:count_type] == COVERAGE_COUNT_TYPE
146
240
  headers.push 'forward_average_coverage'
147
241
  headers.push 'reverse_average_coverage'
242
+ elsif options[:count_type] == COUNT_COUNT_TYPE
243
+ headers.push 'forward_read_count'
244
+ headers.push 'reverse_read_count'
245
+ else
246
+ raise
148
247
  end
149
248
  headers.push 'annotation'
150
249
  puts headers.join("\t")
Binary file
data/spec/script_spec.rb CHANGED
@@ -51,4 +51,28 @@ describe 'script' do
51
51
 
52
52
  found.should == answer
53
53
  end
54
+
55
+ it 'should print counts correctly' do
56
+ answer = %w(
57
+ contig type start end strand forward_read_count reverse_read_count annotation
58
+ ).join("\t")+"\n"+%w(
59
+ contig_100 CDS 2 127 + 0.0 2.0 putative
60
+ ).join("\t")+" methyltransferase YcgJ\n"
61
+
62
+ found = Bio::Commandeer.run "#{path_to_script} --bam #{data_dir}/eg.bam --gff #{data_dir}/realer.gff -q --measure-type count"
63
+
64
+ found.should == answer
65
+ end
66
+
67
+ it 'should count only the forward read when asked' do
68
+ answer = %w(
69
+ contig type start end strand forward_read_count reverse_read_count annotation
70
+ ).join("\t")+"\n"+%w(
71
+ contig_100 CDS 2 127 + 0.0 1.0 putative
72
+ ).join("\t")+" methyltransferase YcgJ\n"
73
+
74
+ found = Bio::Commandeer.run "#{path_to_script} --bam #{data_dir}/eg.bam --gff #{data_dir}/realer.gff -q --measure-type count --forward-read-only"
75
+
76
+ found.should == answer
77
+ end
54
78
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dirseq
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J. Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-08 00:00:00.000000000 Z
11
+ date: 2018-02-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-commandeer
@@ -58,48 +58,6 @@ dependencies:
58
58
  - - "~>"
59
59
  - !ruby/object:Gem::Version
60
60
  version: '1.4'
61
- - !ruby/object:Gem::Dependency
62
- name: shoulda
63
- requirement: !ruby/object:Gem::Requirement
64
- requirements:
65
- - - "~>"
66
- - !ruby/object:Gem::Version
67
- version: '3.5'
68
- type: :development
69
- prerelease: false
70
- version_requirements: !ruby/object:Gem::Requirement
71
- requirements:
72
- - - "~>"
73
- - !ruby/object:Gem::Version
74
- version: '3.5'
75
- - !ruby/object:Gem::Dependency
76
- name: rdoc
77
- requirement: !ruby/object:Gem::Requirement
78
- requirements:
79
- - - "~>"
80
- - !ruby/object:Gem::Version
81
- version: '3.12'
82
- type: :development
83
- prerelease: false
84
- version_requirements: !ruby/object:Gem::Requirement
85
- requirements:
86
- - - "~>"
87
- - !ruby/object:Gem::Version
88
- version: '3.12'
89
- - !ruby/object:Gem::Dependency
90
- name: simplecov
91
- requirement: !ruby/object:Gem::Requirement
92
- requirements:
93
- - - "~>"
94
- - !ruby/object:Gem::Version
95
- version: '0.8'
96
- type: :development
97
- prerelease: false
98
- version_requirements: !ruby/object:Gem::Requirement
99
- requirements:
100
- - - "~>"
101
- - !ruby/object:Gem::Version
102
- version: '0.8'
103
61
  - !ruby/object:Gem::Dependency
104
62
  name: jeweler
105
63
  requirement: !ruby/object:Gem::Requirement
@@ -134,14 +92,14 @@ dependencies:
134
92
  requirements:
135
93
  - - "~>"
136
94
  - !ruby/object:Gem::Version
137
- version: '2.99'
95
+ version: '3.0'
138
96
  type: :development
139
97
  prerelease: false
140
98
  version_requirements: !ruby/object:Gem::Requirement
141
99
  requirements:
142
100
  - - "~>"
143
101
  - !ruby/object:Gem::Version
144
- version: '2.99'
102
+ version: '3.0'
145
103
  - !ruby/object:Gem::Dependency
146
104
  name: pry
147
105
  requirement: !ruby/object:Gem::Requirement
@@ -176,6 +134,7 @@ files:
176
134
  - lib/bio-rnaseq_transcription_directionality.rb
177
135
  - lib/bio-rnaseq_transcription_directionality/rnaseq_transcription_directionality.rb
178
136
  - spec/data/eg.bam
137
+ - spec/data/eg.bam.bai
179
138
  - spec/data/eg.gff
180
139
  - spec/data/eg_with_fasta.gff
181
140
  - spec/data/realer.gff
@@ -201,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
201
160
  version: '0'
202
161
  requirements: []
203
162
  rubyforge_project:
204
- rubygems_version: 2.4.5.1
163
+ rubygems_version: 2.6.13
205
164
  signing_key:
206
165
  specification_version: 4
207
166
  summary: FPKG calculator for metatranscriptomics