transrate 0.3.1 → 1.0.0.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: be3eaa6170b268789a0ff6d6cb68448a7af6e037
4
- data.tar.gz: 5e857b0392f9791e7111069ee1ed61ee19d3853c
3
+ metadata.gz: 329f398d7dc832c5e56dfe90b6483fb00321bfce
4
+ data.tar.gz: 52acb0232a8f42cb3604c2981899f0d931a1195b
5
5
  SHA512:
6
- metadata.gz: f87811816aa975bfc85cfd025a90029c310bfdad19587cb5ea6a4909a78151f1904db0d69e5593b56f0f6b0044f5072e1e966580d7b6a45f78522ede8eda52cd
7
- data.tar.gz: e9dba5e8a9910716250c3bd82bea190264d4ac32caf67acc44009b080e5eab553cf2a7317d1153d851f797c0eb45d07e1531896743b6bb1b3df902608d696d15
6
+ metadata.gz: 35f079c3474e02aa98a896cf6bbe4cd3d87ef7b7e95c91d13abea585eeb54b598ffe9539f7fc7b8c87a40ec78df92a8e1ba9fd23ac46e4da9c8f616919ab004e
7
+ data.tar.gz: fca938cc86122088ecb0308273d8552e1ebb26dd47e52e8154015aa27e68814016cedda45553f4bf76fcc9b164bcaaaf1cad1787c4f8296f316e6833b235e6bc
data/.gitignore CHANGED
@@ -18,6 +18,7 @@ tmp
18
18
  .#*
19
19
  \#*
20
20
  *so
21
+ dryrun
21
22
 
22
23
  # YARD artifacts
23
24
  .yardoc
@@ -39,3 +40,9 @@ dryrun
39
40
  *.bam
40
41
  *.csv
41
42
  *.coverage
43
+
44
+ # c extension build artefacts
45
+ Makefile
46
+ transrate.bundle
47
+ transrate.o
48
+ .RUBYARCHDIR.time
data/README.md CHANGED
@@ -4,11 +4,12 @@
4
4
 
5
5
  ## Development status
6
6
 
7
- [![Gem Version](https://badge.fury.io/rb/transrate.png)][gem]
8
- [![Build Status](https://secure.travis-ci.org/Blahah/transrate.png?branch=master)][travis]
9
- [![Dependency Status](https://gemnasium.com/Blahah/transrate.png?travis)][gemnasium]
10
- [![Code Climate](https://codeclimate.com/github/Blahah/transrate.png)][codeclimate]
11
- [![Coverage Status](https://coveralls.io/repos/Blahah/transrate/badge.png?branch=master)][coveralls]
7
+ [![Gem Version](http://img.shields.io/gem/v/transrate.svg)][gem]
8
+ ![Downloads](http://img.shields.io/gem/dtv/transrate.svg)
9
+ [![Build Status](http://img.shields.io/travis/Blahah/transrate/master.svg)][travis]
10
+ [![Dependency Status](http://img.shields.io/gemnasium/Blahah/transrate.svg)][gemnasium]
11
+ [![Code Climate](http://img.shields.io/codeclimate/github/Blahah/transrate.svg)][codeclimate]
12
+ [![Coverage Status](http://img.shields.io/coveralls/Blahah/transrate.svg)][coveralls]
12
13
 
13
14
  [gem]: https://badge.fury.io/rb/transrate
14
15
  [travis]: https://travis-ci.org/Blahah/transrate
@@ -20,7 +21,7 @@ This software is being actively developed. Please be aware that there may be bug
20
21
 
21
22
  ## Citation
22
23
 
23
- Transrate is pre-publication academic software. If you use it, please cite the github repository and the DOI: [![DOI](https://zenodo.org/badge/3687/Blahah/transrate.png)](http://dx.doi.org/10.5281/zenodo.11037).
24
+ Transrate is pre-publication academic software. If you use it, please cite the github repository and the DOI: [![DOI](https://zenodo.org/badge/3687/Blahah/transrate.png)](http://dx.doi.org/10.5281/zenodo.11039).
24
25
 
25
26
  ## Documentation
26
27
 
@@ -46,12 +46,6 @@ opts = Trollop::options do
46
46
  :type => String
47
47
  opt :right, "right reads file in FASTQ format",
48
48
  :type => String
49
- opt :insertsize, "mean insert size",
50
- :default => 200,
51
- :type => Integer
52
- opt :insertsd, "insert size standard deviation",
53
- :default => 50,
54
- :type => Integer
55
49
  opt :threads, "number of threads to use",
56
50
  :default => 8,
57
51
  :type => Integer
@@ -98,6 +92,21 @@ if opts.reference && !File.exist?(opts.reference)
98
92
  raise IOError.new "Reference fasta file does not exist: #{opts.reference}"
99
93
  end
100
94
 
95
+ if opts.left and opts.right
96
+ if opts.left.split(",").length != opts.right.split(",").length
97
+ msg = "Please provide the same number of left reads as right reads"
98
+ raise ArgumentError.new(msg)
99
+ end
100
+ opts.left.split(",").zip(opts.right.split(",")).each do |left,right|
101
+ if !File.exist?(left)
102
+ raise IOError.new "Left read fastq file does not exist: #{left}"
103
+ end
104
+ if !File.exist?(right)
105
+ raise IOError.new "Right read fastq file does not exist: #{right}"
106
+ end
107
+ end
108
+ end
109
+
101
110
  if opts.profile
102
111
  logger.info "Starting profiler"
103
112
  RubyProf.start
@@ -201,13 +210,16 @@ opts.assembly.split(',').each do |assembly|
201
210
  logger.info "Comparative metrics done in #{Time.now - t0} seconds"
202
211
 
203
212
  logger.info "-" * report_width
213
+ else
214
+ logger.info "No reference provided, skipping comparative diagnostics"
215
+ end
216
+
217
+ if (opts.left && opts.right)
204
218
  score = transrater.assembly_score
205
219
  unless score.nil?
206
- logger.info "OVERALL SCORE: #{score.to_f.round(2) * 100}%"
220
+ logger.info "TRANSRATE ASSEMBLY SCORE: #{score.round(2)}"
207
221
  logger.info "-" * report_width
208
222
  end
209
- else
210
- logger.info "No reference provided, skipping comparative diagnostics"
211
223
  end
212
224
 
213
225
  # write contig metrics to file for each contig
@@ -25,21 +25,56 @@ blastplus:
25
25
  64bit:
26
26
  macosx: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-universal-macosx.tar.gz
27
27
  linux: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.29/ncbi-blast-2.2.29+-x64-linux.tar.gz
28
- bowtie2:
28
+ snap:
29
29
  binaries:
30
- - bowtie2
31
- - bowtie2-align-l
32
- - bowtie2-align-s
33
- - bowtie2-build
34
- - bowtie2-build-l
35
- - bowtie2-build-s
36
- - bowtie2-inspect
37
- - bowtie2-inspect-l
38
- - bowtie2-inspect-s
30
+ - snap
39
31
  version:
40
- number: '2.2.3'
41
- command: 'bowtie2 --version'
32
+ number: '1.0dev.50'
33
+ command: 'snap'
42
34
  url:
43
35
  64bit:
44
- linux: http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.3/bowtie2-2.2.3-linux-x86_64.zip
45
- macosx: http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.3/bowtie2-2.2.3-macos-x86_64.zip
36
+ linux: https://github.com/cboursnell/snap/raw/dev/bin/linux/snap.tar.gz
37
+ macosx: https://github.com/HibberdLab/snap/raw/dev/bin/macosx/snap.tar.gz
38
+ samtools:
39
+ binaries:
40
+ - samtools
41
+ - bcftools
42
+ version:
43
+ number: '0.1.19'
44
+ command: 'samtools'
45
+ url:
46
+ 64bit:
47
+ linux: https://github.com/cboursnell/samtools/raw/master/build/linux64.tar.gz
48
+ macosx: https://github.com/cboursnell/samtools/raw/master/build/osx64.tar.gz
49
+ bam-read:
50
+ binaries:
51
+ - bam-read
52
+ version:
53
+ number: '0.3\.1'
54
+ command: 'bam-read'
55
+ url:
56
+ 64bit:
57
+ linux: https://github.com/cboursnell/transrate-bam-read/raw/master/bin/linux/bam-read
58
+ macosx: https://github.com/Blahah/transrate-bam-read/raw/master/bin/macosx/bam-read
59
+ unpack: false
60
+ bam-split:
61
+ binaries:
62
+ - bam-split
63
+ version:
64
+ number: '0.1'
65
+ command: 'bam-split'
66
+ url:
67
+ 64bit:
68
+ linux: https://github.com/Blahah/transrate-bam-read/raw/master/bin/linux/bam-split
69
+ macosx: https://github.com/Blahah/transrate-bam-read/raw/master/bin/macosx/bam-split
70
+ unpack: false
71
+ express:
72
+ binaries:
73
+ - express
74
+ version:
75
+ number: '1.5.1'
76
+ command: 'express --version'
77
+ url:
78
+ 64bit:
79
+ linux: http://bio.math.berkeley.edu/eXpress/downloads/express-1.5.1/express-1.5.1-linux_x86_64.tgz
80
+ macosx: http://bio.math.berkeley.edu/eXpress/downloads/express-1.5.1/express-1.5.1-macosx_x86_64.tgz
@@ -1,223 +1,257 @@
1
1
  #include "ruby.h"
2
2
  #include <stdlib.h>
3
+ #include <math.h>
3
4
 
4
5
  // Defining a space for information and references about the module to be
5
6
  // stored internally
6
7
  VALUE Contig = Qnil;
8
+ VALUE ReadMetrics = Qnil;
7
9
  VALUE Transrate = Qnil;
8
10
 
9
11
  // Prototype for the initialization method - Ruby calls this, not you
10
12
  void Init_transrate();
11
13
 
12
14
  // methods are prefixed by 'method_' here
13
- //VALUE TestInit(VALUE, VALUE, VALUE, VALUE, VALUE);
15
+ // contig
14
16
  VALUE method_composition(VALUE, VALUE);
15
17
  VALUE method_base_count(VALUE,VALUE);
16
18
  VALUE method_dibase_count(VALUE,VALUE);
17
19
  VALUE method_kmer_count(VALUE,VALUE,VALUE);
18
20
  VALUE method_longest_orf(VALUE, VALUE);
21
+ // read_metrics
19
22
 
20
23
  int * base_counts;
21
24
  int * dibase_counts;
22
25
 
23
26
  // The initialization method for this module
24
27
  void Init_transrate() {
25
- Transrate = rb_define_module("Transrate");
26
- // VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
27
- Contig = rb_define_class_under(Transrate, "Contig", rb_cObject);
28
- // rb_define_method(Contig, "initialize", TestInit, 2);
29
- rb_define_method(Contig, "composition", method_composition, 1);
30
- rb_define_method(Contig, "base_count", method_base_count, 1);
31
- rb_define_method(Contig, "dibase_count", method_dibase_count, 1);
32
- rb_define_method(Contig, "kmer_count", method_kmer_count, 2);
33
- rb_define_method(Contig, "longest_orf", method_longest_orf, 1);
28
+ Transrate = rb_define_module("Transrate");
29
+ Contig = rb_define_class_under(Transrate, "Contig", rb_cObject);
30
+ ReadMetrics = rb_define_class_under(Transrate, "ReadMetrics", rb_cObject);
31
+ // contig
32
+ rb_define_method(Contig, "composition", method_composition, 1);
33
+ rb_define_method(Contig, "base_count", method_base_count, 1);
34
+ rb_define_method(Contig, "dibase_count", method_dibase_count, 1);
35
+ rb_define_method(Contig, "kmer_count", method_kmer_count, 2);
36
+ rb_define_method(Contig, "longest_orf", method_longest_orf, 1);
37
+ // ReadMetrics
34
38
  }
35
39
 
36
40
  VALUE method_composition(VALUE self, VALUE _seq) {
37
- int i,len, idx;
38
- char * seq;
39
- char base;
40
- char prevbase;
41
- seq = StringValueCStr(_seq);
42
- len = RSTRING_LEN(_seq);
43
- base_counts = malloc(5 * sizeof(int));
44
- dibase_counts = malloc(25 * sizeof(int));
41
+ int i, len, idx;
42
+ char * seq;
43
+ char base;
44
+ char prevbase;
45
+ seq = StringValueCStr(_seq);
46
+ len = RSTRING_LEN(_seq);
47
+ base_counts = malloc(5 * sizeof(int));
48
+ dibase_counts = malloc(25 * sizeof(int));
45
49
 
46
- for (i=0; i < 5; i++) {
47
- base_counts[i]=0;
50
+ for (i=0; i < 5; i++) {
51
+ base_counts[i]=0;
52
+ }
53
+ for (i=0; i < 25; i++) {
54
+ dibase_counts[i]=0;
55
+ }
56
+ for (i=0; i < len; i++) {
57
+ base = seq[i];
58
+ switch (base) {
59
+ case 'A': {
60
+ idx=0;
61
+ break;
62
+ }
63
+ case 'C': {
64
+ idx=1;
65
+ break;
66
+ }
67
+ case 'G': {
68
+ idx=2;
69
+ break;
70
+ }
71
+ case 'T': {
72
+ idx=3;
73
+ break;
74
+ }
75
+ default: {
76
+ idx=4;
77
+ break;
78
+ }
48
79
  }
49
- for (i=0; i < 25; i++) {
50
- dibase_counts[i]=0;
51
- }
52
- for (i=0; i < len; i++) {
53
- base = seq[i];
54
- switch (base) {
55
- case 'A': {
56
- idx=0;
57
- break;
58
- }
59
- case 'C': {
60
- idx=1;
61
- break;
62
- }
63
- case 'G': {
64
- idx=2;
65
- break;
66
- }
67
- case 'T': {
68
- idx=3;
69
- break;
70
- }
71
- default: {
72
- idx=4;
73
- break;
74
- }
75
- }
76
- base_counts[idx]++;
80
+ base_counts[idx]++;
77
81
 
78
- if (i > 0) {
79
- prevbase = seq[i-1];
80
- switch (prevbase) {
81
- case 'A': {
82
- idx=idx;
83
- break;
84
- }
85
- case 'C': {
86
- idx=idx+5;
87
- break;
88
- }
89
- case 'G': {
90
- idx=idx+10;
91
- break;
92
- }
93
- case 'T': {
94
- idx=idx+15;
95
- break;
96
- }
97
- default: {
98
- idx=idx+20;
99
- break;
100
- }
101
- }
102
- dibase_counts[idx]++;
82
+ if (i > 0) {
83
+ prevbase = seq[i-1];
84
+ switch (prevbase) {
85
+ case 'A': {
86
+ idx=idx;
87
+ break;
88
+ }
89
+ case 'C': {
90
+ idx=idx+5;
91
+ break;
92
+ }
93
+ case 'G': {
94
+ idx=idx+10;
95
+ break;
96
+ }
97
+ case 'T': {
98
+ idx=idx+15;
99
+ break;
103
100
  }
101
+ default: {
102
+ idx=idx+20;
103
+ break;
104
+ }
105
+ }
106
+ dibase_counts[idx]++;
104
107
  }
105
- return INT2NUM(0);
108
+ }
109
+ return INT2NUM(0);
106
110
  }
107
111
 
108
112
  VALUE method_dibase_count(VALUE self, VALUE idx) {
109
- return INT2NUM(dibase_counts[NUM2INT(idx)]);
113
+ return INT2NUM(dibase_counts[NUM2INT(idx)]);
110
114
  }
111
115
 
112
116
  VALUE method_base_count(VALUE self, VALUE idx) {
113
- return INT2NUM(base_counts[NUM2INT(idx)]);
117
+ return INT2NUM(base_counts[NUM2INT(idx)]);
114
118
  }
115
119
 
116
120
  VALUE method_kmer_count(VALUE self, VALUE _k, VALUE _s) {
117
- int n, i, start, k, len, h, size = 0;
118
- char * c_str;
119
- char base;
120
- len = RSTRING_LEN(_s);
121
- c_str = StringValueCStr(_s);
122
- k = NUM2INT(_k);
123
- size = 1;
124
- for(h=0;h<k;h++) {
125
- size *= 4;
126
- }
127
- short set[size];
128
- for(start=0;start<size;start++) {
129
- set[start]=0;
130
- }
131
- for(start=0; start<len-k+1; start++) {
132
- i = 0;
133
- h = 0;
134
- n = 0;
135
- for(i = start; i < start+k; i++) {
136
- base = c_str[i];
137
- switch (base) {
138
- case 'A': {
139
- h = h << 2;
140
- h += 0;
141
- break;
142
- }
143
- case 'C': {
144
- h = h << 2;
145
- h += 1;
146
- break;
147
- }
148
- case 'G': {
149
- h = h << 2;
150
- h += 2;
151
- break;
152
- }
153
- case 'T': {
154
- h = h << 2;
155
- h += 3;
156
- break;
157
- }
158
- default: {
159
- n++;
160
- break;
161
- }
162
- }
121
+ int n, i, start, k, len, h, size = 0;
122
+ char * c_str;
123
+ char base;
124
+ len = RSTRING_LEN(_s);
125
+ c_str = StringValueCStr(_s);
126
+ k = NUM2INT(_k);
127
+ size = 1;
128
+ for(h=0;h<k;h++) {
129
+ size *= 4;
130
+ }
131
+ short set[size];
132
+ for(start=0;start<size;start++) {
133
+ set[start]=0;
134
+ }
135
+ for(start=0; start<len-k+1; start++) {
136
+ i = 0;
137
+ h = 0;
138
+ n = 0;
139
+ for(i = start; i < start+k; i++) {
140
+ base = c_str[i];
141
+ switch (base) {
142
+ case 'A': {
143
+ h = h << 2;
144
+ h += 0;
145
+ break;
163
146
  }
164
- if (n==0) {
165
- set[h] += 1;
147
+ case 'C': {
148
+ h = h << 2;
149
+ h += 1;
150
+ break;
166
151
  }
167
- }
168
- i = 0; // count how many in array are set //
169
- for(start = 0; start < size; start++) {
170
- if (set[start]>0) {
171
- i++;
152
+ case 'G': {
153
+ h = h << 2;
154
+ h += 2;
155
+ break;
156
+ }
157
+ case 'T': {
158
+ h = h << 2;
159
+ h += 3;
160
+ break;
172
161
  }
162
+ default: {
163
+ n++;
164
+ break;
165
+ }
166
+ }
167
+ }
168
+ if (n==0) {
169
+ set[h] += 1;
170
+ }
171
+ }
172
+ i = 0; // count how many in array are set //
173
+ for(start = 0; start < size; start++) {
174
+ if (set[start]>0) {
175
+ i++;
173
176
  }
174
- return INT2NUM(i);
177
+ }
178
+ return INT2NUM(i);
175
179
  }
176
180
 
177
181
  // takes in a string and calculates the longest open reading frame
178
182
  // in any of the 6 frames
179
183
  // an open reading frame is defined as the number of bases between
180
- // either the start of the sequence or a stop codon and either the
184
+ // either the start of the sequence or a start codon and either the
181
185
  // end of the sequence or a stop codon
182
- VALUE method_longest_orf(VALUE self, VALUE _s) {
183
- int i,sl,longest=0;
184
- int len[6];
185
- char * c_str;
186
186
 
187
- sl = RSTRING_LEN(_s);
188
- c_str = StringValueCStr(_s);
189
- for (i=0;i<6;i++) {
190
- len[i]=0;
191
- }
192
- for (i=0;i<sl-2;i++) {
193
- if (c_str[i]=='T' &&
194
- ((c_str[i+1]=='A' && c_str[i+2]=='G') ||
195
- (c_str[i+1]=='A' && c_str[i+2]=='A') ||
196
- (c_str[i+1]=='G' && c_str[i+2]=='A'))) {
197
- if (len[i%3] > longest) {
198
- longest = len[i%3];
199
- }
200
- len[i%3]=0;
201
- } else {
202
- len[i%3]++;
187
+ VALUE method_longest_orf(VALUE self, VALUE _str) {
188
+ int i,sl,longest=0;
189
+ int len[3];
190
+ char * str;
191
+ sl = RSTRING_LEN(_str);
192
+ str = StringValueCStr(_str);
193
+ for (i=0;i<3;i++) {
194
+ len[i]=0;
195
+ }
196
+ for(i=0;i<sl-2;i++) {
197
+ if (str[i]=='A' && str[i+1]=='T' && str[i+2]=='G') { //Methionine
198
+ if (len[i%3]>=0) {
199
+ len[i%3]++;
200
+ } else {
201
+ len[i%3]=1;
202
+ }
203
+ } else {
204
+ if (str[i]=='T' &&
205
+ ((str[i+1]=='A' && str[i+2]=='G') || //amber
206
+ (str[i+1]=='A' && str[i+2]=='A') || //ochre stops
207
+ (str[i+1]=='G' && str[i+2]=='A'))) { //umber
208
+ if (len[i%3]>longest) {
209
+ longest = len[i%3];
203
210
  }
204
- if (c_str[i+2]=='A' &&
205
- ((c_str[i]=='C' && c_str[i+1]=='T') ||
206
- (c_str[i]=='T' && c_str[i+1]=='T') ||
207
- (c_str[i]=='T' && c_str[i+1]=='C'))) {
208
- if (len[3+i%3] > longest) {
209
- longest = len[3+i%3];
210
- }
211
- len[3+i%3]=0;
212
- } else {
213
- len[3+i%3]++;
211
+ len[i%3]=-1;
212
+ } else { // any other codon
213
+ if (len[i%3]>=0) {
214
+ len[i%3]++;
214
215
  }
216
+ }
215
217
  }
218
+ }
219
+ for(i=0;i<3;i++) {
216
220
  if (len[i%3] > longest) {
217
- longest = len[i%3];
221
+ longest = len[i%3];
222
+ }
223
+ }
224
+ for (i=0;i<3;i++) {
225
+ len[i]=0;
226
+ }
227
+ for(i=sl-1;i>=2;i--) {
228
+ if (str[i]=='T' && str[i-1]=='A' && str[i-2]=='C') { //Methionine
229
+ if (len[i%3]>=0) {
230
+ len[i%3]++;
231
+ } else {
232
+ len[i%3]=1;
233
+ }
234
+ } else {
235
+ if (str[i]=='A' &&
236
+ ((str[i-1]=='T' && str[i-2]=='C') || //amber
237
+ (str[i-1]=='T' && str[i-2]=='T') || //ochre stops
238
+ (str[i-1]=='C' && str[i-2]=='T'))) { //umber
239
+ if (len[i%3]>longest) {
240
+ longest = len[i%3];
241
+ }
242
+ len[i%3]=-1;
243
+ } else { // any other codon
244
+ if (len[i%3]>=0) {
245
+ len[i%3]++;
246
+ }
247
+ }
218
248
  }
219
- if (len[3+i%3] > longest) {
220
- longest = len[3+i%3];
249
+ }
250
+ for(i=0;i<3;i++) {
251
+ if (len[i%3] > longest) {
252
+ longest = len[i%3];
221
253
  }
222
- return INT2NUM(longest);
223
- }
254
+ }
255
+ return INT2NUM(longest);
256
+ }
257
+