scbi_cominer 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,8 @@
1
+ === 0.0.2 2012-07-25
2
+
3
+ Added SNPs y single_points
4
+
5
+ === 0.0.1 2012-07-23
6
+
7
+ * 1 major enhancement:
8
+ * Initial release
data/Manifest.txt ADDED
@@ -0,0 +1,16 @@
1
+ History.txt
2
+ lib/scbi_cominer/classes/base_function.rb
3
+ lib/scbi_cominer/classes/entropy_function.rb
4
+ lib/scbi_cominer/classes/frequency_table.rb
5
+ lib/scbi_cominer/classes/low_pass_filter.rb
6
+ lib/scbi_cominer/cominer.rb
7
+ lib/scbi_cominer.rb
8
+ Manifest.txt
9
+ PostInstall.txt
10
+ Rakefile
11
+ README.rdoc
12
+ script/console
13
+ script/destroy
14
+ script/generate
15
+ test/test_helper.rb
16
+ test/test_scbi_cominer.rb
data/PostInstall.txt ADDED
@@ -0,0 +1,7 @@
1
+
2
+ For more information on scbi_cominer, see http://scbi_cominer.rubyforge.org
3
+
4
+ NOTE: Change this information in PostInstall.txt
5
+ You can also delete it if you don't want it.
6
+
7
+
data/README.rdoc ADDED
@@ -0,0 +1,92 @@
1
+ = scbi_ace
2
+
3
+ * http://www.scbi.uma.es/downloads
4
+
5
+ == DESCRIPTION:
6
+
7
+ scbi_cominer is a ruby gem to calculate some interesting regions and statistics from contigs
8
+
9
+ == FEATURES/PROBLEMS:
10
+
11
+ * Find low covered regions in contigs
12
+ * Find putative snps in contigs
13
+ * Calculates a frequency and position table for each nucleotide
14
+
15
+ == SYNOPSIS:
16
+
17
+
18
+ require 'scbi_ace'
19
+ require 'scbi_cominer'
20
+
21
+ filename=File.join(File.dirname(__FILE__),'test.ace')
22
+
23
+ # open ace file with parser
24
+ ace=AceParser.new(filename)
25
+
26
+ # iterate over all contigs in ace file
27
+ ace.each_contig do |contig|
28
+
29
+ # puts contig name
30
+ puts contig.name
31
+
32
+ # calculate cominer stats with this contig
33
+ cominer_stats=Cominer.new(contig)
34
+
35
+
36
+ # get all reads with orientation and align clips
37
+ contig.reads.each do |name,read|
38
+ puts ">#{read.name} #{read.orientation} #{read.align_clip_start} #{read.align_clip_end}"
39
+ puts read.fasta
40
+ end
41
+
42
+ # get position_table
43
+ puts cominer_stats.position_table.to_json
44
+
45
+ # get regions
46
+ puts cominer_stats.regions.to_json
47
+
48
+ # get snps
49
+ puts cominer_stats.snps.to_json
50
+
51
+ puts cominer_stats.single_points.to_json
52
+
53
+ end
54
+
55
+ ace.close
56
+
57
+
58
+ == REQUIREMENTS:
59
+
60
+ * Needs fftw3 library installed on your operating system.
61
+ * Uses gems scbi_ace, narray , ruby-fftw3, gnuplot. They are installed automatically.
62
+
63
+ == INSTALL:
64
+
65
+ NOTE: You may need to install fftw3 library for your operating system prior to installing scbi_cominer.
66
+
67
+ * gem install scbi_cominer
68
+
69
+ == LICENSE:
70
+
71
+ (The MIT License)
72
+
73
+ Copyright (c) 2010 Dario Guerrero
74
+
75
+ Permission is hereby granted, free of charge, to any person obtaining
76
+ a copy of this software and associated documentation files (the
77
+ 'Software'), to deal in the Software without restriction, including
78
+ without limitation the rights to use, copy, modify, merge, publish,
79
+ distribute, sublicense, and/or sell copies of the Software, and to
80
+ permit persons to whom the Software is furnished to do so, subject to
81
+ the following conditions:
82
+
83
+ The above copyright notice and this permission notice shall be
84
+ included in all copies or substantial portions of the Software.
85
+
86
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
87
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
88
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
89
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
90
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
91
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
92
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,31 @@
1
+ require 'rubygems'
2
+ gem 'hoe', '>= 2.1.0'
3
+ require 'hoe'
4
+ require 'fileutils'
5
+ require './lib/scbi_cominer'
6
+
7
+ Hoe.plugin :newgem
8
+ # Hoe.plugin :website
9
+ # Hoe.plugin :cucumberfeatures
10
+
11
+ # Generate all the Rake tasks
12
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
13
+ $hoe = Hoe.spec 'scbi_cominer' do
14
+ self.developer 'Dario Guerrero', 'dariogf@gmail.com'
15
+ self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
16
+ self.rubyforge_name = self.name # TODO this is default value
17
+ self.extra_deps = [['scbi_ace','>= 0.0.5'],
18
+ ['narray','>= 0.5.9.8'],
19
+ ['ruby-fftw3', '>= 0.4'],
20
+ ['gnuplot', '>= 2.3.4']
21
+ ]
22
+
23
+
24
+ end
25
+
26
+ require 'newgem/tasks'
27
+ Dir['tasks/**/*.rake'].each { |t| load t }
28
+
29
+ # TODO - want other tests/tasks run by default? Add them to the list
30
+ # remove_task :default
31
+ # task :default => [:spec, :features]
@@ -0,0 +1,521 @@
1
+ require 'json'
2
+ require 'narray'
3
+
4
+ require "numru/fftw3"
5
+ include NumRu
6
+ require "gnuplot"
7
+
8
+ require 'low_pass_filter'
9
+
10
+
11
+ class BaseFunction
12
+
13
+ attr_accessor :regions,:single_points, :freq_table, :values, :fft, :snps
14
+
15
+ def initialize(freq_table)
16
+ @freq_table = freq_table
17
+
18
+ @values = []
19
+ @fft = nil
20
+ @lim1 = 0
21
+ @lim2 = 0
22
+
23
+ calculate
24
+
25
+ #puts @freq_table.inspect_pos(86,100)
26
+ #puts @fft.original_data[86].to_json
27
+ over_lim2 = lambda {|v| v > @lim2}
28
+ below_lim1 = lambda {|v| v < @lim1}
29
+
30
+ #puts @lim1,@lim2
31
+
32
+ single_points = filter_regions(@fft.original_data, over_lim2, true)
33
+ #puts single_points.to_json
34
+ # @regions=filter_regions(@fft.filtered_data, over_lim2, false , @values)
35
+ # @regions=filter_regions(@values, over_lim2, false , @values)
36
+
37
+ @regions=group_regions(single_points)
38
+ #@regions_below=filter_regions(@fft.filtered_data, below_lim1, false, @values)
39
+
40
+ @single_points = purge_regions(single_points,@regions)
41
+
42
+ # repeat snps that are already in a region
43
+ @snps = purge_snps(single_points)
44
+
45
+ # do not repeat snps that are in a region
46
+ # @snps = purge_snps(@single_points)
47
+ #puts @snps.to_json
48
+ #puts @regions.to_json
49
+ #puts @single_points.to_json
50
+ # puts @single_points.join(',')
51
+ #graph
52
+
53
+ # puts @values.to_json
54
+ end
55
+
56
+ def calculate
57
+ values = []
58
+ length = @freq_table.max_length
59
+
60
+ # evaluate freq table
61
+ length.times do |i|
62
+
63
+ val = evaluate_pos(i)
64
+
65
+ values.push val
66
+
67
+ end
68
+
69
+ @values = values
70
+
71
+ @fft = LowPassFilter.new(@values)
72
+
73
+ @lim1,@lim2 = @fft.limits
74
+
75
+ end
76
+
77
+ def evaluate_pos(i)
78
+ raise "You must create a child class to override this method"
79
+ end
80
+
81
+ def purge_regions(regions1, regions2)
82
+ res = []
83
+
84
+ #puts "to purge: #{regions1.length}"
85
+
86
+ regions1.each do |r1|
87
+ if !((regions2.find{ |r2|
88
+ ((r1['start']<=r2['end']) and (r2['start']<=r1['end']))
89
+ }))
90
+
91
+ res.push(r1)
92
+ end
93
+
94
+ end
95
+
96
+ #puts "purged: #{res.length}"
97
+
98
+ return res
99
+
100
+ end
101
+
102
+ def purge_snps(regions)
103
+ res = []
104
+
105
+ #puts "to purge: #{regions1.length}"
106
+
107
+ regions.each do |r1|
108
+ # is a one point region
109
+ if r1['start']==r1['end']
110
+ pos =r1['start']
111
+
112
+ if @freq_table.valid_snp(pos)
113
+ res.push(r1)
114
+ end
115
+ end
116
+
117
+ end
118
+
119
+ #puts "purged SNPS: #{res.length} from #{regions.length}\n #{res.to_yaml}"
120
+
121
+ return res
122
+
123
+ end
124
+
125
+
126
+ def valid_region(region, comp, only_single_points, mandatory_data)
127
+
128
+ region_start = region['start']
129
+ region_end = region['end']
130
+
131
+ res = false
132
+
133
+
134
+ if only_single_points
135
+ # only get SNPs
136
+ #print "check: #{region_start} - #{region_end}"
137
+ res = ((region_end - region_start) >= 0)
138
+
139
+ else
140
+
141
+ if mandatory_data.nil?
142
+ # if no mandatory data, add all regions
143
+ res = ((region_end - region_start) >=0)
144
+
145
+ else # there is mandatory data
146
+
147
+ # region must have al least one base
148
+ res = ((region_end - region_start) >0)
149
+
150
+ # negar la siguiente linea para no tener en cuenta regiones anchas sin snps dentro
151
+ if res
152
+ # check for inner regions in this range of the mandatory_data
153
+ data = mandatory_data[region_start,region_end-region_start+1]
154
+ regions = filter_regions(data,comp,nil)
155
+
156
+ # if there is more than one region, then is valid
157
+ if regions.empty? or regions.count<=1
158
+ res = false
159
+ end
160
+ end
161
+
162
+ end
163
+ end
164
+
165
+ return res
166
+ end
167
+
168
+ def group_regions(data)
169
+
170
+ max_separation = 15
171
+ last_end = 0
172
+
173
+ group_start = 0
174
+ group_end = 0
175
+ group_score = 0
176
+ group_size = 0
177
+
178
+
179
+ regions = []
180
+
181
+ if !data.empty?
182
+ region = {}
183
+ region['start'] = data[0]['start']
184
+ region['end'] = data[0]['end']
185
+ region['score'] = data[0]['score']
186
+
187
+ # filter regions
188
+ data.each do |r|
189
+
190
+ if r['start'] < last_end+max_separation
191
+ # group
192
+ group_score += r['score']
193
+ group_end = r['end']
194
+ group_size += 1
195
+ else
196
+ #close previous group, start new one
197
+ region = {}
198
+ region['start'] = group_start
199
+ region['end'] = group_end
200
+ region['score'] = group_score.to_f/group_size.to_f
201
+
202
+ #save region
203
+ if region['start']<region['end']
204
+ regions.push region
205
+ end
206
+
207
+ # init new one
208
+ group_start = r['start']
209
+ group_end = r['end']
210
+ group_score = r['score']
211
+ group_size = 1
212
+
213
+ end
214
+
215
+ last_end = r['end']
216
+
217
+ end
218
+ end
219
+
220
+ return regions
221
+
222
+ end
223
+
224
+ def filter_regions(data, comp, only_single_points = false, mandatory_data = nil)
225
+ # ===========
226
+ pos = 0
227
+
228
+ regions = []
229
+
230
+ region = {}
231
+ region['start'] = 0
232
+ region['end'] = 0
233
+ region['score'] = 0
234
+
235
+ anotate = false
236
+
237
+ # filter regions
238
+ data.each do |v|
239
+
240
+ if not anotate
241
+
242
+ if comp.call(v)
243
+ # is out
244
+ anotate = true
245
+ region['start'] = pos
246
+ region['end'] = 0
247
+ region['score'] = v
248
+ #else
249
+ # is inside limits
250
+ end
251
+
252
+ else # we are anotating a region, outside limits
253
+
254
+ if comp.call(v)
255
+ # is ok
256
+ region['score'] += v
257
+ else
258
+ # finish region
259
+ anotate = false
260
+
261
+ # actually it finished at previos pos
262
+ region['end'] = pos - 1
263
+
264
+ if (valid_region(region, comp, only_single_points, mandatory_data))
265
+ add_region(regions,region)
266
+ end
267
+
268
+ region = {}
269
+
270
+ end
271
+
272
+ end
273
+
274
+ pos = pos + 1
275
+
276
+ end
277
+
278
+ # anotate last region if any
279
+ if anotate
280
+ # finish region
281
+ anotate = false
282
+
283
+ # actually it finished at previos pos
284
+ region['end'] = pos
285
+
286
+ if valid_region(region,comp,only_single_points,mandatory_data)
287
+ add_region(regions,region)
288
+ end
289
+
290
+ region = {}
291
+ end
292
+
293
+ return regions
294
+ end
295
+
296
+ def add_region(regions,r)
297
+ w=(r['end']-r['start'])+1
298
+
299
+ if w>0 then
300
+ r['score'] = r['score'].to_f/w.to_f
301
+ regions.push r
302
+ end
303
+
304
+ end
305
+
306
+ def graph(file_name=nil)
307
+
308
+ Gnuplot.open do |gp|
309
+ Gnuplot::Plot.new( gp ) do |plot|
310
+
311
+ if !file_name.nil?
312
+ plot.terminal "png size #{@fft.filtered_data.length},600"
313
+ plot.output "#{file_name}"
314
+ end
315
+
316
+ plot.set "multiplot layout 2,1 upwards"
317
+
318
+ plot.xrange("[0:#{@fft.original_data.length-1}]")
319
+ #plot.yrange("[#{@fft.original_data.min}:#{@fft.original_data.max}]")
320
+ # plot.ytics("#{@fft.original_data.min},10,#{@fft.original_data.max}]")
321
+
322
+ #plot.ylabel "f"
323
+ #plot.xlabel "x"
324
+
325
+ #plot.set "bmargin #{dx+1}"
326
+ plot.set "tmargin 0.0"
327
+ #plot.set "lmargin #{dy}"
328
+
329
+ # graph fft data
330
+
331
+ plot.title ""
332
+ plot.ylabel "Region"
333
+ plot.xlabel "Nucleotide"
334
+
335
+ # =====================
336
+
337
+ if !@regions.empty?
338
+ x, y = regions_to_graph_data(@regions, @fft.original_data.length-1)
339
+
340
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
341
+ ds.with = "lines lt rgb \"red\" ti \"Regions #{x.length}\""
342
+ #ds.notitle
343
+ end
344
+ end
345
+ # =====================
346
+
347
+ if !@single_points.empty?
348
+ x, y = regions_to_graph_data(@single_points, @fft.original_data.length-1)
349
+
350
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
351
+ ds.with = "lines lt rgb \"blue\" ti \"Points #{x.length}\""
352
+ #ds.notitle
353
+ end
354
+ end
355
+ # =====================
356
+
357
+
358
+ end
359
+
360
+ Gnuplot::Plot.new( gp ) do |plot|
361
+ plot.title "Filter Base: #{fft.filter_base} , skip: #{fft.skip}"
362
+
363
+ plot.set "bmargin 0.0"
364
+ plot.set "tmargin 2"
365
+
366
+ #plot.set "xtics"
367
+ plot.xrange("[0:#{@fft.original_data.length-1}]")
368
+
369
+ #plot.set "origin #{DX},#{DY+SY};"
370
+ plot.ylabel "f"
371
+ plot.xlabel ''
372
+ plot.noxtics
373
+
374
+ x = (0..@fft.original_data.length-1).collect
375
+ y = @fft.original_data.to_a
376
+
377
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
378
+ ds.with = "lines lt rgb \"green\" ti \"Original data\""
379
+ #ds.notitle
380
+ end
381
+
382
+ x = (0..@fft.filtered_data.length-1).collect
383
+ y = @fft.filtered_data.to_a
384
+
385
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
386
+ ds.with = "lines lt rgb \"blue\" ti \"Filtered data\""
387
+ #ds.notitle
388
+ end
389
+
390
+
391
+
392
+ x=[0]
393
+ y=[@lim1]
394
+
395
+ x.push(@fft.filtered_data.length-1)
396
+ y.push(@lim1)
397
+
398
+
399
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
400
+ ds.with = "lines lt rgb \"red\" ti \"Lim1 [#{@lim1}]\""
401
+ #ds.notitle
402
+ end
403
+
404
+ x=[0]
405
+ y=[@lim2]
406
+
407
+ x.push(@fft.filtered_data.length-1)
408
+ y.push(@lim2)
409
+
410
+ #puts @lim1, @lim2
411
+
412
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
413
+ ds.with = "lines lt rgb \"red\" ti \"Lim2 [ #{@lim2}]\""
414
+ #ds.notitle
415
+ end
416
+
417
+ end
418
+
419
+ end
420
+
421
+
422
+ end
423
+
424
+
425
+ def regions_to_graph_data(regions,total_length)
426
+
427
+ x = []
428
+ y = []
429
+ # x = [0]
430
+ # y = [0]
431
+
432
+ regions.each do |r|
433
+
434
+ x.push r['start']-1
435
+ y.push 0
436
+
437
+ x.push r['start']
438
+ y.push r['score']
439
+
440
+
441
+
442
+ x.push r['end']
443
+ y.push r['score']
444
+
445
+ x.push r['end']+1
446
+ y.push 0
447
+
448
+ end
449
+
450
+ # x.push total_length
451
+ # y.push 0
452
+
453
+ if x.empty?
454
+ x.push 0
455
+ end
456
+
457
+ if y.empty?
458
+ y.push 0
459
+ end
460
+
461
+
462
+
463
+ return [x,y]
464
+
465
+ end
466
+
467
+ def graph2(file_name = nil)
468
+
469
+ Gnuplot.open do |gp|
470
+ Gnuplot::Plot.new( gp ) do |plot|
471
+
472
+ if !file_name.nil?
473
+ plot.terminal "png size #{@fft.filtered_data.length},600"
474
+ plot.output "#{file_name}"
475
+ end
476
+
477
+ plot.title "Filter Base: #{@fft.filter_base} , skip: #{@fft.skip}"
478
+ plot.ylabel "f"
479
+ plot.xlabel "x"
480
+
481
+ x = (0..@fft.original_data.length-1).collect
482
+
483
+ y = @fft.original_data.to_a
484
+
485
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
486
+ ds.with = "lines lt rgb \"green\""
487
+ ds.notitle
488
+ end
489
+
490
+ x = (0..@fft.filtered_data.length-1).collect
491
+ y = @fft.filtered_data.to_a
492
+
493
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
494
+ ds.with = "lines lt rgb \"blue\""
495
+ ds.notitle
496
+ end
497
+
498
+ x = (0..@fft.filtered_data.length-1).collect
499
+ y = [@lim1]
500
+ @fft.filtered_data.length.times { y.push(@lim1) }
501
+
502
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
503
+ ds.with = "lines lt rgb \"red\""
504
+ ds.notitle
505
+ end
506
+
507
+ x = (0..@fft.filtered_data.length-1).collect
508
+ y = [@lim2]
509
+ @fft.filtered_data.length.times { y.push(@lim2) }
510
+
511
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
512
+ ds.with = "lines lt rgb \"red\""
513
+ ds.notitle
514
+ end
515
+
516
+ end
517
+
518
+ end
519
+ end
520
+
521
+ end
@@ -0,0 +1,33 @@
1
+ require 'base_function'
2
+
3
+ class EntropyFunction < BaseFunction
4
+
5
+ def log2(v)
6
+ return (Math.log10(v)/Math.log10(2))
7
+ end
8
+
9
+ def evaluate_pos(i)
10
+ res = 0
11
+
12
+ nseq = @freq_table.nseq(i) || 0
13
+
14
+ if nseq>0
15
+ @freq_table.frequency_table.keys.each do |k|
16
+
17
+ if k!='-'
18
+ freq = @freq_table.frequency_table[k][i] || 0
19
+
20
+ co = freq.to_f/nseq.to_f
21
+ if (co > 0)
22
+ res += ((-1*co*log2(co)));
23
+ end
24
+ end
25
+ end
26
+
27
+ end
28
+
29
+ return res
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,258 @@
1
+ class FrequencyTable
2
+
3
+ attr_accessor :frequency_table,:position_table, :sequences
4
+
5
+ def initialize
6
+ @frequency_table = {}
7
+ @position_table = {}
8
+ @sequences =[]
9
+ end
10
+
11
+ def max_length_key
12
+ m = 0
13
+ mk = nil
14
+
15
+ @frequency_table.each { |k,v|
16
+
17
+ if v.length>m
18
+ m = v.length
19
+ mk = k
20
+ end
21
+
22
+ }
23
+
24
+ return mk
25
+ end
26
+
27
+ # cuenta las secuencias efectivas de la posicion pos
28
+ def nseq(pos)
29
+ m = 0
30
+
31
+ @frequency_table.each { |k,v|
32
+
33
+ if (!v[pos].nil?) && (k!='-')
34
+ #puts v[pos].to_s + k if pos==0
35
+ m += v[pos]
36
+ end
37
+
38
+ }
39
+
40
+ return m
41
+ #return @sequences.count
42
+ end
43
+
44
+ # cuenta las secuencias efectivas de la posicion pos
45
+ def nseq_valid(pos)
46
+ m = 0
47
+
48
+ @frequency_table.each { |k,v|
49
+
50
+ if (!v[pos].nil?) && (k!='-') && (k!='*')
51
+ #puts v[pos].to_s + k if pos==0
52
+ m += v[pos]
53
+ end
54
+
55
+ }
56
+
57
+ return m
58
+ #return @sequences.count
59
+ end
60
+
61
+ # cuenta las secuencias efectivas de la posicion pos
62
+ def consensus_freq(pos)
63
+ ke=nil
64
+ m = 0
65
+
66
+ @frequency_table.each { |k,v|
67
+
68
+ if (!v[pos].nil?) && (k!='-') && (k!='*')
69
+ if v[pos] >= m
70
+ m = v[pos]
71
+ ke=k
72
+ end
73
+
74
+ end
75
+
76
+ }
77
+
78
+ return [ke,m]
79
+ #return @sequences.count
80
+ end
81
+
82
+
83
+ def max_length
84
+ m = 0
85
+ mk = nil
86
+
87
+ @frequency_table.each { |k,v|
88
+
89
+ if v.length>m
90
+ m = v.length
91
+ mk = k
92
+ end
93
+
94
+ }
95
+
96
+ return m
97
+ end
98
+
99
+ def add_read(read)
100
+ add_fasta(read.name, read.fasta)
101
+ end
102
+
103
+ def add_sequence(seq)
104
+ add_fasta(seq.seq_name,seq.seq_fasta)
105
+ end
106
+
107
+ def add_fasta(name,fasta)
108
+
109
+ @sequences.push({:name=>name,:fasta=>fasta})
110
+ index = @sequences.length
111
+
112
+ i = 0
113
+ fasta.each_char do |c|
114
+
115
+ #get current freq
116
+ freq = @frequency_table[c]
117
+ if freq.nil?
118
+ @frequency_table[c] = []
119
+ freq = @frequency_table[c]
120
+ end
121
+
122
+ #increment current freq
123
+ if freq[i].nil?
124
+ freq[i] = 1
125
+ else
126
+ freq[i]+=1
127
+ end
128
+
129
+ #get current pos
130
+ pos = @position_table[c]
131
+ if pos.nil?
132
+ @position_table[c] = []
133
+ pos = @position_table[c]
134
+ end
135
+
136
+ #increment current freq
137
+ if pos[i].nil?
138
+ pos[i] = [index]
139
+ else
140
+ pos[i].push index
141
+ end
142
+
143
+ #increment pos
144
+ i +=1
145
+ end
146
+
147
+ # puts " "+fasta[0..30]
148
+ end
149
+
150
+ def extract_col(pos)
151
+
152
+ h={}
153
+
154
+ @frequency_table.each do |k,v|
155
+
156
+ if (!v[pos].nil?) && (k!='-') && (k!='*')
157
+ if v[pos]>0
158
+ h[k]=v[pos]
159
+ end
160
+ end
161
+
162
+ end
163
+
164
+ return h
165
+
166
+ end
167
+
168
+ def valid_snp(pos)
169
+ res = false
170
+
171
+ # a valid SNP is a change in at least two valid sequences.
172
+
173
+ total_seqs = nseq_valid(pos)
174
+ consensus_base, consensus_frequency = consensus_freq(pos)
175
+ # puts h.to_json
176
+ # change a pos
177
+
178
+ #if pos = 128
179
+ # @frequency_table['A'][pos]=1
180
+ # @frequency_table['C'][pos]=2
181
+ # @frequency_table['T'][pos]=0
182
+ # @frequency_table['G'][pos]=4
183
+ # end
184
+ #
185
+ if consensus_frequency <= total_seqs - 2
186
+
187
+ h = extract_col(pos)
188
+
189
+
190
+
191
+ h.each do |k,v|
192
+ if (k!=consensus_base) && (v>=2)
193
+ res = true
194
+ # puts "#{k} is #{v}>=2"
195
+ break
196
+ end
197
+ end
198
+
199
+
200
+ end
201
+
202
+ # puts inspect_pos(pos,pos)
203
+ # puts "POS: #{pos}"
204
+ # puts res.to_s
205
+ # puts "base #{consensus_base}, freq #{consensus_frequency}, tot: #{total_seqs}"
206
+ # puts '-' * 20
207
+
208
+ #puts "nseq[#{pos}] = #{total_seqs} => #{consensus_base}: #{consensus_frequency}"
209
+
210
+ # @frequency_table.each { |k,v|
211
+ #
212
+ # puts "#{k}:#{v[pos] ||= '0'}"
213
+ #
214
+ #
215
+ # }
216
+ #
217
+ # puts res.to_s
218
+
219
+ return res
220
+ end
221
+
222
+ def inspect
223
+
224
+ res = ''
225
+
226
+ @frequency_table.each do |c,v|
227
+
228
+ res += c+':'+(v[0..30].map{|c1| c1 ||= 0 }).join("")+"...more.\n"
229
+
230
+ end
231
+
232
+ @position_table.each do |c,v|
233
+
234
+ # res += c+':'+(v[0..10].map{|c1| (c1 && ('['+c1.map{|c2| @sequences[c2-1].seq_name}.join(","))+']') || "[]" }).join(" - ")+"...more.\n"
235
+ res += c+':'+(v[0..10].map{|c1| (c1 && ('['+c1.map{|c2| @sequences[c2-1][:name]}.join(","))+']') || "[]" }).join(" - ")+"...more.\n"
236
+
237
+ end
238
+
239
+ return res
240
+
241
+ end
242
+
243
+ def inspect_pos(pos_start,pos_end)
244
+
245
+
246
+ res = ''
247
+
248
+ @frequency_table.each do |c,v|
249
+
250
+ res += c+':'+(v[pos_start..pos_end].map{|c1| c1 ||= 0 }).join("")+"...more.\n"
251
+
252
+ end
253
+
254
+ return res
255
+ end
256
+
257
+
258
+ end
@@ -0,0 +1,84 @@
1
+ require "narray"
2
+ require "numru/fftw3"
3
+ include NumRu
4
+ require "gnuplot"
5
+
6
+ class NArray
7
+
8
+ def mad_mean
9
+ me = self.mean
10
+ mad = ((self - me ).abs).mean
11
+
12
+ return [mad,me]
13
+
14
+ end
15
+
16
+ def mad_median
17
+ me = self.median
18
+ mad = ((self - me).abs).median
19
+
20
+ return [mad,me]
21
+ end
22
+
23
+ end
24
+
25
+ class LowPassFilter
26
+
27
+ attr_accessor :original_data, :filtered_data, :skip, :filter_base
28
+
29
+ def initialize(data)
30
+ @original_data = NArray.to_na(data)
31
+ @filtered_data = nil
32
+ @skip = nil
33
+ @filter_base = 8
34
+
35
+ run
36
+ end
37
+
38
+ def run
39
+ coef = FFTW3.fft(@original_data, -1,0)/@original_data.length
40
+
41
+ if @skip.nil?
42
+ @skip = (coef.length/4)
43
+ end
44
+
45
+ inc = (2.0 / (coef.length - @skip));
46
+
47
+ x = 1;
48
+
49
+ # keep intact first components of coefs
50
+ i=@skip
51
+ while i<coef.length
52
+ # el filtro reduce los componentes de alta frecuencia
53
+ f = x ** @filter_base
54
+
55
+ coef[i]=coef[i]*f
56
+
57
+ x = x - inc;
58
+ i +=1# era 2 porque en perl no usa num complejos
59
+ end
60
+
61
+ filtered = FFTW3.fft(coef, 1,0)
62
+
63
+ @filtered_data=filtered.real
64
+
65
+ @filtered_data[0] = @original_data[0]
66
+ @filtered_data[@filtered_data.length-1] = @original_data[@original_data.length-1]
67
+
68
+ end
69
+
70
+ def limits
71
+
72
+ mad,median = @filtered_data.mad_median
73
+
74
+ if mad == 0
75
+ mad,mean = @filtered_data.mad_mean
76
+ end
77
+
78
+ desv = 1.4826 * mad
79
+
80
+ return [median - desv, median + desv].sort
81
+
82
+ end
83
+
84
+ end
@@ -0,0 +1,36 @@
1
+
2
+ require 'frequency_table.rb'
3
+ require 'entropy_function'
4
+
5
+
6
+ class Cominer
7
+
8
+ attr_accessor :regions,:snps,:position_table, :freq_table, :single_points
9
+
10
+ def initialize(contig)
11
+
12
+ @freq_table = FrequencyTable.new
13
+
14
+ contig.reads.each do |name,read|
15
+ # puts ">#{read.name} #{read.orientation} #{read.align_clip_start} #{read.align_clip_end}"
16
+ # puts read.fasta
17
+ @freq_table.add_read(read)
18
+ end
19
+
20
+ # puts freq_table.inspect
21
+
22
+ ef = EntropyFunction.new(@freq_table)
23
+
24
+ @regions=ef.regions
25
+ @snps=ef.snps
26
+ @single_points=ef.single_points
27
+
28
+ @freq_table.position_table.delete('-')
29
+
30
+ @position_table = @freq_table.position_table
31
+
32
+ end
33
+
34
+
35
+
36
+ end
@@ -0,0 +1,12 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ $: << File.join(File.dirname(__FILE__),File.basename(__FILE__,File.extname(__FILE__)))
5
+ $: << File.join(File.dirname(__FILE__),File.basename(__FILE__,File.extname(__FILE__)),'classes')
6
+
7
+ # puts $:
8
+ require 'cominer.rb'
9
+
10
+ module ScbiCominer
11
+ VERSION = '0.0.2'
12
+ end
data/script/console ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/scbi_cominer.rb'}"
9
+ puts "Loading scbi_cominer gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)
@@ -0,0 +1,3 @@
1
+ require 'stringio'
2
+ require 'test/unit'
3
+ require File.dirname(__FILE__) + '/../lib/scbi_cominer'
@@ -0,0 +1,48 @@
1
+ require File.dirname(__FILE__) + '/test_helper.rb'
2
+
3
+ require 'scbi_ace'
4
+
5
+ class TestScbiCominer < Test::Unit::TestCase
6
+
7
+ def setup
8
+ end
9
+
10
+ def test_truth
11
+ filename=File.join(File.dirname(__FILE__),'test.ace')
12
+
13
+ # open ace file with parser
14
+ ace=AceParser.new(filename)
15
+
16
+ # iterate over all contigs in ace file
17
+ ace.each_contig do |contig|
18
+
19
+ # puts contig name
20
+ puts contig.name
21
+
22
+ # calculate cominer stats with this contig
23
+ cominer_stats=Cominer.new(contig)
24
+
25
+
26
+ # get all reads with orientation and align clips
27
+ contig.reads.each do |name,read|
28
+ puts ">#{read.name} #{read.orientation} #{read.align_clip_start} #{read.align_clip_end}"
29
+ puts read.fasta
30
+ end
31
+
32
+ # get position_table
33
+ puts cominer_stats.position_table.to_json
34
+
35
+ # get regions
36
+ puts cominer_stats.regions.to_json
37
+
38
+ # get snps
39
+ puts cominer_stats.snps.to_json
40
+
41
+ puts cominer_stats.single_points.to_json
42
+
43
+ end
44
+
45
+ ace.close
46
+
47
+ end
48
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scbi_cominer
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.2
6
+ platform: ruby
7
+ authors:
8
+ - Dario Guerrero
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2012-07-25 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: scbi_ace
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.5
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: narray
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: 0.5.9.8
35
+ type: :runtime
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
38
+ name: ruby-fftw3
39
+ prerelease: false
40
+ requirement: &id003 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0.4"
46
+ type: :runtime
47
+ version_requirements: *id003
48
+ - !ruby/object:Gem::Dependency
49
+ name: gnuplot
50
+ prerelease: false
51
+ requirement: &id004 !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: 2.3.4
57
+ type: :runtime
58
+ version_requirements: *id004
59
+ - !ruby/object:Gem::Dependency
60
+ name: hoe
61
+ prerelease: false
62
+ requirement: &id005 !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: 2.8.0
68
+ type: :development
69
+ version_requirements: *id005
70
+ description: scbi_cominer is a ruby gem to calculate some interesting regions and statistics from contigs
71
+ email:
72
+ - dariogf@gmail.com
73
+ executables: []
74
+
75
+ extensions: []
76
+
77
+ extra_rdoc_files:
78
+ - History.txt
79
+ - Manifest.txt
80
+ - PostInstall.txt
81
+ files:
82
+ - History.txt
83
+ - lib/scbi_cominer/classes/base_function.rb
84
+ - lib/scbi_cominer/classes/entropy_function.rb
85
+ - lib/scbi_cominer/classes/frequency_table.rb
86
+ - lib/scbi_cominer/classes/low_pass_filter.rb
87
+ - lib/scbi_cominer/cominer.rb
88
+ - lib/scbi_cominer.rb
89
+ - Manifest.txt
90
+ - PostInstall.txt
91
+ - Rakefile
92
+ - README.rdoc
93
+ - script/console
94
+ - script/destroy
95
+ - script/generate
96
+ - test/test_helper.rb
97
+ - test/test_scbi_cominer.rb
98
+ homepage: http://www.scbi.uma.es/downloads
99
+ licenses: []
100
+
101
+ post_install_message: PostInstall.txt
102
+ rdoc_options:
103
+ - --main
104
+ - README.rdoc
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: "0"
113
+ required_rubygems_version: !ruby/object:Gem::Requirement
114
+ none: false
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: "0"
119
+ requirements: []
120
+
121
+ rubyforge_project: scbi_cominer
122
+ rubygems_version: 1.8.24
123
+ signing_key:
124
+ specification_version: 3
125
+ summary: scbi_cominer is a ruby gem to calculate some interesting regions and statistics from contigs
126
+ test_files:
127
+ - test/test_helper.rb
128
+ - test/test_scbi_cominer.rb