scbi_cominer 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt ADDED
@@ -0,0 +1,8 @@
1
+ === 0.0.2 2012-07-25
2
+
3
+ Added SNPs y single_points
4
+
5
+ === 0.0.1 2012-07-23
6
+
7
+ * 1 major enhancement:
8
+ * Initial release
data/Manifest.txt ADDED
@@ -0,0 +1,16 @@
1
+ History.txt
2
+ lib/scbi_cominer/classes/base_function.rb
3
+ lib/scbi_cominer/classes/entropy_function.rb
4
+ lib/scbi_cominer/classes/frequency_table.rb
5
+ lib/scbi_cominer/classes/low_pass_filter.rb
6
+ lib/scbi_cominer/cominer.rb
7
+ lib/scbi_cominer.rb
8
+ Manifest.txt
9
+ PostInstall.txt
10
+ Rakefile
11
+ README.rdoc
12
+ script/console
13
+ script/destroy
14
+ script/generate
15
+ test/test_helper.rb
16
+ test/test_scbi_cominer.rb
data/PostInstall.txt ADDED
@@ -0,0 +1,7 @@
1
+
2
+ For more information on scbi_cominer, see http://scbi_cominer.rubyforge.org
3
+
4
+ NOTE: Change this information in PostInstall.txt
5
+ You can also delete it if you don't want it.
6
+
7
+
data/README.rdoc ADDED
@@ -0,0 +1,92 @@
1
+ = scbi_ace
2
+
3
+ * http://www.scbi.uma.es/downloads
4
+
5
+ == DESCRIPTION:
6
+
7
+ scbi_cominer is a ruby gem to calculate some interesting regions and statistics from contigs
8
+
9
+ == FEATURES/PROBLEMS:
10
+
11
+ * Find low covered regions in contigs
12
+ * Find putative snps in contigs
13
+ * Calculates a frequency and position table for each nucleotide
14
+
15
+ == SYNOPSIS:
16
+
17
+
18
+ require 'scbi_ace'
19
+ require 'scbi_cominer'
20
+
21
+ filename=File.join(File.dirname(__FILE__),'test.ace')
22
+
23
+ # open ace file with parser
24
+ ace=AceParser.new(filename)
25
+
26
+ # iterate over all contigs in ace file
27
+ ace.each_contig do |contig|
28
+
29
+ # puts contig name
30
+ puts contig.name
31
+
32
+ # calculate cominer stats with this contig
33
+ cominer_stats=Cominer.new(contig)
34
+
35
+
36
+ # get all reads with orientation and align clips
37
+ contig.reads.each do |name,read|
38
+ puts ">#{read.name} #{read.orientation} #{read.align_clip_start} #{read.align_clip_end}"
39
+ puts read.fasta
40
+ end
41
+
42
+ # get position_table
43
+ puts cominer_stats.position_table.to_json
44
+
45
+ # get regions
46
+ puts cominer_stats.regions.to_json
47
+
48
+ # get snps
49
+ puts cominer_stats.snps.to_json
50
+
51
+ puts cominer_stats.single_points.to_json
52
+
53
+ end
54
+
55
+ ace.close
56
+
57
+
58
+ == REQUIREMENTS:
59
+
60
+ * Needs fftw3 library installed on your operating system.
61
+ * Uses gems scbi_ace, narray , ruby-fftw3, gnuplot. They are installed automatically.
62
+
63
+ == INSTALL:
64
+
65
+ NOTE: You may need to install fftw3 library for your operating system prior to installing scbi_cominer.
66
+
67
+ * gem install scbi_cominer
68
+
69
+ == LICENSE:
70
+
71
+ (The MIT License)
72
+
73
+ Copyright (c) 2010 Dario Guerrero
74
+
75
+ Permission is hereby granted, free of charge, to any person obtaining
76
+ a copy of this software and associated documentation files (the
77
+ 'Software'), to deal in the Software without restriction, including
78
+ without limitation the rights to use, copy, modify, merge, publish,
79
+ distribute, sublicense, and/or sell copies of the Software, and to
80
+ permit persons to whom the Software is furnished to do so, subject to
81
+ the following conditions:
82
+
83
+ The above copyright notice and this permission notice shall be
84
+ included in all copies or substantial portions of the Software.
85
+
86
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
87
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
88
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
89
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
90
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
91
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
92
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,31 @@
1
+ require 'rubygems'
2
+ gem 'hoe', '>= 2.1.0'
3
+ require 'hoe'
4
+ require 'fileutils'
5
+ require './lib/scbi_cominer'
6
+
7
+ Hoe.plugin :newgem
8
+ # Hoe.plugin :website
9
+ # Hoe.plugin :cucumberfeatures
10
+
11
+ # Generate all the Rake tasks
12
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
13
+ $hoe = Hoe.spec 'scbi_cominer' do
14
+ self.developer 'Dario Guerrero', 'dariogf@gmail.com'
15
+ self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
16
+ self.rubyforge_name = self.name # TODO this is default value
17
+ self.extra_deps = [['scbi_ace','>= 0.0.5'],
18
+ ['narray','>= 0.5.9.8'],
19
+ ['ruby-fftw3', '>= 0.4'],
20
+ ['gnuplot', '>= 2.3.4']
21
+ ]
22
+
23
+
24
+ end
25
+
26
+ require 'newgem/tasks'
27
+ Dir['tasks/**/*.rake'].each { |t| load t }
28
+
29
+ # TODO - want other tests/tasks run by default? Add them to the list
30
+ # remove_task :default
31
+ # task :default => [:spec, :features]
@@ -0,0 +1,521 @@
1
+ require 'json'
2
+ require 'narray'
3
+
4
+ require "numru/fftw3"
5
+ include NumRu
6
+ require "gnuplot"
7
+
8
+ require 'low_pass_filter'
9
+
10
+
11
+ class BaseFunction
12
+
13
+ attr_accessor :regions,:single_points, :freq_table, :values, :fft, :snps
14
+
15
+ def initialize(freq_table)
16
+ @freq_table = freq_table
17
+
18
+ @values = []
19
+ @fft = nil
20
+ @lim1 = 0
21
+ @lim2 = 0
22
+
23
+ calculate
24
+
25
+ #puts @freq_table.inspect_pos(86,100)
26
+ #puts @fft.original_data[86].to_json
27
+ over_lim2 = lambda {|v| v > @lim2}
28
+ below_lim1 = lambda {|v| v < @lim1}
29
+
30
+ #puts @lim1,@lim2
31
+
32
+ single_points = filter_regions(@fft.original_data, over_lim2, true)
33
+ #puts single_points.to_json
34
+ # @regions=filter_regions(@fft.filtered_data, over_lim2, false , @values)
35
+ # @regions=filter_regions(@values, over_lim2, false , @values)
36
+
37
+ @regions=group_regions(single_points)
38
+ #@regions_below=filter_regions(@fft.filtered_data, below_lim1, false, @values)
39
+
40
+ @single_points = purge_regions(single_points,@regions)
41
+
42
+ # repeat snps that are already in a region
43
+ @snps = purge_snps(single_points)
44
+
45
+ # do not repeat snps that are in a region
46
+ # @snps = purge_snps(@single_points)
47
+ #puts @snps.to_json
48
+ #puts @regions.to_json
49
+ #puts @single_points.to_json
50
+ # puts @single_points.join(',')
51
+ #graph
52
+
53
+ # puts @values.to_json
54
+ end
55
+
56
+ def calculate
57
+ values = []
58
+ length = @freq_table.max_length
59
+
60
+ # evaluate freq table
61
+ length.times do |i|
62
+
63
+ val = evaluate_pos(i)
64
+
65
+ values.push val
66
+
67
+ end
68
+
69
+ @values = values
70
+
71
+ @fft = LowPassFilter.new(@values)
72
+
73
+ @lim1,@lim2 = @fft.limits
74
+
75
+ end
76
+
77
+ def evaluate_pos(i)
78
+ raise "You must create a child class to override this method"
79
+ end
80
+
81
+ def purge_regions(regions1, regions2)
82
+ res = []
83
+
84
+ #puts "to purge: #{regions1.length}"
85
+
86
+ regions1.each do |r1|
87
+ if !((regions2.find{ |r2|
88
+ ((r1['start']<=r2['end']) and (r2['start']<=r1['end']))
89
+ }))
90
+
91
+ res.push(r1)
92
+ end
93
+
94
+ end
95
+
96
+ #puts "purged: #{res.length}"
97
+
98
+ return res
99
+
100
+ end
101
+
102
+ def purge_snps(regions)
103
+ res = []
104
+
105
+ #puts "to purge: #{regions1.length}"
106
+
107
+ regions.each do |r1|
108
+ # is a one point region
109
+ if r1['start']==r1['end']
110
+ pos =r1['start']
111
+
112
+ if @freq_table.valid_snp(pos)
113
+ res.push(r1)
114
+ end
115
+ end
116
+
117
+ end
118
+
119
+ #puts "purged SNPS: #{res.length} from #{regions.length}\n #{res.to_yaml}"
120
+
121
+ return res
122
+
123
+ end
124
+
125
+
126
+ def valid_region(region, comp, only_single_points, mandatory_data)
127
+
128
+ region_start = region['start']
129
+ region_end = region['end']
130
+
131
+ res = false
132
+
133
+
134
+ if only_single_points
135
+ # only get SNPs
136
+ #print "check: #{region_start} - #{region_end}"
137
+ res = ((region_end - region_start) >= 0)
138
+
139
+ else
140
+
141
+ if mandatory_data.nil?
142
+ # if no mandatory data, add all regions
143
+ res = ((region_end - region_start) >=0)
144
+
145
+ else # there is mandatory data
146
+
147
+ # region must have al least one base
148
+ res = ((region_end - region_start) >0)
149
+
150
+ # negar la siguiente linea para no tener en cuenta regiones anchas sin snps dentro
151
+ if res
152
+ # check for inner regions in this range of the mandatory_data
153
+ data = mandatory_data[region_start,region_end-region_start+1]
154
+ regions = filter_regions(data,comp,nil)
155
+
156
+ # if there is more than one region, then is valid
157
+ if regions.empty? or regions.count<=1
158
+ res = false
159
+ end
160
+ end
161
+
162
+ end
163
+ end
164
+
165
+ return res
166
+ end
167
+
168
+ def group_regions(data)
169
+
170
+ max_separation = 15
171
+ last_end = 0
172
+
173
+ group_start = 0
174
+ group_end = 0
175
+ group_score = 0
176
+ group_size = 0
177
+
178
+
179
+ regions = []
180
+
181
+ if !data.empty?
182
+ region = {}
183
+ region['start'] = data[0]['start']
184
+ region['end'] = data[0]['end']
185
+ region['score'] = data[0]['score']
186
+
187
+ # filter regions
188
+ data.each do |r|
189
+
190
+ if r['start'] < last_end+max_separation
191
+ # group
192
+ group_score += r['score']
193
+ group_end = r['end']
194
+ group_size += 1
195
+ else
196
+ #close previous group, start new one
197
+ region = {}
198
+ region['start'] = group_start
199
+ region['end'] = group_end
200
+ region['score'] = group_score.to_f/group_size.to_f
201
+
202
+ #save region
203
+ if region['start']<region['end']
204
+ regions.push region
205
+ end
206
+
207
+ # init new one
208
+ group_start = r['start']
209
+ group_end = r['end']
210
+ group_score = r['score']
211
+ group_size = 1
212
+
213
+ end
214
+
215
+ last_end = r['end']
216
+
217
+ end
218
+ end
219
+
220
+ return regions
221
+
222
+ end
223
+
224
+ def filter_regions(data, comp, only_single_points = false, mandatory_data = nil)
225
+ # ===========
226
+ pos = 0
227
+
228
+ regions = []
229
+
230
+ region = {}
231
+ region['start'] = 0
232
+ region['end'] = 0
233
+ region['score'] = 0
234
+
235
+ anotate = false
236
+
237
+ # filter regions
238
+ data.each do |v|
239
+
240
+ if not anotate
241
+
242
+ if comp.call(v)
243
+ # is out
244
+ anotate = true
245
+ region['start'] = pos
246
+ region['end'] = 0
247
+ region['score'] = v
248
+ #else
249
+ # is inside limits
250
+ end
251
+
252
+ else # we are anotating a region, outside limits
253
+
254
+ if comp.call(v)
255
+ # is ok
256
+ region['score'] += v
257
+ else
258
+ # finish region
259
+ anotate = false
260
+
261
+ # actually it finished at previos pos
262
+ region['end'] = pos - 1
263
+
264
+ if (valid_region(region, comp, only_single_points, mandatory_data))
265
+ add_region(regions,region)
266
+ end
267
+
268
+ region = {}
269
+
270
+ end
271
+
272
+ end
273
+
274
+ pos = pos + 1
275
+
276
+ end
277
+
278
+ # anotate last region if any
279
+ if anotate
280
+ # finish region
281
+ anotate = false
282
+
283
+ # actually it finished at previos pos
284
+ region['end'] = pos
285
+
286
+ if valid_region(region,comp,only_single_points,mandatory_data)
287
+ add_region(regions,region)
288
+ end
289
+
290
+ region = {}
291
+ end
292
+
293
+ return regions
294
+ end
295
+
296
+ def add_region(regions,r)
297
+ w=(r['end']-r['start'])+1
298
+
299
+ if w>0 then
300
+ r['score'] = r['score'].to_f/w.to_f
301
+ regions.push r
302
+ end
303
+
304
+ end
305
+
306
+ def graph(file_name=nil)
307
+
308
+ Gnuplot.open do |gp|
309
+ Gnuplot::Plot.new( gp ) do |plot|
310
+
311
+ if !file_name.nil?
312
+ plot.terminal "png size #{@fft.filtered_data.length},600"
313
+ plot.output "#{file_name}"
314
+ end
315
+
316
+ plot.set "multiplot layout 2,1 upwards"
317
+
318
+ plot.xrange("[0:#{@fft.original_data.length-1}]")
319
+ #plot.yrange("[#{@fft.original_data.min}:#{@fft.original_data.max}]")
320
+ # plot.ytics("#{@fft.original_data.min},10,#{@fft.original_data.max}]")
321
+
322
+ #plot.ylabel "f"
323
+ #plot.xlabel "x"
324
+
325
+ #plot.set "bmargin #{dx+1}"
326
+ plot.set "tmargin 0.0"
327
+ #plot.set "lmargin #{dy}"
328
+
329
+ # graph fft data
330
+
331
+ plot.title ""
332
+ plot.ylabel "Region"
333
+ plot.xlabel "Nucleotide"
334
+
335
+ # =====================
336
+
337
+ if !@regions.empty?
338
+ x, y = regions_to_graph_data(@regions, @fft.original_data.length-1)
339
+
340
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
341
+ ds.with = "lines lt rgb \"red\" ti \"Regions #{x.length}\""
342
+ #ds.notitle
343
+ end
344
+ end
345
+ # =====================
346
+
347
+ if !@single_points.empty?
348
+ x, y = regions_to_graph_data(@single_points, @fft.original_data.length-1)
349
+
350
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
351
+ ds.with = "lines lt rgb \"blue\" ti \"Points #{x.length}\""
352
+ #ds.notitle
353
+ end
354
+ end
355
+ # =====================
356
+
357
+
358
+ end
359
+
360
+ Gnuplot::Plot.new( gp ) do |plot|
361
+ plot.title "Filter Base: #{fft.filter_base} , skip: #{fft.skip}"
362
+
363
+ plot.set "bmargin 0.0"
364
+ plot.set "tmargin 2"
365
+
366
+ #plot.set "xtics"
367
+ plot.xrange("[0:#{@fft.original_data.length-1}]")
368
+
369
+ #plot.set "origin #{DX},#{DY+SY};"
370
+ plot.ylabel "f"
371
+ plot.xlabel ''
372
+ plot.noxtics
373
+
374
+ x = (0..@fft.original_data.length-1).collect
375
+ y = @fft.original_data.to_a
376
+
377
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
378
+ ds.with = "lines lt rgb \"green\" ti \"Original data\""
379
+ #ds.notitle
380
+ end
381
+
382
+ x = (0..@fft.filtered_data.length-1).collect
383
+ y = @fft.filtered_data.to_a
384
+
385
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
386
+ ds.with = "lines lt rgb \"blue\" ti \"Filtered data\""
387
+ #ds.notitle
388
+ end
389
+
390
+
391
+
392
+ x=[0]
393
+ y=[@lim1]
394
+
395
+ x.push(@fft.filtered_data.length-1)
396
+ y.push(@lim1)
397
+
398
+
399
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
400
+ ds.with = "lines lt rgb \"red\" ti \"Lim1 [#{@lim1}]\""
401
+ #ds.notitle
402
+ end
403
+
404
+ x=[0]
405
+ y=[@lim2]
406
+
407
+ x.push(@fft.filtered_data.length-1)
408
+ y.push(@lim2)
409
+
410
+ #puts @lim1, @lim2
411
+
412
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
413
+ ds.with = "lines lt rgb \"red\" ti \"Lim2 [ #{@lim2}]\""
414
+ #ds.notitle
415
+ end
416
+
417
+ end
418
+
419
+ end
420
+
421
+
422
+ end
423
+
424
+
425
+ def regions_to_graph_data(regions,total_length)
426
+
427
+ x = []
428
+ y = []
429
+ # x = [0]
430
+ # y = [0]
431
+
432
+ regions.each do |r|
433
+
434
+ x.push r['start']-1
435
+ y.push 0
436
+
437
+ x.push r['start']
438
+ y.push r['score']
439
+
440
+
441
+
442
+ x.push r['end']
443
+ y.push r['score']
444
+
445
+ x.push r['end']+1
446
+ y.push 0
447
+
448
+ end
449
+
450
+ # x.push total_length
451
+ # y.push 0
452
+
453
+ if x.empty?
454
+ x.push 0
455
+ end
456
+
457
+ if y.empty?
458
+ y.push 0
459
+ end
460
+
461
+
462
+
463
+ return [x,y]
464
+
465
+ end
466
+
467
+ def graph2(file_name = nil)
468
+
469
+ Gnuplot.open do |gp|
470
+ Gnuplot::Plot.new( gp ) do |plot|
471
+
472
+ if !file_name.nil?
473
+ plot.terminal "png size #{@fft.filtered_data.length},600"
474
+ plot.output "#{file_name}"
475
+ end
476
+
477
+ plot.title "Filter Base: #{@fft.filter_base} , skip: #{@fft.skip}"
478
+ plot.ylabel "f"
479
+ plot.xlabel "x"
480
+
481
+ x = (0..@fft.original_data.length-1).collect
482
+
483
+ y = @fft.original_data.to_a
484
+
485
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
486
+ ds.with = "lines lt rgb \"green\""
487
+ ds.notitle
488
+ end
489
+
490
+ x = (0..@fft.filtered_data.length-1).collect
491
+ y = @fft.filtered_data.to_a
492
+
493
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
494
+ ds.with = "lines lt rgb \"blue\""
495
+ ds.notitle
496
+ end
497
+
498
+ x = (0..@fft.filtered_data.length-1).collect
499
+ y = [@lim1]
500
+ @fft.filtered_data.length.times { y.push(@lim1) }
501
+
502
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
503
+ ds.with = "lines lt rgb \"red\""
504
+ ds.notitle
505
+ end
506
+
507
+ x = (0..@fft.filtered_data.length-1).collect
508
+ y = [@lim2]
509
+ @fft.filtered_data.length.times { y.push(@lim2) }
510
+
511
+ plot.data << Gnuplot::DataSet.new( [x, y] ) do |ds|
512
+ ds.with = "lines lt rgb \"red\""
513
+ ds.notitle
514
+ end
515
+
516
+ end
517
+
518
+ end
519
+ end
520
+
521
+ end
@@ -0,0 +1,33 @@
1
+ require 'base_function'
2
+
3
+ class EntropyFunction < BaseFunction
4
+
5
+ def log2(v)
6
+ return (Math.log10(v)/Math.log10(2))
7
+ end
8
+
9
+ def evaluate_pos(i)
10
+ res = 0
11
+
12
+ nseq = @freq_table.nseq(i) || 0
13
+
14
+ if nseq>0
15
+ @freq_table.frequency_table.keys.each do |k|
16
+
17
+ if k!='-'
18
+ freq = @freq_table.frequency_table[k][i] || 0
19
+
20
+ co = freq.to_f/nseq.to_f
21
+ if (co > 0)
22
+ res += ((-1*co*log2(co)));
23
+ end
24
+ end
25
+ end
26
+
27
+ end
28
+
29
+ return res
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,258 @@
1
+ class FrequencyTable
2
+
3
+ attr_accessor :frequency_table,:position_table, :sequences
4
+
5
+ def initialize
6
+ @frequency_table = {}
7
+ @position_table = {}
8
+ @sequences =[]
9
+ end
10
+
11
+ def max_length_key
12
+ m = 0
13
+ mk = nil
14
+
15
+ @frequency_table.each { |k,v|
16
+
17
+ if v.length>m
18
+ m = v.length
19
+ mk = k
20
+ end
21
+
22
+ }
23
+
24
+ return mk
25
+ end
26
+
27
+ # cuenta las secuencias efectivas de la posicion pos
28
+ def nseq(pos)
29
+ m = 0
30
+
31
+ @frequency_table.each { |k,v|
32
+
33
+ if (!v[pos].nil?) && (k!='-')
34
+ #puts v[pos].to_s + k if pos==0
35
+ m += v[pos]
36
+ end
37
+
38
+ }
39
+
40
+ return m
41
+ #return @sequences.count
42
+ end
43
+
44
+ # cuenta las secuencias efectivas de la posicion pos
45
+ def nseq_valid(pos)
46
+ m = 0
47
+
48
+ @frequency_table.each { |k,v|
49
+
50
+ if (!v[pos].nil?) && (k!='-') && (k!='*')
51
+ #puts v[pos].to_s + k if pos==0
52
+ m += v[pos]
53
+ end
54
+
55
+ }
56
+
57
+ return m
58
+ #return @sequences.count
59
+ end
60
+
61
+ # cuenta las secuencias efectivas de la posicion pos
62
+ def consensus_freq(pos)
63
+ ke=nil
64
+ m = 0
65
+
66
+ @frequency_table.each { |k,v|
67
+
68
+ if (!v[pos].nil?) && (k!='-') && (k!='*')
69
+ if v[pos] >= m
70
+ m = v[pos]
71
+ ke=k
72
+ end
73
+
74
+ end
75
+
76
+ }
77
+
78
+ return [ke,m]
79
+ #return @sequences.count
80
+ end
81
+
82
+
83
+ def max_length
84
+ m = 0
85
+ mk = nil
86
+
87
+ @frequency_table.each { |k,v|
88
+
89
+ if v.length>m
90
+ m = v.length
91
+ mk = k
92
+ end
93
+
94
+ }
95
+
96
+ return m
97
+ end
98
+
99
+ def add_read(read)
100
+ add_fasta(read.name, read.fasta)
101
+ end
102
+
103
+ def add_sequence(seq)
104
+ add_fasta(seq.seq_name,seq.seq_fasta)
105
+ end
106
+
107
+ def add_fasta(name,fasta)
108
+
109
+ @sequences.push({:name=>name,:fasta=>fasta})
110
+ index = @sequences.length
111
+
112
+ i = 0
113
+ fasta.each_char do |c|
114
+
115
+ #get current freq
116
+ freq = @frequency_table[c]
117
+ if freq.nil?
118
+ @frequency_table[c] = []
119
+ freq = @frequency_table[c]
120
+ end
121
+
122
+ #increment current freq
123
+ if freq[i].nil?
124
+ freq[i] = 1
125
+ else
126
+ freq[i]+=1
127
+ end
128
+
129
+ #get current pos
130
+ pos = @position_table[c]
131
+ if pos.nil?
132
+ @position_table[c] = []
133
+ pos = @position_table[c]
134
+ end
135
+
136
+ #increment current freq
137
+ if pos[i].nil?
138
+ pos[i] = [index]
139
+ else
140
+ pos[i].push index
141
+ end
142
+
143
+ #increment pos
144
+ i +=1
145
+ end
146
+
147
+ # puts " "+fasta[0..30]
148
+ end
149
+
150
+ def extract_col(pos)
151
+
152
+ h={}
153
+
154
+ @frequency_table.each do |k,v|
155
+
156
+ if (!v[pos].nil?) && (k!='-') && (k!='*')
157
+ if v[pos]>0
158
+ h[k]=v[pos]
159
+ end
160
+ end
161
+
162
+ end
163
+
164
+ return h
165
+
166
+ end
167
+
168
+ def valid_snp(pos)
169
+ res = false
170
+
171
+ # a valid SNP is a change in at least two valid sequences.
172
+
173
+ total_seqs = nseq_valid(pos)
174
+ consensus_base, consensus_frequency = consensus_freq(pos)
175
+ # puts h.to_json
176
+ # change a pos
177
+
178
+ #if pos = 128
179
+ # @frequency_table['A'][pos]=1
180
+ # @frequency_table['C'][pos]=2
181
+ # @frequency_table['T'][pos]=0
182
+ # @frequency_table['G'][pos]=4
183
+ # end
184
+ #
185
+ if consensus_frequency <= total_seqs - 2
186
+
187
+ h = extract_col(pos)
188
+
189
+
190
+
191
+ h.each do |k,v|
192
+ if (k!=consensus_base) && (v>=2)
193
+ res = true
194
+ # puts "#{k} is #{v}>=2"
195
+ break
196
+ end
197
+ end
198
+
199
+
200
+ end
201
+
202
+ # puts inspect_pos(pos,pos)
203
+ # puts "POS: #{pos}"
204
+ # puts res.to_s
205
+ # puts "base #{consensus_base}, freq #{consensus_frequency}, tot: #{total_seqs}"
206
+ # puts '-' * 20
207
+
208
+ #puts "nseq[#{pos}] = #{total_seqs} => #{consensus_base}: #{consensus_frequency}"
209
+
210
+ # @frequency_table.each { |k,v|
211
+ #
212
+ # puts "#{k}:#{v[pos] ||= '0'}"
213
+ #
214
+ #
215
+ # }
216
+ #
217
+ # puts res.to_s
218
+
219
+ return res
220
+ end
221
+
222
+ def inspect
223
+
224
+ res = ''
225
+
226
+ @frequency_table.each do |c,v|
227
+
228
+ res += c+':'+(v[0..30].map{|c1| c1 ||= 0 }).join("")+"...more.\n"
229
+
230
+ end
231
+
232
+ @position_table.each do |c,v|
233
+
234
+ # res += c+':'+(v[0..10].map{|c1| (c1 && ('['+c1.map{|c2| @sequences[c2-1].seq_name}.join(","))+']') || "[]" }).join(" - ")+"...more.\n"
235
+ res += c+':'+(v[0..10].map{|c1| (c1 && ('['+c1.map{|c2| @sequences[c2-1][:name]}.join(","))+']') || "[]" }).join(" - ")+"...more.\n"
236
+
237
+ end
238
+
239
+ return res
240
+
241
+ end
242
+
243
+ def inspect_pos(pos_start,pos_end)
244
+
245
+
246
+ res = ''
247
+
248
+ @frequency_table.each do |c,v|
249
+
250
+ res += c+':'+(v[pos_start..pos_end].map{|c1| c1 ||= 0 }).join("")+"...more.\n"
251
+
252
+ end
253
+
254
+ return res
255
+ end
256
+
257
+
258
+ end
@@ -0,0 +1,84 @@
1
+ require "narray"
2
+ require "numru/fftw3"
3
+ include NumRu
4
+ require "gnuplot"
5
+
6
+ class NArray
7
+
8
+ def mad_mean
9
+ me = self.mean
10
+ mad = ((self - me ).abs).mean
11
+
12
+ return [mad,me]
13
+
14
+ end
15
+
16
+ def mad_median
17
+ me = self.median
18
+ mad = ((self - me).abs).median
19
+
20
+ return [mad,me]
21
+ end
22
+
23
+ end
24
+
25
+ class LowPassFilter
26
+
27
+ attr_accessor :original_data, :filtered_data, :skip, :filter_base
28
+
29
+ def initialize(data)
30
+ @original_data = NArray.to_na(data)
31
+ @filtered_data = nil
32
+ @skip = nil
33
+ @filter_base = 8
34
+
35
+ run
36
+ end
37
+
38
+ def run
39
+ coef = FFTW3.fft(@original_data, -1,0)/@original_data.length
40
+
41
+ if @skip.nil?
42
+ @skip = (coef.length/4)
43
+ end
44
+
45
+ inc = (2.0 / (coef.length - @skip));
46
+
47
+ x = 1;
48
+
49
+ # keep intact first components of coefs
50
+ i=@skip
51
+ while i<coef.length
52
+ # el filtro reduce los componentes de alta frecuencia
53
+ f = x ** @filter_base
54
+
55
+ coef[i]=coef[i]*f
56
+
57
+ x = x - inc;
58
+ i +=1# era 2 porque en perl no usa num complejos
59
+ end
60
+
61
+ filtered = FFTW3.fft(coef, 1,0)
62
+
63
+ @filtered_data=filtered.real
64
+
65
+ @filtered_data[0] = @original_data[0]
66
+ @filtered_data[@filtered_data.length-1] = @original_data[@original_data.length-1]
67
+
68
+ end
69
+
70
+ def limits
71
+
72
+ mad,median = @filtered_data.mad_median
73
+
74
+ if mad == 0
75
+ mad,mean = @filtered_data.mad_mean
76
+ end
77
+
78
+ desv = 1.4826 * mad
79
+
80
+ return [median - desv, median + desv].sort
81
+
82
+ end
83
+
84
+ end
@@ -0,0 +1,36 @@
1
+
2
+ require 'frequency_table.rb'
3
+ require 'entropy_function'
4
+
5
+
6
+ class Cominer
7
+
8
+ attr_accessor :regions,:snps,:position_table, :freq_table, :single_points
9
+
10
+ def initialize(contig)
11
+
12
+ @freq_table = FrequencyTable.new
13
+
14
+ contig.reads.each do |name,read|
15
+ # puts ">#{read.name} #{read.orientation} #{read.align_clip_start} #{read.align_clip_end}"
16
+ # puts read.fasta
17
+ @freq_table.add_read(read)
18
+ end
19
+
20
+ # puts freq_table.inspect
21
+
22
+ ef = EntropyFunction.new(@freq_table)
23
+
24
+ @regions=ef.regions
25
+ @snps=ef.snps
26
+ @single_points=ef.single_points
27
+
28
+ @freq_table.position_table.delete('-')
29
+
30
+ @position_table = @freq_table.position_table
31
+
32
+ end
33
+
34
+
35
+
36
+ end
@@ -0,0 +1,12 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ $: << File.join(File.dirname(__FILE__),File.basename(__FILE__,File.extname(__FILE__)))
5
+ $: << File.join(File.dirname(__FILE__),File.basename(__FILE__,File.extname(__FILE__)),'classes')
6
+
7
+ # puts $:
8
+ require 'cominer.rb'
9
+
10
+ module ScbiCominer
11
+ VERSION = '0.0.2'
12
+ end
data/script/console ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r irb/completion"
6
+ # Perhaps use a console_lib to store any extra methods I may want available in the cosole
7
+ # libs << " -r #{File.dirname(__FILE__) + '/../lib/console_lib/console_logger.rb'}"
8
+ libs << " -r #{File.dirname(__FILE__) + '/../lib/scbi_cominer.rb'}"
9
+ puts "Loading scbi_cominer gem"
10
+ exec "#{irb} #{libs} --simple-prompt"
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)
@@ -0,0 +1,3 @@
1
+ require 'stringio'
2
+ require 'test/unit'
3
+ require File.dirname(__FILE__) + '/../lib/scbi_cominer'
@@ -0,0 +1,48 @@
1
+ require File.dirname(__FILE__) + '/test_helper.rb'
2
+
3
+ require 'scbi_ace'
4
+
5
+ class TestScbiCominer < Test::Unit::TestCase
6
+
7
+ def setup
8
+ end
9
+
10
+ def test_truth
11
+ filename=File.join(File.dirname(__FILE__),'test.ace')
12
+
13
+ # open ace file with parser
14
+ ace=AceParser.new(filename)
15
+
16
+ # iterate over all contigs in ace file
17
+ ace.each_contig do |contig|
18
+
19
+ # puts contig name
20
+ puts contig.name
21
+
22
+ # calculate cominer stats with this contig
23
+ cominer_stats=Cominer.new(contig)
24
+
25
+
26
+ # get all reads with orientation and align clips
27
+ contig.reads.each do |name,read|
28
+ puts ">#{read.name} #{read.orientation} #{read.align_clip_start} #{read.align_clip_end}"
29
+ puts read.fasta
30
+ end
31
+
32
+ # get position_table
33
+ puts cominer_stats.position_table.to_json
34
+
35
+ # get regions
36
+ puts cominer_stats.regions.to_json
37
+
38
+ # get snps
39
+ puts cominer_stats.snps.to_json
40
+
41
+ puts cominer_stats.single_points.to_json
42
+
43
+ end
44
+
45
+ ace.close
46
+
47
+ end
48
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scbi_cominer
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.2
6
+ platform: ruby
7
+ authors:
8
+ - Dario Guerrero
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2012-07-25 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: scbi_ace
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.5
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: narray
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: 0.5.9.8
35
+ type: :runtime
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
38
+ name: ruby-fftw3
39
+ prerelease: false
40
+ requirement: &id003 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0.4"
46
+ type: :runtime
47
+ version_requirements: *id003
48
+ - !ruby/object:Gem::Dependency
49
+ name: gnuplot
50
+ prerelease: false
51
+ requirement: &id004 !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: 2.3.4
57
+ type: :runtime
58
+ version_requirements: *id004
59
+ - !ruby/object:Gem::Dependency
60
+ name: hoe
61
+ prerelease: false
62
+ requirement: &id005 !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: 2.8.0
68
+ type: :development
69
+ version_requirements: *id005
70
+ description: scbi_cominer is a ruby gem to calculate some interesting regions and statistics from contigs
71
+ email:
72
+ - dariogf@gmail.com
73
+ executables: []
74
+
75
+ extensions: []
76
+
77
+ extra_rdoc_files:
78
+ - History.txt
79
+ - Manifest.txt
80
+ - PostInstall.txt
81
+ files:
82
+ - History.txt
83
+ - lib/scbi_cominer/classes/base_function.rb
84
+ - lib/scbi_cominer/classes/entropy_function.rb
85
+ - lib/scbi_cominer/classes/frequency_table.rb
86
+ - lib/scbi_cominer/classes/low_pass_filter.rb
87
+ - lib/scbi_cominer/cominer.rb
88
+ - lib/scbi_cominer.rb
89
+ - Manifest.txt
90
+ - PostInstall.txt
91
+ - Rakefile
92
+ - README.rdoc
93
+ - script/console
94
+ - script/destroy
95
+ - script/generate
96
+ - test/test_helper.rb
97
+ - test/test_scbi_cominer.rb
98
+ homepage: http://www.scbi.uma.es/downloads
99
+ licenses: []
100
+
101
+ post_install_message: PostInstall.txt
102
+ rdoc_options:
103
+ - --main
104
+ - README.rdoc
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: "0"
113
+ required_rubygems_version: !ruby/object:Gem::Requirement
114
+ none: false
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: "0"
119
+ requirements: []
120
+
121
+ rubyforge_project: scbi_cominer
122
+ rubygems_version: 1.8.24
123
+ signing_key:
124
+ specification_version: 3
125
+ summary: scbi_cominer is a ruby gem to calculate some interesting regions and statistics from contigs
126
+ test_files:
127
+ - test/test_helper.rb
128
+ - test/test_scbi_cominer.rb