cwords 0.1.10 → 0.1.11
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/cwords_mkdb +2 -12
- data/lib/wordRS-lib.rb +1 -1
- data/scripts/cwords_mkdb.rb +4 -4
- data/scripts/cwords_mkdb_worker.rb +99 -0
- metadata +4 -3
data/bin/cwords_mkdb
CHANGED
@@ -1,16 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
scriptdir = File.dirname(__FILE__) + "/../scripts/"
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
# start lightweight
|
7
|
-
exec "jruby --client " + scriptdir + "cwords_mkdb.rb " + ARGV.join(" ")
|
8
|
-
end
|
9
|
-
|
10
|
-
mems = ARGV.join(" ").match(/M=(\w+)/)
|
11
|
-
mem = mems ? mems[1] : '4096m'
|
12
|
-
argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
|
13
|
-
puts "Starting cwords_mkdb with max heap size " + mem + " ...\n"
|
14
|
-
|
15
|
-
cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords_mkdb.rb " + argv
|
4
|
+
argv = ARGV.join(" ")
|
5
|
+
cmd = "ruby " + scriptdir + "cwords_mkdb.rb " + argv
|
16
6
|
exec cmd
|
data/lib/wordRS-lib.rb
CHANGED
data/scripts/cwords_mkdb.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
srcdir = File.dirname(__FILE__)
|
4
4
|
basedir = srcdir + "/../"
|
@@ -25,13 +25,13 @@ options[:seqfile] = nil
|
|
25
25
|
options[:partitions] = 1
|
26
26
|
options[:stats] = ['p'] # p=p
|
27
27
|
options[:ruby]='jruby --fast -J-Xmx1024m'
|
28
|
-
options[:shuffles]=
|
28
|
+
options[:shuffles]=500
|
29
29
|
options[:bg]=1 #mononucleotide shuffling
|
30
30
|
|
31
31
|
$coptions = OptionParser.new do |opts|
|
32
32
|
opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
|
33
33
|
opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
|
34
|
-
opts.on("-
|
34
|
+
opts.on("-t", "--threads ARG", "number of concurrent processes") {|o| options[:partitions] = o.to_i}
|
35
35
|
opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split('')}
|
36
36
|
opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
|
37
37
|
opts.on("--ruby ARG", "ruby interpreter") {|o| options[:ruby] = o}
|
@@ -72,7 +72,7 @@ end
|
|
72
72
|
|
73
73
|
puts "starting #{n} processes ..."
|
74
74
|
|
75
|
-
cmd = "#{options[:ruby]} #{basedir}/scripts/
|
75
|
+
cmd = "#{options[:ruby]} #{basedir}/scripts/cwords_mkdb_worker.rb"
|
76
76
|
cmd += " -w #{options[:wordsize].join(',')} -s #{options[:seqfile]} -a #{options[:stats].join(",")} -u #{options[:shuffles]} --bg #{options[:bg]}"
|
77
77
|
|
78
78
|
stamp = Time.now.to_i
|
@@ -0,0 +1,99 @@
|
|
1
|
+
|
2
|
+
srcdir = File.dirname(__FILE__)
|
3
|
+
basedir = srcdir + "/../"
|
4
|
+
libdir = basedir + 'lib/'
|
5
|
+
$LOAD_PATH << libdir
|
6
|
+
|
7
|
+
require 'wordRS-lib.rb'
|
8
|
+
require 'rubygems'
|
9
|
+
require 'progressbar'
|
10
|
+
require 'optparse'
|
11
|
+
require 'fileutils'
|
12
|
+
require 'java'
|
13
|
+
require libdir + 'ushuffle.jar'
|
14
|
+
java_import 'UShuffle'
|
15
|
+
us = UShuffle.new
|
16
|
+
|
17
|
+
###
|
18
|
+
### Main
|
19
|
+
###
|
20
|
+
|
21
|
+
|
22
|
+
#default options
|
23
|
+
options = Hash.new
|
24
|
+
options[:wordsize] = [7]
|
25
|
+
options[:seqfile] = nil
|
26
|
+
options[:partition] = nil
|
27
|
+
options[:stats] = ['p'] # p=p-value, z=z-score
|
28
|
+
options[:shuffles]=1000
|
29
|
+
options[:bg]=1 #mononucleotide shuffling
|
30
|
+
|
31
|
+
$coptions = OptionParser.new do |opts|
|
32
|
+
opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}.sort}
|
33
|
+
opts.on("-s", "--seqfile ARG", "sequence file (FASTA format)") {|o| options[:seqfile] = o}
|
34
|
+
opts.on("-p", "--partition ARG", "only process a partition (i.e. 5-10) of sequences") {|o| options[:partition] = o}
|
35
|
+
opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split(',')}
|
36
|
+
opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
|
37
|
+
opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
|
38
|
+
end
|
39
|
+
|
40
|
+
def show_help(msg="", code=0, io=STDOUT)
|
41
|
+
io.puts "#{msg}\n#{$coptions}"
|
42
|
+
exit(code)
|
43
|
+
end
|
44
|
+
|
45
|
+
$coptions.parse!(ARGV)
|
46
|
+
#mandatory parameters
|
47
|
+
[:seqfile].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
|
48
|
+
|
49
|
+
exit("seqfile must have fasta-format") if !options[:seqfile].match(/.fa$/)
|
50
|
+
dbdir = basedir + "/db/" + File.basename(options[:seqfile],'.fa') + "_bg#{options[:bg]}"
|
51
|
+
FileUtils.mkdir_p dbdir # create dir if it does not exist
|
52
|
+
|
53
|
+
decimals = 6
|
54
|
+
bg = options[:bg]
|
55
|
+
|
56
|
+
# word id's
|
57
|
+
@wid = Hash.new
|
58
|
+
i = 0
|
59
|
+
options[:wordsize].each do |ws|
|
60
|
+
['a','g','c','t'].rep_perm(ws) {|seqa| @wid[seqa.join('')]=i ; i+=1 }
|
61
|
+
end
|
62
|
+
|
63
|
+
@seqs = IO.readlines(options[:seqfile],">")[1..-1]
|
64
|
+
if options[:partition]
|
65
|
+
puts "partition #{options[:partition]}"
|
66
|
+
(pstart,pstop) = options[:partition].split('-')
|
67
|
+
@seqs = @seqs[pstart.to_i-1..pstop.to_i-1]
|
68
|
+
end
|
69
|
+
|
70
|
+
puts "computing statistics for #{@seqs.size} sequences"
|
71
|
+
pbar = ProgressBar.new("seqs",@seqs.size)
|
72
|
+
|
73
|
+
@seqs.each do |s|
|
74
|
+
ff = s.split("\n")
|
75
|
+
id = ff.shift
|
76
|
+
seq = ff[0..-2].join('').downcase.gsub('u','t') # last field is ">"
|
77
|
+
# next if not nucleotide sequence, i.e. "unavailable"
|
78
|
+
next if (seq.split('').uniq - ['a','c','g','t']).size > 0
|
79
|
+
|
80
|
+
#observed word counts
|
81
|
+
@observed = Array.new(@wid.size,0)
|
82
|
+
options[:wordsize].each{|ws| (0..seq.size-ws).each{|i| wid = @wid[seq[i, ws]]; @observed[wid] += 1 if not wid.nil?}}
|
83
|
+
|
84
|
+
#expected word counts
|
85
|
+
@expected = Array.new(@wid.size) {Array.new(options[:shuffles],0).to_statarray}
|
86
|
+
us.init_shuffle(seq,bg)
|
87
|
+
options[:shuffles].times do |si|
|
88
|
+
seqsh = us.shuffle
|
89
|
+
options[:wordsize].each{|ws| (0..seq.size-ws).each{|i| wid = @wid[seqsh[i, ws]]; @expected[wid][si] += 1 if not wid.nil?}}
|
90
|
+
end
|
91
|
+
|
92
|
+
#store results
|
93
|
+
@wid.each do |w,wid|
|
94
|
+
obs = @observed[wid]
|
95
|
+
File.open("#{dbdir}/#{w}.rnk", 'a') {|f| f.puts [id,obs,@expected[wid].select{|x| x>=obs}.size,@expected[wid].to_statarray.mean].join("\t")}
|
96
|
+
end
|
97
|
+
pbar.inc
|
98
|
+
end
|
99
|
+
pbar.finish
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 11
|
9
|
+
version: 0.1.11
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Anders Jacobsen
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-04-09 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
- bin/cwords
|
20
20
|
dependencies:
|
@@ -52,6 +52,7 @@ files:
|
|
52
52
|
- resources/word_annotation.tsv
|
53
53
|
- scripts/cwords.rb
|
54
54
|
- scripts/cwords_mkdb.rb
|
55
|
+
- scripts/cwords_mkdb_worker.rb
|
55
56
|
- scripts/cluster_words.rb
|
56
57
|
- scripts/complementary_words.rb
|
57
58
|
has_rdoc: true
|