cwords 0.1.10 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/cwords_mkdb +2 -12
- data/lib/wordRS-lib.rb +1 -1
- data/scripts/cwords_mkdb.rb +4 -4
- data/scripts/cwords_mkdb_worker.rb +99 -0
- metadata +4 -3
data/bin/cwords_mkdb
CHANGED
@@ -1,16 +1,6 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
scriptdir = File.dirname(__FILE__) + "/../scripts/"
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
# start lightweight
|
7
|
-
exec "jruby --client " + scriptdir + "cwords_mkdb.rb " + ARGV.join(" ")
|
8
|
-
end
|
9
|
-
|
10
|
-
mems = ARGV.join(" ").match(/M=(\w+)/)
|
11
|
-
mem = mems ? mems[1] : '4096m'
|
12
|
-
argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
|
13
|
-
puts "Starting cwords_mkdb with max heap size " + mem + " ...\n"
|
14
|
-
|
15
|
-
cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords_mkdb.rb " + argv
|
4
|
+
argv = ARGV.join(" ")
|
5
|
+
cmd = "ruby " + scriptdir + "cwords_mkdb.rb " + argv
|
16
6
|
exec cmd
|
data/lib/wordRS-lib.rb
CHANGED
data/scripts/cwords_mkdb.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/env
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
srcdir = File.dirname(__FILE__)
|
4
4
|
basedir = srcdir + "/../"
|
@@ -25,13 +25,13 @@ options[:seqfile] = nil
|
|
25
25
|
options[:partitions] = 1
|
26
26
|
options[:stats] = ['p'] # p=p
|
27
27
|
options[:ruby]='jruby --fast -J-Xmx1024m'
|
28
|
-
options[:shuffles]=
|
28
|
+
options[:shuffles]=500
|
29
29
|
options[:bg]=1 #mononucleotide shuffling
|
30
30
|
|
31
31
|
$coptions = OptionParser.new do |opts|
|
32
32
|
opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
|
33
33
|
opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
|
34
|
-
opts.on("-
|
34
|
+
opts.on("-t", "--threads ARG", "number of concurrent processes") {|o| options[:partitions] = o.to_i}
|
35
35
|
opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split('')}
|
36
36
|
opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
|
37
37
|
opts.on("--ruby ARG", "ruby interpreter") {|o| options[:ruby] = o}
|
@@ -72,7 +72,7 @@ end
|
|
72
72
|
|
73
73
|
puts "starting #{n} processes ..."
|
74
74
|
|
75
|
-
cmd = "#{options[:ruby]} #{basedir}/scripts/
|
75
|
+
cmd = "#{options[:ruby]} #{basedir}/scripts/cwords_mkdb_worker.rb"
|
76
76
|
cmd += " -w #{options[:wordsize].join(',')} -s #{options[:seqfile]} -a #{options[:stats].join(",")} -u #{options[:shuffles]} --bg #{options[:bg]}"
|
77
77
|
|
78
78
|
stamp = Time.now.to_i
|
@@ -0,0 +1,99 @@
|
|
1
|
+
|
2
|
+
srcdir = File.dirname(__FILE__)
|
3
|
+
basedir = srcdir + "/../"
|
4
|
+
libdir = basedir + 'lib/'
|
5
|
+
$LOAD_PATH << libdir
|
6
|
+
|
7
|
+
require 'wordRS-lib.rb'
|
8
|
+
require 'rubygems'
|
9
|
+
require 'progressbar'
|
10
|
+
require 'optparse'
|
11
|
+
require 'fileutils'
|
12
|
+
require 'java'
|
13
|
+
require libdir + 'ushuffle.jar'
|
14
|
+
java_import 'UShuffle'
|
15
|
+
us = UShuffle.new
|
16
|
+
|
17
|
+
###
|
18
|
+
### Main
|
19
|
+
###
|
20
|
+
|
21
|
+
|
22
|
+
#default options
|
23
|
+
options = Hash.new
|
24
|
+
options[:wordsize] = [7]
|
25
|
+
options[:seqfile] = nil
|
26
|
+
options[:partition] = nil
|
27
|
+
options[:stats] = ['p'] # p=p-value, z=z-score
|
28
|
+
options[:shuffles]=1000
|
29
|
+
options[:bg]=1 #mononucleotide shuffling
|
30
|
+
|
31
|
+
$coptions = OptionParser.new do |opts|
|
32
|
+
opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}.sort}
|
33
|
+
opts.on("-s", "--seqfile ARG", "sequence file (FASTA format)") {|o| options[:seqfile] = o}
|
34
|
+
opts.on("-p", "--partition ARG", "only process a partition (i.e. 5-10) of sequences") {|o| options[:partition] = o}
|
35
|
+
opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split(',')}
|
36
|
+
opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
|
37
|
+
opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
|
38
|
+
end
|
39
|
+
|
40
|
+
def show_help(msg="", code=0, io=STDOUT)
|
41
|
+
io.puts "#{msg}\n#{$coptions}"
|
42
|
+
exit(code)
|
43
|
+
end
|
44
|
+
|
45
|
+
$coptions.parse!(ARGV)
|
46
|
+
#mandatory parameters
|
47
|
+
[:seqfile].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
|
48
|
+
|
49
|
+
exit("seqfile must have fasta-format") if !options[:seqfile].match(/.fa$/)
|
50
|
+
dbdir = basedir + "/db/" + File.basename(options[:seqfile],'.fa') + "_bg#{options[:bg]}"
|
51
|
+
FileUtils.mkdir_p dbdir # create dir if it does not exist
|
52
|
+
|
53
|
+
decimals = 6
|
54
|
+
bg = options[:bg]
|
55
|
+
|
56
|
+
# word id's
|
57
|
+
@wid = Hash.new
|
58
|
+
i = 0
|
59
|
+
options[:wordsize].each do |ws|
|
60
|
+
['a','g','c','t'].rep_perm(ws) {|seqa| @wid[seqa.join('')]=i ; i+=1 }
|
61
|
+
end
|
62
|
+
|
63
|
+
@seqs = IO.readlines(options[:seqfile],">")[1..-1]
|
64
|
+
if options[:partition]
|
65
|
+
puts "partition #{options[:partition]}"
|
66
|
+
(pstart,pstop) = options[:partition].split('-')
|
67
|
+
@seqs = @seqs[pstart.to_i-1..pstop.to_i-1]
|
68
|
+
end
|
69
|
+
|
70
|
+
puts "computing statistics for #{@seqs.size} sequences"
|
71
|
+
pbar = ProgressBar.new("seqs",@seqs.size)
|
72
|
+
|
73
|
+
@seqs.each do |s|
|
74
|
+
ff = s.split("\n")
|
75
|
+
id = ff.shift
|
76
|
+
seq = ff[0..-2].join('').downcase.gsub('u','t') # last field is ">"
|
77
|
+
# next if not nucleotide sequence, i.e. "unavailable"
|
78
|
+
next if (seq.split('').uniq - ['a','c','g','t']).size > 0
|
79
|
+
|
80
|
+
#observed word counts
|
81
|
+
@observed = Array.new(@wid.size,0)
|
82
|
+
options[:wordsize].each{|ws| (0..seq.size-ws).each{|i| wid = @wid[seq[i, ws]]; @observed[wid] += 1 if not wid.nil?}}
|
83
|
+
|
84
|
+
#expected word counts
|
85
|
+
@expected = Array.new(@wid.size) {Array.new(options[:shuffles],0).to_statarray}
|
86
|
+
us.init_shuffle(seq,bg)
|
87
|
+
options[:shuffles].times do |si|
|
88
|
+
seqsh = us.shuffle
|
89
|
+
options[:wordsize].each{|ws| (0..seq.size-ws).each{|i| wid = @wid[seqsh[i, ws]]; @expected[wid][si] += 1 if not wid.nil?}}
|
90
|
+
end
|
91
|
+
|
92
|
+
#store results
|
93
|
+
@wid.each do |w,wid|
|
94
|
+
obs = @observed[wid]
|
95
|
+
File.open("#{dbdir}/#{w}.rnk", 'a') {|f| f.puts [id,obs,@expected[wid].select{|x| x>=obs}.size,@expected[wid].to_statarray.mean].join("\t")}
|
96
|
+
end
|
97
|
+
pbar.inc
|
98
|
+
end
|
99
|
+
pbar.finish
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 11
|
9
|
+
version: 0.1.11
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Anders Jacobsen
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-04-09 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
- bin/cwords
|
20
20
|
dependencies:
|
@@ -52,6 +52,7 @@ files:
|
|
52
52
|
- resources/word_annotation.tsv
|
53
53
|
- scripts/cwords.rb
|
54
54
|
- scripts/cwords_mkdb.rb
|
55
|
+
- scripts/cwords_mkdb_worker.rb
|
55
56
|
- scripts/cluster_words.rb
|
56
57
|
- scripts/complementary_words.rb
|
57
58
|
has_rdoc: true
|