cwords 0.1.10 → 0.1.11

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,16 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  scriptdir = File.dirname(__FILE__) + "/../scripts/"
3
3
 
4
- helps = ARGV.join(" ").match(/--help/)
5
- if (helps)
6
- # start lightweight
7
- exec "jruby --client " + scriptdir + "cwords_mkdb.rb " + ARGV.join(" ")
8
- end
9
-
10
- mems = ARGV.join(" ").match(/M=(\w+)/)
11
- mem = mems ? mems[1] : '4096m'
12
- argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
13
- puts "Starting cwords_mkdb with max heap size " + mem + " ...\n"
14
-
15
- cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords_mkdb.rb " + argv
4
+ argv = ARGV.join(" ")
5
+ cmd = "ruby " + scriptdir + "cwords_mkdb.rb " + argv
16
6
  exec cmd
@@ -5,7 +5,7 @@ class Array
5
5
  def threach(n = 1, &b)
6
6
  return [] if n == 0 or size == 0
7
7
  result = Array.new(size)
8
- self.send(:each,&b) if n == 1
8
+ return self.send(:each,&b) if n == 1 # trying return here
9
9
 
10
10
  n = [n,size].min
11
11
 
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env jruby
1
+ #!/usr/bin/env ruby
2
2
 
3
3
  srcdir = File.dirname(__FILE__)
4
4
  basedir = srcdir + "/../"
@@ -25,13 +25,13 @@ options[:seqfile] = nil
25
25
  options[:partitions] = 1
26
26
  options[:stats] = ['p'] # p=p
27
27
  options[:ruby]='jruby --fast -J-Xmx1024m'
28
- options[:shuffles]=5000
28
+ options[:shuffles]=500
29
29
  options[:bg]=1 #mononucleotide shuffling
30
30
 
31
31
  $coptions = OptionParser.new do |opts|
32
32
  opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
33
33
  opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
34
- opts.on("-p", "--partitions ARG", "number of sequence partitions") {|o| options[:partitions] = o.to_i}
34
+ opts.on("-t", "--threads ARG", "number of concurrent processes") {|o| options[:partitions] = o.to_i}
35
35
  opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split('')}
36
36
  opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
37
37
  opts.on("--ruby ARG", "ruby interpreter") {|o| options[:ruby] = o}
@@ -72,7 +72,7 @@ end
72
72
 
73
73
  puts "starting #{n} processes ..."
74
74
 
75
- cmd = "#{options[:ruby]} #{basedir}/scripts/wordsrus_mkdb.rb"
75
+ cmd = "#{options[:ruby]} #{basedir}/scripts/cwords_mkdb_worker.rb"
76
76
  cmd += " -w #{options[:wordsize].join(',')} -s #{options[:seqfile]} -a #{options[:stats].join(",")} -u #{options[:shuffles]} --bg #{options[:bg]}"
77
77
 
78
78
  stamp = Time.now.to_i
@@ -0,0 +1,99 @@
1
+
2
+ srcdir = File.dirname(__FILE__)
3
+ basedir = srcdir + "/../"
4
+ libdir = basedir + 'lib/'
5
+ $LOAD_PATH << libdir
6
+
7
+ require 'wordRS-lib.rb'
8
+ require 'rubygems'
9
+ require 'progressbar'
10
+ require 'optparse'
11
+ require 'fileutils'
12
+ require 'java'
13
+ require libdir + 'ushuffle.jar'
14
+ java_import 'UShuffle'
15
+ us = UShuffle.new
16
+
17
+ ###
18
+ ### Main
19
+ ###
20
+
21
+
22
+ #default options
23
+ options = Hash.new
24
+ options[:wordsize] = [7]
25
+ options[:seqfile] = nil
26
+ options[:partition] = nil
27
+ options[:stats] = ['p'] # p=p-value, z=z-score
28
+ options[:shuffles]=1000
29
+ options[:bg]=1 #mononucleotide shuffling
30
+
31
+ $coptions = OptionParser.new do |opts|
32
+ opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}.sort}
33
+ opts.on("-s", "--seqfile ARG", "sequence file (FASTA format)") {|o| options[:seqfile] = o}
34
+ opts.on("-p", "--partition ARG", "only process a partition (i.e. 5-10) of sequences") {|o| options[:partition] = o}
35
+ opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split(',')}
36
+ opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
37
+ opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
38
+ end
39
+
40
+ def show_help(msg="", code=0, io=STDOUT)
41
+ io.puts "#{msg}\n#{$coptions}"
42
+ exit(code)
43
+ end
44
+
45
+ $coptions.parse!(ARGV)
46
+ #mandatory parameters
47
+ [:seqfile].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
48
+
49
+ exit("seqfile must have fasta-format") if !options[:seqfile].match(/.fa$/)
50
+ dbdir = basedir + "/db/" + File.basename(options[:seqfile],'.fa') + "_bg#{options[:bg]}"
51
+ FileUtils.mkdir_p dbdir # create dir if it does not exist
52
+
53
+ decimals = 6
54
+ bg = options[:bg]
55
+
56
+ # word id's
57
+ @wid = Hash.new
58
+ i = 0
59
+ options[:wordsize].each do |ws|
60
+ ['a','g','c','t'].rep_perm(ws) {|seqa| @wid[seqa.join('')]=i ; i+=1 }
61
+ end
62
+
63
+ @seqs = IO.readlines(options[:seqfile],">")[1..-1]
64
+ if options[:partition]
65
+ puts "partition #{options[:partition]}"
66
+ (pstart,pstop) = options[:partition].split('-')
67
+ @seqs = @seqs[pstart.to_i-1..pstop.to_i-1]
68
+ end
69
+
70
+ puts "computing statistics for #{@seqs.size} sequences"
71
+ pbar = ProgressBar.new("seqs",@seqs.size)
72
+
73
+ @seqs.each do |s|
74
+ ff = s.split("\n")
75
+ id = ff.shift
76
+ seq = ff[0..-2].join('').downcase.gsub('u','t') # last field is ">"
77
+ # next if not nucleotide sequence, i.e. "unavailable"
78
+ next if (seq.split('').uniq - ['a','c','g','t']).size > 0
79
+
80
+ #observed word counts
81
+ @observed = Array.new(@wid.size,0)
82
+ options[:wordsize].each{|ws| (0..seq.size-ws).each{|i| wid = @wid[seq[i, ws]]; @observed[wid] += 1 if not wid.nil?}}
83
+
84
+ #expected word counts
85
+ @expected = Array.new(@wid.size) {Array.new(options[:shuffles],0).to_statarray}
86
+ us.init_shuffle(seq,bg)
87
+ options[:shuffles].times do |si|
88
+ seqsh = us.shuffle
89
+ options[:wordsize].each{|ws| (0..seq.size-ws).each{|i| wid = @wid[seqsh[i, ws]]; @expected[wid][si] += 1 if not wid.nil?}}
90
+ end
91
+
92
+ #store results
93
+ @wid.each do |w,wid|
94
+ obs = @observed[wid]
95
+ File.open("#{dbdir}/#{w}.rnk", 'a') {|f| f.puts [id,obs,@expected[wid].select{|x| x>=obs}.size,@expected[wid].to_statarray.mean].join("\t")}
96
+ end
97
+ pbar.inc
98
+ end
99
+ pbar.finish
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 10
9
- version: 0.1.10
8
+ - 11
9
+ version: 0.1.11
10
10
  platform: ruby
11
11
  authors:
12
12
  - Anders Jacobsen
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-03-12 00:00:00 +01:00
17
+ date: 2010-04-09 00:00:00 +02:00
18
18
  default_executable:
19
19
  - bin/cwords
20
20
  dependencies:
@@ -52,6 +52,7 @@ files:
52
52
  - resources/word_annotation.tsv
53
53
  - scripts/cwords.rb
54
54
  - scripts/cwords_mkdb.rb
55
+ - scripts/cwords_mkdb_worker.rb
55
56
  - scripts/cluster_words.rb
56
57
  - scripts/complementary_words.rb
57
58
  has_rdoc: true