RubyGems - cwords - Versions diffs - 0.1.10 → 0.1.11 - Mend

cwords 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/bin/cwords_mkdb +2 -12
data/lib/wordRS-lib.rb +1 -1
data/scripts/cwords_mkdb.rb +4 -4
data/scripts/cwords_mkdb_worker.rb +99 -0
metadata +4 -3

data/bin/cwords_mkdb CHANGED

@@ -1,16 +1,6 @@
 #!/usr/bin/env ruby
 scriptdir = File.dirname(__FILE__) + "/../scripts/"
-helps = ARGV.join(" ").match(/--help/)
-if (helps)
-  # start lightweight
-  exec "jruby --client " + scriptdir + "cwords_mkdb.rb " + ARGV.join(" ")
-end
-mems = ARGV.join(" ").match(/M=(\w+)/)
-mem = mems ? mems[1] : '4096m'
-argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
-puts "Starting cwords_mkdb with max heap size " + mem + " ...\n"
-cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords_mkdb.rb " + argv
+argv = ARGV.join(" ")
+cmd = "ruby " + scriptdir + "cwords_mkdb.rb " + argv
 exec cmd

data/lib/wordRS-lib.rb CHANGED

@@ -5,7 +5,7 @@ class Array
   def threach(n = 1, &b)
     return [] if n == 0 or size == 0
     result = Array.new(size)
-    self.send(:each,&b) if n == 1
+    return self.send(:each,&b) if n == 1 # trying return here
     n = [n,size].min

data/scripts/cwords_mkdb.rb CHANGED

@@ -1,4 +1,4 @@
-#!/usr/bin/env jruby
+#!/usr/bin/env ruby
 srcdir = File.dirname(__FILE__)
 basedir = srcdir + "/../"
@@ -25,13 +25,13 @@ options[:seqfile] = nil
 options[:partitions] = 1
 options[:stats] = ['p'] # p=p
 options[:ruby]='jruby --fast -J-Xmx1024m'
-options[:shuffles]=5000
+options[:shuffles]=500
 options[:bg]=1 #mononucleotide shuffling
 $coptions = OptionParser.new do |opts|
   opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
   opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
-  opts.on("-p", "--partitions ARG", "number of sequence partitions") {|o| options[:partitions] = o.to_i}
+  opts.on("-t", "--threads ARG", "number of concurrent processes") {|o| options[:partitions] = o.to_i}
   opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split('')}
   opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
   opts.on("--ruby ARG", "ruby interpreter") {|o| options[:ruby] = o}
@@ -72,7 +72,7 @@ end
 puts "starting #{n} processes ..."
-cmd = "#{options[:ruby]} #{basedir}/scripts/wordsrus_mkdb.rb"
+cmd = "#{options[:ruby]} #{basedir}/scripts/cwords_mkdb_worker.rb"
 cmd += " -w #{options[:wordsize].join(',')} -s #{options[:seqfile]} -a #{options[:stats].join(",")} -u #{options[:shuffles]} --bg #{options[:bg]}"
 stamp = Time.now.to_i

data/scripts/cwords_mkdb_worker.rb ADDED

@@ -0,0 +1,99 @@
+srcdir = File.dirname(__FILE__)
+basedir = srcdir + "/../"
+libdir = basedir + 'lib/'
+$LOAD_PATH << libdir
+require 'wordRS-lib.rb'
+require 'rubygems'
+require 'progressbar'
+require 'optparse'
+require 'fileutils'
+require 'java'
+require libdir + 'ushuffle.jar'
+java_import 'UShuffle'
+us = UShuffle.new
+###
+### Main
+###
+#default options
+options = Hash.new
+options[:wordsize] = [7]
+options[:seqfile] = nil
+options[:partition] = nil
+options[:stats] = ['p'] # p=p-value, z=z-score
+options[:shuffles]=1000
+options[:bg]=1 #mononucleotide shuffling
+$coptions = OptionParser.new do |opts|
+  opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}.sort}
+  opts.on("-s", "--seqfile ARG", "sequence file (FASTA format)") {|o| options[:seqfile] = o}
+  opts.on("-p", "--partition ARG", "only process a partition (i.e. 5-10) of sequences") {|o| options[:partition] = o}
+  opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split(',')}
+  opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
+  opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
+end
+def show_help(msg="", code=0, io=STDOUT)
+  io.puts "#{msg}\n#{$coptions}"
+  exit(code)
+end
+$coptions.parse!(ARGV)
+#mandatory parameters
+[:seqfile].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
+exit("seqfile must have fasta-format") if !options[:seqfile].match(/.fa$/)
+dbdir = basedir + "/db/" + File.basename(options[:seqfile],'.fa') + "_bg#{options[:bg]}"
+FileUtils.mkdir_p dbdir # create dir if it does not exist
+decimals = 6
+bg = options[:bg]
+# word id's
+@wid = Hash.new
+i = 0
+options[:wordsize].each do |ws|
+  ['a','g','c','t'].rep_perm(ws) {|seqa| @wid[seqa.join('')]=i ; i+=1 }
+end
+@seqs = IO.readlines(options[:seqfile],">")[1..-1]
+if options[:partition]
+  puts "partition #{options[:partition]}"
+  (pstart,pstop) = options[:partition].split('-')
+  @seqs = @seqs[pstart.to_i-1..pstop.to_i-1]
+end
+puts "computing statistics for #{@seqs.size} sequences"
+pbar = ProgressBar.new("seqs",@seqs.size)
+@seqs.each do |s|
+  ff = s.split("\n")
+  id = ff.shift
+  seq = ff[0..-2].join('').downcase.gsub('u','t') # last field is ">"
+  # next if not nucleotide sequence, i.e. "unavailable"
+  next if (seq.split('').uniq - ['a','c','g','t']).size > 0
+  #observed word counts
+  @observed = Array.new(@wid.size,0)
+  options[:wordsize].each{|ws| (0..seq.size-ws).each{|i| wid = @wid[seq[i, ws]]; @observed[wid] += 1 if not wid.nil?}}
+  #expected word counts
+  @expected = Array.new(@wid.size) {Array.new(options[:shuffles],0).to_statarray}
+  us.init_shuffle(seq,bg)
+  options[:shuffles].times do |si|
+    seqsh = us.shuffle
+    options[:wordsize].each{|ws| (0..seq.size-ws).each{|i| wid = @wid[seqsh[i, ws]]; @expected[wid][si] += 1 if not wid.nil?}}
+  end
+  #store results
+  @wid.each do |w,wid|
+   obs = @observed[wid]
+   File.open("#{dbdir}/#{w}.rnk", 'a') {|f| f.puts [id,obs,@expected[wid].select{|x| x>=obs}.size,@expected[wid].to_statarray.mean].join("\t")}
+  end
+  pbar.inc
+end
+pbar.finish

metadata CHANGED

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 1
-  - 10
-  version: 0.1.10
+  - 11
+  version: 0.1.11
 platform: ruby
 authors:
 - Anders Jacobsen
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-03-12 00:00:00 +01:00
+date: 2010-04-09 00:00:00 +02:00
 default_executable:
 - bin/cwords
 dependencies:
@@ -52,6 +52,7 @@ files:
 - resources/word_annotation.tsv
 - scripts/cwords.rb
 - scripts/cwords_mkdb.rb
+- scripts/cwords_mkdb_worker.rb
 - scripts/cluster_words.rb
 - scripts/complementary_words.rb
 has_rdoc: true