cwords 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  scriptdir = File.dirname(__FILE__) + "/../scripts/"
3
3
 
4
- helps = ARGV.join(" ").match(/--help/)
5
- if (helps)
6
- # start lightweight
7
- exec "jruby --client " + scriptdir + "cwords_mkdb.rb " + ARGV.join(" ")
8
- end
9
-
10
- mems = ARGV.join(" ").match(/M=(\w+)/)
11
- mem = mems ? mems[1] : '4096m'
12
- argv = ARGV.select{|x| not x=~ /^M=\w+$/}.join(' ')
13
- puts "Starting cwords_mkdb with max heap size " + mem + " ...\n"
14
-
15
- cmd = "jruby --server --fast -J-Xmx#{mem} " + scriptdir + "cwords_mkdb.rb " + argv
4
+ argv = ARGV.join(" ")
5
+ cmd = "ruby " + scriptdir + "cwords_mkdb.rb " + argv
16
6
  exec cmd
@@ -5,7 +5,7 @@ class Array
5
5
  def threach(n = 1, &b)
6
6
  return [] if n == 0 or size == 0
7
7
  result = Array.new(size)
8
- self.send(:each,&b) if n == 1
8
+ return self.send(:each,&b) if n == 1 # trying return here
9
9
 
10
10
  n = [n,size].min
11
11
 
@@ -1,4 +1,4 @@
1
- #!/usr/bin/env jruby
1
+ #!/usr/bin/env ruby
2
2
 
3
3
  srcdir = File.dirname(__FILE__)
4
4
  basedir = srcdir + "/../"
@@ -25,13 +25,13 @@ options[:seqfile] = nil
25
25
  options[:partitions] = 1
26
26
  options[:stats] = ['p'] # p=p
27
27
  options[:ruby]='jruby --fast -J-Xmx1024m'
28
- options[:shuffles]=5000
28
+ options[:shuffles]=500
29
29
  options[:bg]=1 #mononucleotide shuffling
30
30
 
31
31
  $coptions = OptionParser.new do |opts|
32
32
  opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}}
33
33
  opts.on("-s", "--seqfile ARG", "sequence file") {|o| options[:seqfile] = o}
34
- opts.on("-p", "--partitions ARG", "number of sequence partitions") {|o| options[:partitions] = o.to_i}
34
+ opts.on("-t", "--threads ARG", "number of concurrent processes") {|o| options[:partitions] = o.to_i}
35
35
  opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split('')}
36
36
  opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
37
37
  opts.on("--ruby ARG", "ruby interpreter") {|o| options[:ruby] = o}
@@ -72,7 +72,7 @@ end
72
72
 
73
73
  puts "starting #{n} processes ..."
74
74
 
75
- cmd = "#{options[:ruby]} #{basedir}/scripts/wordsrus_mkdb.rb"
75
+ cmd = "#{options[:ruby]} #{basedir}/scripts/cwords_mkdb_worker.rb"
76
76
  cmd += " -w #{options[:wordsize].join(',')} -s #{options[:seqfile]} -a #{options[:stats].join(",")} -u #{options[:shuffles]} --bg #{options[:bg]}"
77
77
 
78
78
  stamp = Time.now.to_i
@@ -0,0 +1,99 @@
1
+
2
+ srcdir = File.dirname(__FILE__)
3
+ basedir = srcdir + "/../"
4
+ libdir = basedir + 'lib/'
5
+ $LOAD_PATH << libdir
6
+
7
+ require 'wordRS-lib.rb'
8
+ require 'rubygems'
9
+ require 'progressbar'
10
+ require 'optparse'
11
+ require 'fileutils'
12
+ require 'java'
13
+ require libdir + 'ushuffle.jar'
14
+ java_import 'UShuffle'
15
+ us = UShuffle.new
16
+
17
+ ###
18
+ ### Main
19
+ ###
20
+
21
+
22
+ #default options
23
+ options = Hash.new
24
+ options[:wordsize] = [7]
25
+ options[:seqfile] = nil
26
+ options[:partition] = nil
27
+ options[:stats] = ['p'] # p=p-value, z=z-score
28
+ options[:shuffles]=1000
29
+ options[:bg]=1 #mononucleotide shuffling
30
+
31
+ $coptions = OptionParser.new do |opts|
32
+ opts.on("-w", "--wordsize ARG", "wordsize") { |o| options[:wordsize] = o.split(",").map{|x| x.to_i}.sort}
33
+ opts.on("-s", "--seqfile ARG", "sequence file (FASTA format)") {|o| options[:seqfile] = o}
34
+ opts.on("-p", "--partition ARG", "only process a partition (i.e. 5-10) of sequences") {|o| options[:partition] = o}
35
+ opts.on("-a", "--stats ARG", "sequence file") {|o| options[:stats] = o.split(',')}
36
+ opts.on("-u", "--shuffle ARG", "number of shuffles") {|o| options[:shuffles] = o.to_i}
37
+ opts.on("-b", "--bg ARG", "background nucleotide model") {|o| options[:bg] = o.to_i}
38
+ end
39
+
40
+ def show_help(msg="", code=0, io=STDOUT)
41
+ io.puts "#{msg}\n#{$coptions}"
42
+ exit(code)
43
+ end
44
+
45
+ $coptions.parse!(ARGV)
46
+ #mandatory parameters
47
+ [:seqfile].each{ |p| show_help("option '#{p}' mandatory") if options[p].nil?}
48
+
49
+ exit("seqfile must have fasta-format") if !options[:seqfile].match(/.fa$/)
50
+ dbdir = basedir + "/db/" + File.basename(options[:seqfile],'.fa') + "_bg#{options[:bg]}"
51
+ FileUtils.mkdir_p dbdir # create dir if it does not exist
52
+
53
+ decimals = 6
54
+ bg = options[:bg]
55
+
56
+ # word id's
57
+ @wid = Hash.new
58
+ i = 0
59
+ options[:wordsize].each do |ws|
60
+ ['a','g','c','t'].rep_perm(ws) {|seqa| @wid[seqa.join('')]=i ; i+=1 }
61
+ end
62
+
63
+ @seqs = IO.readlines(options[:seqfile],">")[1..-1]
64
+ if options[:partition]
65
+ puts "partition #{options[:partition]}"
66
+ (pstart,pstop) = options[:partition].split('-')
67
+ @seqs = @seqs[pstart.to_i-1..pstop.to_i-1]
68
+ end
69
+
70
+ puts "computing statistics for #{@seqs.size} sequences"
71
+ pbar = ProgressBar.new("seqs",@seqs.size)
72
+
73
+ @seqs.each do |s|
74
+ ff = s.split("\n")
75
+ id = ff.shift
76
+ seq = ff[0..-2].join('').downcase.gsub('u','t') # last field is ">"
77
+ # next if not nucleotide sequence, i.e. "unavailable"
78
+ next if (seq.split('').uniq - ['a','c','g','t']).size > 0
79
+
80
+ #observed word counts
81
+ @observed = Array.new(@wid.size,0)
82
+ options[:wordsize].each{|ws| (0..seq.size-ws).each{|i| wid = @wid[seq[i, ws]]; @observed[wid] += 1 if not wid.nil?}}
83
+
84
+ #expected word counts
85
+ @expected = Array.new(@wid.size) {Array.new(options[:shuffles],0).to_statarray}
86
+ us.init_shuffle(seq,bg)
87
+ options[:shuffles].times do |si|
88
+ seqsh = us.shuffle
89
+ options[:wordsize].each{|ws| (0..seq.size-ws).each{|i| wid = @wid[seqsh[i, ws]]; @expected[wid][si] += 1 if not wid.nil?}}
90
+ end
91
+
92
+ #store results
93
+ @wid.each do |w,wid|
94
+ obs = @observed[wid]
95
+ File.open("#{dbdir}/#{w}.rnk", 'a') {|f| f.puts [id,obs,@expected[wid].select{|x| x>=obs}.size,@expected[wid].to_statarray.mean].join("\t")}
96
+ end
97
+ pbar.inc
98
+ end
99
+ pbar.finish
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 10
9
- version: 0.1.10
8
+ - 11
9
+ version: 0.1.11
10
10
  platform: ruby
11
11
  authors:
12
12
  - Anders Jacobsen
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-03-12 00:00:00 +01:00
17
+ date: 2010-04-09 00:00:00 +02:00
18
18
  default_executable:
19
19
  - bin/cwords
20
20
  dependencies:
@@ -52,6 +52,7 @@ files:
52
52
  - resources/word_annotation.tsv
53
53
  - scripts/cwords.rb
54
54
  - scripts/cwords_mkdb.rb
55
+ - scripts/cwords_mkdb_worker.rb
55
56
  - scripts/cluster_words.rb
56
57
  - scripts/complementary_words.rb
57
58
  has_rdoc: true